Exemplo n.º 1
0
def _collect_and_validate_regions(regions):
    contigs = _collect_fasta_contigs(regions)
    sequences = set()
    with open(regions["BED"]) as bedhandle:
        for (line_num, line) in enumerate(bedhandle):
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            try:
                bed = BEDRecord(line)
            except ValueError, error:
                raise MakefileError(
                    ("Error parsing line %i in regions file:\n"
                     "  Path = %r\n  Line = %r\n\n%s") %
                    (line_num + 1, regions["BED"], line, error))

            if len(bed) < 6:
                url = "http://genome.ucsc.edu/FAQ/FAQformat.html#format1"
                name = repr(bed.name) if len(bed) > 3 else "unnamed record"
                raise MakefileError(("Region at line #%i (%s) does not "
                                     "contain the expected number of fields; "
                                     "the first 6 fields are required. C.f. "
                                     "defination at\n   %s\n\nPath = %r") %
                                    (line_num, name, url, regions["BED"]))

            contig_len = contigs.get(bed.contig)
            if contig_len is None:
                raise MakefileError(("Regions file contains contig not found "
                                     "in reference:\n  Path = %r\n  Contig = "
                                     "%r\n\nPlease ensure that all contig "
                                     "names match the reference names!") %
                                    (regions["BED"], bed.contig))
            elif not (0 <= bed.start < bed.end <= contig_len):
                raise MakefileError(("Regions file contains invalid region:\n"
                                     "  Path   = %r\n  Contig = %r\n"
                                     "  Start  = %s\n  End    = %s\n\n"
                                     "Expected 0 <= Start < End <= %i!") %
                                    (regions["BED"], bed.contig, bed.start,
                                     bed.end, contig_len))

            sequences.add(bed.name)
Exemplo n.º 2
0
 def _stat_areas_of_interest(cls, prefixes):
     """Returns (size, number of named intervals, total number of intervals)
     for a set of areas of interest."""
     areas_of_interest = {}
     for (prefix_name, prefix) in prefixes.iteritems():
         prefix_label = prefix.get("Label", prefix_name)
         for (roi_name, roi_filename) in prefix.get("RegionsOfInterest", {}).iteritems():
             count, names, size = 0, set(), 0
             with open(roi_filename) as handle:
                 for line in handle:
                     bed = BEDRecord(line)
                     names.add(bed.name if len(bed) >= 4 else (bed.contig + "*"))
                     size += (bed.end - bed.start)
                     count += 1
             areas_of_interest[(prefix_name, roi_name)] = {"Size"       : size,
                                                           "NFeatures"  : len(names),
                                                           "NIntervals" : count,
                                                           "Genome"     : prefix["Name"],
                                                           "Name"       : roi_name,
                                                           "Label"      : "%s:%s" % (prefix_label, roi_name),
                                                           "Path"       : roi_filename}
     return areas_of_interest