def find_cds ():
    seq_des = str(record_dict[keys].description).split("|")
    for i in seq_des:
        if re.match("CDS", i):
            feature, cds_start, cds_end = re.split(":|-", i)
            f = FeatureLocation(int(cds_start)-1, int(cds_end))
            cds_sequence = f.extract(record_dict[keys].seq)
            protein_sequence = cds_sequence.translate()
            if "*" not in protein_sequence:
                return 0
            else
                return 1

        else
            return 0
def find_cds ():
    seq_des = str(record_dict[keys].description).split("|")
    if any("CDS:" in s for s in seq_des):
        for des in seq_des:
            match = re.match("CDS:", des)
            if match is not None:
                print record_dict[keys].id
                feature, cds_start, cds_end = re.split(":|-", des)
                f = FeatureLocation(int(cds_start)-1, int(cds_end))
                cds_sequence = f.extract(record_dict[keys].seq)
                protein_sequence = cds_sequence.translate()
                if "*" not in protein_sequence:
                    return 3
                else:
                    return 1
        
    else:
        return 0
Пример #3
0
def find_gaps(ppr, mingap=30, maxgap=None, skip_introns=True):
	"""Find all the gaps between PPR motifs which are gte maxgap"""
	loc = []
	feats = sorted(ppr.features, key = lambda(p): int(p.location.start))
	for a,b in pairwise(feats):
		#ignore any gaps if a and b aren't in the same frame
		if a.qualifiers['frame'] != b.qualifiers['frame']:
			continue

		#if the size is within the range
		l = int(b.location.start) - int(a.location.end)
		if l >= (mingap or -float('inf')) and l <= (maxgap or float('inf')):
			#We've found a gap
			g = FeatureLocation(int(a.location.end), 
			int(b.location.start), strand=1)
			g.prev = a
			g.next = b
			loc.append(g)

	return loc
Пример #4
0
    def __init__(self, logger, sequences, reference, dateFormat):
        super(sequence_set, self).__init__()
        self.log = logger

        # load sequences from the (parsed) JSON - don't forget to sort out dates
        self.seqs = {}
        for name, data in sequences.items():
            self.seqs[name] = SeqRecord(Seq(data["seq"], generic_dna),
                   id=name, name=name, description=name)
            self.seqs[name].attributes = data["attributes"]
            # tidy up dates
            date_struc = parse_date(self.seqs[name].attributes["raw_date"], dateFormat)
            self.seqs[name].attributes["num_date"] = date_struc[1]
            self.seqs[name].attributes["date"] = date_struc[2]

        # if the reference is to be analysed it'll already be in the (filtered & subsampled)
        # sequences, so no need to add it here, and no need to care about attributes etc
        # we do, however, need it for alignment
        self.reference_in_dataset = reference["included"]
        name = reference["strain"]
        self.reference_seq = SeqRecord(Seq(reference["seq"], generic_dna),
               id=name, name=name, description=name)
        if "genes" in reference and len(reference["genes"]):
            self.proteins = {}
            for k, v in reference["genes"].items():
                feature = FeatureLocation(start=v["start"], end=v["end"], strand=v["strand"])

                # Translate sequences to identify any proteins ending with a stop codon.
                translation = Seq.translate(Seq(feature.extract(str(self.reference_seq.seq))))
                if translation.endswith("*"):
                    # Truncate the last codon of the protein to omit the stop codon.
                    feature = FeatureLocation(start=v["start"], end=v["end"] - 3, strand=v["strand"])

                self.proteins[k] = feature
        else:
            self.proteins = None

        # other things:
        self.run_dir = '_'.join(['temp', time.strftime('%Y%m%d-%H%M%S',time.gmtime()), str(random.randint(0,1000000))])
        self.nthreads = 2 # should load from config file
Пример #5
0
    def test_diagram_via_methods_pdf(self):
        """Construct and draw PDF using method approach."""
        genbank_entry = self.record
        gdd = Diagram("Test Diagram")

        # Add a track of features,
        gdt_features = gdd.new_track(1,
                                     greytrack=True,
                                     name="CDS Features",
                                     greytrack_labels=0,
                                     height=0.5)
        # We'll just use one feature set for the genes and misc_features,
        gds_features = gdt_features.new_set()
        for feature in genbank_entry.features:
            if feature.type == "gene":
                if len(gds_features) % 2 == 0:
                    color = "blue"
                else:
                    color = "lightblue"
                gds_features.add_feature(
                    feature,
                    color=color,
                    # label_position="middle",
                    # label_position="end",
                    label_position="start",
                    label_size=11,
                    # label_angle=90,
                    sigil="ARROW",
                    label=True,
                )

        # I want to include some strandless features, so for an example
        # will use EcoRI recognition sites etc.
        for site, name, color in [
            ("GAATTC", "EcoRI", "green"),
            ("CCCGGG", "SmaI", "orange"),
            ("AAGCTT", "HindIII", "red"),
            ("GGATCC", "BamHI", "purple"),
        ]:
            index = 0
            while True:
                index = genbank_entry.seq.find(site, start=index)
                if index == -1:
                    break
                feature = SeqFeature(FeatureLocation(index, index + 6),
                                     strand=None)

                # This URL should work in SVG output from recent versions
                # of ReportLab.  You need ReportLab 2.4 or later
                try:
                    url = ("http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi" +
                           "?db=protein&id=%s" %
                           feature.qualifiers["protein_id"][0])
                except KeyError:
                    url = None

                gds_features.add_feature(
                    feature,
                    color=color,
                    url=url,
                    # label_position="middle",
                    label_size=10,
                    label_color=color,
                    # label_angle=90,
                    name=name,
                    label=True,
                )
                index += len(site)
            del index

        # Now add a graph track...
        gdt_at_gc = gdd.new_track(2,
                                  greytrack=True,
                                  name="AT and GC content",
                                  greytrack_labels=True)
        gds_at_gc = gdt_at_gc.new_set(type="graph")

        step = len(genbank_entry) // 200
        gds_at_gc.new_graph(
            apply_to_window(genbank_entry.seq, step, calc_gc_content, step),
            "GC content",
            style="line",
            color=colors.lightgreen,
            altcolor=colors.darkseagreen,
        )
        gds_at_gc.new_graph(
            apply_to_window(genbank_entry.seq, step, calc_at_content, step),
            "AT content",
            style="line",
            color=colors.orange,
            altcolor=colors.red,
        )

        # Finally draw it in both formats,
        gdd.draw(
            format="linear",
            orientation="landscape",
            tracklines=0,
            pagesize="A4",
            fragments=3,
        )
        output_filename = os.path.join("Graphics", "GD_by_meth_linear.pdf")
        gdd.write(output_filename, "PDF")

        gdd.draw(
            format="circular",
            tracklines=False,
            circle_core=0.8,
            pagesize=(20 * cm, 20 * cm),
            circular=True,
        )
        output_filename = os.path.join("Graphics", "GD_by_meth_circular.pdf")
        gdd.write(output_filename, "PDF")
Пример #6
0
    def test_diagram_via_object_pdf(self):
        """Construct and draw PDF using object approach."""
        genbank_entry = self.record
        gdd = Diagram("Test Diagram")

        gdt1 = Track(
            "CDS features",
            greytrack=True,
            scale_largetick_interval=1e4,
            scale_smalltick_interval=1e3,
            greytrack_labels=10,
            greytrack_font_color="red",
            scale_format="SInt",
        )
        gdt2 = Track("gene features",
                     greytrack=1,
                     scale_largetick_interval=1e4)

        # First add some feature sets:
        gdfsA = FeatureSet(name="CDS backgrounds")
        gdfsB = FeatureSet(name="gene background")

        gdfs1 = FeatureSet(name="CDS features")
        gdfs2 = FeatureSet(name="gene features")
        gdfs3 = FeatureSet(name="misc_features")
        gdfs4 = FeatureSet(name="repeat regions")

        prev_gene = None
        cds_count = 0
        for feature in genbank_entry.features:
            if feature.type == "CDS":
                cds_count += 1
                if prev_gene:
                    # Assuming it goes with this CDS!
                    if cds_count % 2 == 0:
                        dark, light = colors.peru, colors.tan
                    else:
                        dark, light = colors.burlywood, colors.bisque
                    # Background for CDS,
                    a = gdfsA.add_feature(
                        SeqFeature(
                            FeatureLocation(feature.location.start,
                                            feature.location.end,
                                            strand=0)),
                        color=dark,
                    )
                    # Background for gene,
                    b = gdfsB.add_feature(
                        SeqFeature(
                            FeatureLocation(
                                prev_gene.location.start,
                                prev_gene.location.end,
                                strand=0,
                            )),
                        color=dark,
                    )
                    # Cross link,
                    gdd.cross_track_links.append(CrossLink(a, b, light, dark))
                    prev_gene = None
            if feature.type == "gene":
                prev_gene = feature

        # Some cross links on the same linear diagram fragment,
        f, c = fill_and_border(colors.red)
        a = gdfsA.add_feature(SeqFeature(FeatureLocation(2220, 2230)),
                              color=f,
                              border=c)
        b = gdfsB.add_feature(SeqFeature(FeatureLocation(2200, 2210)),
                              color=f,
                              border=c)
        gdd.cross_track_links.append(CrossLink(a, b, f, c))

        f, c = fill_and_border(colors.blue)
        a = gdfsA.add_feature(SeqFeature(FeatureLocation(2150, 2200)),
                              color=f,
                              border=c)
        b = gdfsB.add_feature(SeqFeature(FeatureLocation(2220, 2290)),
                              color=f,
                              border=c)
        gdd.cross_track_links.append(CrossLink(a, b, f, c, flip=True))

        f, c = fill_and_border(colors.green)
        a = gdfsA.add_feature(SeqFeature(FeatureLocation(2250, 2560)),
                              color=f,
                              border=c)
        b = gdfsB.add_feature(SeqFeature(FeatureLocation(2300, 2860)),
                              color=f,
                              border=c)
        gdd.cross_track_links.append(CrossLink(a, b, f, c))

        # Some cross links where both parts are saddling the linear diagram fragment boundary,
        f, c = fill_and_border(colors.red)
        a = gdfsA.add_feature(SeqFeature(FeatureLocation(3155, 3250)),
                              color=f,
                              border=c)
        b = gdfsB.add_feature(SeqFeature(FeatureLocation(3130, 3300)),
                              color=f,
                              border=c)
        gdd.cross_track_links.append(CrossLink(a, b, f, c))
        # Nestled within that (drawn on top),
        f, c = fill_and_border(colors.blue)
        a = gdfsA.add_feature(SeqFeature(FeatureLocation(3160, 3275)),
                              color=f,
                              border=c)
        b = gdfsB.add_feature(SeqFeature(FeatureLocation(3180, 3225)),
                              color=f,
                              border=c)
        gdd.cross_track_links.append(CrossLink(a, b, f, c, flip=True))

        # Some cross links where two features are on either side of the linear diagram fragment boundary,
        f, c = fill_and_border(colors.green)
        a = gdfsA.add_feature(SeqFeature(FeatureLocation(6450, 6550)),
                              color=f,
                              border=c)
        b = gdfsB.add_feature(SeqFeature(FeatureLocation(6265, 6365)),
                              color=f,
                              border=c)
        gdd.cross_track_links.append(CrossLink(a, b, color=f, border=c))
        f, c = fill_and_border(colors.gold)
        a = gdfsA.add_feature(SeqFeature(FeatureLocation(6265, 6365)),
                              color=f,
                              border=c)
        b = gdfsB.add_feature(SeqFeature(FeatureLocation(6450, 6550)),
                              color=f,
                              border=c)
        gdd.cross_track_links.append(CrossLink(a, b, color=f, border=c))
        f, c = fill_and_border(colors.red)
        a = gdfsA.add_feature(SeqFeature(FeatureLocation(6275, 6375)),
                              color=f,
                              border=c)
        b = gdfsB.add_feature(SeqFeature(FeatureLocation(6430, 6530)),
                              color=f,
                              border=c)
        gdd.cross_track_links.append(
            CrossLink(a, b, color=f, border=c, flip=True))
        f, c = fill_and_border(colors.blue)
        a = gdfsA.add_feature(SeqFeature(FeatureLocation(6430, 6530)),
                              color=f,
                              border=c)
        b = gdfsB.add_feature(SeqFeature(FeatureLocation(6275, 6375)),
                              color=f,
                              border=c)
        gdd.cross_track_links.append(
            CrossLink(a, b, color=f, border=c, flip=True))

        cds_count = 0
        for feature in genbank_entry.features:
            if feature.type == "CDS":
                cds_count += 1
                if cds_count % 2 == 0:
                    gdfs1.add_feature(feature,
                                      color=colors.pink,
                                      sigil="ARROW")
                else:
                    gdfs1.add_feature(feature, color=colors.red, sigil="ARROW")

            if feature.type == "gene":
                # Note we set the colour of ALL the genes later on as a test,
                gdfs2.add_feature(feature, sigil="ARROW")

            if feature.type == "misc_feature":
                gdfs3.add_feature(feature, color=colors.orange)

            if feature.type == "repeat_region":
                gdfs4.add_feature(feature, color=colors.purple)

        # gdd.cross_track_links = gdd.cross_track_links[:1]

        gdfs1.set_all_features("label", 1)
        gdfs2.set_all_features("label", 1)
        gdfs3.set_all_features("label", 1)
        gdfs4.set_all_features("label", 1)

        gdfs3.set_all_features("hide", 0)
        gdfs4.set_all_features("hide", 0)

        # gdfs1.set_all_features('color', colors.red)
        gdfs2.set_all_features("color", colors.blue)

        gdt1.add_set(gdfsA)  # Before CDS so under them!
        gdt1.add_set(gdfs1)

        gdt2.add_set(gdfsB)  # Before genes so under them!
        gdt2.add_set(gdfs2)

        gdt3 = Track("misc features and repeats",
                     greytrack=1,
                     scale_largetick_interval=1e4)
        gdt3.add_set(gdfs3)
        gdt3.add_set(gdfs4)

        # Now add some graph sets:

        # Use a fairly large step so we can easily tell the difference
        # between the bar and line graphs.
        step = len(genbank_entry) // 200
        gdgs1 = GraphSet("GC skew")

        graphdata1 = apply_to_window(genbank_entry.seq, step, calc_gc_skew,
                                     step)
        gdgs1.new_graph(
            graphdata1,
            "GC Skew",
            style="bar",
            color=colors.violet,
            altcolor=colors.purple,
        )

        gdt4 = Track("GC Skew (bar)",
                     height=1.94,
                     greytrack=1,
                     scale_largetick_interval=1e4)
        gdt4.add_set(gdgs1)

        gdgs2 = GraphSet("GC and AT Content")
        gdgs2.new_graph(
            apply_to_window(genbank_entry.seq, step, calc_gc_content, step),
            "GC content",
            style="line",
            color=colors.lightgreen,
            altcolor=colors.darkseagreen,
        )

        gdgs2.new_graph(
            apply_to_window(genbank_entry.seq, step, calc_at_content, step),
            "AT content",
            style="line",
            color=colors.orange,
            altcolor=colors.red,
        )

        gdt5 = Track(
            "GC Content(green line), AT Content(red line)",
            height=1.94,
            greytrack=1,
            scale_largetick_interval=1e4,
        )
        gdt5.add_set(gdgs2)

        gdgs3 = GraphSet("Di-nucleotide count")
        step = len(genbank_entry) // 400  # smaller step
        gdgs3.new_graph(
            apply_to_window(genbank_entry.seq, step, calc_dinucleotide_counts,
                            step),
            "Di-nucleotide count",
            style="heat",
            color=colors.red,
            altcolor=colors.orange,
        )
        gdt6 = Track("Di-nucleotide count",
                     height=0.5,
                     greytrack=False,
                     scale=False)
        gdt6.add_set(gdgs3)

        # Add the tracks (from both features and graphs)
        # Leave some white space in the middle/bottom
        gdd.add_track(gdt4, 3)  # GC skew
        gdd.add_track(gdt5, 4)  # GC and AT content
        gdd.add_track(gdt1, 5)  # CDS features
        gdd.add_track(gdt2, 6)  # Gene features
        gdd.add_track(gdt3, 7)  # Misc features and repeat feature
        gdd.add_track(gdt6, 8)  # Feature depth

        # Finally draw it in both formats, and full view and partial
        gdd.draw(format="circular",
                 orientation="landscape",
                 tracklines=0,
                 pagesize="A0")
        output_filename = os.path.join("Graphics", "GD_by_obj_circular.pdf")
        gdd.write(output_filename, "PDF")

        gdd.circular = False
        gdd.draw(
            format="circular",
            orientation="landscape",
            tracklines=0,
            pagesize="A0",
            start=3000,
            end=6300,
        )
        output_filename = os.path.join("Graphics",
                                       "GD_by_obj_frag_circular.pdf")
        gdd.write(output_filename, "PDF")

        gdd.draw(
            format="linear",
            orientation="landscape",
            tracklines=0,
            pagesize="A0",
            fragments=3,
        )
        output_filename = os.path.join("Graphics", "GD_by_obj_linear.pdf")
        gdd.write(output_filename, "PDF")

        gdd.set_all_tracks("greytrack_labels", 2)
        gdd.draw(
            format="linear",
            orientation="landscape",
            tracklines=0,
            pagesize=(30 * cm, 10 * cm),
            fragments=1,
            start=3000,
            end=6300,
        )
        output_filename = os.path.join("Graphics", "GD_by_obj_frag_linear.pdf")
        gdd.write(output_filename, "PDF")
import os

in_files = glob.glob('genomes/*.embl')
flanking_region = 100
try:
        os.mkdir("results")
except OSError:
        print "a 'results' dir already exists"
        print "Overwriting"
stored = {}
for f in in_files:
        cur_genome = SeqIO.parse(f, "embl")
#print cur_genome
for record in cur_genome:
        for feat in record.features:
                if feat.type == 'mobile_element':
                        s, e, strand = feat.location.start, feat.location.end, feat.location.strand
                        header = '>'+feat.qualifiers['mobile_element_type'][0].split(':')[-1]+","+feat.qualifiers['mobile_element_type'][0].split(':')[-1]+".."+str(s+1)+".."+str(e)+"("+str(strand)+"),""100bp flanked,[EC958 IS]"
                        flanked = FeatureLocation(s-flanking_region, e+flanking_region, strand)
                        out_seq = flanked.extract(record.seq)
                        fname = header[1:].split(',')[0].replace('unclassified','unc').replace('family', 'fam').replace('(', '').replace('partial', 'p').replace(')', '').replace(' ', '_').replace('/', '-').strip()+'.fna'
                        if fname in stored.keys():
                                old = fname
                                fname = fname.replace(".fna", "_"+str(stored[fname])+".fna")
                                stored[old] = stored[old]+1
                        else:
                                stored[fname] = 1
                        with open(os.path.join('results', fname), 'w') as out:
                                out.write(header+'\n')
                                out.write(str(out_seq)+'\n')
Пример #8
0
def lipoP_gff(lipoIn, gff3In, jBrowseOut):

    orgIDs = {}
    orgID = ""

    # Take and parse the txt output into a sequence of records
    # Dict of X records, with the ID as key and an array Y of each cleavage site as the value,
    for row in lipoIn:
        if row.startswith("#"):
            orgID = ""
            continue

        rowElem = row.split("\t")

        orgID = rowElem[0]

        if rowElem[2] == "CleavII":
            if not (orgID in orgIDs.keys()):
                orgIDs[orgID] = []
            orgIDs[orgID].append(int(rowElem[3]))  # , int(rowElem[4])))

    # Rebase
    for gff in GFF.parse(gff3In):
        keepSeq = []
        for xRec in gff.features:
            cdss = list(
                feature_lambda(
                    xRec.sub_features,
                    feature_test_type,
                    {"type": "CDS"},
                    subfeatures=False,
                ))
            findCleave = ""
            cdsOff = 0
            for cds in cdss:
                if cds.id in orgIDs:
                    findCleave = cds.id
                    break
                cdsOff += 1
            if findCleave == "":
                if not jBrowseOut:
                    keepSeq.append(xRec)
                continue

            if jBrowseOut:
                xRec.sub_features = []

            i = 0
            for cleaveBase in orgIDs[findCleave]:
                tempQuals = xRec.qualifiers.copy()
                i += 1
                tempQuals["ID"] = xRec.id + "_cleavage_" + str(i)

                xRec.sub_features.append(
                    SeqFeature(
                        FeatureLocation(
                            cdss[cdsOff].location.start + (cleaveBase * 3) - 1,
                            cdss[cdsOff].location.start + (cleaveBase * 3) + 1,
                        ),
                        type="cleavage_site",
                        strand=xRec.location.strand,
                        qualifiers=tempQuals,
                    ))
            keepSeq.append(xRec)

        gff.features = keepSeq
        GFF.write([gff], sys.stdout)
Пример #9
0
    def create_regions(self,
                       superclusters: List[SuperCluster] = None,
                       subregions: List[SubRegion] = None) -> int:
        """ Creates Region features based on contained SuperClusters and SubRegions
            and returns the number of regions created. Regions will not overlap.

            If supplied, parameters will override the Records own superclusters
            and subregions.
        """
        if superclusters is None:
            superclusters = self._superclusters
        if subregions is None:
            subregions = self._subregions

        if not superclusters and not subregions:
            return 0

        areas = []  # type: List[CDSCollection]
        areas.extend(superclusters)
        areas.extend(subregions)
        areas.sort()

        region_location = FeatureLocation(
            max(0, areas[0].location.start),
            min(areas[0].location.end, len(self)))

        supers = []
        subs = []
        if isinstance(areas[0], SuperCluster):
            supers.append(areas[0])
        else:
            assert isinstance(areas[0], SubRegion), type(areas[0])
            subs.append(areas[0])

        regions_added = 0
        for area in areas[1:]:
            if area.overlaps_with(region_location):
                region_location = combine_locations(area.location,
                                                    region_location)
                if isinstance(area, SuperCluster):
                    supers.append(area)
                else:
                    assert isinstance(area, SubRegion), type(area)
                    subs.append(area)
                continue
            # no overlap means new region
            self.add_region(Region(supers, subs))
            regions_added += 1
            region_location = area.location
            supers = []
            subs = []
            if isinstance(area, SuperCluster):
                supers.append(area)
            else:
                assert isinstance(area, SubRegion), type(area)
                subs.append(area)

        # add the final region being built
        self.add_region(Region(supers, subs))
        regions_added += 1

        return regions_added
Пример #10
0
 def test_get_cluster_type(self):
     "Test utils.get_cluster_type()"
     cluster = FakeFeature('cluster', FeatureLocation(23, 42),
                           {'product': ['fake']})
     self.assertEqual('fake', utils.get_cluster_type(cluster))
Пример #11
0
    def create_gff(self,
                   nested_element,
                   dirpath,
                   output_fasta_offset,
                   format='default'):

        if format not in format_dict:
            format = 'default'

        self._create_dirs(nested_element.id, dirpath)

        #TODO move to separate method
        # find closest parent
        nl = nested_element.nested_list
        parents = [-1] * len(nl)
        for i in range(len(parents) - 1):
            for j in range(i + 1, len(parents)):
                if intervals.contains(nl[j].location, nl[i].location):
                    parents[i] = j
                    break

        # append direct children
        direct_children = [[] for i in range(len(nl))]
        for i in reversed(range(len(direct_children))):
            parent = parents[i]
            if parent != -1:
                direct_children[parent].append(nl[i].location)

        # GFF
        rec = SeqRecord(nested_element.sequence, nested_element.id)
        features = []

        for i in range(len(nl)):
            #insert baseline
            base_type = format_dict[format]['te_base'] if format != 'default' else 'te_base'
            features.append(SeqFeature(
                FeatureLocation(
                    (nl[i].location[0]-1), nl[i].location[1]),
                    type=base_type,
                    strand=0,
                    qualifiers={
                        'name': 'TE_BASE {}'.format(i),
                        'ID': 'TE_BASE {}'.format(i)
                    }
            ))

            #insert element cropped by its children
            subseq = Seq('')
            children = direct_children[i]
            cropped = intervals.crop(nl[i].location, children)
            for subinterval in cropped:
                subseq += nested_element.sequence[subinterval[0] : subinterval[1]]
                te_type = format_dict[format]['te'] if format != 'default' else 'te'
                features.append(SeqFeature(
                    FeatureLocation((subinterval[0]-1), subinterval[1]),
                    type=te_type,
                    strand=0,
                    qualifiers={
                        'ID': 'TE {}'.format(i), 
                        'name': 'TE {}'.format(i), 
                        'Parent': 'TE_BASE {}'.format(i)
                    }
                ))

            # save transposon fasta
            subseq = (
                nested_element.sequence[(nl[i].location[0] - output_fasta_offset) : nl[i].location[0]]
                + subseq +
                nested_element.sequence[nl[i].location[1] : (nl[i].location[1] + output_fasta_offset)])
            with open('{}/{}/TE/{}.fa'.format(dirpath, nested_element.id, i), 'w') as fasta_out:
                SeqIO.write(
                    SeqRecord(subseq, id='{}|TE-{}'.format(nested_element.id, i), description='Cropped nested retrotransposon'),
                    fasta_out,
                    'fasta'
                )
            if len(cropped) > 1:
                subseq = nested_element.sequence[(nl[i].location[0] - output_fasta_offset) : (nl[i].location[1] + output_fasta_offset)]
                with open('{}/{}/TE/{}_full.fa'.format(dirpath, nested_element.id, i), 'w') as fasta_out:
                    SeqIO.write(
                        SeqRecord(subseq, id='{}|TE-{}'.format(nested_element.id, i), description='Cropped nested retrotransposon'),
                        fasta_out,
                        'fasta'
                    )

            # insert domains
            if 'domains' in nl[i].features:
                j = 0
                for domain in nl[i].features['domains']:
                    domain_location = domain.location
                    sign = (lambda x: x and (1, -1)[x < 0])(domain.frame[0])
                    if sign < 0:
                        domain_location = [domain_location[1], domain_location[0]]
                    overlap = [x for x in children if intervals.contains(domain_location, x)]
                    cropped_domain = intervals.crop(domain_location, overlap)
                    for part in cropped_domain:
                        domain_type = format_dict[format]['domain'] if format != 'default' else domain.type
                        features.append(SeqFeature(
                            FeatureLocation(part[0] - 1, part[1]),
                            type=domain_type,
                            strand=sign,
                            qualifiers={
                                'ID': 'DOMAIN {}-{}'.format(i, j), 
                                'name': domain.type,
                                'Parent': 'TE_BASE {}'.format(i)
                            }
                        ))
                    j += 1

            #insert pbs,ppt
            if 'pbs' in nl[i].features and not math.isnan(nl[i].features['pbs'][0]):
                pbs_tybe = format_dict[format]['pbs'] if format != 'default' else 'pbs'
                features.append(SeqFeature(
                    FeatureLocation(nl[i].features['pbs'][0] - 1, nl[i].features['pbs'][1]),
                    type=pbs_tybe,
                    strand=0,
                    qualifiers={
                        'ID': 'PBS {}'.format(i),
                        'name': 'pbs',
                        'Parent': 'TE_BASE {}'.format(i)
                    }
                ))

            if 'ppt' in nl[i].features and not math.isnan(nl[i].features['ppt'][0]):
                ppt_type = format_dict[format]['ppt'] if format != 'default' else 'ppt'
                features.append(SeqFeature(
                    FeatureLocation(nl[i].features['ppt'][0] - 1, nl[i].features['ppt'][1]),
                    type=ppt_type,
                    strand=0,
                    qualifiers={
                        'ID': 'PPT {}'.format(i),
                        'name': 'ppt',
                        'Parent': 'TE_BASE {}'.format(i)
                    }
                ))

            #insert ltrs            
            ltr_type = format_dict[format]['ltr'] if format != 'default' else 'ltr'
            features.append(SeqFeature(
                FeatureLocation(nl[i].ltr_right_location[0] - 1, nl[i].ltr_right_location[1]),
                type=ltr_type,
                strand=0,
                qualifiers={
                    'ID': 'LTR RIGHT {}'.format(i),
                    'name': 'ltr right',
                    'Parent': 'TE_BASE {}'.format(i)
                }
            ))
            features.append(SeqFeature(
                FeatureLocation(nl[i].ltr_left_location[0] - 1, nl[i].ltr_left_location[1]),
                type=ltr_type,
                strand=0,
                qualifiers={
                    'ID': 'LTR LEFT {}'.format(i),
                    'name': 'ltr left',
                    'Parent': 'TE_BASE {}'.format(i)
                }
            ))

            #insert tsrs
            if not math.isnan(nl[i].tsr_left[0]):
                tsr_type = format_dict[format]['tsr'] if format != 'default' else 'tsr'
                features.append(SeqFeature(
                    FeatureLocation(nl[i].tsr_left[0] - 1, nl[i].tsr_left[1]),
                    type=tsr_type,
                    strand=0,
                    qualifiers={
                        'ID': 'TSR LEFT {}'.format(i),
                        'name': 'tsr left',
                        'Parent': 'TE_BASE {}'.format(i)
                    }
                ))
                features.append(SeqFeature(
                    FeatureLocation(nl[i].tsr_right[0] - 1, nl[i].tsr_right[1]),
                    type=tsr_type,
                    strand=0,
                    qualifiers={
                        'ID': 'TSR RIGHT {}'.format(i),
                        'name': 'tsr right',
                        'Parent': 'TE_BASE {}'.format(i)
                    }
                ))

        # FOR END
        rec.features = features

        #create GFF
        filename = '{}/{}/{}'.format(dirpath, nested_element.id, nested_element.id)
        if format != 'default':
            filename += '_{}'.format(format)
        gff_filepath = '{}.gff'.format(filename)
        with open(gff_filepath, 'w+') as gff_out:
            GFF.write([rec], gff_out)

        return gff_filepath
Пример #12
0
    def create_clusters_from_borders(
            self, borders: Optional[List[ClusterBorder]] = None) -> int:
        """ Takes all ClusterBorder instances and constructs Clusters that cover
            each ClusterBorder. If a cluster would overlap with another, the
            clusters are merged.

            Returns:
                the number of clusters created
        """
        if borders is None:
            borders = self._cluster_borders

        if not borders:
            return 0

        borders = sorted(borders)
        cluster_location = FeatureLocation(
            max(0, borders[0].location.start - borders[0].extent),
            min(borders[0].location.end + borders[0].extent, len(self)))
        # create without products initially, add based on the border naming attributes
        borders_within_cluster = [borders[0]]
        cluster = Cluster(cluster_location, borders[0].cutoff,
                          borders[0].extent, [])
        if borders[0].rule:
            cluster.detection_rules.append(borders[0].rule)

        clusters_added = 0
        for border in borders[1:]:
            dummy_border_location = FeatureLocation(
                max(0, border.location.start - border.extent),
                min(border.location.end + border.extent, len(self)))
            if cluster.overlaps_with(dummy_border_location):
                cluster.extent = max(cluster.extent, border.extent)
                cluster.cutoff = max(cluster.cutoff, border.cutoff)
                start = min(cluster.location.start,
                            border.location.start - border.extent)
                if start < 0:
                    start = 0
                end = max(cluster.location.end,
                          border.location.end + border.extent)
                if end > len(self):
                    end = len(self)
                cluster.location = FeatureLocation(start, end)
                borders_within_cluster.append(border)
                if border.rule:
                    cluster.detection_rules.append(border.rule)
            else:
                cluster.contig_edge = cluster.location.start == 0 or cluster.location.end == len(
                    self.seq)
                for product in _build_products_from_borders(
                        borders_within_cluster):
                    cluster.add_product(product)
                self.add_cluster(cluster)
                borders_within_cluster.clear()
                clusters_added += 1
                cluster_location = FeatureLocation(
                    max(0, border.location.start - border.extent),
                    min(border.location.end + border.extent, len(self)))
                cluster = Cluster(cluster_location, border.cutoff,
                                  border.extent, [])
                borders_within_cluster.append(border)
                if border.rule:
                    cluster.detection_rules.append(border.rule)

        # add the final cluster being built if it wasn't added already
        cluster.contig_edge = cluster.location.start == 0 or cluster.location.end == len(
            self.seq)
        for product in _build_products_from_borders(borders_within_cluster):
            cluster.add_product(product)
        self.add_cluster(cluster)
        clusters_added += 1

        return clusters_added
Пример #13
0
    def test_eq_not_identical(self):
        """Test two different locations are not equal."""
        loc1 = FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1)
        loc2 = (FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1) +
                FeatureLocation(50, 60, 1))
        self.assertNotEqual(loc1, loc2)

        loc1 = FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1)
        loc2 = FeatureLocation(12, 17, -1) + FeatureLocation(23, 42, -1)
        self.assertNotEqual(loc1, loc2)

        loc1 = CompoundLocation(
            [FeatureLocation(12, 17, 1),
             FeatureLocation(23, 42, 1)])
        loc2 = CompoundLocation(
            [FeatureLocation(12, 17, 1),
             FeatureLocation(23, 42, 1)], "order")
        self.assertNotEqual(loc1, loc2)

        loc1 = FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1)
        loc2 = 5
        self.assertNotEqual(loc1, loc2)
Пример #14
0
def convert_xmfa_to_gff3(xmfa_file,
                         relative_to='1',
                         sequences=None,
                         window_size=1000):
    label_convert = _id_tn_dict(sequences)

    lcbs = parse_xmfa(xmfa_file)

    records = [
        SeqRecord(Seq("A"), id=label_convert.get(relative_to, relative_to))
    ]
    for lcb in lcbs:
        ids = [seq['id'] for seq in lcb]

        # Doesn't match part of our sequence
        if relative_to not in ids:
            continue

        # Skip sequences that are JUST our "relative_to" genome
        if len(ids) == 1:
            continue

        parent = [seq for seq in lcb if seq['id'] == relative_to][0]
        others = [seq for seq in lcb if seq['id'] != relative_to]

        for other in others:
            other['feature'] = SeqFeature(
                FeatureLocation(parent['start'], parent['end'] + 1),
                type="match",
                strand=parent['strand'],
                qualifiers={
                    "source": "progressiveMauve",
                    "target": label_convert.get(other['id'], other['id']),
                    "ID": label_convert.get(other['id'],
                                            'xmfa_' + other['rid'])
                })

        for i in range(0, len(lcb[0]['seq']), window_size):
            block_seq = parent['seq'][i:i + window_size]
            real_window_size = len(block_seq)
            real_start = abs(
                parent['start']) - parent['seq'][0:i].count('-') + i
            real_end = real_start + real_window_size - block_seq.count('-')

            if (real_end - real_start) < 10:
                continue

            if parent['start'] < 0:
                strand = -1
            else:
                strand = 1

            for other in others:
                pid = _percent_identity(block_seq,
                                        other['seq'][i:i + real_window_size])
                # Ignore 0% identity sequences
                if pid == 0:
                    continue
                other['feature'].sub_features.append(
                    SeqFeature(FeatureLocation(real_start, real_end),
                               type="match_part",
                               strand=strand,
                               qualifiers={
                                   "source": "progressiveMauve",
                                   'score': pid
                               }))

        for other in others:
            records[0].features.append(other['feature'])
    return records
Пример #15
0
def new_track(plot, record, track_num=1, name=None, start=None, end=None, links=list()):
    """ Shorthand for adding a new track to a plot.
    
    Keyword arguments:
    plot -- A GenomeDiagram.Diagram object
    record -- A SeqRecord object with features
    name -- The label for the track (default record.id)
    start -- Where to start from in the record (default 0)
    end -- Where to end in the record (default len(record))
    
    Returns:
    None the plot object itself is updated.
    """
    
    type_colours = {
        "gene": colors.Color(red=1/255, green=108/255, blue=154/255),
        "mnh120_REPET_SSRs": colors.Color(red=0, green=160/255, blue=138/255),
        "mnh120_REPET_TEs": colors.Color(red=249/255, green=132/255, blue=0)
    }
    if start is None:
        start = 0
    if end is None:
        end = len(record)
    if name is None:
        name = record.id
    
    # set large tick interval
    if end - start > 200000:
        interval = 100000
    elif 80000 < end - start <= 200000:
        interval = 50000
    elif 20000 < end - start <= 80000:
        interval = 10000
    else: 
        interval = 5000
    track = plot.new_track(
        track_num,
        name=name,
        greytrack=False,
        greytrack_labels=1,
        start=start,
        end=end,
        height=0.5,
        scale_fontsize=10,
        scale_largeticks=1.2,
        scale_largetick_interval=interval,
        )
    track_features = track.new_set()
    
    for link, psim in links:       
        colour = colors.Color(red=1, alpha=psim/1000)
        track_features.add_feature(
            SeqFeature(
                id='',
                location=FeatureLocation(
                    link['start'],
                    link['end'],
                    strand=0
                    )
                ),
            color=colour,
            border=colors.Color(red=1, alpha=0.)
            )
   
    for feature in record.features:
        colour = type_colours[feature.type]
        if feature.type == 'gene':
            label_args = {
                'label': True,
                'label_position': 'start',
                'label_angle': 90,
                'label_size': 10,
                'name': feature.id
                }
        else:
            label_args = dict()

        track_features.add_feature(
            feature,
            sigil="BIGARROW",
            arrowshaft_height=1.0,
            color=colour,
            **label_args
            )
    # plot.draw(format="linear", pagesize='A5', fragments=1)
    # plot.write("filename", "PNG")
    return track
    piece_2 = rec_genome_seq[swap_idx_2:swap_idx_2+swap_size]
    rec_genome_seq[swap_idx_1:swap_idx_1+swap_size] = piece_2
    rec_genome_seq[swap_idx_2:swap_idx_2+swap_size] = piece_1


rec_genome = SeqRecord(rec_genome_seq, id='001', name='recoded_genome')

SYNTH_SEG_SIZE = 49000
SYNTH_FRAG_SIZE = 2500

for seg_num in range(genome_length / SYNTH_SEG_SIZE):
    start_idx = seg_num * SYNTH_SEG_SIZE
    end_idx = start_idx + SYNTH_SEG_SIZE
    if seg_num == (genome_length / SYNTH_SEG_SIZE - 1):
        end_idx += genome_length % SYNTH_SEG_SIZE
    s_feat = SeqFeature(FeatureLocation(start_idx, end_idx), 
                        type='synth_segment', qualifiers={'label':
                        'seg%02d' % (seg_num)})
    rec_genome.features.append(s_feat)
    for frag_num in range(SYNTH_SEG_SIZE / SYNTH_FRAG_SIZE):
        f_start_idx = start_idx + SYNTH_FRAG_SIZE * frag_num
        f_end_idx = f_start_idx + SYNTH_FRAG_SIZE
        if frag_num == (SYNTH_SEG_SIZE / SYNTH_FRAG_SIZE - 1):
            f_end_idx += SYNTH_SEG_SIZE % SYNTH_FRAG_SIZE
        else: 
            f_end_idx += 100  # simulated overlap for assembly
        f_feat = SeqFeature(FeatureLocation(f_start_idx,f_end_idx), 
                            type='synth_fragment', qualifiers={'label':
                            'seg%02d_%03d' % (seg_num, frag_num)})
        rec_genome.features.append(f_feat)
Пример #17
0
    def predict_primer_set(self):
        predefined_sets = dict()
        if self.predefined_handle is not None:
            self.parse_predefined_pairs(predefined_sets)

        out_genes = []
        for record in SeqIO.parse(self.input_handle, "fasta"):
            gene = Gene(record.id)
            sequence = str(record.seq)
            for i, sel_sequence in enumerate(re.split(r"//", sequence)):
                s = re.sub(r"[\[\]<>]", "", sel_sequence)
                amplicon = Amplicon(s)

                if record.id in predefined_sets:
                    amplicon.primer_set = predefined_sets[record.id]
                    gene.append(amplicon)
                    del predefined_sets[record.id]
                    continue

                input_string = ""
                input_string += "SEQUENCE_ID=" + record.id + "\n"
                input_string += "SEQUENCE_TEMPLATE=" + s + "\n"

                if sel_sequence.find("<") >= 0 and sel_sequence.find(">") >= 0:
                    input_string += "SEQUENCE_EXCLUDED_REGION="
                    spl_sequence = re.split(
                        r"[<>]",
                        sel_sequence.replace("[", "").replace("]", ""))
                    for i in range(0, len(spl_sequence) - 1, 2):
                        start = 0
                        for j in range(0, i + 1):
                            start += len(spl_sequence[j])
                        input_string += (str(start + 1) + "," +
                                         str(len(spl_sequence[i + 1])) + " ")
                        amplicon.add_feature(
                            ExcludedRegion(
                                FeatureLocation(
                                    start + 1,
                                    start + len(spl_sequence[i + 1]))))
                    input_string += "\n"

                sel_sequence = sel_sequence.replace("<", "")
                sel_sequence = sel_sequence.replace(">", "")

                if sel_sequence.find("[") >= 0 and sel_sequence.find("]") >= 0:
                    input_string += "SEQUENCE_TARGET="
                    spl_sequence = re.split(r"[\[\]]", sel_sequence)
                    for i_ in range(0, len(spl_sequence) - 1, 2):
                        start = 0
                        for j in range(0, i_ + 1):
                            start += len(spl_sequence[j])
                        input_string += (str(start + 1) + "," +
                                         str(len(spl_sequence[i_ + 1])) + " ")
                        amplicon.add_feature(
                            TargetRegion(
                                FeatureLocation(
                                    start + 1,
                                    start + len(spl_sequence[i_ + 1]))))
                    input_string += "\n"

                input_string += "P3_FILE_FLAG=0\n"
                # This is badly programmed and NEEDS that trailing slash
                input_string += f"PRIMER_THERMODYNAMIC_PARAMETERS_PATH={self.config.p3_thermo_path}/\n="

                log.info(input_string)
                p = run_and_feed(
                    self.config.p3_path,
                    p3_settings_file=self.config.p3_config_path,
                    _input_str=input_string,
                    _long_arg_prefix="-",
                )
                p3_output = p.stdout.strip()
                log.info("P3 output: %s", p3_output)

                m = re.search(r"(?<=PRIMER_ERROR=)\w+", p3_output)
                if m is not None:
                    raise Exception(
                        "Error for sequence (Probably no primer found in region): "
                        f"{record.id}: {m.group(0)}\n Start NEW Primerprediction."
                    )

                primer_set = PrimerPairSet(record.id)
                parse_p3_information(primer_set, p3_output)

                if len(primer_set) == 0:
                    log.warning("WARNING: No primer found for %s sequence %s.",
                                record.id, i + 1)
                    continue

                amplicon.primer_set = primer_set
                gene.append(amplicon)

            if len(gene) == 0:
                raise Exception(
                    f"No primer found for {gene.name}. Consider less restrictive Primer3 settings."
                )
            out_genes.append(gene)
        for key in predefined_sets:
            log.info(
                "WARNING: No input sequence could be found for the predefined primer %s",
                key,
            )

        return out_genes
Пример #18
0
def table_annotations(gff3In, tabularIn, fastaIn, out_gff3, out_changelog):

    # CSV parse tabular
    header = csv.DictReader(tabularIn, delimiter="\t")
    for i in header.fieldnames:
        if i == "Boundary" and not ("bound_s" in header.fieldnames):
            header.fieldnames[header.fieldnames.index(i)] = "bound_s"

        elif i == "Boundary":
            header.fieldnames[header.fieldnames.index(i)] = "bound_e"
        # Else error
        elif i == "# Organism ID":
            header.fieldnames[header.fieldnames.index(i)] = "org_id"

        elif i == "User entered Notes":
            header.fieldnames[header.fieldnames.index(i)] = "Note"
    idDict = csv.DictReader(tabularIn,
                            delimiter="\t",
                            fieldnames=header.fieldnames)

    # BioPython parse GFF
    sourceG = list(
        GFF.parse(gff3In, SeqIO.to_dict(SeqIO.parse(fastaIn, "fasta"))))
    recG = []

    recTest = []

    sumFeatures = 0
    numOrgs = 0
    while numOrgs < len(sourceG):  # Should be directly editable
        topFeat = 0
        while topFeat < len(sourceG[numOrgs].features):
            subFeat1 = 0
            recG.append(sourceG[numOrgs].features[topFeat])
            sumFeatures += 1
            while subFeat1 < len(
                    sourceG[numOrgs].features[topFeat].sub_features):
                subFeat2 = 0
                recG.append(
                    sourceG[numOrgs].features[topFeat].sub_features[subFeat1])
                sumFeatures += 1
                while subFeat2 < len(sourceG[numOrgs].features[topFeat].
                                     sub_features[subFeat1].sub_features):
                    recG.append(sourceG[numOrgs].features[topFeat].
                                sub_features[subFeat1].sub_features[subFeat2])
                    sumFeatures += 1
                    subFeat2 += 1
                subFeat1 += 1
            topFeat += 1
        numOrgs += 1

    # Get a changelog ready
    out_changelog.write("ID\tChanges\tStatus\n")

    anyChange = False

    for row in idDict:
        if row["ID"] == "ID":
            continue  # Skip header
        Found = False

        for i in recG:

            # if "Parent" in i.qualifiers:

            if row["ID"] == i.id:

                strandC = False
                startC = False
                endC = False
                nameC = False
                noteC = False
                qualC = False
                aliasC = False
                parentC = False

                Found = True

                for qual in row:
                    if qual == "ID" or qual == "":
                        continue

                    if qual == "Strand":
                        if row["Strand"] == "+":
                            row["Strand"] = +1
                        else:
                            row["Strand"] = -1

                    if qual == "Name" and row["Name"] != i.qualifiers["Name"][
                            0]:
                        i.qualifiers["Name"][0] = row["Name"]
                        nameC = True

                    elif qual == "Alias" and row["Alias"] != i.qualifiers[
                            "Alias"][0]:
                        i.qualifiers["Alias"][0] = row["Alias"]
                        aliasC = True

                    elif (qual == "Parent"
                          and row["Parent"] != i.qualifiers["Parent"][0]):
                        i.qualifiers["Parent"][0] = row["Parent"]
                        parentC = true

                    # elif qual == "Note":

                    elif qual == "Strand" and i.strand != row["Strand"]:
                        strandC = True
                        i.location = FeatureLocation(i.location.start,
                                                     i.location.end,
                                                     row["Strand"])

                    elif qual == "bound_s" and i.location.start != int(
                            row["bound_s"]):
                        startC = True
                        i.location = FeatureLocation(int(row["bound_s"]),
                                                     i.location.end,
                                                     i.location.strand)

                    elif qual == "bound_e" and i.location.end != int(
                            row["bound_e"]):
                        endC = True
                        i.location = FeatureLocation(i.location.start,
                                                     int(row["bound_e"]),
                                                     i.location.strand)

                    elif qual == "Note":
                        temp = [str(row["Note"])]
                        if ("Note" in i.qualifiers
                            ) and i.qualifiers["Note"] != temp:
                            if temp:
                                i.qualifiers["Note"] = temp
                            else:
                                i.qualifiers.pop("Note", None)
                            noteC = True
                        elif temp != [""] and not ("Note" in i.qualifiers):
                            i.qualifiers["Note"] = temp
                            noteC = True

                    elif not (qual in [
                            "Target",
                            "Gap",
                            "Dbxref",
                            "Ontology_term",
                            "Is_circular",
                            "Derives_from",
                            "bound_s",
                            "bound_e",
                            "org_id",
                            "Strand",
                            "Name",
                            "Note",
                    ]):
                        temp = qual.lower().replace(" ", "_")
                        if temp in i.qualifiers:  # Edit
                            if type(row[qual]) == type(None):
                                i.qualifiers.pop(temp, None)
                                qualC = True
                            elif i.qualifiers[temp] != [str(row[qual])]:
                                i.qualifiers[temp] = [str(row[qual])]
                                qualC = True
                        elif type(row[qual]) != type(None):  # Create
                            i.qualifiers[temp] = [str(row[qual])]
                            qualC = True

                    # print(i)
                # if "OrgID" in row and (row["OrgID"] != i.qualifiers["Name"][0]):
                # OrgID Seemingly not used aside from GFF Header

                # Location object needs to be rebuilt, can't individually set start/end

                changeList = ""
                if nameC:
                    changeList += "Name"
                if aliasC:
                    if changeList != "":
                        changeList += ", "
                    changeList += "Alias"
                if parentC:
                    if changeList != "":
                        changeList += ", "
                    changeList += "Parent"
                if startC:
                    if changeList != "":
                        changeList += ", "
                    changeList += "Start"
                if endC:
                    if changeList != "":
                        changeList += ", "
                    changeList += "End"
                if strandC:
                    if changeList != "":
                        changeList += ", "
                    changeList += "Strand"
                if noteC:
                    if changeList != "":
                        changeList += ", "
                    changeList += "Notes"
                if qualC:
                    if changeList != "":
                        changeList += ", "
                    changeList += "Other Qualifiers"

                if (changeList != ""
                    ):  # On success, write out replaced attributes and success
                    out_changelog.write("%s\t%s\tSuccess\n" %
                                        (i.id, changeList))
                    anyChange = True
                else:  # On fail, write out table line and why
                    # No changes detected
                    out_changelog.write("%s\tNone\tNo Change\n" % i.id)

                break

        if Found == False:
            # No such ID
            out_changelog.write("%s\tNone\tID not Found\n" % row["ID"])

    if anyChange:
        sourceG[0].annotations = {}
        sourceG[0].features = [
            x for x in sourceG[0].features if x.type != "remark"
        ]
        GFF.write(sourceG, out_gff3)
    else:
        out_changelog.write("GFF3\tNone\tGFF3 already equals Table\n")
        out_gff3 = gff3In

    out_changelog.close()
    out_gff3.close()
Пример #19
0
    def draw_gene_diagram(self, inv_pair, genes, save_path):

        # print("Drawing gene diagram for", inv_pair)

        # define the tick interval based on the start and end of the genes (should already be ordered)
        track_start = genes[0].location.start
        track_end = genes[len(genes) - 1].location.end
        s_tick_int = int((track_end - track_start) / 5)

        # create an empty genome diagram
        gdd = GenomeDiagram.Diagram(self.accession_num)
        gdt_features = gdd.new_track(1,
                                     greytrack=True,
                                     scale_smalltick_interval=s_tick_int,
                                     scale_smalltick_labels=True,
                                     scale_smallticks=0.1,
                                     scale_fontangle=0,
                                     scale_fontsize=4,
                                     name=self.accession_num)
        gds_features = gdt_features.new_set()

        # for each loci, annotate the diagram
        for orf in genes:

            # describe the orf
            loctag = orf.qualifiers['locus_tag'][0]
            product = orf.qualifiers['product'][0]

            # define orientation based on strand
            if orf.strand == 1:
                angle = 15
                pos = 'left'
            if orf.strand == -1:
                angle = -195
                pos = 'right'

            # draw the orf
            gds_features.add_feature(orf,
                                     name=loctag + ": " + product,
                                     label=True,
                                     sigil="BIGARROW",
                                     label_size=4,
                                     arrowhead_length=0.2,
                                     label_angle=angle,
                                     label_position=pos,
                                     arrowshaft_height=0.3)

        # for the cluster, annotate inversion positions

        feature = SeqFeature(FeatureLocation(int(inv_pair[0]),
                                             int(inv_pair[0]) + 1),
                             strand=0)
        gds_features.add_feature(feature,
                                 name='   START',
                                 label=True,
                                 color="purple",
                                 label_position="left",
                                 label_angle=45,
                                 sigil='BOX',
                                 label_color='purple',
                                 label_size=6)

        feature = SeqFeature(FeatureLocation(int(inv_pair[1]),
                                             int(inv_pair[1]) + 1),
                             strand=0)
        gds_features.add_feature(feature,
                                 name='   END',
                                 label=True,
                                 color="purple",
                                 label_position="left",
                                 label_angle=45,
                                 sigil='BOX',
                                 label_color='purple',
                                 label_size=6)

        # draw and save the graph
        gdd.draw(format='linear',
                 pagesize=(16 * cm, 10 * cm),
                 fragments=1,
                 start=track_start - 500,
                 end=track_end + 500)
        gdd.write(save_path, "pdf")

        return
Пример #20
0
    def test_eq_not_identical(self):
        """Test two different locations are not equal."""
        loc1 = FeatureLocation(22, 42, 1)
        loc2 = FeatureLocation(23, 42, 1)
        self.assertNotEqual(loc1, loc2)

        loc1 = FeatureLocation(23, 42, 1)
        loc2 = FeatureLocation(23, 43, 1)
        self.assertNotEqual(loc1, loc2)

        loc1 = FeatureLocation(23, 42, 1)
        loc2 = FeatureLocation(23, 42, -1)
        self.assertNotEqual(loc1, loc2)

        loc1 = FeatureLocation(23, 42, 1)
        loc2 = (23, 42, 1)
        self.assertNotEqual(loc1, loc2)

        loc1 = FeatureLocation(23, 42, 1, "foo")
        loc2 = FeatureLocation(23, 42, 1, "bar")
        self.assertNotEqual(loc1, loc2)

        loc1 = FeatureLocation(23, 42, 1, "foo", "bar")
        loc2 = FeatureLocation(23, 42, 1, "foo", "baz")
        self.assertNotEqual(loc1, loc2)
Пример #21
0
    def setUp(self):
        self.features = [
            FakeFeature('cluster', FeatureLocation(25, 50),
                        {'note': ['Cluster number: 1']}),
            FakeFeature('CDS', FeatureLocation(15, 20)),
            FakeFeature('PFAM_domain', FeatureLocation(15, 17)),
            FakeFeature('CDS', FeatureLocation(23, 42)),
            FakeFeature('CDS', FeatureLocation(45, 47)),
            FakeFeature('CDS', FeatureLocation(48, 55)),
            FakeFeature('aSDomain', FeatureLocation(4730, 4740)),
            FakeFeature('CDS', FeatureLocation(4700, 4710)),
            FakeFeature('CDS', FeatureLocation(4750, 4760)),
            FakeFeature('CDS', FeatureLocation(4790, 4812)),
            FakeFeature('cluster', FeatureLocation(4711, 4800),
                        {'note': ['Cluster number: 2']}),
        ]

        self.record = FakeRecord(self.features)
Пример #22
0
def find_repeat(self, contig, fn, st, ppno, extra_dna):
    """
    Find repeats in the DNA sequence
    :param self: The data object
    :param contig: the name of the contig we are searching on
    :param fn: the nuclotide sequence to search
    :param st: the start to find repeats at
    :param ppno: the prophage number
    :param extra_dna: the extra dna that flanks the sequence
    :return: a list of repeat regions
    """

    if len(fn) == 0:
        log_and_message("Len sequence is 0 so ignoring\n",
                        c="RED",
                        stderr=True,
                        loglevel="WARNING")
        return {}

    rep = {}
    index = 0

    # with open(os.path.join(output_dir, "repeat_finding"), 'a') as rptout:
    #     rptout.write(f">pp{ppno} {st}\n{fn}\n")

    try:
        # set the False parameter to True to enable debugging of repeat finder
        repeats = PhiSpyRepeatFinder.repeatFinder(fn, 3, self.min_repeat_len,
                                                  ppno, False)
    except Exception as e:
        log_and_message(
            f"There was an error running repeatfinder for {fn}:{e}\n",
            c="RED",
            stderr=True,
            loglevel="WARNING")
        return {}

    for r in repeats:
        if (r['first_start'] <
            (3 * extra_dna)) and (r['second_start'] >
                                  (len(fn) - (3 * extra_dna))):
            # check that start is always less than end
            # This always causes an off by one error, so we have to increment our ends
            if r['first_end'] < r['first_start']:
                [r['first_start'],
                 r['first_end']] = [r['first_end'] + 1, r['first_start'] + 1]
            if r['second_end'] < r['second_start']:
                [r['second_start'], r['second_end']
                 ] = [r['second_end'] + 1, r['second_start'] + 1]

            rep[index] = {}
            rep[index]['s1'] = r['first_start'] + st
            rep[index]['e1'] = r['first_end'] + st
            rep[index]['s2'] = r['second_start'] + st
            rep[index]['e2'] = r['second_end'] + st
            if self.include_all_repeats:
                replen = max(rep[index]['e1'] - rep[index]['s1'],
                             rep[index]['e2'] - rep[index]['s2'])
                r1loc = FeatureLocation(rep[index]['s1'],
                                        rep[index]['e1'],
                                        strand=+1)
                r2loc = FeatureLocation(rep[index]['s2'],
                                        rep[index]['e2'],
                                        strand=+1)
                rptloc = CompoundLocation([r1loc, r2loc])
                rptsf = SeqFeature(
                    rptloc,
                    type="repeat_region",
                    qualifiers={
                        'note':
                        f"{replen}bp repeat identified by PhiSpy v{version.__version__}"
                    })
                self.record.get_entry(contig).features.append(rptsf)
            index += 1

    return rep
Пример #23
0
                    start = ali_from - (enter.max_length - hmm_to)
                else:
                    start = ali_from
            elif enter.length is not False and \
                 enter.min_length is not False:
                if hmm_from > 1:
                    end = (hmm_from - 1) + ali_to
                else:
                    end = ali_to
                if enter.min_length < hmm_to < enter.max_length:
                    start = ali_from - (enter.max_length - hmm_to)
                elif hmm_to <= enter.min_length:
                    start = ali_from - (enter.min_length - hmm_to)
        start_pos = SeqFeature.ExactPosition(start - 1)
        end_pos = SeqFeature.ExactPosition(end)
        feature_location = FeatureLocation(start_pos, end_pos)
        feature_type = enter.feature
        from Bio.SeqFeature import SeqFeature
        note_qualifier = dict()
        note_qualifier['note'] = str(
            '%s score %s E-value %s' %
            (prog[2].replace('\n', ''), score, e_value))
        my_feature = MySeqFeature(location=feature_location,
                                  type=feature_type,
                                  strand=strnd,
                                  qualifiers=dict(
                                      list(qualifier.items()) +
                                      list(note_qualifier.items())))

        if (hmm_diff - ali_diff == 0 or
                hmm_diff - ali_diff == 1 or
Пример #24
0
## Generate list of contig,length tuples then sort in decending order
genome = {}
entries = []
lengths = []
for i in SeqIO.parse("/Volumes/MP_HD/repDNA_data/Pm_all_genome.fasta", "fasta"):
    genome[i.id] = i
    entries.append((i.id,len(i.seq)))
    lengths.append(len(i.seq))

entries = sorted(entries,key=lambda x: x[1],reverse=True)
## Create gene SeqFeatures as subsets of contig seqRecord objects
for i in genome:
    for j in gene:
        if gene[j][0] == genome[i].id:
            direc = int(gene[j][3] + '1')
            genome[i].features.append(SeqFeature(FeatureLocation(gene[j][1], gene[j][2], strand = direc),type='gene',id=j,qualifiers={'locus_tag':[gene[j][4]]}))
    for j in centromeres:
        if centromeres[j][0] == genome[i].id:
            # print j,gene[j][0],gene[j][3],genome[i].id
            # sl(0.5)
            direc = None#int(gene[j][3] + '1')
            genome[i].features.append(SeqFeature(FeatureLocation(centromeres[j][1], centromeres[j][2], strand = direc),type='gene',id=j,qualifiers={'locus_tag':[centromeres[j][4]]}))


## telomere length - rounded ends of chromosome size
max_len = max(lengths)
telomere_length = 40000

chr_diagram = BasicChromosome.Organism()
chr_diagram.page_size = (60*cm, 21*cm)
Пример #25
0
 def to_seqfeature(self):
     feat = SeqFeature(location=FeatureLocation(self.start, self.end),
                       id=self.value)
     if hasattr(self, 'confidence'):
         feat.qualifiers['confidence'] = self.confidence
     return feat
Пример #26
0
def write_all_outputs(**kwargs):
    self = Namespace(**kwargs)
    # make all predicted pp list
    log_and_message("Creating output files",
                    c="GREEN",
                    stderr=True,
                    quiet=self.quiet)
    prophage_feature_type = 'misc_feature'  # / prophage_region

    for i in self.pp:
        self.record.get_entry(self.pp[i]['contig']).append_feature(
            SeqFeature(
                location=FeatureLocation(self.pp[i]['start'] - 1,
                                         self.pp[i]['stop'] - 1),
                type=prophage_feature_type,
                strand=1,
                qualifiers=OrderedDict({
                    'note':
                    f'prophage region pp{i} identified with PhiSpy v{version.__version__}'
                })))
        if 'atts' in self.pp[i]:
            self.record.get_entry(self.pp[i]['contig']).append_feature(
                SeqFeature(
                    location=FeatureLocation(int(self.pp[i]['att'][0]),
                                             int(self.pp[i]['att'][1])) +
                    FeatureLocation(int(self.pp[i]['att'][2]),
                                    int(self.pp[i]['att'][3])),
                    type='repeat_region',
                    strand=1,
                    qualifiers=OrderedDict({
                        'note':
                        f'prophage region pp{i} potential attachment sites'
                    })))

    if self.keep_dropped_predictions:
        for i in self.droppedpp:
            self.record.get_entry(self.droppedpp[i]['contig']).append_feature(
                SeqFeature(
                    location=FeatureLocation(self.droppedpp[i]['start'] - 1,
                                             self.droppedpp[i]['stop'] - 1),
                    type=prophage_feature_type,
                    strand=-1,
                    qualifiers=OrderedDict({
                        'note':
                        f'Putative prophage region identified with PhiSpy v{version.__version__} but not'
                        +
                        f'kept because: {self.droppedpp[i]["dropped_reason"]}'
                    })))
    """
    now we need to decide which files to keep
    It is based on this code:
        Code | File
        --- | ---
        1 | prophage_coordinates.tsv
        2 | GenBank format output
        4 | prophage and bacterial sequences
        8 | prophage_information.tsv
        16 | prophage.tsv
        32 | GFF3 format of the prophages
        64 | prophage.tbl
        128 | test data used in the random forest
        256 | GFF3 format of the genomes
    As explained in the README.
    """

    oc = self.output_choice

    if oc >= 256:
        # write the genomic GFF3 format
        genome_gff3(self)
        oc -= 256
    if oc >= 128:
        # write the calculated data
        write_test_data(self)
        oc -= 128
    if oc >= 64:
        # write the prophage location table
        write_prophage_tbl(self)
        oc -= 64
    if oc >= 32:
        # write the prophage in GFF3 format
        write_gff3(self)
        oc -= 32
    if oc >= 16:
        # write a tsv file of this data
        write_prophage_tsv(self)
        oc -= 16
    if oc >= 8:
        # write prophage_information.tsv
        write_prophage_information(self)
        oc -= 8
    if oc >= 4:
        # separate out the bacteria and phage as fasta files
        write_phage_and_bact(self)
        oc -= 4
    if oc >= 2:
        # update input GenBank file and incorporate prophage regions
        write_genbank(self)
        oc -= 2
    if oc >= 1:
        # print the prophage coordinates:
        write_prophage_coordinates(self)
Пример #27
0
def scan_orfs(seq: str,
              direction: int,
              offset: int = 0) -> List[FeatureLocation]:
    """ Scan for open reading frames on a given sequence.
        Skips all ORFs with a size less than 60 bases.

        Arguments:
            seq: the sequence to examine
            direction: the search direction to use (all ORFs will use this as the strand)
            offset: an offset to add to any location discovered

        Returns:
            a list of FeatureLocations for each ORF, ordered by ascending position
    """
    seq = seq.upper()
    start_codons = ('ATG', 'GTG', 'TTG')
    stop_codons = ('TAA', 'TAG', 'TGA')
    matches = []
    # cache the sequence length
    seq_len = len(seq)
    for frame in [0, 1, 2]:
        i = frame
        last_stop = 0
        while i < seq_len - 2:
            if seq[i:i + 3] in stop_codons and last_stop == 0:
                # special case for unstarted stops
                last_stop = i
                new_orf = FeatureLocation(BeforePosition(offset),
                                          offset + i + 2 + 1, direction)
                if direction == -1:
                    start = AfterPosition(seq_len + offset - new_orf.start)
                    end = seq_len + offset - new_orf.end
                    new_orf = FeatureLocation(end, start, strand=direction)
                matches.append(new_orf)
            if seq[i:i + 3] not in start_codons:
                i += 3
                continue
            # Look for the next stop codon in this frame
            for j in range(i, seq_len - 2, 3):
                if seq[j:j + 3] in stop_codons:
                    last_stop = j
                    # Skip Orfs that are shorter than 20 AA / 60 bases
                    if j - i <= 60:
                        break  # since no ORFs will be bigger before the stop
                    start = i
                    end = j + 2 + 1
                    if direction == 1:
                        new_orf = FeatureLocation(offset + start, offset + end,
                                                  direction)
                    else:
                        # reversed, so convert back to the forward positions
                        new_orf = FeatureLocation(seq_len + offset - end,
                                                  seq_len + offset - start,
                                                  direction)
                    matches.append(new_orf)
                    # This was a good hit, update the last_stop cache.
                    break

            # if we found a matching stop, carry on looking for starts after this stop
            if last_stop > i:
                i = last_stop
                continue

            # Save orfs ending at the end of the sequence without stop codon
            if direction == 1:
                new_orf = FeatureLocation(i + offset,
                                          AfterPosition(seq_len + offset),
                                          direction)
            else:
                # reversed, so convert back to the forward positions
                new_orf = FeatureLocation(BeforePosition(offset),
                                          offset + seq_len - i, direction)
            matches.append(new_orf)
            # since there are no stop codons, just stop here
            break
    return sorted(matches, key=lambda x: min(x.start, x.end))
def check_genomewide(refseq, VERBOSE=0):
    '''Check the integrity of all genes in the genomewide consensus'''
    # Check single-exon genes
    length_tolerance = {'gag': 30, 'pol': 30, 'env': 70, 'vpr': 15, 'vpu': 15}
    for genename, tol in length_tolerance.iteritems():
        (start, end,
         start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
        if (not start_found) or (not end_found):
            print 'ERROR: '+genename+' not found in genomewide!'
            return False
        elif VERBOSE >= 3:
            print 'OK: start and end of '+genename+' found'
        
        gene_HXB2 = get_gene_HXB2(genename)
        check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=tol)
        if not check:
            return False

        geneseq = refseq[start: end]
        gene = geneseq.seq
        check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
        if not check:
            # sometimes the gene ends a few nucleotides upstream, and there is a
            # frameshift mutation that screws up
            gene_new = refseq.seq[start:]
            gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
            prot_new = gene_new.translate()
            end_new = prot_new.find('*')
            end_diff = start + (3 * end_new + 3) - end
            if -90 < end_diff < 0:
                print genename.upper()+' ENDS '+str((end - start) // 3 - end_new - 1)+' AMINO ACIDS UPSTREAM!'
                gene = gene_new[:3 * (end_new + 1)]
            else:
                return False

        prot = gene.translate()
        check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
        if (not check):
            if genename != 'vpu':
                return False
            else:
                print 'ERROR IN VPU STARTING CODON, CONTINUING!'

        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            # sometimes a gene is a bit longer
            gene_new = refseq.seq[start:]
            gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
            prot_new = gene_new.translate()
            end_new = prot_new.find('*')
            end_diff = start + (3 * end_new + 3) - end
            if -90 < end_diff < 0:
                print genename.upper()+' ENDS '+str((end - start) // 3 - end_new - 1)+' AMINO ACIDS UPSTREAM!'
                gene = gene_new[:3 * (end_new + 1)]
                prot = gene.translate()
            elif 0 < end_diff < 90:
                print genename.upper()+' ENDS '+str(end_new + 1 - (end - start) // 3)+' AMINO ACIDS DOWNSTREAM!'
                gene = gene_new[:3 * (end_new + 1)]
                prot = gene.translate()
            else:
                return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

    # Vif is special because it can be longer than in HXB2
    genename = 'vif'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: '+genename+' not found in genomewide!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of '+genename+' found'
    
    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start, len(gene_HXB2), genename,
                                     VERBOSE=VERBOSE, maxdiff=15)
    if not check:
        return False

    geneseq = refseq[start: end]
    gene = geneseq.seq
    check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_end(prot, genename, VERBOSE=0)
    if not check:
        # Vif tends to be a bit longer than in HXB2
        for nc in xrange(1, 4):
            gene_ext = refseq[start: end + 3 * nc].seq
            prot_ext = gene_ext.translate()
            check = check_has_end(prot_ext, genename, VERBOSE=0)
            if check:
                gene = gene_ext
                prot = prot_ext
                if VERBOSE:
                    print 'WARNING: '+genename+' actually ends '+str(nc)+' codons downstream'
                break
        else:
            print 'ERROR: '+genename+' does not end, not even slightly downstream'
            return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check 2-exon genes
    for genename_whole in ('tat', 'rev'):
        genename = genename_whole+'1'
        (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
        if (not start_found) or (not end_found):
            print 'ERROR: '+genename+' not found in genomewide!'
            return False
        elif VERBOSE >= 3:
            print 'OK: start and end of '+genename+' found'
        
        gene_HXB2 = get_gene_HXB2(genename)
        check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15)
        if not check:
            return False

        geneseq = refseq[start: end]
        geneseq = geneseq[:len(geneseq) - len(geneseq) % 3]
        gene = geneseq.seq
        prot = gene.translate()
        check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        start_exon1 = start
        end_exon1 = end

        genename = genename_whole+'2'
        (start, end, start_found, end_found) = locate_gene(refseq[end_exon1 + 2000:], genename, VERBOSE=VERBOSE)
        if (not start_found) or (not end_found):
            print 'ERROR: '+genename+' not found in genomewide!'
            return False
        elif VERBOSE >= 3:
            print 'OK: start and end of '+genename+' found'

        start += end_exon1 + 2000
        end += end_exon1 + 2000

        # NOTE: rev2 overlaps with env gp41 and can have insertions or deletions
        if genename == 'rev2':
            tol = 45
        else:
            tol = 15
        gene_HXB2 = get_gene_HXB2(genename)
        check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=tol)
        if not check:
            return False

        geneseq = refseq[start: end]
        frame = get_frame(geneseq, gene_HXB2, genename, VERBOSE=VERBOSE)
        geneseq = geneseq[frame:]
        gene = geneseq.seq
        prot = gene.translate()
        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            if genename != 'rev2':
                return False

            else:
                # rev2 can end a bit early
                end_new = prot.rfind('*')
                if end_new != -1:
                    if len(prot) - 1 - end_new < 20:
                        print 'REV2 ENDS '+str(len(prot) - end_new - 1)+' AMINO ACIDS UPSTREAM!'
                        prot = prot[:end_new + 1]
                        end = start + frame + 3 * (end_new + 1)
                    else:
                        return False
                else:
                    # rev2 can also end quite a bit late
                    gene_new = refseq.seq[start:]
                    gene_new = gene_new[(end - start) % 3:]
                    gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
                    prot_new = gene_new.translate()
                    end_new = prot_new.find('*')

                    if (start + 3 * end_new) - end < 200:
                        print 'REV2 ENDS '+str(end_new - len(prot) + 1)+' AMINO ACIDS DOWNSTREAM!'
                        prot = prot_new[:end_new + 1]
                        end = start + ((end - start) % 3) + 3 * (end_new + 1)
                    else:
                        return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        start_exon2 = start
        end_exon2 = end

        genename = genename_whole
        gene_HXB2 = get_gene_HXB2(genename)

        from Bio.SeqFeature import FeatureLocation
        gene_loc = FeatureLocation(start_exon1, end_exon1, strand=+1) + \
                   FeatureLocation(start_exon2, end_exon2, strand=+1)
        geneseq = gene_loc.extract(refseq)
        gene = geneseq.seq

        check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        prot = gene.translate()
        check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

    return True
def kbaseGenomeToGenbank(genome_object, taxid=None):
    '''Convert a KBase genome object into a Genbank file incorporating as much info as we can
    as found in the NCBI genbank files.

    Note - the genome object (not to be confused with a ModelSEED "annotation" object) has both annotations / translations
    AND the DNA sequence. It's obtained by calling annotate_genome on an object that only has the DNA sequence.

    Hopefully they won't change this otherwise I'll have to do more cross-referencing and ask for two files. Sigh...'''

    organism_name = genome_object["scientific_name"]
    organism_domain = genome_object["domain"]
    organism_id = genome_object["id"]
    organism_genetic_code = genome_object["genetic_code"]

    # Get the TaxID
    # If none is specified the user has to provide one (or at least some unique integer, not necessarily a tax ID) for this system to work right.
    if taxid is None:
        # CDMI.py is from the KBase - we need it to get the Taxon ID
        # Download it at http://kbase.science.energy.gov/developer-zone/downloads/
        try:
            from CDMI import CDMI_EntityAPI
        except ImportError:
            sys.stderr.write(
                "ERROR: If no TaxID is provided, the CDMI.py file is necessary (http://kbase.science.energy.gov/developer-zone/downloads/) to attempt to guess it.\n"
            )
            exit(2)
        URL = "https://www.kbase.us/services/cdmi_api/"
        cdmi_entity = CDMI_EntityAPI(URL)
        reldict = cdmi_entity.get_relationship_IsInTaxa(
            organism_id, [], [], ["id"])
        if reldict is None:
            sys.stderr.write(
                "ERROR: TaxID for Organism ID %s not found in the KBase CDMI. You will need to specify it manually if you want it\n"
                % (organism_id))
            exit(2)
        else:
            taxidlist = getFieldFromRelationship(reldict, "id", "to")
            taxid = taxidlist[0]

    annotations = {'source': organism_name, 'organism': organism_name}

    # Specify contig data and "source" features for each contig (required by the genbank standard)
    contig_to_sequence = {}
    contig_to_feature_data = {}
    for contig in genome_object["contigs"]:
        contig_to_sequence[contig["id"]] = contig["dna"]
        qualifiers = {}
        qualifiers["organism"] = organism_name
        qualifiers["mol_type"] = "Genomic DNA"
        if taxid is not None:
            qualifiers["db_xref"] = "taxon:%s" % (taxid)
        feature = SeqFeature(FeatureLocation(0, len(contig["dna"])),
                             strand=1,
                             type="source",
                             qualifiers=qualifiers)
        contig_to_feature_data[contig["id"]] = [feature]

    # The contig references are inside the feature definitions in the Genome object file, but
    # in a genbank file the features in a contig must all be separated.
    # Therefore I have to keep track of them in one step and then create the SeqRecord objects
    # in a separate step.
    for feature in genome_object["features"]:
        # FIXME - What do I do with things that have more than one location?
        assert (len(feature["location"]) == 1)

        # First lets Deal with start and stop locations...
        # I verified against Pubseed that these semantics and calcualtions are correct, at least
        # for the proteins I checked that are the same between pubseed and KBase...
        loc = feature["location"][0]
        contig = loc[0]
        start = int(loc[1])
        strandstr = loc[2]
        if strandstr == "-":
            strand = -1
        else:
            strand = 1
        featurelen = loc[3]
        if strand == -1:
            stop = start - featurelen + 1
        else:
            stop = start + featurelen - 1
        # Now I need to convert these into Python slicing indexes...because that is what FeatureLocation wants.
        # This includes making the start always less than stop and offsetting the stop by 1 because slide [a,b] only goes up to position b-1
        seqstart = min(start, stop) - 1
        seqstop = max(start, stop)

        feature_id = feature["id"]
        feature_type = feature["type"]

        qualifiers = {}
        # Unfortunately there are features including proteins in the genome objects that have no function (not even "hypothetical protein")
        # Thankfully this isn't a required field in the Genbank file
        if "function" in feature:
            qualifiers["product"] = strip_control_characters(
                feature["function"])
        if feature_type == "CDS" or feature_type == "peg":
            qualifiers["protein_id"] = feature_id
            qualifiers["translation"] = feature["protein_translation"]

        # "RNA" is not an official type in a GENBANK file.
        # We attempt to figure out based on the annotation whether it is a tRNA, rRNA, or other (misc_RNA) RNA.
        # These are the offiial RNA types (aside from mRNA but those don't have special fields in the Genome object)
        if feature_type == "rna":
            rRNA_finders = [
                "rRNA", "ribosomal", "5S", "16S", "23S", "5.8S", "28S", "18S"
            ]
            tRNA_finders = ["tRNA", "transfer"]
            for finder in rRNA_finders:
                if finder in feature["function"]:
                    feature_type = "rRNA"
            for finder in tRNA_finders:
                if finder in feature["function"]:
                    feature_type = "tRNA"
            if feature_type == "rna":
                feature_type = "misc_RNA"

        # I checked that the above formulas give the correct positions in the genbank file (or at least, the same as the PubSEED genabnk files).
        feature = SeqFeature(FeatureLocation(seqstart, seqstop),
                             strand=strand,
                             type=feature_type,
                             id=feature_id,
                             qualifiers=qualifiers)

        # Attach the new features to the appropriate contig...
        if contig in contig_to_feature_data:
            contig_to_feature_data[contig].append(feature)
        else:
            contig_to_feature_data[contig] = [feature]

    # Create one record for each contig
    records = []
    for contig in contig_to_feature_data:
        seq = Seq(contig_to_sequence[contig], IUPAC.ambiguous_dna)
        record = SeqRecord(seq,
                           id=sanitizeString(contig, False),
                           description="%s contig %s" %
                           (organism_name, contig),
                           name=contig,
                           features=contig_to_feature_data[contig],
                           annotations=annotations)
        records.append(record)
    SeqIO.write(records, sys.stdout, "genbank")

    return
Пример #30
0
def write_data_to_seq_record(pksnrpsvars, seq_record, options):
    #Save substrate specificity predictions in NRPS/PKS domain sec_met info of seq_record
    #
    # Workaround to extract positional information for CDS_motifs from the sec_met qualifiers

    for f in utils.get_cluster_features(seq_record):
        cluster_info = f.qualifiers

    for feature in pksnrpsvars.pksnrpscoregenes:
        nrat = 0
        nra = 0
        nrcal = 0
        nrkr = 0
        nrXdom = 0
        secmetqualifiers = feature.qualifiers['sec_met']
        updated_secmetqualifiers = []
        updated_secmetqualifiers_predictions = []
        domainFeatures = []
        gene_id = utils.get_gene_id(feature)
        for qualifier in secmetqualifiers:
            if "NRPS/PKS Domain:" not in qualifier:
                updated_secmetqualifiers.append(qualifier)
                updated_secmetqualifiers_predictions.append(qualifier)
            else:
                # extract domain type, start and end position from qualifier string
                match_pos_obj = re.search("NRPS/PKS Domain: ([\w-]+) \((\d+)\-(\d+)\)\. E-value: ([\de\.-]+)\. Score: ([\de\.a-]+);", qualifier)
                if not match_pos_obj:
                    logging.exception("Exception: could not extract domain string from qualifier %s:" % qualifier)
                    sys.exit(1)
                domain_type = match_pos_obj.group(1)
                start_aa = int(match_pos_obj.group(2))
                end_aa = int(match_pos_obj.group(3))
                evalue = float(match_pos_obj.group(4))
                score = float (match_pos_obj.group(5))

                #calculate respective positions based on aa coordinates
                if feature.location.strand==1:
                    start = feature.location.start + ( 3 * start_aa )
                    end = feature.location.start + ( 3* end_aa )
                else:
                    end = feature.location.end - ( 3 * start_aa )
                    start = feature.location.end - ( 3 * end_aa)
                loc = FeatureLocation(start, end, strand=feature.strand)

                # set up new CDS_motif feature
                domainFeature = SeqFeature(loc, type=options.FeatureTags.pksnrpsdomains_tag)
                domainFeature.qualifiers['domain'] = [domain_type]
                if feature.qualifiers.has_key('locus_tag'):
                    domainFeature.qualifiers['locus_tag'] = feature.qualifiers['locus_tag']
                else:
                    domainFeature.qualifiers['locus_tag'] = [gene_id]
                domainFeature.qualifiers['detection'] = ["hmmscan"]
                domainFeature.qualifiers['database'] = ["nrpspksdomains.hmm"]
                domainFeature.qualifiers['evalue'] = [str("{:.2E}".format(float(evalue)))]
                domainFeature.qualifiers['score'] = [score]
                if feature.qualifiers.has_key('transl_table'):
                    [transl_table] = feature.qualifiers['transl_table']
                else:
                    transl_table = 1
                domainFeature.qualifiers['translation'] = [str(domainFeature.extract(seq_record).seq.translate(table=transl_table))]

                domainFeature_specificity = []

                if domain_type == "AMP-binding":
                    nra += 1
                    domainname = gene_id + "_A" + str(nra)
                    domainFeature.qualifiers['label'] = [domainname]
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname]
                    predicat_prediction = "{}-{}".format(pksnrpsvars.sandpuma_allmethods[domainname]['prediCAT_MP'],
                        resolve_predicat_domain_specificity(pksnrpsvars.sandpuma_allmethods[domainname]['prediCAT_MP']))
                    domainFeature_specificity.append("Stachelhaus code: %s" % pksnrpsvars.sandpuma_allmethods[domainname]['ASM'])
                    domainFeature_specificity.append("NRPSpredictor3 SVM: %s" % pksnrpsvars.sandpuma_allmethods[domainname]['SVM'])
                    domainFeature_specificity.append("pHMM: %s" % pksnrpsvars.sandpuma_allmethods[domainname]['pHMM'])
                    domainFeature_specificity.append("PrediCAT %s" % predicat_prediction)
                    domainFeature_specificity.append("SANDPUMA ensemble: %s" % pksnrpsvars.sandpuma_res[domainname])
                    domainFeature_specificity.append("PID to NN: %s" % pksnrpsvars.sandpuma_pid[domainname])
                    domainFeature_specificity.append("SNN score: %s" % pksnrpsvars.sandpuma_snn[domainname])
                    newqualifier = qualifier + " NRPS/PKS Domain: %s; Substrate specificity predictions: "\
                                               "%s (Stachelhaus code), %s (NRPSPredictor3 SVM), %s (pHMM), "\
                                               "%s (PrediCAT), %s (SANDPUMA ensemble); PID to NN: %s; "\
                                               "SNN score: %s ." % \
                                               (domainname,
                                                pksnrpsvars.sandpuma_allmethods[domainname]['ASM'],
                                                pksnrpsvars.sandpuma_allmethods[domainname]['SVM'],
                                                pksnrpsvars.sandpuma_allmethods[domainname]['pHMM'],
                                                predicat_prediction,
                                                pksnrpsvars.sandpuma_res[domainname],
                                                pksnrpsvars.sandpuma_pid[domainname],
                                                pksnrpsvars.sandpuma_snn[domainname])

                    # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                    newqualifier_detailed = qualifier + " NRPS/PKS Domain: %s; Substrate specificity predictions: %s (NRPSPredictor3 SVM), %s (Stachelhaus code), %s (PrediCAT), %s (SANPDPUMA);" % \
                                                        (domainname,
                                                         pksnrpsvars.sandpuma_allmethods[domainname]['SVM'],
                                                         pksnrpsvars.sandpuma_allmethods[domainname]['ASM'],
                                                         predicat_prediction,
                                                         pksnrpsvars.sandpuma_res[domainname])
                    updated_secmetqualifiers.append(newqualifier)
                    updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                elif domain_type == "PKS_AT":
                    nrat += 1
                    domainname = gene_id + "_AT" + str(nrat)
                    domainFeature.qualifiers['label'] = [domainname]
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname]
                    domainFeature_specificity.append("PKS signature: %s" % pksnrpsvars.pks_code_preds[domainname])
                    domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_pks_preds[domainname])
                    #For t1pks, t2pks and t3pks
                    if 'transatpks' not in cluster_info['product'][0]:
                        domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds[domainname])
                        newqualifier = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds[domainname], pksnrpsvars.minowa_pks_preds[domainname], pksnrpsvars.consensuspreds[domainname])
                        # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                        newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds_details[domainname], pksnrpsvars.minowa_pks_preds_details[domainname], pksnrpsvars.consensuspreds[domainname])
                        updated_secmetqualifiers.append(newqualifier)
                        updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                    #For transatpks
                    elif 'transatpks' in cluster_info['product'][0]:
                        domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds_transat[domainname])
                        newqualifier = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds[domainname], pksnrpsvars.minowa_pks_preds[domainname], pksnrpsvars.consensuspreds_transat[domainname])
                        # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                        newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds_details[domainname], pksnrpsvars.minowa_pks_preds_details[domainname], pksnrpsvars.consensuspreds_transat[domainname])

                        updated_secmetqualifiers.append(newqualifier)
                        updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                elif domain_type == "CAL_domain":
                    nrcal += 1
                    domainname = gene_id + "_CAL" + str(nrcal)
                    domainFeature.qualifiers['label'] = [domainname]
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname]
                    domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_cal_preds[domainname])
                    newqualifier = qualifier + " Substrate specificity predictions: %s (Minowa);" %(pksnrpsvars.minowa_cal_preds[domainname])
                    # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                    newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (Minowa);" %(pksnrpsvars.minowa_cal_preds_details[domainname])
                    updated_secmetqualifiers.append(newqualifier)
                    updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                elif domain_type == "PKS_KR":
                    nrkr += 1
                    domainname = gene_id + "_KR" + str(nrkr)
                    domainFeature.qualifiers['label'] = [domainname]
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname]
                    domainFeature_specificity.append("KR activity: %s" % pksnrpsvars.kr_activity_preds[domainname])
                    domainFeature_specificity.append("KR stereochemistry: %s" % pksnrpsvars.kr_stereo_preds[domainname])
                    newqualifier = qualifier + " Predicted KR activity: %s; Predicted KR stereochemistry: %s;" %(pksnrpsvars.kr_activity_preds[domainname], pksnrpsvars.kr_stereo_preds[domainname])
                    # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                    newqualifier_detailed = qualifier + " Predicted KR activity: %s; Predicted KR stereochemistry: %s;" %(pksnrpsvars.kr_activity_preds[domainname], pksnrpsvars.kr_stereo_preds[domainname])
                    updated_secmetqualifiers.append(newqualifier)
                    updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                else:
                    nrXdom += 1
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_" + gene_id.partition(".")[0] + "_Xdom"+'{:02d}'.format(nrXdom)]
                    updated_secmetqualifiers.append(qualifier)
                domainFeature.qualifiers['specificity'] = domainFeature_specificity
                if _map_domaintype(domain_type):
                    domainFeature.qualifiers['domain_subtype'] = [domain_type]
                    domainFeature.qualifiers['domain'] = [_map_domaintype(domain_type)]
                domainFeatures.append(domainFeature)

        feature.qualifiers['sec_met'] = updated_secmetqualifiers
        # BiosynML: creating new 'sec_met_predictions' qualifier
        seq_record.features.extend(domainFeatures)

        if pksnrpsvars.consensuspred_gene_dict.has_key(gene_id):
            feature.qualifiers[options.QualifierTags.product_prediction] = "-".join(pksnrpsvars.consensuspred_gene_dict[gene_id])

    #Save consensus structure + link to structure image to seq_record
    clusters = utils.get_cluster_features(seq_record)
    for cluster in clusters:
        clusternr = utils.get_cluster_number(cluster)
        if pksnrpsvars.compound_pred_dict.has_key(clusternr):
            structpred = pksnrpsvars.compound_pred_dict[clusternr]
            cluster.qualifiers['note'].append("Monomers prediction: " + structpred)
            cluster.qualifiers['note'].append("Structure image: structures/genecluster%s.png" % clusternr)
Пример #31
0
for sequence in sequences:
    seq = Seq(sequence, IUPAC.unambiguous_dna)  # unambiguous_dna = only ACGT
    seqRecord = SeqRecord(
        seq,
        id='123',  # sequence ID
        name='EXAMPLE',  # Unique name (no blank spaces)
        description='Annotation for the sequence',  # anotacao da enzima
        annotations={
            "molecule_type": "cDNA",
            "date": "25-MAR-2019"
        })
    qualifiers = {
        "EC_number": "9.9.9.9",  # enzyme EC
        "info": "value"  # other data
    }
    feature = SeqFeature(FeatureLocation(start=0, end=len(sequence)),
                         type='cDNA',
                         location_operator='',
                         strand=0,
                         id=None,
                         qualifiers=qualifiers,
                         sub_features=None,
                         ref=None,
                         ref_db=None)
    seqRecord.features.append(feature)
    listOfseqrecords.append(seqRecord)

    #Open the output file to write
    outputFile = open('example.gb', 'w')
    for seqRecord in listOfseqrecords:
        SeqIO.write(seqRecord, outputFile, 'genbank')
Пример #32
0
    def run_module(self):

        if self.id_list and os.access(self.id_list[0], os.R_OK):
            print("Detected supplied circRNA ID file.")
            with open(self.id_list[0]) as f:
                lines = f.read().splitlines()
            self.id_list = lines

        # let's first check if the temporary directory exists
        if not (os.access(self.temp_dir, os.W_OK)):
            print("Temporary directory %s not writable." % self.temp_dir)
            # exit with -1 error if we can't use it
            exit(-1)

        # let's first check if the temporary directory exists
        if not (os.access(self.output_dir, os.W_OK)):
            print("Output directory %s not writable." % self.output_dir)
            # exit with -1 error if we can't use it
            exit(-1)

        # let's first check if the temporary directory exists
        if self.product_range and len(self.product_range) != 2:
            print(
                "Please specify a qPCR product range as range, e.g. \"-p 140 150\"."
            )
            # exit with -1 error if we can't use it
            exit(-1)

        if self.product_range[1] < self.product_range[0]:
            print("qPCR product range has to be > 0.")
            # exit with -1 error if we can't use it
            exit(-1)

        circ_rna_number = 0

        # define temporary files
        exon_storage_tmp = self.temp_dir + "circtools_flanking_exons.tmp"
        blast_storage_tmp = self.temp_dir + "circtools_blast_results.tmp"
        blast_xml_tmp = self.temp_dir + "circtools_blast_results.xml"

        output_html_file = self.output_dir + self.experiment_title.replace(
            " ", "_") + ".html"

        # erase old contents
        open(exon_storage_tmp, 'w').close()

        # define cache dicts
        exon_cache = {}
        flanking_exon_cache = {}
        primer_to_circ_cache = {}

        if self.input_circRNA:
            from Bio import SeqIO
            with open(exon_storage_tmp, 'a') as data_store:
                for record in SeqIO.parse(self.input_circRNA, "fasta"):

                    # from the FASTA file we cannot tell the coordinates of the circRNA
                    name = str(record.id) + "_0_0_" + str(len(
                        record.seq)) + "_0"

                    data_store.write("\t".join(
                        [name, str(record.seq), "", "\n"]))
                    exon_cache[name] = {1: str(record.seq), 2: ""}

        else:
            exons = self.read_annotation_file(self.gtf_file, entity="exon")

            with open(self.dcc_file) as fp:

                for line in fp:

                    # make sure we remove the header
                    if line.startswith('Chr\t'):
                        continue

                    line = line.rstrip()
                    current_line = line.split('\t')

                    if current_line[3] == "not_annotated":
                        continue

                    if self.gene_list and not self.id_list and current_line[
                            3] not in self.gene_list:
                        continue

                    sep = "_"
                    name = sep.join([
                        current_line[3], current_line[0], current_line[1],
                        current_line[2], current_line[5]
                    ])

                    if self.id_list and not self.gene_list and name not in self.id_list:
                        continue

                    circrna_length = int(current_line[2]) - int(
                        current_line[1])

                    if self.product_range[
                            0] > circrna_length or self.product_range[
                                1] > circrna_length:
                        print(
                            "Specified qPCR product size to large for circRNA \"%s\".\nCircRNA length:"
                            " %d, product size: %d to %d." %
                            (name, circrna_length, self.product_range[0],
                             self.product_range[1]))
                        exit(-1)

                    flanking_exon_cache[name] = {}

                    sep = "\t"
                    bed_string = sep.join([
                        current_line[0], current_line[1], current_line[2],
                        current_line[3],
                        str(0), current_line[5]
                    ])

                    virtual_bed_file = pybedtools.BedTool(bed_string,
                                                          from_string=True)
                    result = exons.intersect(virtual_bed_file, s=True)

                    fasta_bed_line_start = ""
                    fasta_bed_line_stop = ""

                    start = 0
                    stop = 0

                    for result_line in str(result).splitlines():
                        bed_feature = result_line.split('\t')

                        # this is a single-exon circRNA
                        if bed_feature[1] == current_line[1] and bed_feature[
                                2] == current_line[2]:
                            fasta_bed_line_start += result_line + "\n"
                            start = 1
                            stop = 1

                        if bed_feature[1] == current_line[1] and start == 0:
                            fasta_bed_line_start += result_line + "\n"
                            start = 1

                        if bed_feature[2] == current_line[2] and stop == 0:
                            fasta_bed_line_stop += result_line + "\n"
                            stop = 1

                        # these exons are kept for correctly drawing the circRNAs later
                        # not used for primer design
                        if bed_feature[1] > current_line[1] and bed_feature[
                                2] < current_line[2]:
                            flanking_exon_cache[name][bed_feature[1] + "_" +
                                                      bed_feature[2]] = 1

                    virtual_bed_file_start = pybedtools.BedTool(
                        fasta_bed_line_start, from_string=True)
                    virtual_bed_file_stop = pybedtools.BedTool(
                        fasta_bed_line_stop, from_string=True)

                    virtual_bed_file_start = virtual_bed_file_start.sequence(
                        fi=self.fasta_file)
                    virtual_bed_file_stop = virtual_bed_file_stop.sequence(
                        fi=self.fasta_file)

                    if stop == 0 or start == 0:
                        print(
                            "Could not identify the exact exon-border of the circRNA."
                        )
                        print(
                            "Will continue with non-annotated, manually extracted sequence."
                        )

                        # we have to manually reset the start position

                        fasta_bed_line = "\t".join([
                            current_line[0], current_line[1], current_line[2],
                            current_line[5]
                        ])

                        virtual_bed_file_start = pybedtools.BedTool(
                            fasta_bed_line, from_string=True)
                        virtual_bed_file_start = virtual_bed_file_start.sequence(
                            fi=self.fasta_file)
                        virtual_bed_file_stop = ""

                    exon1 = ""
                    exon2 = ""

                    if virtual_bed_file_start:
                        exon1 = open(
                            virtual_bed_file_start.seqfn).read().split(
                                "\n", 1)[1].rstrip()

                    if virtual_bed_file_stop:
                        exon2 = open(virtual_bed_file_stop.seqfn).read().split(
                            "\n", 1)[1].rstrip()

                    circ_rna_number += 1
                    print("extracting flanking exons for circRNA #",
                          circ_rna_number,
                          name,
                          end="\n",
                          flush=True)

                    if exon2 and not exon1:
                        exon1 = exon2
                        exon2 = ""

                    exon_cache[name] = {1: exon1, 2: exon2}

                    with open(exon_storage_tmp, 'a') as data_store:
                        data_store.write("\t".join([name, exon1, exon2, "\n"]))

        if not exon_cache:
            print(
                "Could not find any circRNAs matching your criteria, exiting.")
            exit(-1)

        # need to define path top R wrapper
        primer_script = 'circtools_primex_wrapper.R'

        # ------------------------------------ run script and check output -----------------------

        script_result = os.popen(primer_script + " " + exon_storage_tmp + " " +
                                 str(self.product_range[0]) + "," +
                                 str(self.product_range[1]) + " " +
                                 self.junction + " " +
                                 str(self.num_pairs)).read()

        # this is the first time we look through the input file
        # we collect the primer sequences and unify everything in one blast query

        blast_object_cache = {}
        blast_result_cache = {}

        blast_input_file = ""

        if circ_rna_number < 50:

            for line in script_result.splitlines():
                entry = line.split('\t')
                circular_rna_id = entry[0].split('_')

                if entry[1] == "NA":
                    continue

                # only blast 1
                elif entry[2] in blast_object_cache and not entry[
                        1] in blast_object_cache:
                    blast_input_file += "\n>" + entry[1] + "\n" + entry[1]
                    blast_object_cache[entry[1]] = 1
                    primer_to_circ_cache[entry[1]] = circular_rna_id[0]

                # only blast 2
                elif entry[1] in blast_object_cache and not entry[
                        2] in blast_object_cache:
                    blast_input_file += "\n>" + entry[2] + "\n" + entry[2]
                    blast_object_cache[entry[2]] = 1
                    primer_to_circ_cache[entry[2]] = circular_rna_id[0]

                # seen both already, skip
                elif entry[1] in blast_object_cache and entry[
                        2] in blast_object_cache:
                    continue

                # nothing seen yet, blast both
                else:
                    blast_input_file += "\n>" + entry[1] + "\n" + entry[
                        1] + "\n>" + entry[2] + "\n" + entry[2]
                    blast_object_cache[entry[1]] = 1
                    blast_object_cache[entry[2]] = 1
                    primer_to_circ_cache[entry[1]] = circular_rna_id[0]
                    primer_to_circ_cache[entry[2]] = circular_rna_id[0]
        else:
            print("Too many circRNAs selected, skipping BLAST step.")

        if self.no_blast:
            print("User disabled BLAST search, skipping.")

        run_blast = 0

        # check if we have to blast
        if not self.no_blast and blast_input_file:

            try:
                print("Sending " + str(len(blast_object_cache)) +
                      " primers to BLAST")
                print("This may take a few minutes, please be patient.")
                result_handle = self.call_blast(blast_input_file,
                                                self.organism)
                run_blast = 1
            except Exception as exc:
                print(exc)
                exit(-1)

            with open(blast_xml_tmp, "w") as out_handle:
                out_handle.write(result_handle.read())

            result_handle.close()
            result_handle = open(blast_xml_tmp)

            blast_records = NCBIXML.parse(result_handle)

            for blast_record in blast_records:

                if blast_record.query not in blast_result_cache:
                    blast_result_cache[blast_record.query] = []

                for description in blast_record.descriptions:

                    # filter out the host gene we're in now
                    # also filter out all "PREDICTED" stuff
                    if description.title.find(primer_to_circ_cache[blast_record.query]) == -1 and\
                            description.title.find("PREDICTED") == -1:
                        blast_result_cache[blast_record.query].append(
                            description.title)

        # if we encounter NAs nothing has been blasted, we manually set the values now

        blast_result_cache["NA"] = ["Not blasted, no primer pair found"]

        primex_data_with_blast_results = ""

        for line in script_result.splitlines():
            entry = line.split('\t')

            # split up the identifier for final plotting
            line = line.replace("_", "\t")

            if run_blast == 1:
                left_result = "No hits"
                right_result = "No hits"
            else:
                left_result = "Not blasted, no primer pair found"
                right_result = left_result

            if entry[1] in blast_result_cache:
                left_result = ";".join(blast_result_cache[entry[1]])

            if entry[2] in blast_result_cache:
                right_result = ";".join(blast_result_cache[entry[2]])

            # update line
            primex_data_with_blast_results += line + "\t" + left_result + "\t" + right_result + "\n"

        with open(blast_storage_tmp, 'w') as data_store:
            data_store.write(primex_data_with_blast_results)

        # need to define path top R wrapper
        primer_script = 'circtools_primex_formatter.R'

        # ------------------------------------ run script and check output -----------------------

        primex_data_formatted = os.popen(primer_script + " " +
                                         blast_storage_tmp + " " + "\"" +
                                         self.experiment_title + "\"").read()

        with open(output_html_file, 'w') as data_store:
            data_store.write(primex_data_formatted)

        print("Writing results to " + output_html_file)

        # here we create the circular graphics for primer visualisation
        for line in primex_data_with_blast_results.splitlines():
            entry = line.split('\t')

            # no primers, no graphics
            if entry[6] == "NA":
                continue

            circular_rna_id = "_".join(
                [entry[0], entry[1], entry[2], entry[3], entry[4]])

            if circular_rna_id in exon_cache:

                circular_rna_id_isoform = circular_rna_id + "_" + entry[5]

                circrna_length = int(entry[3]) - int(entry[2])

                exon1_length = len(exon_cache[circular_rna_id][1])
                exon2_length = len(exon_cache[circular_rna_id][2])

                exon2_colour = "#ffac68"

                if exon2_length == 0:
                    exon1_length = int(
                        len(exon_cache[circular_rna_id][1]) / 2) + 1
                    exon2_length = int(len(exon_cache[circular_rna_id][1]) / 2)
                    exon2_colour = "#ff6877"

                forward_primer_start = int(
                    entry[8].split(',')[0]) + circrna_length - exon2_length
                forward_primer_length = int(entry[8].split(',')[1])

                reverse_primer_start = int(
                    entry[9].split(',')[0]) - exon2_length
                reverse_primer_length = int(entry[9].split(',')[1])

                product_size = entry[14]

                gdd = GenomeDiagram.Diagram('circRNA primer diagram')
                gdt_features = gdd.new_track(
                    1,
                    greytrack=True,
                    name="",
                )
                gds_features = gdt_features.new_set()

                feature = SeqFeature(FeatureLocation(0, exon1_length),
                                     strand=+1)
                gds_features.add_feature(feature,
                                         name="Exon 1",
                                         label=False,
                                         color="#ff6877",
                                         label_size=22)

                feature = SeqFeature(FeatureLocation(
                    circrna_length - exon2_length, circrna_length),
                                     strand=+1)
                gds_features.add_feature(feature,
                                         name="Exon 2",
                                         label=False,
                                         color=exon2_colour,
                                         label_size=22)

                feature = SeqFeature(FeatureLocation(forward_primer_start,
                                                     circrna_length),
                                     strand=-1)
                gds_features.add_feature(feature,
                                         name="Product",
                                         label=False,
                                         color="#6881ff")

                feature = SeqFeature(FeatureLocation(0, reverse_primer_start),
                                     strand=-1)
                gds_features.add_feature(feature,
                                         name="Product: " + product_size +
                                         "bp",
                                         label=False,
                                         color="#6881ff",
                                         label_size=22,
                                         label_position="middle")

                if self.junction == "f":

                    feature = SeqFeature(FeatureLocation(
                        reverse_primer_start - reverse_primer_length,
                        reverse_primer_start),
                                         strand=-1)
                    gds_features.add_feature(feature,
                                             name="Reverse",
                                             label=False,
                                             sigil="BIGARROW",
                                             color="#75ff68",
                                             arrowshaft_height=0.3,
                                             arrowhead_length=0.1,
                                             label_size=22)

                    # the primer spans the BSJ, therefore we have to draw it in two pieces:
                    # piece 1: primer start to circRNA end
                    # piece 2: remaining primer portion beginning from 0

                    # piece 1:
                    feature = SeqFeature(
                        FeatureLocation(forward_primer_start, circrna_length))
                    gds_features.add_feature(feature,
                                             name="Forward",
                                             label=False,
                                             sigil="BIGARROW",
                                             color="#75ff68",
                                             arrowshaft_height=0.3,
                                             arrowhead_length=0.1,
                                             label_size=22)

                    # piece 2:
                    feature = SeqFeature(
                        FeatureLocation(
                            0, forward_primer_length -
                            (circrna_length - forward_primer_start)))
                    gds_features.add_feature(feature,
                                             name="Forward",
                                             label=False,
                                             sigil="BIGARROW",
                                             color="#75ff68",
                                             arrowshaft_height=0.3,
                                             arrowhead_length=0.1,
                                             label_size=22)
                elif self.junction == "r":
                    # the primer spans the BSJ, therefore we have to draw it in two pieces:
                    # piece 1: primer start of circRNA to circRNA end
                    # piece 2: remaining primer portion beginning from 0

                    # piece 1:
                    feature = SeqFeature(FeatureLocation(
                        circrna_length - reverse_primer_start, circrna_length),
                                         strand=-1)
                    gds_features.add_feature(feature,
                                             name="Reverse",
                                             label=False,
                                             sigil="BIGARROW",
                                             color="#75ff68",
                                             arrowshaft_height=0.3,
                                             arrowhead_length=0.1,
                                             label_size=22)

                    # piece 2:
                    feature = SeqFeature(FeatureLocation(
                        0, reverse_primer_start),
                                         strand=-1)
                    gds_features.add_feature(feature,
                                             name="Reverse",
                                             label=False,
                                             sigil="BIGARROW",
                                             color="#75ff68",
                                             arrowshaft_height=0.3,
                                             arrowhead_length=0.1,
                                             label_size=22)

                    feature = SeqFeature(
                        FeatureLocation(
                            forward_primer_start,
                            forward_primer_start + forward_primer_length))
                    gds_features.add_feature(feature,
                                             name="Forward",
                                             label=False,
                                             sigil="BIGARROW",
                                             color="#75ff68",
                                             arrowshaft_height=0.3,
                                             arrowhead_length=0.1,
                                             label_size=22)
                else:
                    feature = SeqFeature(FeatureLocation(
                        reverse_primer_start - reverse_primer_length,
                        reverse_primer_start),
                                         strand=-1)
                    gds_features.add_feature(feature,
                                             name="Reverse",
                                             label=False,
                                             sigil="BIGARROW",
                                             color="#75ff68",
                                             arrowshaft_height=0.3,
                                             arrowhead_length=0.1,
                                             label_size=22)

                    feature = SeqFeature(
                        FeatureLocation(
                            forward_primer_start,
                            forward_primer_start + forward_primer_length))
                    gds_features.add_feature(feature,
                                             name="Forward",
                                             label=False,
                                             sigil="BIGARROW",
                                             color="#75ff68",
                                             arrowshaft_height=0.3,
                                             arrowhead_length=0.1,
                                             label_size=22)

                feature = SeqFeature(FeatureLocation(0, 1))
                gds_features.add_feature(feature,
                                         name="BSJ",
                                         label=True,
                                         color="white",
                                         label_size=22)

                if circular_rna_id in flanking_exon_cache:
                    for exon in flanking_exon_cache[circular_rna_id]:
                        exon_start, exon_stop = exon.split('_')

                        exon_start = int(exon_start) - int(entry[2])
                        exon_stop = int(exon_stop) - int(entry[2])

                        feature = SeqFeature(FeatureLocation(
                            exon_start, exon_stop),
                                             strand=+1)
                        gds_features.add_feature(feature,
                                                 name="Exon",
                                                 label=False,
                                                 color="grey",
                                                 label_size=22)

                gdd.draw(format='circular',
                         pagesize=(600, 600),
                         circle_core=0.6,
                         track_size=0.3,
                         tracklines=0,
                         x=0.00,
                         y=0.00,
                         start=0,
                         end=circrna_length - 1)

                gdd.write(
                    self.output_dir + "/" + circular_rna_id_isoform + ".svg",
                    "svg")
Пример #33
0
def parse_gff(path):
    """Parses GFF and corresponding FASTA using GFFutils.

    Args:
        path (str):
            Path to GFF file. Should have a corresponding FASTA file of the same
            name with a valid FASTA suffix (.fa, .fasta, .fsa, .fna, .faa).
    Returns:
        list: SeqRecord objects corresponding to each scaffold in the file
    """
    fasta = find_fasta(path)
    if not fasta:
        raise FileNotFoundError(f"Could not find partner FASTA file for {path}")

    # Parse FASTA and create GFFUtils database
    fasta = parse_fasta(fasta)
    gff = gffutils.create_db(
        str(path),
        ":memory:",
        force=True,
        merge_strategy="create_unique",
        sort_attribute_values=True
    )
    regions = find_regions(gff.directives)

    # Find features for each record in the FASTA file
    for record in fasta:
        try:
            record_start, _ = regions[record.id]
            record_start -= 1
        except KeyError:
            record_start = 0

        # Normalise Feature location based on ##sequence-region directive.
        # Necessary for extracted GFF3 files that still store coordinates
        # relative to the entire region, not to the extracted FASTA.
        # If no sequence-region directive is found, assumes 1 (i.e. sequence start).
        cds_features = []
        for feature in gff.region(seqid=record.id, featuretype=["gene", "CDS"]):
            feature = biopython_integration.to_seqfeature(feature)
            feature.location = FeatureLocation(
                feature.location.start - record_start,
                feature.location.end - record_start,
                strand=feature.location.strand
            )
            if feature.type == "CDS":
                cds_features.append(feature)
            else:
                record.features.append(feature)

        if not cds_features:
            raise ValueError(f"Found no CDS features in {record.id} [{path}]")

        # Merge CDS features into singular SeqFeature objects, add them to record
        previous = None
        for feature in sorted(cds_features, key=lambda f: f.location.start):
            seqid = feature.qualifiers["ID"][0]
            same_feature = previous == seqid
            if not previous:
                previous = seqid
            if same_feature:
                if feature.location.strand == 1:
                    record.features[-1].location += feature.location
                else:
                    # Reverse strand locations must be in biological order
                    old, new = record.features[-1].location, feature.location
                    record.features[-1].location = new + old
            else:
                record.features.append(feature)
                previous = seqid

        # Sort, then generate insertion tuples like with other formats
        record.features.sort(key=lambda f: f.location.start)

    return fasta
Пример #34
0
    def short_sigils(self, glyph):
        """Draw sigils on top of grey box backgrounds."""
        # The blue boxes are only relevant for the BIGARROW
        # Add a track of features, bigger height to emphasise any sigil errors
        self.gdt_features = self.gdd.new_track(1, greytrack=True, height=3)
        # We'll just use one feature set for these features,
        self.gds_features = self.gdt_features.new_set()
        # For the ARROW and BIGARROW sigils:
        # - Green arrows just have small heads (meaning if there is a mitre
        #   it will escape the bounding box).
        # - Red arrows should be small triangles (so short no shaft shown)

        # Forward strand:
        feature = SeqFeature(FeatureLocation(15, 30), strand=-1)
        self.gds_features.add_feature(feature, color="blue")
        feature = SeqFeature(FeatureLocation(15, 30), strand=+1)
        self.gds_features.add_feature(feature, color="grey")
        self.gds_features.add_feature(feature,
                                      name="Forward",
                                      sigil=glyph,
                                      arrowhead_length=0.05)

        feature = SeqFeature(FeatureLocation(55, 60), strand=-1)
        self.gds_features.add_feature(feature, color="blue")
        feature = SeqFeature(FeatureLocation(55, 60), strand=+1)
        self.gds_features.add_feature(feature, color="grey")
        self.gds_features.add_feature(feature,
                                      name="Forward",
                                      sigil=glyph,
                                      arrowhead_length=1000,
                                      color="red")

        feature = SeqFeature(FeatureLocation(75, 125), strand=-1)
        self.gds_features.add_feature(feature, color="blue")
        feature = SeqFeature(FeatureLocation(75, 125), strand=+1)
        self.gds_features.add_feature(feature, color="grey")
        self.gds_features.add_feature(feature,
                                      name="Forward",
                                      sigil=glyph,
                                      arrowhead_length=0.05)

        # Strandless:
        feature = SeqFeature(FeatureLocation(140, 155), strand=None)
        self.gds_features.add_feature(feature, color="grey")
        self.gds_features.add_feature(feature,
                                      name="Strandless",
                                      sigil=glyph,
                                      arrowhead_length=0.05)

        feature = SeqFeature(FeatureLocation(180, 185), strand=None)
        self.gds_features.add_feature(feature, color="grey")
        self.gds_features.add_feature(feature,
                                      name="Strandless",
                                      sigil=glyph,
                                      arrowhead_length=1000,
                                      color="red")

        feature = SeqFeature(FeatureLocation(200, 250), strand=None)
        self.gds_features.add_feature(feature, color="grey")
        self.gds_features.add_feature(feature,
                                      name="Strandless",
                                      sigil=glyph,
                                      arrowhead_length=0.05)

        # Reverse strand:
        feature = SeqFeature(FeatureLocation(265, 280), strand=+1)
        self.gds_features.add_feature(feature, color="blue")
        feature = SeqFeature(FeatureLocation(265, 280), strand=-1)
        self.gds_features.add_feature(feature, color="grey")
        self.gds_features.add_feature(feature,
                                      name="Reverse",
                                      sigil=glyph,
                                      arrowhead_length=0.05)

        feature = SeqFeature(FeatureLocation(305, 310), strand=+1)
        self.gds_features.add_feature(feature, color="blue")
        feature = SeqFeature(FeatureLocation(305, 310), strand=-1)
        self.gds_features.add_feature(feature, color="grey")
        self.gds_features.add_feature(feature,
                                      name="Reverse",
                                      sigil=glyph,
                                      arrowhead_length=1000,
                                      color="red")

        feature = SeqFeature(FeatureLocation(325, 375), strand=+1)
        self.gds_features.add_feature(feature, color="blue")
        feature = SeqFeature(FeatureLocation(325, 375), strand=-1)
        self.gds_features.add_feature(feature, color="grey")
        self.gds_features.add_feature(feature,
                                      name="Reverse",
                                      sigil=glyph,
                                      arrowhead_length=0.05)

        self.finish("GD_sigil_short_%s" % glyph)
        continue
    # checking each feature and separating them into different color sets to make reading the
    # genome map easier
    if len(gd_feature_set) % 2 == 0:
        color = colors.turquoise
    else:
        color = colors.lightgreen
    #setting each feature to the color assigned and adding a label
    gd_feature_set.add_feature(feature, color=color, label=True)

# checking for specific genome sequences and labeling them with their specific labels
for site, name, color in [("GAATTC", "EcoRI", colors.blue), ("GGATCC", "BamHI", colors.red)]:
    index = 0
    while True:
        index = record.seq.find(site, start=index)
        if index == -1: break
        feature = SeqFeature(FeatureLocation(index, index + len(site)))
        gd_feature_set.add_feature(feature, color=color, name=name, label=True, label_size=10, label_color=color)
        index += len(site)

# creating the circular genome map as a png using biopython's built in diagram creation tool
gd_diagram.draw(format="circular", circular=True, pagesize=(20 * cm, 20 * cm),
                start=0, end=len(record), circle_core=0.5)
gd_diagram.write("ToCSV.png", "PNG")

# using Python's Image Library to add a title to the top of the file by opening the png, writing the name of the virus
# and then saving over the file with the title added.
img = Image.open("ToCSV.png")
draw = ImageDraw.Draw(img)
draw.text((200, 10), "Tomato Curly Stunt Virus", (0, 0, 0))
img.save("ToCSV.png")
Пример #36
0
 def setUp(self):
     self.record = Record(Seq("A" * 1000))
     self.subregion = SubRegion(FeatureLocation(100, 200), tool="test")