def get_locations(CDSs, start, end, strand): """Return mRNA and CDS locations CDS has exact boundaries, while mRNA not. """ #gff is 1-based, gb also, but sf is 0-based if len(CDSs) > 1: parts, mrnaparts = [], [] for cdsi, (s, e) in enumerate(CDSs): parts.append(FeatureLocation(s - 1, e, strand=strand)) if cdsi == 0: mrnaparts.append( FeatureLocation(BeforePosition(s - 1), e, strand=strand)) elif cdsi == len(CDSs) - 1: mrnaparts.append( FeatureLocation(s - 1, AfterPosition(e), strand=strand)) else: mrnaparts.append(FeatureLocation(s - 1, e, strand=strand)) cdsloc = CompoundLocation(parts) mrnaloc = CompoundLocation(parts) else: cdsloc = FeatureLocation(start - 1, end, strand=strand) mrnaloc = FeatureLocation(BeforePosition(start - 1), AfterPosition(end), strand=strand) return cdsloc, mrnaloc
def get_subsequence(self, resnums, new_id=None, copy_letter_annotations=True): """Get a subsequence as a new SeqProp object given a list of residue numbers""" # XTODO: documentation if not self.seq_record: raise ValueError('No chain sequence stored') biop_compound_list = [] for resnum in resnums: feat = FeatureLocation(resnum - 1, resnum) biop_compound_list.append(feat) if len(biop_compound_list) == 0: log.info('Zero length subsequences') return elif len(biop_compound_list) == 1: log.debug('Subsequence only one residue long') sub_feature_location = biop_compound_list[0] else: sub_feature_location = CompoundLocation(biop_compound_list) sub_feature = sub_feature_location.extract(self.seq_record) if not new_id: new_id = '{}_subseq'.format(self.id) new_sp = SeqProp(id=new_id, seq=sub_feature) if copy_letter_annotations: new_sp.letter_annotations = sub_feature.letter_annotations return new_sp
def gb(self): l = self.length() g = SeqRecord( Seq(self.sequence(),IUPAC.IUPACUnambiguousDNA()), id=self.name[0:8], name=self.name[0:8], description=self.description ) g.features = [] for f in self.features(): t = f.type if f.direction == 'f': strand = 1 else: strand = -1 if self.shape == 'c' and f.end > l: f1 = FeatureLocation(ExactPosition(f.start), ExactPosition(l), strand) f2 = FeatureLocation(ExactPosition(0), ExactPosition(f.end - l), strand) if strand == 1: floc = CompoundLocation([f1, f2]) else: floc = CompoundLocation([f2, f1]) else: floc = FeatureLocation(ExactPosition(f.start),ExactPosition(f.end), strand) sf = SeqFeature(floc, f.type, qualifiers=dict([[q.name,q.data] for q in f.qualifiers.all()])) g.features.append(sf) return g.format('genbank')
def test_eq_not_identical(self): """Test two different locations are not equal.""" loc1 = FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1) loc2 = ( FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1) + FeatureLocation(50, 60, 1) ) self.assertNotEqual(loc1, loc2) loc1 = FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1) loc2 = FeatureLocation(12, 17, -1) + FeatureLocation(23, 42, -1) self.assertNotEqual(loc1, loc2) loc1 = CompoundLocation( [FeatureLocation(12, 17, 1), FeatureLocation(23, 42, 1)] ) loc2 = CompoundLocation( [FeatureLocation(12, 17, 1), FeatureLocation(23, 42, 1)], "order" ) self.assertNotEqual(loc1, loc2) loc1 = FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1) loc2 = 5 self.assertNotEqual(loc1, loc2)
def get_deletions(gb_m1, l, e, nc_margin): """Define a list of deletions according to parameters L and E. A deletion is defined as any region longer than (or equal to) L not containing any gene with essentiality score higher than E. Args: gb_m1 (Bio.SeqRecord.SeqRecord): GenBank annotation. l (int): minimum deletion length. e (float): gene essentiality threshold. nc_margin (int): non-coding margin around essential genes. Returns: deletions (Bio.SeqFeature.CompoundLocation): list of proposed deletions. """ essential_genes = [ gene for gene in gb_m1.features if is_essential(gene, e) ] print("Essential gene count with threshold %.3f: %d" % (e, len(essential_genes))) essential_regions = CompoundLocation([ FeatureLocation(max(gene.location.start - nc_margin, 0), min(gene.location.end + nc_margin, len(gb_m1))) for gene in essential_genes ]) essential_regions = merge_overlaps(essential_regions) nonessential_regions = complementary_compoundloc(0, len(gb_m1), essential_regions) try: deletions = CompoundLocation([ region for region in nonessential_regions.parts if len(region) >= l ]) except ValueError: raise SystemExit("No deletions defined! Try readjusting L and/or E.") return deletions
def build_location_from_others( locations: Sequence[Location]) -> FeatureLocation: """ Builds a new location from non-overlapping others. If location boundaries are equal, they will be merged. If at least one provided location is a CompoundLocation or the locations are not continuous, the resulting location will be a CompoundLocation. Arguments: locations: a sequence of FeatureLocations to merge Returns: a FeatureLocation if the locations are continuous, otherwise a CompoundLocation """ if not locations: raise ValueError("at least one FeatureLocation required") location = locations[0] for loc in locations[1:]: if loc.start == location.end: new_sub = FeatureLocation(location.parts[-1].start, loc.parts[0].end, location.strand) if len(location.parts) > 1 or len(loc.parts) > 1: location = CompoundLocation(location.parts[:-1] + [new_sub] + loc.parts[1:]) else: location = new_sub else: location = CompoundLocation(location.parts + loc.parts) return location
def test_other(self): location = CompoundLocation([FeatureLocation(5922, 6190, strand=1), FeatureLocation(5741, 5877, strand=1), FeatureLocation(4952, 5682, strand=1)]) assert self.func(97, 336, location) == (5243, 6064) location = CompoundLocation([FeatureLocation(5922, 6190, strand=-1), FeatureLocation(5741, 5877, strand=-1), FeatureLocation(4952, 5682, strand=-1)]) assert self.func(97, 336, location) == (5078, 5854)
def _feature(self, seq, s, e, strand): if s < e: return FeatureLocation(s, e, strand=strand) if strand == 1: return CompoundLocation([ FeatureLocation(s, len(seq.seq), strand=1), FeatureLocation(0, e, strand=1) ]) return CompoundLocation([ FeatureLocation(0, e, strand=-1), FeatureLocation(s, len(seq.seq), strand=-1) ])
def check_sub(feature, sequence) -> List[SeqFeature]: """ Recursively checks a GFF feature for any subfeatures and generates any appropriate SeqFeature instances from them. """ new_features = [] locations = [] # type: List[FeatureLocation] trans_locations = [] # type: List[FeatureLocation] qualifiers = {} # type: Dict[str, List[str]] mismatching_qualifiers = set() # type: Set[str] for sub in feature.sub_features: if sub.sub_features: # If there are sub_features, go deeper new_features.extend(check_sub(sub, sequence)) elif sub.type == 'CDS': sub_mismatch = generate_details_from_subfeature( sub, qualifiers, locations, trans_locations) mismatching_qualifiers.update(sub_mismatch) for qualifier in mismatching_qualifiers: del qualifiers[qualifier] if 'Parent' in qualifiers: del qualifiers['Parent'] # if nothing to work on if not new_features and not locations: return [] # Only works in tip of the tree, when there's no new_feature built yet. If there is, # it means the script just came out of a check_sub and it's ready to return. if not new_features: new_loc = locations[0] # construct a compound location if required if len(locations) > 1: locations = sorted(locations, key=lambda x: x.start.real) trans_locations = sorted(trans_locations, key=lambda x: x.start.real) if locations[0].strand == 1: new_loc = CompoundLocation(locations) else: new_loc = CompoundLocation(list(reversed(locations))) trans_locations = list(reversed(trans_locations)) # TODO: use new secmet features new_feature = SeqFeature(new_loc) new_feature.qualifiers = qualifiers new_feature.type = 'CDS' trans = ''.join([ n.extract(sequence.seq).translate(stop_symbol='')._data for n in trans_locations ]) new_feature.qualifiers['translation'] = [str(trans)] new_features.append(new_feature) return new_features
def get_subsequence(self, resnums): """Get a subsequence as a new SeqProp object given a list of residue numbers""" biop_compound_list = [] for resnum in resnums: feat = FeatureLocation(resnum - 1, resnum) biop_compound_list.append(feat) sub_feature_location = CompoundLocation(biop_compound_list) sub_feature = sub_feature_location.extract(self) new_sp = SeqProp(id='{}_subseq'.format(self.id), seq=sub_feature) new_sp.letter_annotations = sub_feature.letter_annotations return new_sp
def test_position_conversion_nonzero_start_compound(self): location = CompoundLocation([FeatureLocation(6, 18, strand=1), FeatureLocation(24, 27, strand=1)]) assert len(location) == 15 assert self.func(0, 2, location) == (6, 12) assert self.func(1, 4, location) == (9, 18) assert self.func(3, 5, location) == (15, 27) location = CompoundLocation([FeatureLocation(6, 15, strand=-1), FeatureLocation(21, 27, strand=-1)]) assert len(location) == 15 assert self.func(0, 2, location) == (21, 27) assert self.func(1, 4, location) == (9, 24) assert self.func(3, 5, location) == (6, 12)
def get_subsequence_from_property(self, property_key, property_value, condition, return_resnums=False): """Get a subsequence as a new SeqProp object given a certain property you want to find in this chain's letter_annotation See documentation for :func:`ssbio.protein.sequence.seqprop.SeqProp.get_subsequence_from_property` Args: property_key (str): Property key in the ``letter_annotations`` attribute that you want to filter using property_value (str): Property value that you want to filter by condition (str): ``<``, ``=``, ``>``, ``>=``, or ``<=`` to filter the values by return_resnums (bool): If resnums should be returned as well Returns: SeqProp: New SeqProp object that you can run computations on or just extract its properties """ if not self.seq_record: raise ValueError('No chain sequence stored') if property_key not in self.seq_record.letter_annotations: raise KeyError( '{}: {} not contained in the letter annotations'.format( self.seq_record.id, property_key)) subfeat_indices = list( locate( self.seq_record.letter_annotations[property_key], lambda x: ssbio.utils.check_condition(x, condition, property_value))) biop_compound_list = [] for idx in subfeat_indices: feat = FeatureLocation(idx, idx + 1) biop_compound_list.append(feat) sub_feature_location = CompoundLocation(biop_compound_list) sub_feature = sub_feature_location.extract(self.seq_record) new_sp = SeqProp(id='{}-{}_{}_{}_{}_extracted'.format( self.pdb_parent, self.id, property_key, condition, property_value), seq=sub_feature) new_sp.letter_annotations = sub_feature.letter_annotations if return_resnums: return new_sp, [x + 1 for x in subfeat_indices] else: return new_sp
def nucleic_sequence(self, feature_type): """ :type feature_type: str :rtype: Bio.Seq.Seq """ locations = self._get_features(feature_type) if locations: if len(locations) > 1: feature = CompoundLocation(locations) else: feature = locations[0] return feature.extract(self.chromosome.nucleic_sequence) return None
def test_position_conversion_compound_reverse(self): location = CompoundLocation([FeatureLocation(0, 6, strand=-1), FeatureLocation(9, 18, strand=-1)]) assert len(location) == 15 assert self.func(0, 4, location) == (3, 18) assert self.func(1, 5, location) == (0, 15) location = CompoundLocation([FeatureLocation(0, 6, strand=-1), FeatureLocation(12, 15, strand=-1), FeatureLocation(21, 27, strand=-1)]) assert len(location) == 15 assert self.func(0, 4, location) == (3, 27) assert self.func(1, 5, location) == (0, 24) assert self.func(2, 3, location) == (12, 15)
def set_annotation(self, seq_ident, seq_rec, gb_file): if self._annotation: SeqIO = import_bio_seq_io() # If SeqIO exists than these should be installed! from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation from Bio.Alphabet import DNAAlphabet _seg_length = lambda s, e: (e - s) if e > s else (s + self._length - e) _get_loc = lambda s, e: FeatureLocation(s, e) if e > s else \ CompoundLocation([FeatureLocation(s, self._length), FeatureLocation(0, e)]) for a in self._annotation: length = max(_seg_length(a.start_1, a.end_1), _seg_length(a.start_1, a.end_1)) m = 100 if a.matched == length else round( 100 * length / a.matched, 2) note_1 = f'repeat_hit position {a.start_2} - {a.end_2}, match {m}.%, length {a.matched}' note_2 = f'repeat_hit position {a.start_1} - {a.end_1}, match {m}.%, length {a.matched}' rpt = ('rpt_type', a.rpt_type) seq_rec.features.append( SeqFeature(_get_loc(a.start_1, a.end_1), type='repeat_region', qualifiers=OrderedDict([rpt, ('note', note_1)]))) seq_rec.features.append( SeqFeature(_get_loc(a.start_2, a.end_2), type='repeat_region', qualifiers=OrderedDict([rpt, ('note', note_2)]))) # Fix other data seq_rec.id = seq_ident seq_rec.seq.alphabet = DNAAlphabet() SeqIO.write([seq_rec], gb_file, 'genbank') return True
def save_genbank_m4(gb_m3, gb_m4): """Generate modified-IV GenBank file (reduced version of the genome). Args: gb_m3 (Bio.SeqRecord.SeqRecord): modified-III GenBank annotation. gb_m4 (str): modified-IV GenBank file path. """ deletions = [] for feature in gb_m3.features: if is_deletion(feature): deletions.append(feature.location) deletions = CompoundLocation(deletions) non_deletions = complementary_compoundloc(0, len(gb_m3), deletions) reduced_annot = SeqRecord(seq=non_deletions.extract(gb_m3.seq), id=gb_m3.id, name=gb_m3.name, description=gb_m3.description, dbxrefs=gb_m3.dbxrefs, annotations=gb_m3.annotations) # Shift features' positions according to deletions end = 0 for nondel in non_deletions.parts: offset = nondel.start - end for feature in gb_m3.features: if (feature.location.start in nondel and feature.location.end in nondel): feature.location = feature.location + (-offset) reduced_annot.features.append(feature) end = nondel.end - offset + 2 SeqIO.write(reduced_annot, gb_m4, "genbank")
def feature_data_to_seqfeature(coordinates, feature_type): """Converts coordinates received from feature table parsing to a SeqFeature. :param coordinates: Start and end positions for the feature. :type coordinates: list[tuple(int, int)] :param feature_type: Label of the parsed feature. :type str :returns: Returns a Biopython SeqFeature loaded with given coordinates. :rtype: SeqFeature """ locations = [] for coordinate in coordinates: if coordinate[0] <= coordinate[1]: start = int(coordinate[0]) - 1 stop = int(coordinate[1]) strand = 1 else: start = int(coordinate[1]) - 1 stop = int(coordinate[0]) strand = -1 locations.append(FeatureLocation(start, stop, strand=strand)) if len(locations) == 1: feature = SeqFeature(locations[0], type=feature_type) else: feature = SeqFeature(CompoundLocation(locations), type=feature_type) return feature
def create_3_part_seqfeature(start1=0, stop1=0, strand1=1, start2=0, stop2=0, strand2=1, start3=0, stop3=0, strand3=1, type="", qualifiers=None): """Constructs simple BioPython SeqFeature. Start1, Start2, Start3 = int Stop1, Stop2, Stop3 = int Strand1, Strand2, Strand3 = int (-1, 1) Type = 'CDS', 'Source', 'tRNA', etc. Qualifiers = dictionary of feature descriptions.""" seq_ftr = SeqFeature(CompoundLocation([ FeatureLocation( ExactPosition(start1), ExactPosition(stop1), strand=strand1), FeatureLocation( ExactPosition(start2), ExactPosition(stop2), strand=strand2), FeatureLocation( ExactPosition(start3), ExactPosition(stop3), strand=strand3) ], "join"), type=type, location_operator="join", qualifiers=qualifiers) return seq_ftr
def set_seqfeature(self): """ Create a SeqFeature object with which to populate the `seqfeature` attribute. :return: """ # SeqFeature coordinates are 0-based half-open start, stop = basic.reformat_coordinates(self.start, self.stop, self.coordinate_format, "0_half_open") # SeqFeature orientation is (-1, 1) instead of ("R", "F") strand = basic.reformat_strand(self.orientation, "numeric") # Standard genes will have start < stop if self.start <= self.stop: self.seqfeature = SeqFeature(FeatureLocation(start, stop), strand=strand, type=self.type) # Wrap-around genes will have stop < start else: self.seqfeature = SeqFeature(CompoundLocation([ FeatureLocation(start, self.genome_length), FeatureLocation(0, stop) ]), strand=strand, type=self.type) # Add feature qualifiers self.seqfeature.qualifiers = self.get_qualifiers()
def wraparound(feature): new_start = length -(shift-feature.location.start) new_end = feature.location.end-shift c = SeqFeature(CompoundLocation( [FeatureLocation(0, new_end), FeatureLocation(new_start, length)]), type=feature.type, location_operator="join", strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers) sub_features=[] for sf in feature.sub_features: if feature.location.end<shift: sub_features.append(SeqFeature(FeatureLocation(length-feature.location.start, length-feature.location.end), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=None)) elif feature.location.start>shift: sub_features.append(SeqFeature(FeatureLocation(feature.location.start-shift, feature.location.end-shift), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=None)) else: sub_features.extend(wraparound(sf)) c.sub_features.extend(sub_features) return c
def new_compound_location( indices: List[Union[Tuple[int, int], Tuple[int, int, int]]], strand: int ) -> CompoundLocation: locations = [] for index in indices: if not isinstance(index, Tuple): raise ValueError( "Expects a tuple of integers size 2 or 3, not a {}".format( indices.__class__ ) ) if not len(index) in [2, 3]: raise ValueError("Expects a tuple of integers of size 2 or 3") if len(index) == 2: i, j = index s = strand elif len(index) == 3: i, j, s = index else: raise ValueError("Must be tuple of 2 or 3 integers") if not isinstance(i, int) or not isinstance(j, int) or not isinstance(s, int): raise ValueError( "Expects a tuple of integers of size 2 or 3. Found {}".format(index) ) locations.append(FeatureLocation(ExactPosition(i), ExactPosition(j), strand=s)) return CompoundLocation(locations)
def _adjust_location_by_offset(location: Location, offset: int) -> Location: """ Adjusts the given location to account for an offset (e.g. start_codon) """ assert -2 <= offset <= 2, "invalid offset %d" % offset def adjust_single_location(part: FeatureLocation) -> FeatureLocation: """ only functions on FeatureLocation """ assert not isinstance(part, CompoundLocation) start = part.start end = part.end if part.strand == -1: end = type(end)(end + offset) else: start = type(start)(start + offset) return FeatureLocation(start, end, part.strand) if isinstance(location, CompoundLocation): part = location.parts[0] if location.strand == -1: assert part.end == location.end else: assert part.start == location.start location = CompoundLocation([adjust_single_location(part)] + location.parts[1:]) else: location = adjust_single_location(location) return location
def nucleic_coding_sequence(self): """ :rtype: Bio.Seq.Seq """ if not self._nucleic_coding_sequence: if self.cds and len(self.cds) > 1: cds = CompoundLocation(self.cds) else: cds = self.cds[0] self._nucleic_coding_sequence = cds.extract( self.chromosome.nucleic_sequence) if self.location.strand == -1: self._nucleic_coding_sequence = Seq( self._nucleic_coding_sequence).reverse_complement() return self._nucleic_coding_sequence
def test_get_list_of_locations(self): f1 = FeatureLocation(10, 40, strand=+1) f2 = FeatureLocation(50, 59, strand=+1) f = CompoundLocation([f1, f2]) location_list = hf.decompose_compound_location(f) self.assertTrue(isinstance(location_list, list)) self.assertTrue(isinstance(location_list[0], FeatureLocation)) self.assertTrue(isinstance(location_list[1], FeatureLocation))
def check_sub(feature, sequence): new_features = [] loc_list = [] qual_list = {} topop = [] for sub in feature.sub_features: if sub.sub_features: # If there are sub_features, go deeper new_features.extend(check_sub(sub, sequence)) elif sub.type == 'CDS': loc = [sub.location.start.real, sub.location.end.real] loc_list.append(FeatureLocation(loc[0], loc[1], strand=sub.strand)) # For split features (CDSs), the final feature will have the same qualifiers as the children ONLY if # they're the same, i.e.: all children have the same "protein_ID" (key and value). for qual in sub.qualifiers.keys(): if qual not in qual_list: qual_list[qual] = sub.qualifiers[qual] if qual in qual_list and not qual_list[qual] == sub.qualifiers[ qual]: topop.append(qual) for n in topop: # Pop mismatching qualifers over split features qual_list.pop(n, None) qual_list.pop('Parent', None) # Pop parent. # Only works in tip of the tree, when there's no new_feature built yet. If there is, # it means the script just came out of a check_sub and it's ready to return. if not new_features: if len(loc_list) > 1: loc_list = sorted(loc_list, key=lambda x: x.start.real) if loc_list[0].strand == 1: new_loc = CompoundLocation(loc_list) else: new_loc = CompoundLocation(list(reversed(loc_list))) elif len(loc_list) == 0: return new_features else: new_loc = loc_list[0] new_feature = SeqFeature(new_loc) new_feature.qualifiers = qual_list new_feature.type = 'CDS' trans = new_feature.extract(sequence.seq).translate(stop_symbol='') new_feature.qualifiers['translation'] = [str(trans)] new_features.append(new_feature) return new_features
def coordinatesToLocation(self, coordinates): locationParts = [ FeatureLocation(int(p[0]), int(p[1]), int(p[2]) ) for p in [ s.split(',') for s in coordinates.split(';')] ] if len(locationParts) == 1: return locationParts[0] elif len(locationParts) > 1: return CompoundLocation(locationParts) else: return None
def test_eq_identical(self): """Test two identical locations are equal.""" loc1 = FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1) loc2 = FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1) self.assertEqual(loc1, loc2) loc1 = FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1) loc2 = CompoundLocation([FeatureLocation(12, 17, 1), FeatureLocation(23, 42, 1)]) self.assertEqual(loc1, loc2)
def setUp(self): self.magic_split = Seq("ATGGCAxxxxxxGGTxxxxxxATTTGT") self.magic = Seq("ATGGCAGGTATTTGT") self.translation = "MAGIC" self.sub_locations = [FeatureLocation(0, 6, strand=1), FeatureLocation(12, 15, strand=1), FeatureLocation(21, 27, strand=1)] self.location = CompoundLocation(self.sub_locations) self.cds = CDSFeature(self.location, locus_tag="compound")
def new_feature_location(start: int, end: int, length: int, strand: int): """Makes a FeatureLocation. If necessary, makes CompoundLocation. """ if start > end: if length is None: raise ValueError( "A length must be provided to create a feature with start > end." ) f1 = FeatureLocation(start, length, strand) f2 = FeatureLocation(1, end, strand) if strand == -1: location = CompoundLocation([f2, f1]) else: location = CompoundLocation([f1, f2]) else: location = FeatureLocation(start, end, strand=strand) return location
def find_repeat(self, contig, fn, st, ppno, extra_dna): """ Find repeats in the DNA sequence :param self: The data object :param contig: the name of the contig we are searching on :param fn: the nuclotide sequence to search :param st: the start to find repeats at :param ppno: the prophage number :param extra_dna: the extra dna that flanks the sequence :return: a list of repeat regions """ if len(fn) == 0: log_and_message("Len sequence is 0 so ignoring\n", c="RED", stderr=True, loglevel="WARNING") return {} rep = {} index = 0 # with open(os.path.join(output_dir, "repeat_finding"), 'a') as rptout: # rptout.write(f">pp{ppno} {st}\n{fn}\n") try: # set the False parameter to True to enable debugging of repeat finder repeats = PhiSpyRepeatFinder.repeatFinder(fn, 3, self.min_repeat_len, ppno, False) except Exception as e: log_and_message(f"There was an error running repeatfinder for {fn}:{e}\n", c="RED", stderr=True, loglevel="WARNING") return {} for r in repeats: if (r['first_start'] < (3 * extra_dna)) and (r['second_start'] > (len(fn) - (3 * extra_dna))): # check that start is always less than end # This always causes an off by one error, so we have to increment our ends if r['first_end'] < r['first_start']: [r['first_start'], r['first_end']] = [r['first_end'] + 1, r['first_start'] + 1] if r['second_end'] < r['second_start']: [r['second_start'], r['second_end']] = [r['second_end'] + 1, r['second_start'] + 1] rep[index] = {} rep[index]['s1'] = r['first_start'] + st rep[index]['e1'] = r['first_end'] + st rep[index]['s2'] = r['second_start'] + st rep[index]['e2'] = r['second_end'] + st if self.include_all_repeats: replen = max(rep[index]['e1'] - rep[index]['s1'], rep[index]['e2'] - rep[index]['s2']) r1loc = FeatureLocation(rep[index]['s1'], rep[index]['e1'], strand=+1) r2loc = FeatureLocation(rep[index]['s2'], rep[index]['e2'], strand=+1) rptloc = CompoundLocation([r1loc, r2loc]) rptsf = SeqFeature(rptloc,type="repeat_region", qualifiers={'note':f"{replen}bp repeat identified by PhiSpy v{version.__version__}"}) self.record.get_entry(contig).features.append(rptsf) index += 1 return rep
def subfeatures(feature): """ Return a location object from GFF output. The BCBio GFF parser adds exons, mRNA, and CDS's to features as sub_features which has since been depreciated in Biopython in favour of the CompoundLocation object. This function returns a Location object that we can use to extract sequences. Keyword arguments: feature -- A SeqFeature record with the _sub_features attribute. Returns: An ExactLocation or CompundLocation object. """ new_features = list() if not hasattr(feature, "_sub_features"): return [feature] if feature._sub_features is None: return [feature] sub_features = feature._sub_features feature._sub_features = [] new_features.append(feature) sub_features_cds = [f for f in sub_features if f.type.lower() == 'cds'] sub_features_exons = [f for f in sub_features if f.type.lower() == 'exon'] sub_features_mrna = [f for f in sub_features if f.type.lower() == 'mrna'] sub_features_others = [f for f in sub_features if f.type.lower() not in {'cds', 'exon', 'mrna'}] new_features.extend(sub_features_others) strand = feature.strand if len(sub_features_exons) > 0: if len(sub_features_cds) > 1: sub_features_exons.sort(key=lambda l: l.location.start) locations = [f.location for f in sub_features_exons] if strand == -1: """ When calling CompoundLocation.extract() the sequences are extracted in the order that they are encountered. For features on the - strand, we need to reverse this order. """ locations.reverse() locations = CompoundLocation(locations) else: # One CDS returns an ExactLocation locations = sub_features_exons[0].location qualifiers = sub_features_exons[0].qualifiers sub_feature = SeqFeature( id=sub_features_exons[0].id, type="exon", strand=strand, qualifiers=qualifiers, location=locations ) new_features.append(sub_feature) if len(sub_features_cds) > 0: if len(sub_features_cds) > 1: sub_features_cds.sort(key=lambda l: l.location.start) locations = [f.location for f in sub_features_cds] if strand == -1: locations.reverse() locations = CompoundLocation(locations) else: # One CDS returns an ExactLocation locations = sub_features_cds[0].location qualifiers = sub_features_cds[0].qualifiers sub_feature = SeqFeature( id=sub_features_cds[0].id, type="CDS", strand=strand, qualifiers=qualifiers, location=locations, ) new_features.append(sub_feature) if len(sub_features_mrna) > 0: for mrna in sub_features_mrna: new_features.extend(subfeatures(mrna)) return new_features