def _extract_regions(gff_iterator):
    """Function added by KC Jan 2020. This Extracts regions from the first annotated position to the last annotated position, and updates the locations to correspond to the location in the sequence.
    """
    for rec in gff_iterator:
        pos = []
        loc = min([i.location.start for i in rec.features])
        endloc = max([i.location.end for i in rec.features])
        for i in range(len(rec.features)):
            pos += range(int(rec.features[i].location.start),
                         int(rec.features[i].location.end))
            rec.features[i].location = SeqFeature.FeatureLocation(
                SeqFeature.ExactPosition(rec.features[i].location.start - loc),
                SeqFeature.ExactPosition(rec.features[i].location.end - loc),
                strand=rec.features[i].strand)
            for j in range(len(rec.features[i].sub_features)):
                rec.features[i].sub_features[
                    j].location = SeqFeature.FeatureLocation(
                        SeqFeature.ExactPosition(
                            rec.features[i].sub_features[j].location.start -
                            loc),
                        SeqFeature.ExactPosition(
                            rec.features[i].sub_features[j].location.end -
                            loc),
                        strand=rec.features[i].sub_features[j].strand)
        rec.seq = rec.seq[loc:endloc]
        yield rec
Пример #2
0
def translateFeatureLocation(location, region, translation=0):
    location2 = location + translation + 1
    if location2.end < 0:
        logging.debug('Error-prone feature detected: {}'.format(location2))
        return SeqFeature.FeatureLocation(start=0, end=0, strand=0)
    else:
        return SeqFeature.FeatureLocation(start=max(0, location2.start),
                                          end=min(location2.end, region.end),
                                          strand=location2.strand)
Пример #3
0
 def _trans_loc(loc):
     # Don't write the contig ID in the loc line unless it's trans-spliced
     if loc[0] == current_contig_id:
         loc[0] = None
     if loc[2] == "-":
         return SeqFeature.FeatureLocation(loc[1] - loc[3], loc[1], -1,
                                           loc[0])
     else:
         return SeqFeature.FeatureLocation(loc[1] - 1,
                                           loc[1] + loc[3] - 1, 1,
                                           loc[0])
Пример #4
0
def ss_extract(genbank_in):
    """ 
	Extraction of list of individual 5' and 3' ss.
	Function takes an input GenBank file name (genbank_in);
	Function returns 5' and 3' ss sequences (f_ss_seq, t_ss_seq)

	"""
    seq_record = SeqIO.read(genbank_in, 'genbank')
    f_loc = []
    t_loc = []
    f_list = []
    t_list = []
    f_ss_seq = []
    t_ss_seq = []

    # Loop over the gen file, get the mRNA starts
    # and ends position for '+' strand
    for feature in seq_record.features:
        if feature.type == 'mRNA':
            for exon_location in feature.location.parts:
                t_loc.append(int(exon_location.start))
                f_loc.append(int(exon_location.end))
            break

    with open('seq_log', 'a') as file:  # Adding data to log file
        file.write(genbank_in.split('.')[0])  # Gene name
        file.write(' ' + feature.type + ' ')  # Selected feature type
        file.write(str(feature.qualifiers.get("db_xref")) +
                   '\n')  # GeneID and DBref feature
        file.write('5` intron end location:' + str(f_loc) +
                   '\n')  # 5' intron ends location
        file.write('3` intron end location:' + str(t_loc) +
                   '\n \n')  # 3' intron ends location

    # 5' ss extraction
    for n in f_loc:
        if n > 15 and n < len(seq_record.seq):
            f_list.append(SeqFeature.FeatureLocation(n - 3, n + 6))
    for loc in f_list:
        f_ss_seq.append(str(loc.extract(seq_record.seq)))

    # 3' ss extraction
    for n in t_loc:
        if n > 15:
            t_list.append(SeqFeature.FeatureLocation(n - 20, n + 3))
    for loc in t_list:
        t_ss_seq.append(str(loc.extract(seq_record.seq)))

    return f_ss_seq, t_ss_seq
Пример #5
0
 def make_start_fuzzy(self, location_object):
     ''' This function makes the start position of location 
         objects fuzzy.
     '''
     from Bio import SeqFeature
     if hasattr(location_object, 'parts'):
         if len(location_object.parts) == 1:
             new_start_pos = SeqFeature.BeforePosition(location_object.start)
             location_object = SeqFeature.FeatureLocation(new_start_pos,
                 location_object.end)
         if len(location_object.parts) > 1:
             new_start_pos = SeqFeature.BeforePosition(location_object.parts[0].start)
             location_object.parts[0] = SeqFeature.FeatureLocation(new_start_pos,
                 location_object.parts[0].end)
     return location_object
Пример #6
0
def annotate(my_plasmid, annot_dict):
    """ This function takes a plasmid map (.gb) and a dictionary of features
    (such as created by gb_annot_collector)
    and adds these features to the plasmid if they are found
    """

    #loop through sequences in the dictionary
    for key in annot_dict.keys():
        #create a variable for the feature label for convenience (we'll use it couple more times)
        name = str(annot_dict[key]['label'])
        #try to find this sequence
        if my_plasmid.seq.find(key) > -1:
            #get start and end coordinates
            start = my_plasmid.seq.find(key)
            end = start + len(key)
            #create new feature, put it in place and...
            new_feature = sf.SeqFeature(sf.FeatureLocation(
                start, end, strand=annot_dict[key]['strand']),
                                        type=annot_dict[key]['type'])
            #now append all the other features
            new_feature.qualifiers = annot_dict[key]
            #add it to the plasmid features
            my_plasmid.features.append(new_feature)
            #and let the user know we've found something
            print('found ' + name)
        else:  # if the feature is not found try reverse complement
            #let the user know about it
            print(name + ' not found, trying complementary')
            #make reverse complement
            comp = Seq(key, generic_dna).reverse_complement()
            #try to find reverse complement
            if my_plasmid.seq.find(comp) > -1:
                #get start and end coordinates
                start = my_plasmid.seq.find(comp)
                end = start + len(comp)
                #create new feature, put it in place and...
                new_feature = sf.SeqFeature(sf.FeatureLocation(
                    start, end, strand=-1 * annot_dict[key]['strand']),
                                            type=annot_dict[key]['type'])
                #now append all the other features
                new_feature.qualifiers = annot_dict[key]
                #add it to the plasmid features
                my_plasmid.features.append(new_feature)
                #let the user know we've found something
                print('found ' + name + ' in reverse complementary')
            else:  # if not found in either strand
                print(name + ' not found')  # let the user know about it
    return my_plasmid
Пример #7
0
def convert_annotations(block, gb):
    # Add My annotations as features
    for annotation in block["sequence"]["annotations"]:
        gb_annot = SeqFeature.SeqFeature()
        annotation_type = "unknown"

        if "role" in annotation and annotation["role"] != "":
            annotation_type = annotation["role"]

        for key, value in annotation.iteritems():
            if key not in ["start", "end", "notes", "strand", "color", "role"]:
                gb_annot.qualifiers[key] = value
            elif key == "color":
                gb_annot.qualifiers["GC_Color"] = value
            elif key == "notes":
                for notes_key, notes_value in annotation["notes"].iteritems():
                    if notes_key == "genbank":
                        for gb_key, gb_value in notes_value.iteritems():
                            if gb_key not in ["type"]:
                                gb_annot.qualifiers[gb_key] = gb_value
                            elif gb_key == "type":
                                annotation_type = gb_value

        if "start" in annotation:
            strand = 1
            if "strand" in annotation and annotation["strand"] == -1:
                strand = -1
            gb_annot.location = SeqFeature.FeatureLocation(
                annotation["start"], annotation["end"] + 1, strand)

        gb_annot.type = annotation_type

        gb.features.append(gb_annot)
Пример #8
0
def create_feature_annot(loc_range, featuretype, s):
    """ Create a new feature annotation at loc_range with featuretype on strand s. """
    location = SeqFeature.FeatureLocation(
        SeqFeature.ExactPosition(loc_range[0]),
        SeqFeature.ExactPosition(loc_range[1]))
    new_feature = SeqFeature.SeqFeature(location, type=featuretype, strand=s)
    return (new_feature)
Пример #9
0
 def extract_location_on_gene(self):
     """
     :return:
     :rtype: SeqFeature.FeatureLocation
     """
     if self.__genpept is None:
         return
     cds_set = set(
         filter(lambda feature: feature.type == 'CDS',
                self.__genpept.features))
     if len(cds_set) != 1:
         return
     cds = cds_set.pop()
     coding_regions = cds.qualifiers.get('coded_by', [])
     if len(coding_regions) != 1:
         return
     coding_regions = coding_regions[0]
     res = self.__location_format.search(coding_regions)
     if res is None:
         return
     if self._accession_no is None:
         self._accession_no = res.group("accession")
     strand = self.COMPLEMENT if bool(res.group("complement")) else None
     start = int(res.group("start"))
     end = int(res.group("end"))
     return SeqFeature.FeatureLocation(start, end, strand)
Пример #10
0
def create_feature(hit, end):
    '''
    Creates a feature from the hit, setting colours
    based on orientation and noting whether it is a
    left or right end hit.
    '''
    # Set up coordinates
    start = int(hit[1])
    stop = int(hit[2])
    quals = {}

    location = SeqFeature.FeatureLocation(start, stop)
    # Annotate with end information and colour accordingly
    if end == 'five':
        quals['colour'] = '2'
        quals['end'] = 'left_end'
        feat_type = 'left end'
    elif end == 'three':
        quals['colour'] = '7'
        quals['end'] = 'right end'
        feat_type = 'right_end'
    # Create feature
    feature = SeqFeature.SeqFeature(location, type=feat_type, qualifiers=quals)

    return feature
Пример #11
0
def modify_genbank(gb_file, fasta_file):
    gb_filename = re.search(r'(.*/users/.*/uploads/.*).(\w*)', gb_file)
    out_file = str(gb_filename.group(1)) + '_modified.' + str(
        gb_filename.group(2))

    genome = SeqIO.read(fasta_file, "fasta").seq
    final_annotations = get_final_annotations(genome)
    final_features = []
    for record in SeqIO.parse(open(gb_file, "r"), "genbank"):
        for feature in record.features:
            if feature.type == "gene" or feature.type == "CDS":
                locus_tag = feature.qualifiers["locus_tag"][0]
                if locus_tag in final_annotations.keys():
                    new_start = final_annotations[locus_tag]["start"]
                    feature.location = SeqFeature.FeatureLocation(
                        SeqFeature.ExactPosition(new_start - 1),
                        SeqFeature.ExactPosition(
                            feature.location.end.position),
                        feature.location.strand)
                    if feature.type == "CDS":
                        feature.qualifiers["product"][0] = final_annotations[
                            locus_tag]["function"]
                        feature.qualifiers["translation"][
                            0] = final_annotations[locus_tag]["translation"]
                else:
                    continue
            final_features.append(feature)  # Append final features
        record.features = final_features
        with open(out_file, "w") as new_gb:
            SeqIO.write(record, new_gb, "genbank")

    return out_file
Пример #12
0
 def add_feature(self, search_str, label):
     """Label a feature by literal string match, failing silently.
     Does not label features that wrap around circular sequence
     Args:
         search_str (str): string representing feature
         label (str): feature name to display"""
     f_matches = re.finditer(search_str, str(self.seq), re.IGNORECASE)
     r_matches = ()
     if search_str.lower() != dna.revc(search_str.lower()):
         # Don't label palindromes twice
         r_matches = re.finditer(dna.revc(search_str), str(self.seq),
                                 re.IGNORECASE)
     for m, strand in chain(izip_longest(f_matches, [1]),
                            izip_longest(r_matches, [-1])):
         if m:
             feature = SeqFeature.SeqFeature(
                 SeqFeature.FeatureLocation(m.start(), m.end(), strand),
                 'misc_feature')
             color = self.colors.next()
             feature.qualifiers = {
                 'label': [label],
                 'ApEinfo_fwdcolor': [color],
                 'ApEinfo_revcolor': [color],
                 'ApEinfo_graphicformat':
                 ['arrow_data {{0 1 2 0 0 -1} {} 0} width 5 offset 0']
             }
             self.features.append(feature)
     return
Пример #13
0
def createFEATUREannot(loc_range, featuretype, s):
    """ Creates a new SeqFeature with ExactPositions based on range."""
    location = SeqFeature.FeatureLocation(
        SeqFeature.ExactPosition(loc_range[0]),
        SeqFeature.ExactPosition(loc_range[1]))
    new_feature = SeqFeature.SeqFeature(location, type=featuretype, strand=s)
    return (new_feature)
Пример #14
0
def read_reference(fname, genemap):
    try:
        ref = str(SeqIO.read(fname, 'fasta').seq)
    except:
        with open(fname, 'r') as fh:
            ref = "".join([x.strip() for x in fh])

    translations = {}
    with open(genemap, 'r') as fh:
        for line in fh:
            if line[0] == '#':
                continue
            entries = [x.strip() for x in line.strip().split('\t')]
            start = int(entries[3])
            end = int(entries[4])
            strand = entries[6]
            attributes = {
                x.split()[0]: ' '.join(x.split()[1:])
                for x in entries[8].split(';')
            }
            if 'gene_name' in attributes:
                name = attributes['gene_name'].strip('"')
            else:
                name = None
            translation = Seq.translate(
                SeqFeature.SeqFeature(
                    SeqFeature.FeatureLocation(
                        start - 1, end,
                        strand=-1 if strand == '-' else 1)).extract(ref))
            translations[name] = str(translation)

    return {"nuc": ref, "translations": translations}
Пример #15
0
 def _parse_feature(element):
     feature = SeqFeature.SeqFeature()
     for k, v in element.attrib.items():
         feature.qualifiers[k] = v
     feature.type = element.attrib.get('type', '')
     if 'id' in element.attrib:
         feature.id = element.attrib['id']
     for feature_element in element.getchildren():
         if feature_element.tag == NS + 'location':
             position_elements = feature_element.findall(NS +
                                                         'position')
             if position_elements:
                 element = position_elements[0]
                 start_position = _parse_position(element, -1)
                 end_position = _parse_position(element)
             else:
                 element = feature_element.findall(NS + 'begin')[0]
                 start_position = _parse_position(element, -1)
                 element = feature_element.findall(NS + 'end')[0]
                 end_position = _parse_position(element)
             feature.location = SeqFeature.FeatureLocation(
                 start_position, end_position)
         else:
             try:
                 feature.qualifiers[feature_element.tag.replace(
                     NS, '')] = feature_element.text
             except:
                 pass  #skip unparsable tag
     self.ParsedSeqRecord.features.append(feature)
Пример #16
0
def _retrieve_reference(adaptor, primary_id):
    # XXX dbxref_qualifier_value

    refs = adaptor.execute_and_fetchall(
        "SELECT start_pos, end_pos, "
        " location, title, authors,"
        " dbname, accession"
        " FROM bioentry_reference"
        " JOIN reference USING (reference_id)"
        " LEFT JOIN dbxref USING (dbxref_id)"
        " WHERE bioentry_id = %s"
        " ORDER BY rank", (primary_id,))
    references = []
    for start, end, location, title, authors, dbname, accession in refs:
        reference = SeqFeature.Reference()
        # If the start/end are missing, reference.location is an empty list
        if (start is not None) or (end is not None):
            if start is not None:
                start -= 1  # python counting
            reference.location = [SeqFeature.FeatureLocation(start, end)]
        # Don't replace the default "" with None.
        if authors:
            reference.authors = authors
        if title:
            reference.title = title
        reference.journal = location
        if dbname == 'PUBMED':
            reference.pubmed_id = accession
        elif dbname == 'MEDLINE':
            reference.medline_id = accession
        references.append(reference)
    if references:
        return {'references': references}
    else:
        return {}
Пример #17
0
 def _parse_feature(element):
     feature = SeqFeature.SeqFeature()
     for k, v in element.attrib.items():
         feature.qualifiers[k] = v
     feature.type = element.attrib.get("type", "")
     if "id" in element.attrib:
         feature.id = element.attrib["id"]
     for feature_element in element:
         if feature_element.tag == NS + "location":
             position_elements = feature_element.findall(NS + "position")
             if position_elements:
                 element = position_elements[0]
                 start_position = _parse_position(element, -1)
                 end_position = _parse_position(element)
             else:
                 element = feature_element.findall(NS + "begin")[0]
                 start_position = _parse_position(element, -1)
                 element = feature_element.findall(NS + "end")[0]
                 end_position = _parse_position(element)
             feature.location = SeqFeature.FeatureLocation(
                 start_position, end_position
             )
         else:
             try:
                 feature.qualifiers[
                     feature_element.tag.replace(NS, "")
                 ] = feature_element.text
             except Exception:  # TODO - Which exceptions?
                 pass  # skip unparsable tag
     self.ParsedSeqRecord.features.append(feature)
Пример #18
0
def _make_seqfeature(name, from_res, to_res, description, ft_id):
    """Construct SeqFeature from feature data from parser (PRIVATE)."""
    loc = SeqFeature.FeatureLocation(_make_position(from_res, -1),
                                     _make_position(to_res, 0))
    if not ft_id:
        ft_id = "<unknown id>"  # The default in SeqFeature object
    return SeqFeature.SeqFeature(loc, type=name, id=ft_id,
                                 qualifiers={"description": description})
Пример #19
0
def write_gbk(sequence, matched_orfs, filename, organism):

    date = strftime("%d-%b-%Y").upper()
    orfs = []
    features = []

    gbk_record = SeqRecord(Seq(sequence, IUPAC.unambiguous_dna),
                           description=organism + " predicted Tn3 sequence",
                           annotations={
                               "accession": '.',
                               "version": '.',
                               "organism": '.',
                               "date": date,
                               "data_file_division": "BCT"
                           })

    for item in matched_orfs:
        if item[0] < item[1]:
            start = item[0]
            end = item[1]
            strand = 0
        else:
            start = item[1]
            end = item[0]
            strand = -1
        orfs.append((start, end, strand, item[2], item[3]))

    for item in orfs:
        if item[3] and item[4]:
            features.append(
                SeqFeature.SeqFeature(
                    SeqFeature.FeatureLocation(item[0] - 1,
                                               item[1],
                                               strand=item[2]),
                    type="misc_feature",
                    qualifiers={"note": item[4] + " family " + item[3]}))
        else:
            features.append(
                SeqFeature.SeqFeature(SeqFeature.FeatureLocation(
                    item[0] - 1, item[1], strand=item[2]),
                                      type="CDS"))

    for item in features:
        gbk_record.features.append(item)

    SeqIO.write(gbk_record, filename, "gb")
Пример #20
0
 def _get_feature(self, feature_dict):
     """Retrieve a Biopython feature from our dictionary representation.
     """
     location = SeqFeature.FeatureLocation(*feature_dict['location'])
     new_feature = SeqFeature.SeqFeature(location, feature_dict['type'],
             id=feature_dict['id'], strand=feature_dict['strand'])
     new_feature.qualifiers = feature_dict['quals']
     return new_feature
Пример #21
0
def create_feature(sequence, name, start, end, strand=+1):

    if str(name) and int(start) and int(end):
        my_feature_location = SeqFeature.FeatureLocation(start,
                                                         end,
                                                         strand=strand)
        my_feature = SeqFeature.SeqFeature(my_feature_location, type=name)
        sequence.features.append(my_feature)
Пример #22
0
def add_features(block, allblocks, gb, start):
    # Disregard fillers... don't create features for them
    if is_filler(block):
        return start + block["sequence"]["length"]

    # For handling list blocks!
    if "current_option" in block:
        option = [b for b in allblocks if b["id"] == block["current_option"]][0]
        return add_features(option, allblocks, gb, start)

    # Add Myself as a feature
    sf = SeqFeature.SeqFeature()
    # Set the type based on the original type or the role type
    if "genbank" in block["metadata"] and "type" in block["metadata"]["genbank"]:
        sf.type = block["metadata"]["genbank"]["type"]
    elif "rules" in block and "role" in block["rules"] and block["rules"]["role"] is not None and block["rules"]["role"] != "":
        sf.type = block["rules"]["role"]
    else:
        sf.type = "misc_feature"

    # Set up the location of the feature
    feature_strand = 1
    if "strand" in block["metadata"]:
        feature_strand = block["metadata"]["strand"]

    # And copy all the other qualifiers that came originally from genbank
    if "genbank" in block["metadata"]:
        for annot_key, annot_value in block["metadata"]["genbank"].iteritems():
            if annot_key not in ["name_source", "note"]:
                sf.qualifiers[annot_key] = annot_value

    convert_block_name(sf, block)

    add_GC_info(sf, block, allblocks)

    convert_annotations(block, gb, start)

    # Add my children as features
    child_start = start
    for i in range(0, len(block["components"])):
        block_id = block["components"][i]
        bl = [b for b in allblocks if b["id"] == block_id][0]
        child_start = add_features(bl, allblocks, gb, child_start)

    if child_start != start:
        # The end is where the last child ended...
        end = child_start
    else:
        # No children, look at the block's length
        if "sequence" in block:
            end = start + block["sequence"]["length"]
        else:
            end = start

    sf.location = SeqFeature.FeatureLocation(start, end, strand=feature_strand)
    gb.features.append(sf)

    return end
Пример #23
0
def write_gb(main_record_file, add="", destination=""):
    destination = expanduser(destination)
    main_record_file = expanduser(main_record_file)
    main_record = SeqIO.read(main_record_file, "fasta")
    add = expanduser(add)
    add = SeqIO.read(add, "fasta")

    main_record.seq.alphabet = IUPACAmbiguousDNA()
    cre_end = len(main_record.seq)
    main_record.seq = Seq(
        concatenate_overlapping_sequences(main_record.seq.tostring(),
                                          add.seq.tostring()),
        IUPACAmbiguousDNA())
    main_record.name = "ePet-cre"
    main_record.id = "ePet-cre"
    main_record.description = "ePet-cre construct from doi:10.1038/nn.2623"

    my_start_pos = SeqFeature.ExactPosition(0)
    my_end_pos = SeqFeature.ExactPosition(cre_end)

    my_feature_location = SeqFeature.FeatureLocation(my_start_pos,
                                                     my_end_pos,
                                                     strand=1)
    my_feature_type = "CDS"
    my_feature = SeqFeature.SeqFeature(my_feature_location,
                                       type=my_feature_type)
    my_feature.qualifiers["gene"] = "Cre"
    main_record.features.append(my_feature)

    # Add SV40 PolyA
    my_start_pos = SeqFeature.ExactPosition(cre_end)
    my_end_pos = SeqFeature.ExactPosition(cre_end + 118)

    my_feature_location = SeqFeature.FeatureLocation(my_start_pos,
                                                     my_end_pos,
                                                     strand=1)
    my_feature_type = "PolyA"
    my_feature = SeqFeature.SeqFeature(my_feature_location,
                                       type=my_feature_type)
    my_feature.qualifiers["PolyA"] = "SV40-PolyA"
    main_record.features.append(my_feature)

    SeqIO.write(main_record, destination + ".gb", "genbank")
    SeqIO.write(main_record, destination + ".fastas", "fasta")
Пример #24
0
 def test_GenerateFeatLoc__make_start_fuzzy__1(self):
     ''' Test to evaluate function `make_start_fuzzy` of class `GenerateFeatLoc`.
         This test evaluates the case where FeatureLocations are made fuzzy. '''
     from Bio import SeqFeature
     start_pos = SeqFeature.ExactPosition(5)
     end_pos = SeqFeature.ExactPosition(9)
     location_object = SeqFeature.FeatureLocation(start_pos, end_pos)
     out = GnOps.GenerateFeatLoc().make_start_fuzzy(location_object)
     self.assertIsInstance(out, Bio.SeqFeature.FeatureLocation) # FeatureLocation
     self.assertIsInstance(out.start, Bio.SeqFeature.BeforePosition) # Fuzzy Start
Пример #25
0
def contig_info(contig_id, contig_seq, species_informations):
    """
    Create contig information from species_informations dictionary and contig id and contig seq.
    """
    record = SeqRecord(contig_seq,
                       id=contig_id,
                       name=contig_id,
                       description=species_informations['description'],
                       annotations={"molecule_type": "DNA"})

    if IUPAC:
        record.seq.alphabet = IUPAC.ambiguous_dna
    if 'data_file_division' in species_informations:
        record.annotations['data_file_division'] = species_informations[
            'data_file_division']
    record.annotations['date'] = datetime.date.today().strftime(
        '%d-%b-%Y').upper()
    if 'topology' in species_informations:
        record.annotations['topology'] = species_informations['topology']
    record.annotations['accessions'] = contig_id
    if 'organism' in species_informations:
        record.annotations['organism'] = species_informations['organism']
    # Use of literal_eval for taxonomy and keywords to retrieve list.
    if 'taxonomy' in species_informations:
        record.annotations['taxonomy'] = species_informations['taxonomy']
    if 'keywords' in species_informations:
        record.annotations['keywords'] = species_informations['keywords']
    if 'source' in species_informations:
        record.annotations['source'] = species_informations['source']

    new_feature_source = sf.SeqFeature(sf.FeatureLocation(
        1 - 1, len(contig_seq)),
                                       type="source")
    new_feature_source.qualifiers['scaffold'] = contig_id
    if 'isolate' in species_informations:
        new_feature_source.qualifiers['isolate'] = species_informations[
            'isolate']
    # db_xref corresponds to the taxon NCBI ID.
    # Important if you want to use Pathway Tools after.
    if 'db_xref' in species_informations:
        new_feature_source.qualifiers['db_xref'] = species_informations[
            'db_xref']
    if 'cell_type' in species_informations:
        new_feature_source.qualifiers['cell_type'] = species_informations[
            'cell_type']
    if 'dev_stage' in species_informations:
        new_feature_source.qualifiers['dev_stage'] = species_informations[
            'dev_stage']
    if 'mol_type' in species_informations:
        new_feature_source.qualifiers['mol_type'] = species_informations[
            'mol_type']

    record.features.append(new_feature_source)

    return record
Пример #26
0
        def _parse_dbReference(element):
            self.ParsedSeqRecord.dbxrefs.append(element.attrib["type"] + ":" +
                                                element.attrib["id"])
            if "type" in element.attrib:
                # <dbReference type="EMBL" id="U96180">
                #    <property type="protein sequence ID" value="AAB66902.1"/>
                #    <property type="molecule type" value="mRNA"/>
                # </dbReference>
                if element.attrib["type"] == "EMBL":
                    for ref_element in element:
                        if "type" in ref_element.attrib and "value" in ref_element.attrib:
                            if ref_element.attrib[
                                    "type"] == "protein sequence ID":
                                self.ParsedSeqRecord.dbxrefs.append(
                                    "EMBL-CDS:" + ref_element.attrib["value"])
                # e.g.
                # <dbReference type="PDB" key="11" id="2GEZ">
                #   <property value="X-ray" type="method"/>
                #   <property value="2.60 A" type="resolution"/>
                #   <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/>
                # </dbReference>
                elif element.attrib["type"] == "PDB":
                    method = ""
                    resolution = ""
                    for ref_element in element:
                        if ref_element.tag == NS + "property":
                            dat_type = ref_element.attrib["type"]
                            if dat_type == "method":
                                method = ref_element.attrib["value"]
                            if dat_type == "resolution":
                                resolution = ref_element.attrib["value"]
                            if dat_type == "chains":
                                pairs = ref_element.attrib["value"].split(",")
                                for elem in pairs:
                                    pair = elem.strip().split("=")
                                    if pair[1] != "-":
                                        # TODO - How best to store these, do SeqFeatures make sense?
                                        feature = SeqFeature.SeqFeature()
                                        feature.type = element.attrib["type"]
                                        feature.qualifiers[
                                            "name"] = element.attrib["id"]
                                        feature.qualifiers["method"] = method
                                        feature.qualifiers[
                                            "resolution"] = resolution
                                        feature.qualifiers["chains"] = pair[
                                            0].split("/")
                                        start = int(pair[1].split("-")[0]) - 1
                                        end = int(pair[1].split("-")[1])
                                        feature.location = SeqFeature.FeatureLocation(
                                            start, end)
                                        # self.ParsedSeqRecord.features.append(feature)

            for ref_element in element:
                if ref_element.tag == NS + "property":
                    pass  # this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs
Пример #27
0
 def _get_feature(self, feature_dict):
     """Retrieve a Biopython feature from our dictionary representation."""
     location = SeqFeature.FeatureLocation(*feature_dict["location"])
     new_feature = SeqFeature.SeqFeature(
         location,
         feature_dict["type"],
         id=feature_dict["id"],
         strand=feature_dict["strand"],
     )
     new_feature.qualifiers = feature_dict["quals"]
     return new_feature
Пример #28
0
 def test_GenerateFeatLoc__make_start_fuzzy__3(self):
     ''' Test to evaluate function `make_start_fuzzy` of class `GenerateFeatLoc`.
         This test evaluates if end FeatureLocations are made fuzzy. 
         See AfterPosition. '''
     from Bio import SeqFeature
     start_pos = SeqFeature.ExactPosition(5)
     end_pos = SeqFeature.ExactPosition(9)
     location_object = SeqFeature.FeatureLocation(start_pos, end_pos)
     out = GnOps.GenerateFeatLoc().make_end_fuzzy(location_object)
     self.assertIsInstance(out, Bio.SeqFeature.FeatureLocation) # FeatureLocation
     self.assertIsInstance(out.end, Bio.SeqFeature.AfterPosition) # Fuzzy End
Пример #29
0
def create_genbank(gene_nucleic_seqs, gene_protein_seqs, annot, go_namespaces,
                   go_alternatives, output_path, species_informations):
    """ Create genbank file from nucleic and protein fasta plus eggnog mapper annotation file.

    Args:
        gene_nucleic_seqs (dict): dictionary of nucleic sequences (key: sequence id, value: sequence)
        gene_protein_seqs (dict): dictionary of protein sequences (key: sequence id, value: sequence)
        annot (dict): dictionary of eggnog-ammper annotation (key: gene_id, value: ['GOs','EC', 'Preferred_name'])
        go_namespaces (dict): dictionary of GO terms namespace (key: GO Term ID, value: namespace associated to GO Term)
        go_alternatives (dict): dictionary of GO terms alternatives ID (key: GO Term ID, value: alternatives GO Term associated to GO Term)
        output_path (str): output file or directory
        species_informations (dict): dictionary containing information about species
    """
    # All SeqRecord objects will be stored in a list and then give to the SeqIO writer to create the genbank.
    records = []

    # Iterate through each contig/gene.
    for gene_nucleic_id in sorted(gene_nucleic_seqs):
        # Create a SeqRecord object using gene information.
        record = record_info(gene_nucleic_id,
                             gene_nucleic_seqs[gene_nucleic_id],
                             species_informations)

        # If id is numeric, change it
        if gene_nucleic_id.isnumeric():
            id_gene = f"gene_{gene_nucleic_id}"
        elif "|" in gene_nucleic_id:
            id_gene = gene_nucleic_id.split("|")[1]
        else:
            id_gene = gene_nucleic_id
        start_position = 1
        end_position = len(gene_nucleic_seqs[gene_nucleic_id])
        strand = 0
        new_feature_gene = sf.SeqFeature(sf.FeatureLocation(
            start_position, end_position, strand),
                                         type="gene")
        new_feature_gene.qualifiers['locus_tag'] = id_gene

        # Add gene information to contig record.
        record.features.append(new_feature_gene)

        new_cds_feature = create_cds_feature(id_gene, start_position,
                                             end_position, strand, annot,
                                             go_namespaces, go_alternatives,
                                             gene_protein_seqs)
        new_cds_feature.qualifiers['locus_tag'] = id_gene

        # Add CDS information to contig record
        record.features.append(new_cds_feature)

        records.append(record)

    # Create Genbank with the list of SeqRecord.
    SeqIO.write(records, output_path, 'genbank')
Пример #30
0
 def _get_feature(self, feature_dict):
     """Retrieve a Biopython feature from our dictionary representation.
     """
     location = SeqFeature.FeatureLocation(*feature_dict['location'])
     new_feature = SeqFeature.SeqFeature(location, feature_dict['type'],
             id=feature_dict['id'], strand=feature_dict['strand'])
     # Support for Biopython 1.68 and above, which removed sub_features
     if not hasattr(new_feature, "sub_features"):
         new_feature.sub_features = []
     new_feature.qualifiers = feature_dict['quals']
     return new_feature