def rebuild_references(annotations: Dict) -> Dict[str, List[Reference]]: """ Rebuilds the SeqRecord 'references' annotation from JSON """ bases = annotations["references"] refs = [] for ref in bases: new_reference = Reference() new_reference.__dict__ = ref new_reference.location = [location_from_string(loc) for loc in ref["location"]] refs.append(new_reference) annotations["references"] = refs return annotations
def start_li(self, attrs): if self._state == 'references': self._reference_state = 'pubmed_id' self._flush_text() if (self._current_reference != ''): self._references.append(self._current_reference) self._current_reference = Reference()
def reset(self): sgmllib.SGMLParser.reset(self) self.ndb_dict = Record() self.text = '' self._space_group = '' self._state = 'id' self._reference_state = 'authors' self._current_reference = Reference()
def __init__(self): self['Id'] = '' self['Features'] = '' self['Name'] = '' self['Sequence'] = Crystal({}) self['Citation'] = Reference() self['Space Group'] = '' self['Cell Constants'] = {} self['Crystallization Conditions'] = [] self['Refinement'] = '' self['Coordinates'] = ''
def reference_number(self, line): """RN line, reference number (start of new reference).""" from Bio.SeqFeature import Reference # if we have a current reference that hasn't been added to # the list of references, add it. if self._current_ref is not None: self.data.annotations['references'].append(self._current_ref) else: self.data.annotations['references'] = [] self._current_ref = Reference()
def reference_number(self, line): rn = line[5:].rstrip() assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn ref = Reference() ref.number = int(rn[1:-1]) self.data.references.append(ref)
f.qualifiers["label"].remove(color) f.qualifiers["label"].append("color: #ff8eff") # sort features by start location, source always first gb_archive.features.sort( key=lambda f: (-len(gb.seq)) * (f.type == "source") + f.location.start ) # translate color from notes to ApEinfo for feature in gb_archive.features: translate_color(feature) # Fix the direct submission reference if gb_archive.annotations["references"][-1].title == "Direct Submission": ref = gb_archive.annotations["references"][-1] else: ref = Reference() ref.title = "Direct Submission" gb_archive.annotations.append(ref) ref.authors = "Larralde M" ref.journal = "Distributed with the MoClo Python library\nhttps://github.com/althonos/moclo" # write the final record dst_dir = os.path.abspath( os.path.join(__file__, "..", "..", "moclo-plant", "registry", "plant") ) with fs.open_fs(os.path.join(__file__, "..", ".."), create=True) as dst_fs: dir_fs = dst_fs.makedirs(fs.path.join("moclo-plant", "registry", "plant"), recreate=True) with dir_fs.open("{}.gb".format(info["id"]), "w") as dst_file: write(gb_archive, dst_file, "gb")
def to_seq_record(self) -> SeqRecord: """Convert the cluster to a single record. Annotations of the source sequence are kept intact if they don't overlap with the cluster boundaries. Component genes are added on the record as *CDS* features. Annotated protein domains are added as *misc_feature*. """ # store time of record creation now = datetime.datetime.now() # NB(@althonos): we use inclusive 1-based ranges in the data model # but slicing expects 0-based ranges with exclusive ends bgc = self.source[self.start - 1:self.end] bgc.id = bgc.name = self.id # copy sequence annotations bgc.annotations = self.source.annotations.copy() bgc.annotations["topology"] = "linear" bgc.annotations["molecule_type"] = "DNA" with patch_locale("C"): bgc.annotations['date'] = now.strftime("%d-%b-%Y").upper() biopython_version = tuple(map(int, Bio.__version__.split("."))) if biopython_version < (1, 77): from Bio import Alphabet bgc.seq.alphabet = Alphabet.generic_dna # add GECCO preprint as a reference ref = Reference() ref.title = "Accurate de novo identification of biosynthetic gene clusters with GECCO" ref.journal = "bioRxiv (2021.05.03.442509)" ref.comment = "doi:10.1101/2021.05.03.442509" ref.authors = ", ".join([ "Laura M Carroll", "Martin Larralde", "Jonas Simon Fleck", "Ruby Ponnudurai", "Alessio Milanese", "Elisa Cappio Barazzone", "Georg Zeller" ]) bgc.annotations.setdefault("references", []).append(ref) # add GECCO-specific annotations as a structured comment structured_comment = bgc.annotations.setdefault( "structured_comment", OrderedDict()) structured_comment['GECCO-Data'] = { "version": f"GECCO v{__version__}", "creation_date": now.isoformat(), "biosyn_class": ",".join(ty.name for ty in self.type.unpack()), "alkaloid_probability": self.type_probabilities.get(ProductType.Alkaloid, 0.0), "polyketide_probability": self.type_probabilities.get(ProductType.Polyketide, 0.0), "ripp_probability": self.type_probabilities.get(ProductType.RiPP, 0.0), "saccharide_probability": self.type_probabilities.get(ProductType.Saccharide, 0.0), "terpene_probability": self.type_probabilities.get(ProductType.Terpene, 0.0), "nrp_probability": self.type_probabilities.get(ProductType.NRP, 0.0), "other_probability": self.type_probabilities.get(ProductType.Other, 0.0), } # add proteins as CDS features for gene in self.genes: # write gene as a /cds GenBank record cds = gene.to_seq_feature() cds.location += -self.start bgc.features.append(cds) # write domains as /misc_feature annotations for domain in gene.protein.domains: misc = domain.to_seq_feature(protein_coordinates=False) misc.location += cds.location.start bgc.features.append(misc) # return the complete BGC return bgc
RN [1] ok RA Submitter, A.; ok RT "Bacullis sp. strain XYZ genome annotated using Prokka."; ok RL Submitted (18-Apr-2016) to the INSDC. ok XX ''' record.id = "XXX" record.name = 'XXX' contig_name = record.description.split('Contig ')[1].split(' ')[0] contig_list.append(contig_name) record.description = args.description record.dbxrefs.append("Project:%s" % args.project) record.annotations['accessions'] = ['XXX', 'contig'] record.annotations["data_file_division"] = 'XXX' record.annotations["references"] = [Reference()] record.annotations["references"][0].authors = 'XXX' record.annotations["references"][0].location = [ FeatureLocation(0, len(record)) ] record.annotations["references"][0].title = '' record.annotations["references"][ 0].journal = 'Submitted (%s) to the INSDC.' % today.strftime( '%d-%b-%Y') new_features = [] for i in range(0, len(record.features)): type_list.append(record.features[i].type) if record.features[i].type == 'source': del record.features[i].qualifiers['project'] del record.features[i].qualifiers['genome_md5'] del record.features[i].qualifiers['genome_id']
nargs='?', type=argparse.FileType('w'), default=sys.stdout) args = parser.parse_args() # First get the lineage (and fail now if not found) lineage = get_lineage(args.species, args.email) if not lineage: raise RuntimeError( "Could not find lineage information on NCBI for species '%s'" % args.species) print('Found lineage: %s' % lineage) # Prepare the bibliographic reference ref = Reference() if args.ref_pubmed_id: ref.pubmed_id = args.ref_pubmed_id if args.ref_consortium: ref.consrtm = args.ref_consortium if args.ref_authors: ref.authors = args.ref_authors if args.ref_title: ref.title = args.ref_title if args.ref_journal: ref.journal = args.ref_journal else: now = datetime.datetime.now() ref_date = now.strftime("%m-%b-%Y").upper() # Temp switch to C to get english month abbr saved = locale.setlocale(locale.LC_TIME)
def create_reference(author_string=None): """Returns mock Reference data.""" reference = Reference() reference.authors = author_string return reference
def doConvert(embl_file, dep_file, contact, project, genome_project_id, organism_name, strain, locus_tag, taxon_id, dna_source, authors, comment, ac, clean=False): record = SeqIO.read(open(embl_file), "embl") # ---------------------------------------- # HEADER # ---------------------------------------- # remove accession if 'accession' in record.annotations.keys(): del record.annotations['accession'] record.annotations['accession'] = [ac] # ID line record.id = "XXX" record.name = "XXX" record.annotations['data_file_division'] = 'PRO' record.annotations['data_file_class'] = 'WGS' # PR line record.dbxrefs = ["Project:%s" % genome_project_id] # OS line record.annotations["organism"] = "%s %s" % (organism_name, strain) # DE line if project == 'metahit': record.description = "%s %s draft genome." % (organism_name, strain) else: record.description = "%s %s genome." % (organism_name, strain) # RN & RL lines if dna_source == 'GHP': dna_source = 'Rowett Institute of Nutrition and Health, University of Aberdeen -- http://www.rowett.ac.uk/divisions/ghp/' authors = 'Pajon A., Turner K., Parkhill J., Duncan S., Flint H.' elif dna_source == 'INRA': dna_source = 'INRA Clermont-Ferrand-Theix -- http://www.clermont.inra.fr/' authors = 'Pajon A., Turner K., Parkhill J., Bernalier A.' elif dna_source == 'HCIR': dna_source = 'Helmholtz Centre for Infection Research -- http://www.helmholtz-hzi.de/' authors = 'Pajon A., Turner K., Parkhill J., Timmis K., Oxley A., Wurdemann D.' elif dna_source == 'DSMZ': dna_source = 'German Collection of Microorganisms and Cell Cultures -- http://www.dsmz.de/' authors = 'Pajon A., Turner K., Parkhill J.' elif dna_source == 'NCTC': dna_source = 'Health Protection Agency\'s National Collection of Type Cultures -- http://www.hpacultures.org.uk/' authors = 'Pajon A., Turner K., Parkhill J.' elif dna_source == 'DPM': dna_source = 'Departments of Periodontology and Microbiology, King\'s College London -- http://www.kcl.ac.uk/' authors = 'Pajon A., Turner K., Parkhill J., Wade W., Vartoukian S.' else: dna_source = dna_source authors = authors ref_journal = Reference() ref_journal.journal = 'Unpublished.' if project == 'metahit': ref_journal.consrtm = "metaHIT consortium -- http://www.metahit.eu/" ref_journal.title = 'The genome sequence of %s %s' % (organism_name, strain) ref_journal.authors = authors ref_dep = Reference() ref_dep.authors = CONTACTS[contact]['author'] today = date.today() ref_dep.journal = "Submitted (%s) to the EMBL/GenBank/DDBJ databases. Sanger Institute, Wellcome Trust Genome Campus, Hinxton, Cambridge CB10 1SA, United Kingdom." % today.strftime("%d-%b-%Y") ref_dep.title = 'Direct submission' record.annotations['references'] = [ref_journal, ref_dep] # CC line record.annotations['comment'] = ['Data release policy http://www.sanger.ac.uk/legal/#t_2', 'DNA source: %s' % dna_source, '%s' % comment] # ---------------------------------------- # GAP FEATURE (only with clean option) # ---------------------------------------- # Add FT gap seq = record.seq in_N = False gap_features = [] if clean: # TODO - Cope with a sequence which ends with N if seq[-1] != "N": print "WARNING: sequence ends with N" for i in range(len(seq)): if seq[i] == 'N' and not in_N: start_N = i in_N = True if in_N and not seq[i+1] == 'N': end_N = i + 1 length = end_N - start_N assert length > 0 assert str(seq[start_N:end_N]) == "N"*length # do not create FT for 1bp gap if length > 1: gap_feature = SeqFeature(FeatureLocation(start_N,end_N), strand=1, type="gap") gap_feature.qualifiers['estimated_length'] = [length] gap_features.append(gap_feature) in_N = False # ---------------------------------------- # OTHER FEATURE (only with clean option) # ---------------------------------------- new_features = [] first_source = True has_source = False removed_cds = 0 for i in range(len(record.features)): feature = record.features[i] # Add strain into FT source if feature.type == 'source' and first_source: has_source = True feature.location.end.position = len(record.seq) feature.qualifiers['organism'] = ["%s %s" % (organism_name, strain)] feature.qualifiers['strain'] = [strain] # Remove qualifier /note & /translation if clean: if 'note' in feature.qualifiers.keys(): del feature.qualifiers['note'] #if 'translation' in feature.qualifiers.keys(): # del feature.qualifiers['translation'] # Rename locus_tag if clean: if 'locus_tag' in feature.qualifiers.keys(): feature.qualifiers['locus_tag'] = [getLocusTag(feature.qualifiers['locus_tag'][0], locus_tag, feature.type)] # Check /EC_number="5.3.1.25" or /EC_number="1.1.2.-" if clean: if 'EC_number' in feature.qualifiers.keys(): for i in range(len(feature.qualifiers['EC_number'])): feature.qualifiers['EC_number'][i] = getEcNumber(feature.qualifiers['EC_number'][i]) # Remove (EC 2.1.2.3) in /product and /function if 'product' in feature.qualifiers.keys(): for i in range(len(feature.qualifiers['product'])): (feature.qualifiers['product'][i], ec_list) = getValueWithoutEc(feature.qualifiers['product'][i], feature) if ec_list: for ec in ec_list: if 'EC_number' not in feature.qualifiers.keys(): feature.qualifiers['EC_number'] = [ec] else: feature.qualifiers['EC_number'].append(ec) # Remove tRNA /product (not only when containing ???) if clean: if feature.type == 'tRNA' and 'product' in feature.qualifiers.keys(): del feature.qualifiers['product'] #for i in range(len(feature.qualifiers['product'])): # if feature.qualifiers['product'][i].count('?') > 1: # del feature.qualifiers['product'][i] if 'function' in feature.qualifiers.keys(): for i in range(len(feature.qualifiers['function'])): (feature.qualifiers['function'][i], ec_list) = getValueWithoutEc(feature.qualifiers['function'][i], feature) if ec_list: for ec in ec_list: if 'EC_number' not in feature.qualifiers.keys(): feature.qualifiers['EC_number'] = [ec] else: feature.qualifiers['EC_number'].append(ec) # Remove FT gene & keep only one FT source per record & remove some CDS if clean: if not feature.type == 'gene': if feature.type == 'source': if first_source: new_features.append(feature) first_source = False # Remove CDS that are not valid # CDS -- translation must start with M, nucleotide sequence without N's & no overlap with gap feature # CDS -- must not have internal stop codons # CDS -- must end with stop codons (TAG, TAA, or TGA) or add '<' or '>' e.g. complement(<1..174); 1399953..>1401221 elif feature.type == 'CDS': if not 'transl_table' in feature.qualifiers.keys(): feature.qualifiers['transl_table'] = 11 if feature.strand == 1: stop_codon = record.seq[feature.location.nofuzzy_end-3:feature.location.nofuzzy_end] if not str(stop_codon) in ['TAG', 'TAA', 'TGA']: feature.location = FeatureLocation(ExactPosition(feature.location.nofuzzy_start), AfterPosition(feature.location.nofuzzy_end)) if feature.strand == -1: stop_codon = record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_start+3] if not str(stop_codon) in ['CTA', 'TTA', 'TCA']: feature.location = FeatureLocation(BeforePosition(feature.location.nofuzzy_start), ExactPosition(feature.location.nofuzzy_end)) translation = feature.extract(record.seq).translate(table=11) if 'translation' in feature.qualifiers.keys(): if translation[-1] == '*': if not len(translation) - 1 == len(feature.qualifiers['translation'][0]): print 'WARNING: CDS %s translation length of different size' % feature.location print translation print feature.qualifiers['translation'][0] else: if not str(translation[:-1]) == str(feature.qualifiers['translation'][0]): print 'WARNING: CDS %s translation not identical' % feature.location print translation[:-1] print feature.qualifiers['translation'][0] else: if not len(translation) == len(feature.qualifiers['translation'][0]): print 'WARNING: CDS %s translation length of different size' % feature.translation print translation print feature.qualifiers['translation'][0] else: if not str(translation) == str(feature.qualifiers['translation'][0]): print 'WARNING: CDS %s translation not identical' % feature.location print translation print feature.qualifiers['translation'][0] #feature.qualifiers['translation'] = [translation] if translation.startswith('M') and record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].count('N') == 0: if translation[:-1].count('*') >= 1: print 'WARNING: CDS %s with internal stop codon' % feature.location print translation else: new_features.append(feature) else: print 'WARNING: CDS %s does not start with M' % feature.location print translation removed_cds = removed_cds + 1 else: new_features.append(feature) if not clean: new_features.append(feature) # Add source feature if not has_source: feature = SeqFeature(FeatureLocation(0,len(record.seq)), type="source") feature.qualifiers['organism'] = ["%s %s" % (organism_name, strain)] feature.qualifiers['strain'] = [strain] feature.qualifiers['db_xref'] = ["taxon:%s" % taxon_id] feature.qualifiers['mol_type'] = getMolType(embl_file) new_features.append(feature) if clean: print 'WARNING: %s CDSs have been removed' % removed_cds else: print "Only adding header, use '--clean' for cleaning features" new_features.extend(gap_features) new_features.sort(feature_compare) record.features = new_features # Write out new embl file SeqIO.write([record], open(dep_file, "w"), "embl")
# if any(f.location is None for f in gb.features): # continue for f in gb.features: if f.location is None: print(gb, f) # sort features by start location, source always first gb.features.sort(key=lambda f: (-len(gb.seq)) * (f.type == "source") + f.location.start) # translate color from notes to ApEinfo for feature in gb.features: translate_color(feature) # Add an EcoFlex article reference ref = Reference() ref.authors = 'Moore SJ, Lai HE, Kelwick RJ, Chee SM, Bell DJ, Polizzi KM, Freemont PS.' ref.title = 'EcoFlex: A Multifunctional MoClo Kit for E. coli Synthetic Biology.' ref.journal = 'ACS Synth Biol 2016;5:1059-1069.' ref.pubmed_id = '27096716' gb.annotations['references'].append(ref) # Fix the direct submission reference ref = Reference() # ref = gb.annotations["references"][-1] ref.authors = "Larralde M" ref.title = "Direct Submission" ref.journal = "Distributed with the MoClo Python library\nhttps://github.com/althonos/moclo" gb.annotations['references'].append(ref) # write the final record
def reformat_gbk(gbk_file, study, publication_title, publication_authors, publication_journal, locus_tag_prefix, taxon_id, scaffold_prefix, strain, plasmid=False, locus_count_start=1): ''' - remove protein_id - split scaffolds into contigs ==> name contigs contig_XXX - generate agp file :param gbk_file: :param study: :param publication: :param locus_tag_prefix: :param plasmid: :return: ''' source, taxonomy, organism = taxon_id2taxonomy(taxon_id) print(source) print() print(taxonomy) print() new_records = [] from Bio import SeqIO import copy import copy from Bio.SeqFeature import Reference from Bio.SeqFeature import FeatureLocation with open(gbk_file, 'r') as f: records = [i for i in SeqIO.parse(f, 'genbank')] #locus_count=1 contig_records = [] contig_count = 1 for new_record in records: start = 0 end = len(new_record.seq) print(dir(new_record)) for feature in new_record.features: ''' if feature.type == 'assembly_gap': print 'GAP-------' print feature contig = new_record[start:int(feature.location.start)] # update start location start = int(feature.location.end) # rename contig record LOCUS contig.id = "contig_%s" % contig_count contig.name = "contig_%s" % contig_count contig_records.append(contig) contig_count += 1 ''' contig = new_record[start:end] contig.id = "%s_%02d" % (scaffold_prefix, contig_count) contig.name = "%s_%02d" % (scaffold_prefix, contig_count) contig_records.append(contig) contig_count += 1 for n, record in enumerate(contig_records): ref = Reference() ref.authors = publication_authors ref.journal = publication_journal ref.title = publication_title ''' ref_seq = Refserence() ref.authors = "Trestan Pillonel" ref.journal = "RL Submitted (09-APRIL-2019) to the INSDC." ''' #print record #print dir(record) #print "id", record.id #print "name", record.name #print record.annotations #print record.description #print record.dbxrefs #record.id = '' record.annotations['source'] = source record.annotations['taxonomy'] = taxonomy record.annotations['organism'] = organism record.description = '%s %s scaffold_%s' % (organism, strain, n + 1) if record.features[0].type != 'source': print('NOT SOURCE-------------------') record.features = [copy.copy(record.features[0]) ] + record.features record.features[0].qualifiers = {} record.features[0].type = 'source' record.features[0].location = FeatureLocation( 0, len(record.seq)) else: print('SOURCE!!!!!!!!!!!!!!!!') record.features[0].qualifiers['db_xref'] = ["taxon:%s" % taxon_id] record.features[0].qualifiers['mol_type'] = ["genomic DNA"] record.features[0].qualifiers['organism'] = ["%s" % organism] record.features[0].qualifiers['strain'] = ["%s" % strain] if plasmid: # /mol_type="genomic DNA" # /organism="Klebsiella pneumoniae" # /strain="KpGe" #record.features[0].type = "source" #record.features[0].qualifiers['organism'] = ["Klebsiella pneumoniae"] #record.features[0].qualifiers['strain'] = ["KpGe"] record.features[0].qualifiers['plasmid'] = ["p%s" % strain] record.annotations['mol_type'] = ["genomic DNA"] ref.location = [record.features[0].location] #print 'location!', ref.location record.annotations['references'] = [ref] record.dbxrefs = ['BioProject:%s' % study] for i, feature in enumerate(record.features): if "protein_id" in feature.qualifiers: del feature.qualifiers['protein_id'] if feature.type == 'gene': ''' if not plasmid: locus = "%s_%05d" % (locus_tag_prefix, locus_count) else: print 'rename locus!', locus_tag_prefix locus = "%s_p%04d" % (locus_tag_prefix, locus_count) ''' locus = "%s_%05d" % (locus_tag_prefix, locus_count_start) locus_count_start += 1 feature.qualifiers['locus_tag'] = locus record.features[i + 1].qualifiers['locus_tag'] = locus new_records.append(record) return new_records