Пример #1
                f.qualifiers["label"].append("color: #ff8eff")

        # sort features by start location, source always first
            key=lambda f: (-len(gb.seq)) * (f.type == "source") + f.location.start

        # translate color from notes to ApEinfo
        for feature in gb_archive.features:

        # Fix the direct submission reference
        if gb_archive.annotations["references"][-1].title == "Direct Submission":
            ref = gb_archive.annotations["references"][-1]
            ref = Reference()
            ref.title = "Direct Submission"
        ref.authors = "Larralde M"
        ref.journal = "Distributed with the MoClo Python library\nhttps://github.com/althonos/moclo"

        # write the final record
        dst_dir = os.path.abspath(
            os.path.join(__file__, "..", "..", "moclo-plant", "registry", "plant")
        with fs.open_fs(os.path.join(__file__, "..", ".."), create=True) as dst_fs:
            dir_fs = dst_fs.makedirs(fs.path.join("moclo-plant", "registry", "plant"), recreate=True)
            with dir_fs.open("{}.gb".format(info["id"]), "w") as dst_file:
                write(gb_archive, dst_file, "gb")
Пример #2
    def to_seq_record(self) -> SeqRecord:
        """Convert the cluster to a single record.

        Annotations of the source sequence are kept intact if they don't
        overlap with the cluster boundaries. Component genes are added on the
        record as *CDS* features. Annotated protein domains are added as

        # store time of record creation
        now = datetime.datetime.now()

        # NB(@althonos): we use inclusive 1-based ranges in the data model
        # but slicing expects 0-based ranges with exclusive ends
        bgc = self.source[self.start - 1:self.end]
        bgc.id = bgc.name = self.id

        # copy sequence annotations
        bgc.annotations = self.source.annotations.copy()
        bgc.annotations["topology"] = "linear"
        bgc.annotations["molecule_type"] = "DNA"
        with patch_locale("C"):
            bgc.annotations['date'] = now.strftime("%d-%b-%Y").upper()

        biopython_version = tuple(map(int, Bio.__version__.split(".")))
        if biopython_version < (1, 77):
            from Bio import Alphabet

            bgc.seq.alphabet = Alphabet.generic_dna

        # add GECCO preprint as a reference
        ref = Reference()
        ref.title = "Accurate de novo identification of biosynthetic gene clusters with GECCO"
        ref.journal = "bioRxiv (2021.05.03.442509)"
        ref.comment = "doi:10.1101/2021.05.03.442509"
        ref.authors = ", ".join([
            "Laura M Carroll", "Martin Larralde", "Jonas Simon Fleck",
            "Ruby Ponnudurai", "Alessio Milanese", "Elisa Cappio Barazzone",
            "Georg Zeller"
        bgc.annotations.setdefault("references", []).append(ref)

        # add GECCO-specific annotations as a structured comment
        structured_comment = bgc.annotations.setdefault(
            "structured_comment", OrderedDict())
        structured_comment['GECCO-Data'] = {
            f"GECCO v{__version__}",
            ",".join(ty.name for ty in self.type.unpack()),
            self.type_probabilities.get(ProductType.Alkaloid, 0.0),
            self.type_probabilities.get(ProductType.Polyketide, 0.0),
            self.type_probabilities.get(ProductType.RiPP, 0.0),
            self.type_probabilities.get(ProductType.Saccharide, 0.0),
            self.type_probabilities.get(ProductType.Terpene, 0.0),
            self.type_probabilities.get(ProductType.NRP, 0.0),
            self.type_probabilities.get(ProductType.Other, 0.0),

        # add proteins as CDS features
        for gene in self.genes:
            # write gene as a /cds GenBank record
            cds = gene.to_seq_feature()
            cds.location += -self.start
            # write domains as /misc_feature annotations
            for domain in gene.protein.domains:
                misc = domain.to_seq_feature(protein_coordinates=False)
                misc.location += cds.location.start

        # return the complete BGC
        return bgc
Пример #3

print('Found lineage: %s' % lineage)

# Prepare the bibliographic reference
ref = Reference()
if args.ref_pubmed_id:
    ref.pubmed_id = args.ref_pubmed_id
if args.ref_consortium:
    ref.consrtm = args.ref_consortium
if args.ref_authors:
    ref.authors = args.ref_authors
if args.ref_title:
    ref.title = args.ref_title
if args.ref_journal:
    ref.journal = args.ref_journal
    now = datetime.datetime.now()
    ref_date = now.strftime("%m-%b-%Y").upper()
    # Temp switch to C to get english month abbr
    saved = locale.setlocale(locale.LC_TIME)
        locale.setlocale(locale.LC_TIME, "C")
        ref_date = now.strftime("%m-%b-%Y").upper()
        locale.setlocale(locale.LC_TIME, saved)
    ref.journal = "Submitted (" + ref_date + ") to the INSDC."

print('Loading input GFF and fasta files...')

seq_dict = SeqIO.to_dict(
Пример #4
def doConvert(embl_file, dep_file, contact, project, genome_project_id, organism_name, strain, locus_tag, taxon_id, dna_source, authors, comment, ac, clean=False):
    record = SeqIO.read(open(embl_file), "embl")

    # ----------------------------------------
    # HEADER
    # ----------------------------------------
    # remove accession
    if 'accession' in record.annotations.keys():
        del record.annotations['accession']
    record.annotations['accession'] = [ac]
    # ID line
    record.id = "XXX"
    record.name = "XXX"
    record.annotations['data_file_division'] = 'PRO'
    record.annotations['data_file_class'] = 'WGS'
    # PR line
    record.dbxrefs = ["Project:%s" % genome_project_id]
    # OS line
    record.annotations["organism"] = "%s %s" % (organism_name, strain)
    # DE line
    if project == 'metahit':
        record.description = "%s %s draft genome." % (organism_name, strain)
        record.description = "%s %s genome." % (organism_name, strain)
    # RN & RL lines
    if dna_source == 'GHP':
        dna_source = 'Rowett Institute of Nutrition and Health, University of Aberdeen -- http://www.rowett.ac.uk/divisions/ghp/'
        authors = 'Pajon A., Turner K., Parkhill J., Duncan S., Flint H.'
    elif dna_source == 'INRA':
        dna_source = 'INRA Clermont-Ferrand-Theix -- http://www.clermont.inra.fr/'
        authors = 'Pajon A., Turner K., Parkhill J., Bernalier A.'
    elif dna_source == 'HCIR':
        dna_source = 'Helmholtz Centre for Infection Research -- http://www.helmholtz-hzi.de/'
        authors = 'Pajon A., Turner K., Parkhill J., Timmis K., Oxley A., Wurdemann D.'
    elif dna_source == 'DSMZ':
        dna_source = 'German Collection of Microorganisms and Cell Cultures -- http://www.dsmz.de/'
        authors = 'Pajon A., Turner K., Parkhill J.'
    elif dna_source == 'NCTC':
        dna_source = 'Health Protection Agency\'s National Collection of Type Cultures -- http://www.hpacultures.org.uk/'
        authors = 'Pajon A., Turner K., Parkhill J.'
    elif dna_source == 'DPM':
        dna_source = 'Departments of Periodontology and Microbiology, King\'s College London -- http://www.kcl.ac.uk/'
        authors = 'Pajon A., Turner K., Parkhill J., Wade W., Vartoukian S.'
        dna_source = dna_source
        authors = authors
    ref_journal = Reference()
    ref_journal.journal = 'Unpublished.'
    if project == 'metahit':
        ref_journal.consrtm = "metaHIT consortium -- http://www.metahit.eu/"
    ref_journal.title = 'The genome sequence of %s %s' % (organism_name, strain)
    ref_journal.authors = authors
    ref_dep = Reference()
    ref_dep.authors = CONTACTS[contact]['author']
    today = date.today()
    ref_dep.journal = "Submitted (%s) to the EMBL/GenBank/DDBJ databases. Sanger Institute, Wellcome Trust Genome Campus, Hinxton, Cambridge CB10 1SA, United Kingdom." % today.strftime("%d-%b-%Y")
    ref_dep.title = 'Direct submission'
    record.annotations['references'] = [ref_journal, ref_dep]
    # CC line
    record.annotations['comment'] = ['Data release policy http://www.sanger.ac.uk/legal/#t_2',
                                     'DNA source: %s' % dna_source,
                                     '%s' % comment]
    # ----------------------------------------
    # GAP FEATURE (only with clean option)
    # ----------------------------------------
    # Add FT gap 
    seq = record.seq
    in_N = False
    gap_features = []
    if clean:
        # TODO - Cope with a sequence which ends with N
        if seq[-1] != "N":
            print "WARNING: sequence ends with N"
        for i in range(len(seq)):
            if seq[i] == 'N' and not in_N:
                start_N = i
                in_N = True
            if in_N and not seq[i+1] == 'N':
                end_N = i + 1
                length = end_N - start_N
                assert length > 0
                assert str(seq[start_N:end_N]) == "N"*length
                # do not create FT for 1bp gap
                if length > 1:
                    gap_feature = SeqFeature(FeatureLocation(start_N,end_N), strand=1, type="gap")
                    gap_feature.qualifiers['estimated_length'] = [length]
                in_N = False
    # ----------------------------------------
    # OTHER FEATURE (only with clean option)
    # ----------------------------------------
    new_features = []
    first_source = True
    has_source = False
    removed_cds = 0
    for i in range(len(record.features)):
        feature = record.features[i]

        # Add strain into FT source 
        if feature.type == 'source' and first_source:
            has_source = True
            feature.location.end.position = len(record.seq)
            feature.qualifiers['organism'] = ["%s %s" % (organism_name, strain)]
            feature.qualifiers['strain'] = [strain]

        # Remove qualifier /note & /translation
        if clean:
            if 'note' in feature.qualifiers.keys():
                del feature.qualifiers['note']
            #if 'translation' in feature.qualifiers.keys():
            #    del feature.qualifiers['translation']

        # Rename locus_tag
        if clean:
            if 'locus_tag' in feature.qualifiers.keys():
                feature.qualifiers['locus_tag'] = [getLocusTag(feature.qualifiers['locus_tag'][0], locus_tag, feature.type)]

        # Check /EC_number="" or /EC_number="1.1.2.-"
        if clean:
            if 'EC_number' in feature.qualifiers.keys():
                for i in range(len(feature.qualifiers['EC_number'])):
                    feature.qualifiers['EC_number'][i] = getEcNumber(feature.qualifiers['EC_number'][i])
            # Remove (EC in /product and /function
            if 'product' in feature.qualifiers.keys():
                for i in range(len(feature.qualifiers['product'])):
                    (feature.qualifiers['product'][i], ec_list) = getValueWithoutEc(feature.qualifiers['product'][i], feature)
                    if ec_list:
                        for ec in ec_list:
                            if 'EC_number' not in feature.qualifiers.keys():
                                feature.qualifiers['EC_number'] = [ec]

        # Remove tRNA /product (not only when containing ???)
        if clean:
            if feature.type == 'tRNA' and 'product' in feature.qualifiers.keys():
                del feature.qualifiers['product']
                #for i in range(len(feature.qualifiers['product'])):
                #    if feature.qualifiers['product'][i].count('?') > 1:
                #        del feature.qualifiers['product'][i]
            if 'function' in feature.qualifiers.keys():
                for i in range(len(feature.qualifiers['function'])):
                    (feature.qualifiers['function'][i], ec_list) = getValueWithoutEc(feature.qualifiers['function'][i], feature)
                    if ec_list:
                        for ec in ec_list:
                            if 'EC_number' not in feature.qualifiers.keys():
                                feature.qualifiers['EC_number'] = [ec]

        # Remove FT gene & keep only one FT source per record & remove some CDS
        if clean:
            if not feature.type == 'gene':
                if feature.type == 'source':
                    if first_source:
                        first_source = False
                # Remove CDS that are not valid
                # CDS -- translation must start with M, nucleotide sequence without N's & no overlap with gap feature
                # CDS -- must not have internal stop codons
                # CDS -- must end with stop codons (TAG, TAA, or TGA) or add '<' or '>' e.g. complement(<1..174); 1399953..>1401221
                elif feature.type == 'CDS':
                    if not 'transl_table' in feature.qualifiers.keys():
                        feature.qualifiers['transl_table'] = 11
                    if feature.strand == 1:
                        stop_codon = record.seq[feature.location.nofuzzy_end-3:feature.location.nofuzzy_end]
                        if not str(stop_codon) in ['TAG', 'TAA', 'TGA']:
                            feature.location = FeatureLocation(ExactPosition(feature.location.nofuzzy_start), AfterPosition(feature.location.nofuzzy_end))
                    if feature.strand == -1:
                        stop_codon = record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_start+3]
                        if not str(stop_codon) in ['CTA', 'TTA', 'TCA']:
                            feature.location = FeatureLocation(BeforePosition(feature.location.nofuzzy_start), ExactPosition(feature.location.nofuzzy_end))
                    translation = feature.extract(record.seq).translate(table=11)
                    if 'translation' in feature.qualifiers.keys():
                        if translation[-1] == '*':
                            if not len(translation) - 1 == len(feature.qualifiers['translation'][0]):
                                print 'WARNING: CDS %s translation length of different size' % feature.location
                                print translation
                                print feature.qualifiers['translation'][0]
                                if not str(translation[:-1]) == str(feature.qualifiers['translation'][0]):
                                    print 'WARNING: CDS %s translation not identical' % feature.location
                                    print translation[:-1]
                                    print feature.qualifiers['translation'][0]
                            if not len(translation) == len(feature.qualifiers['translation'][0]):
                                print 'WARNING: CDS %s translation length of different size' % feature.translation
                                print translation
                                print feature.qualifiers['translation'][0]
                                if not str(translation) == str(feature.qualifiers['translation'][0]):
                                    print 'WARNING: CDS %s translation not identical' % feature.location
                                    print translation
                                    print feature.qualifiers['translation'][0]
                    #feature.qualifiers['translation'] = [translation]
                    if translation.startswith('M') and record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].count('N') == 0:
                        if translation[:-1].count('*') >= 1:
                            print 'WARNING: CDS %s with internal stop codon' % feature.location
                            print translation
                        print 'WARNING: CDS %s does not start with M' % feature.location
                        print translation
                        removed_cds = removed_cds + 1

        if not clean:

    # Add source feature
    if not has_source:
        feature = SeqFeature(FeatureLocation(0,len(record.seq)), type="source")
        feature.qualifiers['organism'] = ["%s %s" % (organism_name, strain)]
        feature.qualifiers['strain'] = [strain]
        feature.qualifiers['db_xref'] = ["taxon:%s" % taxon_id]
        feature.qualifiers['mol_type'] = getMolType(embl_file) 

    if clean:
        print 'WARNING: %s CDSs have been removed' % removed_cds
        print "Only adding header, use '--clean' for cleaning features"
    record.features = new_features
    # Write out new embl file
    SeqIO.write([record], open(dep_file, "w"), "embl")
Пример #5
            if f.location is None:
                print(gb, f)

        # sort features by start location, source always first
        gb.features.sort(key=lambda f: (-len(gb.seq)) *
                         (f.type == "source") + f.location.start)

        # translate color from notes to ApEinfo
        for feature in gb.features:

        # Add an EcoFlex article reference
        ref = Reference()
        ref.authors = 'Moore SJ, Lai HE, Kelwick RJ, Chee SM, Bell DJ, Polizzi KM, Freemont PS.'
        ref.title = 'EcoFlex: A Multifunctional MoClo Kit for E. coli Synthetic Biology.'
        ref.journal = 'ACS Synth Biol 2016;5:1059-1069.'
        ref.pubmed_id = '27096716'

        # Fix the direct submission reference
        ref = Reference()
        # ref = gb.annotations["references"][-1]
        ref.authors = "Larralde M"
        ref.title = "Direct Submission"
        ref.journal = "Distributed with the MoClo Python library\nhttps://github.com/althonos/moclo"

        # write the final record
        dst_dir = os.path.abspath(
            os.path.join(__file__, "..", "..", "moclo-ecoflex", "registry",
Пример #6
def reformat_gbk(gbk_file,

    - remove protein_id
    - split scaffolds into contigs ==> name contigs contig_XXX
    - generate agp file

    :param gbk_file:
    :param study:
    :param publication:
    :param locus_tag_prefix:
    :param plasmid:

    source, taxonomy, organism = taxon_id2taxonomy(taxon_id)


    new_records = []
    from Bio import SeqIO
    import copy
    import copy
    from Bio.SeqFeature import Reference
    from Bio.SeqFeature import FeatureLocation
    with open(gbk_file, 'r') as f:

        records = [i for i in SeqIO.parse(f, 'genbank')]

        contig_records = []
        contig_count = 1

        for new_record in records:
            start = 0
            end = len(new_record.seq)
            for feature in new_record.features:
                if feature.type == 'assembly_gap':
                    print 'GAP-------'
                    print feature
                    contig = new_record[start:int(feature.location.start)]
                    # update start location
                    start = int(feature.location.end)

                    # rename contig record LOCUS

                    contig.id = "contig_%s" % contig_count
                    contig.name = "contig_%s" % contig_count

                    contig_count += 1
            contig = new_record[start:end]

            contig.id = "%s_%02d" % (scaffold_prefix, contig_count)
            contig.name = "%s_%02d" % (scaffold_prefix, contig_count)
            contig_count += 1

        for n, record in enumerate(contig_records):

            ref = Reference()
            ref.authors = publication_authors
            ref.journal = publication_journal
            ref.title = publication_title
            ref_seq = Refserence()
            ref.authors = "Trestan Pillonel"
            ref.journal = "RL   Submitted (09-APRIL-2019) to the INSDC."

            #print record
            #print dir(record)
            #print "id", record.id
            #print "name", record.name
            #print record.annotations
            #print record.description
            #print record.dbxrefs
            #record.id = ''
            record.annotations['source'] = source
            record.annotations['taxonomy'] = taxonomy
            record.annotations['organism'] = organism
            record.description = '%s %s scaffold_%s' % (organism, strain,
                                                        n + 1)

            if record.features[0].type != 'source':

                print('NOT SOURCE-------------------')
                record.features = [copy.copy(record.features[0])
                                   ] + record.features
                record.features[0].qualifiers = {}
                record.features[0].type = 'source'
                record.features[0].location = FeatureLocation(
                    0, len(record.seq))
            record.features[0].qualifiers['db_xref'] = ["taxon:%s" % taxon_id]
            record.features[0].qualifiers['mol_type'] = ["genomic DNA"]
            record.features[0].qualifiers['organism'] = ["%s" % organism]
            record.features[0].qualifiers['strain'] = ["%s" % strain]

            if plasmid:
                #     /mol_type="genomic DNA"
                #     /organism="Klebsiella pneumoniae"
                #     /strain="KpGe"
                #record.features[0].type = "source"
                #record.features[0].qualifiers['organism'] = ["Klebsiella pneumoniae"]
                #record.features[0].qualifiers['strain'] = ["KpGe"]
                record.features[0].qualifiers['plasmid'] = ["p%s" % strain]

            record.annotations['mol_type'] = ["genomic DNA"]
            ref.location = [record.features[0].location]
            #print 'location!', ref.location
            record.annotations['references'] = [ref]
            record.dbxrefs = ['BioProject:%s' % study]
            for i, feature in enumerate(record.features):
                if "protein_id" in feature.qualifiers:
                    del feature.qualifiers['protein_id']
                if feature.type == 'gene':
                    if not plasmid:
                        locus = "%s_%05d" % (locus_tag_prefix, locus_count)
                        print 'rename locus!', locus_tag_prefix
                        locus = "%s_p%04d" % (locus_tag_prefix, locus_count)
                    locus = "%s_%05d" % (locus_tag_prefix, locus_count_start)
                    locus_count_start += 1
                    feature.qualifiers['locus_tag'] = locus
                    record.features[i + 1].qualifiers['locus_tag'] = locus

    return new_records