Пример #1
0
def write_format(file):
    """Write a GenBank record from a Genbank file and compare them."""
    record_parser = GenBank.RecordParser(debug_level=2)

    print("Testing GenBank writing for %s..." % os.path.basename(file))
    # be able to handle gzipped files
    if ".gz" in file:
        cur_handle = gzip.open(file, "r")
        compare_handle = gzip.open(file, "r")
    else:
        cur_handle = open(file, "r")
        compare_handle = open(file, "r")

    iterator = GenBank.Iterator(cur_handle, record_parser)
    compare_iterator = GenBank.Iterator(compare_handle)

    while True:
        cur_record = next(iterator)
        compare_record = next(compare_iterator)

        if cur_record is None or compare_record is None:
            break

        # print("\tTesting for %s" % cur_record.version)

        output_record = str(cur_record) + "\n"
        try:
            do_comparison(compare_record, output_record)
        except AssertionError as msg:
            print("\tTesting for %s" % cur_record.version)
            print(msg)

    cur_handle.close()
    compare_handle.close()
Пример #2
0
def t_ensembl_locus():
    line = "LOCUS       HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011\n"
    s = GenBank.Scanner.GenBankScanner()
    c = GenBank._FeatureConsumer(True)
    s._feed_first_line(c, line)
    assert c.data.name == "HG531_PATCH", c.data.name
    assert c._expected_size == 1000000, c._expected_size

    line = "LOCUS       HG531_PATCH 759984 bp DNA HTG 18-JUN-2011\n"
    s = GenBank.Scanner.GenBankScanner()
    c = GenBank._FeatureConsumer(True)
    s._feed_first_line(c, line)
    assert c.data.name == "HG531_PATCH", c.data.name
    assert c._expected_size == 759984, c._expected_size

    line = "LOCUS       HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011\n"
    s = GenBank.Scanner.GenBankScanner()
    c = GenBank._FeatureConsumer(True)
    s._feed_first_line(c, line)
    assert c.data.name == "HG506_HG1000_1_PATCH", c.data.name
    assert c._expected_size == 814959, c._expected_size

    line = "LOCUS       HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011\n"
    s = GenBank.Scanner.GenBankScanner()
    c = GenBank._FeatureConsumer(True)
    s._feed_first_line(c, line)
    assert c.data.name == "HG506_HG1000_1_PATCH", c.data.name
    assert c._expected_size == 1219964, c._expected_size

    print("Done")
Пример #3
0
def write_format(file):
    record_parser = GenBank.RecordParser(debug_level = 2)

    print "Testing GenBank writing for %s..." % os.path.basename(file)
    # be able to handle gzipped files
    if '.gz' in file:
        cur_handle = gzip.open(file, "r")
        compare_handle = gzip.open(file, "r")
    else:
        cur_handle = open(file, "r")
        compare_handle = open(file, "r")

    iterator = GenBank.Iterator(cur_handle, record_parser)
    compare_iterator = GenBank.Iterator(compare_handle)

    while 1:
        cur_record = iterator.next()
        compare_record = compare_iterator.next()

        if cur_record is None or compare_record is None:
            break

        # print "\tTesting for %s" % cur_record.version

        output_record = str(cur_record) + "\n"
        try:
            do_comparison(compare_record, output_record)
        except AssertionError, msg:
            print "\tTesting for %s" % cur_record.version
            print msg
Пример #4
0
def t_ensembl_locus():
    line = "LOCUS       HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011\n"
    s = GenBank.Scanner.GenBankScanner()
    c = GenBank._FeatureConsumer(True)
    s._feed_first_line(c, line)
    assert c.data.name == "HG531_PATCH", c.data.name
    assert c._expected_size == 1000000, c._expected_size

    line = "LOCUS       HG531_PATCH 759984 bp DNA HTG 18-JUN-2011\n"
    s = GenBank.Scanner.GenBankScanner()
    c = GenBank._FeatureConsumer(True)
    s._feed_first_line(c, line)
    assert c.data.name == "HG531_PATCH", c.data.name
    assert c._expected_size == 759984, c._expected_size

    line = "LOCUS       HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011\n"
    s = GenBank.Scanner.GenBankScanner()
    c = GenBank._FeatureConsumer(True)
    s._feed_first_line(c, line)
    assert c.data.name == "HG506_HG1000_1_PATCH", c.data.name
    assert c._expected_size == 814959, c._expected_size

    line = "LOCUS       HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011\n"
    s = GenBank.Scanner.GenBankScanner()
    c = GenBank._FeatureConsumer(True)
    s._feed_first_line(c, line)
    assert c.data.name == "HG506_HG1000_1_PATCH", c.data.name
    assert c._expected_size == 1219964, c._expected_size

    print("Done")
Пример #5
0
    def setUp(self):
        # create TESTDB
        create_database()

        # load the database
        db_name = "biosql-test"
        server = BioSeqDatabase.open_database(driver=DBDRIVER,
                                              user=DBUSER,
                                              passwd=DBPASSWD,
                                              host=DBHOST,
                                              db=TESTDB)

        # remove the database if it already exists
        try:
            server[db_name]
            server.remove_database(db_name)
        except KeyError:
            pass

        self.db = server.new_database(db_name)

        # get the GenBank file we are going to put into it
        input_file = os.path.join(os.getcwd(), "GenBank", "cor6_6.gb")
        handle = open(input_file, "r")
        parser = GenBank.FeatureParser()
        self.iterator = GenBank.Iterator(handle, parser)
Пример #6
0
    def search(self):

        if self.database == 'PubMed':
            from Bio import PubMed
            from Bio import GenBank

        searchIds = PubMed.search_for(self.searchTerm, max_ids=self.maxResults)

        GBrecParser = GenBank.FeatureParser()
        ncbiDict = GenBank.NCBIDictionary(self.type,
                                          'genbank',
                                          parser=GBrecParser)

        from Bio import Medline

        MLrecParser = Medline.RecordParser()
        medlineDict = PubMed.Dictionary(delay=1.0, parser=MLrecParser)
        for id in searchIds:
            MLrecord = medlineDict[id]
            GBrecord = ncbiDict[id]
            newDBItem = DBItem(self.project,
                               seq=GBrecord.seq,
                               descript=GBrecord.description,
                               id=id,
                               record=MLrecord)
            self.items[id] = newDBItem
Пример #7
0
def t_write_format():
    record_parser = GenBank.RecordParser(debug_level=0)

    for file in write_format_files:
        print("Testing GenBank writing for %s..." % os.path.basename(file))
        cur_handle = open(os.path.join("GenBank", file), "r")
        compare_handle = open(os.path.join("GenBank", file), "r")

        iterator = GenBank.Iterator(cur_handle, record_parser)
        compare_iterator = GenBank.Iterator(compare_handle)

        while True:
            cur_record = next(iterator)
            compare_record = next(compare_iterator)

            if cur_record is None or compare_record is None:
                break

            print("\tTesting for %s" % cur_record.version)

            output_record = str(cur_record) + "\n"
            do_comparison(compare_record, output_record)

        cur_handle.close()
        compare_handle.close()
Пример #8
0
def fetch_refseq(path, strain_lst, species_to_search='Mycoplasma genitalium'):
    """ download NCBI refseq GenBank file from strain list """
    import os, sys, time, glob, csv
    from Bio import GenBank
    from sf_miscellaneous import write_pickle
    #species_to_search
    ## fetch the newest refseq assembly_summary file
    os.system('wget -c ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt > %sassembly_summary.txt'%path)
    with open('assembly_summary.txt','rb') as csvfile:
        outfile='downloadlink.txt'
        with open(path+outfile,'wb') as output:
            csv_reader = csv.reader(csvfile, delimiter='\t')
            headers = csv_reader.next()
            for icsv_line in csv_reader:
                # species name and complete
                if species_to_search in icsv_line[7] and 'Complete' in icsv_line[11]:
                    #os.system('wget -c %s/%s'%(icsv_line[19],'*_genomic.gbff.gz -P ./Refseq/Mt'))
                    output.write('%s/%s\n'%(icsv_line[19],'*_genomic.gbff.gz'))

    gbk_path='%sinput_GenBank/'%path
    command_download='wget -c --input %sdownloadlink.txt -P %s'%(path,gbk_path)
    os.system(command_download)
    command_gunzip='gunzip %s*.gz'%gbk_path
    os.system(command_gunzip)
    for each_gbk_path in glob.iglob('%s*gbff*'%gbk_path):
        with open(each_gbk_path) as gbk_file:
            for record in GenBank.parse(gbk_file):
                print(each_gbk_path,record.accession[0])
                break
            os.system('mv %s %s%s.gbk'%(each_gbk_path, gbk_path, record.accession[0]))

    if 0:
        os.chdir(path)
        species=glob.glob('*txt')[0].split('_list.')[0]
        os.system('rm *txt; rm *sh')
        os.system('gunzip *')
        while len(glob.glob('*.gz'))!=0:
            time.sleep(5)
        # rename gbk file
        for each_gbk_path in glob.iglob('*gbff*'):
            with open(each_gbk_path) as handle:
                print handle
                for record in GenBank.parse(handle):
                    print(each_gbk_path,record.accession[0])
                    break
            os.system('mv %s %s'%(each_gbk_path, record.accession[0]))
        for each_gbk_path in glob.iglob('*'):
            os.system('mv %s %s.gbk'%(each_gbk_path, each_gbk_path))
            #os.system('mv %s %s'%(each_gbk_path, each_gbk_path.split('.')[0]))
        os.system('ls *gbk > %s-RefSeq.txt; sed -i -- "s/.gbk//g" *txt'%species)
        os.system('wc -l *txt ; ls *gbk |wc -l')
        path='../../pan-genome-analysis/'
        os.system('cp %srun-TestSet-template.sh %srun-%s.sh; sed -i -- "s/TestSet/%s/g" %srun-%s.sh'%(path,path,species,species,path,species))
        os.system('mv ../%s/ %sdata/'%(species,path))
Пример #9
0
 def loadData(self, data, dbtype):
     if (dbtype == "GenBank"):
         # get the GenBank file we are going to put into it
         parser = GenBank.FeatureParser()
         iterator = GenBank.Iterator(data, parser)
         # finally put it in the database
         try:
             self.getDatabase().load(iterator)
         except:
             self.getBioSQLRoot().getDBServer().adaptor.conn.rollback()
             return traceback.format_exc()
         self.getBioSQLRoot().getDBServer().adaptor.conn.commit()
         return ""
     else:
         raise "Unknown dbtype: %r" % (dbtype) 
Пример #10
0
 def from_genbank(cls, filepath: str):
     try:
         gb = GenBank.read(file=filepath)
         source = GenBank.get_source_data(gb)
         return cls.insert(
             accession=gb.accession[0],
             organism=gb.organism,
             date_released=GenBank.format_date(gb.date),
             host=source.get('host'),
             date_collected=source.get('collection_date'),
             country=source.get('country'),
         )
     except Exception as e:
         logging.warning(
             f"Error inserting {filepath} to {cls.__name__}: {e}")
Пример #11
0
def plot_unique_genome_diagram(gbk, unique_loci):
    parser = GenBank.FeatureParser()
    fhandle = open(gbk, 'r')
    genbank_entry = parser.parse(fhandle)
    fhandle.close()

    gdd = GenomeDiagram.Diagram(gbk)
    gd_track_for_features = gdd.new_track(1,
                                          name="CDS",
                                          scale_smalltick_interval=100000)
    gdfs = gd_track_for_features.new_set()
    for feature in genbank_entry.features:
        if feature.type == 'CDS':
            feature.strand = 1
            if feature.qualifiers['locus_tag'][0] in unique_loci:
                gdfs.add_feature(feature, color=rcolors.HexColor("#93341F"))
            else:
                gdfs.add_feature(feature, color=rcolors.HexColor("#058F45"))
    gdd.draw(format='circular',
             orientation='landscape',
             tracklines=0,
             pagesize='A5',
             fragments=5,
             circular=1)
    return gdd
Пример #12
0
def pLonk(plasmids):
    pLenks = []
    pLasmids = []
    for (pName, seq_infile, offset, order) in plasmids:
        fhandle = open(seq_infile, 'r')  # load plasmid sequence file
        # evaluate file name to detect format using Quixote [ filename ]
        format = Quixote(seq_infile)
        if format == 'genbank':
            parser = GenBank.FeatureParser()
            gb_entry = parser.parse(fhandle)
            pLen = len(gb_entry.seq)  # read in length of plasmid sequence
            print pName, pLen
        elif format == 'fasta' or format == 'seq':
            for fa_entry in SeqIO.parse(fhandle, "fasta"):
                pLen = len(fa_entry.seq)  # read in length of plasmid sequence
        else:
            print "TERMINAL ERROR : file format not recognized for " + pName + " !!!"
            break
        fhandle.close()  # close sequence file (to free up memory)
        pLenks.append(pLen)
        pLasmids.append((pName, seq_infile, int(pLen), int(offset)))
        pLenks.sort()
        pLenks.reverse()
        pLen_MAX = pLenks[0]
    return pLenks, pLen_MAX, pLasmids
Пример #13
0
def main():
    sorted_pos_tagged = {}
    db_handler = DatabaseHandler()
    for entry in os.scandir(shb_records):
        graph = rdflib.Graph()
        graph.load(entry.path)
        for s, p, o in graph:
            if p in predicate_whitelist and isinstance(o, rdflib.Literal):
                pos_tagged = mine_sentences(o)
                for p in pos_tagged:
                    if p[1] in sorted_pos_tagged.keys():
                        sorted_pos_tagged[p[1]].append(p[0])
                    else:
                        sorted_pos_tagged[p[1]] = [p[0]]

    for entry in os.scandir(gbk_records):
        with open(entry) as handle:
            for record in GenBank.parse(handle):
                graph = db_handler._db_util.db_mapping_calls[
                    "genbank"].generalise_get_results(record)
                for s, p, o in graph:
                    if isinstance(o, rdflib.Literal):
                        pos_tagged = mine_sentences(o)
                        for p in pos_tagged:
                            if p[1] in sorted_pos_tagged.keys():
                                sorted_pos_tagged[p[1]].append(p[0])
                            else:
                                sorted_pos_tagged[p[1]] = [p[0]]

    f = open("summary.txt", "a+")
    for k, v in sorted_pos_tagged.items():
        v = list(set(v))
        print(k, v)
        print("\n")
        f.write(f'{k} - {"-".join(v)}\n')
Пример #14
0
def t_cleaning_features():
    """Test the ability to clean up feature values."""
    gb_parser = GenBank.FeatureParser(
        feature_cleaner=utils.FeatureValueCleaner())
    handle = open(os.path.join("GenBank", "arab1.gb"))
    iterator = GenBank.Iterator(handle, gb_parser)

    first_record = next(iterator)

    # test for cleaning of translation
    translation_feature = first_record.features[1]
    test_trans = translation_feature.qualifiers["translation"][0]
    assert " " not in test_trans, "Did not clean spaces out of the translation"
    assert "\012" not in test_trans, "Did not clean newlines out of the translation"

    handle.close()
Пример #15
0
 def test_topology_genbank(self):
     """Check GenBank LOCUS line parsing."""
     # This is a bit low level, but can test pasing the LOCUS line only
     tests = [
         ("LOCUS       U00096", None, None, None),
         # This example is actually fungal, accession U49845 from Saccharomyces cerevisiae:
         ("LOCUS       SCU49845     5028 bp    DNA             PLN       21-JUN-1999",
          None, "DNA", "PLN"),
         ("LOCUS       AB070938                6497 bp    DNA     linear   BCT 11-OCT-2001",
          "linear", "DNA", "BCT"),
         ("LOCUS       NC_005816               9609 bp    DNA     circular BCT 21-JUL-2008",
          "circular", "DNA", "BCT"),
         ("LOCUS       SCX3_BUTOC                64 aa            linear   INV 16-OCT-2001",
          "linear", None, "INV"),
     ]
     for (line, topo, mol_type, div) in tests:
         scanner = Scanner.GenBankScanner()
         consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner)
         scanner._feed_first_line(consumer, line)
         t = consumer.data.annotations.get('topology', None)
         self.assertEqual(
             t, topo, "Wrong topology %r not %r from %r" % (t, topo, line))
         mt = consumer.data.annotations.get('molecule_type', None)
         self.assertEqual(
             mt, mol_type,
             "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line))
         d = consumer.data.annotations.get('data_file_division', None)
         self.assertEqual(
             d, div, "Wrong division %r not %r from %r" % (d, div, line))
Пример #16
0
def load_genbank(seqfile):
    """Load single-record GenBank file."""
    parser = GenBank.FeatureParser()
    input_handle = open(seqfile, 'rU')
    gb_record = parser.parse(input_handle)
    input_handle.close()
    return gb_record
Пример #17
0
 def test_topology_genbank(self):
     """Check GenBank LOCUS line parsing."""
     # This is a bit low level, but can test pasing the LOCUS line only
     tests = [
         ("LOCUS       U00096",
          None, None, None),
         # This example is actually fungal, accession U49845 from Saccharomyces cerevisiae:
         ("LOCUS       SCU49845     5028 bp    DNA             PLN       21-JUN-1999",
          None, "DNA", "PLN"),
         ("LOCUS       AB070938                6497 bp    DNA     linear   BCT 11-OCT-2001",
          "linear", "DNA", "BCT"),
         ("LOCUS       NC_005816               9609 bp    DNA     circular BCT 21-JUL-2008",
          "circular", "DNA", "BCT"),
         ("LOCUS       SCX3_BUTOC                64 aa            linear   INV 16-OCT-2001",
          "linear", None, "INV"),
     ]
     for (line, topo, mol_type, div) in tests:
         scanner = Scanner.GenBankScanner()
         consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner)
         scanner._feed_first_line(consumer, line)
         t = consumer.data.annotations.get('topology', None)
         self.assertEqual(t, topo,
                          "Wrong topology %r not %r from %r" % (t, topo, line))
         mt = consumer.data.annotations.get('molecule_type', None)
         self.assertEqual(mt, mol_type,
                          "Wrong molecule_type %r not %r from %r" %
                          (mt, mol_type, line))
         d = consumer.data.annotations.get('data_file_division', None)
         self.assertEqual(d, div,
                          "Wrong division %r not %r from %r" % (d, div, line))
Пример #18
0
def t_cleaning_features():
    """Test the ability to clean up feature values.
    """
    parser = GenBank.FeatureParser(feature_cleaner = \
                                   utils.FeatureValueCleaner())
    handle = open(os.path.join("GenBank", "arab1.gb"))
    iterator = GenBank.Iterator(handle, parser)

    first_record = iterator.next()

    # test for cleaning of translation
    translation_feature = first_record.features[1]
    test_trans = translation_feature.qualifiers["translation"][0]
    assert test_trans.find(" ") == -1, \
      "Did not clean spaces out of the translation"
    assert test_trans.find("\012") == -1, \
      "Did not clean newlines out of the translation"
Пример #19
0
 def test_genbank_bad_loc_wrap_parsing(self):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", BiopythonParserWarning)
         with open(path.join("GenBank", "bad_loc_wrap.gb")) as handle:
             record = GenBank.read(handle)
             self.assertEqual(1, len(record.features))
             loc = record.features[0].location
             self.assertEqual(loc, "join(3462..3615,3698..3978,4077..4307,4408..4797,4876..5028,5141..5332)")
Пример #20
0
    def __processGffFilesNotNew(self, changed):
        for gff in changed:
            loc = os.path.dirname(gff)
            dbName = os.path.splitext(os.path.basename(gff))[0] + '.db'
            dbName = os.path.join(loc, dbName)

            gffRewriter = GFFRewriter(filename=gff,
                                      outfile=gff + ".sorted.prepared",
                                      accession=genbank_id)

            #print setting.DATABASES['default']['USER']

            gffRewriter.addUnknownCvTerms({
                'user':
                settings.DATABASES['default']['USER'],
                'password':
                settings.DATABASES['default']['PASSWORD'],
                'db':
                settings.DATABASES['default']['NAME']
            })

            gffRewriter.addColor({
                'user':
                settings.DATABASES['default']['USER'],
                'password':
                settings.DATABASES['default']['PASSWORD'],
                'db':
                'go'
            })

            error = gffRewriter.getError()

            # run the sqlite database loader to be able to add it to GBrowse
            # since the name should be preserved, no changes need to be made
            # to the GBrowse configuration file
            args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff]
            runProgram('bp_seqfeature_load.pl', args)

            parser = GenBank.RecordParser()
            gbk = os.path.join(os.path.splitext(gff)[0], '.gbk')
            record = parser.parse(open(gbk))
            organismName = record.organism
            organismDir = os.path.basename(loc)

            GenomeDBUtil.editGBrowseEntry(gff, dbName, organismDir,
                                          organismName)

            # now edit the record in Chado
            args = [
                '--organism', organismName, "--gfffile", gff, "--dbname",
                settings.DATABASES['default']['NAME'], "--dbuser",
                settings.DATABASES['default']['USER'], "--dbpass",
                settings.DATABASES['default']['PASSWORD'], "--random_tmp_dir"
            ]
            runProgram('gmod_bulk_load_gff3.pl', args)
Пример #21
0
def main():
	try: os.mkdir(OUTDIR)
	except OSError: print 'Using existing directory %s' % OUTDIR
	with open(INFILE) as handle:
		records = [r for r in GenBank.parse(handle)]
	c = Counter([r.references[0].pubmed_id for r in records])
	del c['']
	pubs = c.keys()
	seqdict = {k:[r for r in records if r.references[0].pubmed_id == k] for k in pubs}
	for pub in pubs:
		qfas(seqdict[pub], OUTDIR+'/'+pub+'.fasta')
Пример #22
0
 def test_000_genbank_bad_loc_wrap_warning(self):
     with warnings.catch_warnings():
         warnings.simplefilter("error", BiopythonParserWarning)
         with open(path.join("GenBank", "bad_loc_wrap.gb")) as handle:
             # self.assertRaises(BiopythonParserWarning, GenBank.read, handle)
             try:
                 record = GenBank.read(handle)
             except BiopythonParserWarning as e:
                 self.assertEqual(str(e), "Non-standard feature line wrapping (didn't break on comma)?")
             else:
                 self.assertTrue(False, "Expected specified BiopythonParserWarning here.")
Пример #23
0
def load_samples(sequences):

    with open("data/genbank_sequences.gb") as handle:
        #Use biopython to parse the GenBank records
        for record in GenBank.parse(handle):
            #skip partial sequence records
            if ('partial' in record.definition):
                continue

            #For now id for a sample will include a truncated version of the country of origin, date collected
            #and the accession number of the record.
            accession = record.accession[0]
            source = findFeature(record, 'source')
            if source is not None:
                country = findItem(source, '/country=')
                col_date = findItem(source, '/collection_date=')
            id = accession
            if col_date is not None:
                dt = dateutil.parser.parse(
                    col_date)  # Time formatting is not consistent
                norm_date = dt.strftime(r'%Y-%m-%d')
                id = norm_date + '-' + id
            if country is not None:
                country = country.replace(':', ' ')
                id = country.split()[0][:7].strip() + '-' + id

            #For each CDS record
            genes = findAllFeature(record, 'CDS')
            for gene in genes:
                #First figure out the gene / protein name.  Try both the product tag and gene tag
                #NOTE: We are not interested in the post translation non structural protein products.
                #      We just process the orf1ab gene that has all of them embedded in it.
                product = findItem(gene, '/product=')
                if product is not None:
                    gene_name = product.split()[0].upper()
                    #A few proteins have aliases, map them to the standard form.
                    if gene_name in geneAlias:
                        gene_name = geneAlias[gene_name]

                if gene_name is None or gene_name not in validList:
                    gene_name = findItem(gene, '/gene=')
                    if (gene_name is not None and gene_name in geneAlias):
                        gene_name = geneAlias[gene_name]

                if (gene_name not in validList):
                    continue

                sequence = findItem(gene, '/translation=')
                if (id is not None and sequence is not None):
                    if gene_name == 'ORF1AB':
                        loadOrf1AB(sequences, id, sequence)
                    else:
                        sequences[gene_name].append(id + '|' + sequence)
Пример #24
0
def parse(path='./flat_files/'):
    path = Path(path)
    print "parsing records at {}".format(path.absolute())

    records = []
    for p in path.listdir():
        try:
            gbr = GenBank.read(open(p))
            records.append(gbr)
        except:
            print 'error with file', p
    print "parsed %s records.." % len(records)

    return records
Пример #25
0
def parse(path='./flat_files/'):
    path = Path(path)
    print "parsing records at {}".format(path.absolute())

    records = []
    for p in path.listdir():
        try:
            gbr = GenBank.read(open(p))
            records.append(gbr)
        except:
            print 'error with file', p
    print "parsed %s records.." % len(records)

    return records
Пример #26
0
    def fetch(id: str):
        logging.info(f"Fetching GenBank id={id}")

        r = eutils.fetch(db='nuccore', id=id, rettype='gb')
        if r.ok:
            gb = GenBank.read(string=r.text)
            accession = gb.accession[0]
            filename = f"{accession}{'.' + config.taxon if config.taxon else ''}.gb"
            file_out = os.path.join(FileSystem.dir['genbank'], filename)
            record = GenBank.query.filter_by(version=gb.version).first()

            if record and os.path.exists(record.filepath):
                logging.info(
                    f"{accession} already exists as file='{record.filepath}' and GenBank.id={record.id}"
                )
            else:
                with open(file_out, 'w') as fh:
                    fh.write(r.text)
                GenBank.add_file(file_out)
                print(f"[INFO] Fetched file: {file_out}")
            return file_out
        else:
            print(f"[WARN] Could not download '{id}'")
Пример #27
0
def BaseDraw(plasmids):
    ordN = 0
    for (pName, seq_infile, pLen, offset) in plasmids:
        # set Y axis once and for all for the plasmid being processed
        y0 = (pNs - ordN) * dBL  # starts from the top
        pLeni = int(pLen)
        print 'offset', offset
        offset = int(offset)
        # draw plasmid baseline
        BaseL(ordN, pName, pLeni, y0, canvas_main)
        # label the baseline with plasmid name and size
        LabeL(ordN, pName, pLeni, y0, canvas_main)
        # evaluate file name to detect format using Quixote [filename]
        format = Quixote(seq_infile)
        # mark up sequence origin if there is an offset
        if offset < -1 or offset > 1:
            Zs, dir = Off7(1, pLeni, offset)
            xs = Zs * u
            canvas_main.setFont(bFont, NfSize)
            canvas_main.drawString(xs, y0 + da / 2, osym)
        # filter and draw annotation features
        if format == 'genbank':
            # load GB file to filter features
            parser = GenBank.FeatureParser()
            fhandle = open(seq_infile, 'r')  # load GenBank file
            gb_entry = parser.parse(fhandle)
            ORFcnt = 0
            for feature in gb_entry.features:
                if feature.type == 'CDS' or feature.type == 'cds':  # draw CDS using ORFeus
                    ORFcnt += 1
                    ORFeus(feature, pLeni, offset, y0, ORFcnt)
                elif SFX == 'on':
                    if feature.type == 'SNP':
                        Snippit(feature, pLeni, offset,
                                y0)  # draw asterisk at feature location
                    if feature.type == 'IR':
                        IRFlag(feature, pLeni, offset,
                               y0)  # draw flag at feature location
                # need other functions for other features ( with conditional, default switch off)
            fhandle.close()
            print "    got a GenBank-style file for " + pName + " with " + str(
                ORFcnt) + " ORFs"
        else:
            # no features so just skip this step
            print "    got a non-genbank-style file for " + pName + "; no features to draw"
        # increment plasmid ordinal count
        ordN = ordN + 1
        print "    " + pName + " (" + str(pLeni) + " bp) drawn with " + str(
            ORFcnt) + " ORFs"
    print "    OK"
Пример #28
0
def load_database(gb_handle):
    """Load a GenBank file into a BioSQL database.
    
    This is useful for running tests against a newly created database.
    """

    create_database()
    # now open a connection to load the database
    db_name = "biosql-test"
    server = BioSeqDatabase.open_database(driver=DBDRIVER,
                                          user=DBUSER,
                                          passwd=DBPASSWD,
                                          host=DBHOST,
                                          db=TESTDB)
    db = server.new_database(db_name)

    # get the GenBank file we are going to put into it
    parser = GenBank.FeatureParser()
    iterator = GenBank.Iterator(gb_handle, parser)
    # finally put it in the database
    db.load(iterator)
    server.adaptor.conn.commit()
    server.adaptor.conn.close()
Пример #29
0
def fetch_gb(id, taxon=None):
    r = eutils.fetch(db='nuccore', id=id, rettype='gb')
    if r.ok:
        gb = GenBank.read(StringIO(r.text))
        filename = f"{gb.locus}{'.' + taxon if taxon else ''}.gb"
        file_out = os.path.join(config.genbank_dir, filename)
        if os.path.exists(file_out):
            print(f"[WARN] {file_out} already exists")
        else:
            with open(file_out, 'w') as fh:
                fh.write(r.text)
            print(f"[INFO] Fetched file: {file_out}")
    else:
        print(f"[WARN] Could not download '{id}'")
Пример #30
0
 def _get_organella(self, gb_file):
     """
     Retrive the organelle from the genbank file, using the specific GenBank object,
     because SeqIO does not support this field
     """
     organella = {}
     with open(gb_file, "r") as gbh:
         for record in GenBank.parse(gbh):
             accession = record.version
             for q in record.features[0].qualifiers:
                 if q.key == "/organelle=":
                     organelle = q.value.replace('"', '')
                     organella[record.version] = organelle
     return organella
Пример #31
0
 def parse_genebank_file(self):
     with open(self.gene_bank_file, "rU") as input_handle:
         for record in GenBank.parse(input_handle):
             #print("Name:  %s, %i" % (record.name, len(record.features)))
             print(record.features)
             print(record.accession)
             print("----")
             print(record.gi)
             print("----")
             self.Acc = record.accession[0]
             
             if self.GI is None or len(self.GI) == 0:
                 self.GI = "NA"
             if self.Acc is None or len(self.Acc) == 0:
                 self.Acc = "NA"
Пример #32
0
def genbank_single(filename):
    """
    >>> record = genbank_single("GFF/NC_001422.gbk")
    >>> record.taxonomy
    ['Viruses', 'ssDNA viruses', 'Microviridae', 'Microvirus']
    >>> cds = record.features[-4]
    >>> cds.key
    'CDS'
    >>> location = LocationFromString(cds.location)
    >>> print location
    2931..3917
    >>> subseq = record_subseq(record, location)
    >>> subseq[0:20]
    Seq('ATGTTTGGTGCTATTGCTGG', Alphabet())
    """
    return GenBank.RecordParser().parse(open(filename))
Пример #33
0
 def add_file(cls, filepath: str):
     """TODO(seanbeagle): Create scraping tool for genbank data similar to get_source_data()"""
     try:
         gb = GenBank.read(file=filepath)
         accession = gb.accession[0]
         logging.debug(f"Adding {accession} to {cls}...")
         record = cls.insert(
             accession=accession,
             version=gb.version,
             filepath=filepath,  # TODO: Ensure this is absolute filepath
             date_downloaded=now(),
             downloaded_by=getpass.getuser(),
             num_features=len(gb.features),
             length=len(gb))
         return record
     except Exception as e:
         logging.debug(f"Could not insert GenBank record: {e}")
Пример #34
0
 def test_topology_genbank(self):
     """Check GenBank topology parsing."""
     # This is a bit low level, but can test pasing the ID line only
     tests = [
         ("LOCUS       U00096", None),
         ("LOCUS       SCU49845     5028 bp    DNA             PLN       21-JUN-1999", None),
         ("LOCUS       AB070938                6497 bp    DNA     linear   BCT 11-OCT-2001", "linear"),
         ("LOCUS       NC_005816               9609 bp    DNA     circular BCT 21-JUL-2008", "circular"),
         ("LOCUS       SCX3_BUTOC                64 aa            linear   INV 16-OCT-2001", "linear"),
     ]
     for (line, topo) in tests:
         scanner = Scanner.GenBankScanner()
         consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner)
         scanner._feed_first_line(consumer, line)
         t = consumer.data.annotations.get('topology', None)
         self.assertEqual(t, topo,
                          "Wrong topology %r not %r from %r" % (t, topo, line))
Пример #35
0
 def test_topology_embl(self):
     """Check EMBL topology parsing."""
     # This is a bit low level, but can test pasing the ID line only
     tests = [
         ("ID   X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.", "linear"),
         ("ID   CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP.", "linear"),
         ("ID   BSUB9999   standard; circular DNA; PRO; 4214630 BP.", "circular"),
         ("ID   SC10H5 standard; DNA; PRO; 4870 BP.", None),
         ("ID   NRP_AX000635; PRT; NR1; 15 SQ", None),
         ("ID   NRP0000016E; PRT; NR2; 5 SQ", None),
     ]
     for (line, topo) in tests:
         scanner = Scanner.EmblScanner()
         consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner)
         scanner._feed_first_line(consumer, line)
         t = consumer.data.annotations.get('topology', None)
         self.assertEqual(t, topo,
                          "Wrong topology %r not %r from %r" % (t, topo, line))
Пример #36
0
def get_record_list():
    """
    Parses through and generates a list of dictionaries containing all required information about records.

    Makes calls to other functions to format location and record data
    """

    with open(INPUT_PATH) as handle:

        record_list = []

        for gbk_record in GenBank.parse(handle):
            record_list.append({
                "locus":
                gbk_record.locus,
                "features": [feature for feature in gbk_record.features]
            })

    return format_record_list(record_list)
Пример #37
0
 def test_topology_genbank(self):
     """Check GenBank LOCUS line parsing."""
     # This is a bit low level, but can test pasing the LOCUS line only
     tests = [
         ("LOCUS       U00096",
          None, None, None, None),
         # This example is actually fungal, accession U49845 from Saccharomyces cerevisiae:
         ("LOCUS       SCU49845     5028 bp    DNA             PLN       21-JUN-1999",
          None, "DNA", "PLN", None),
         ("LOCUS       AB070938                6497 bp    DNA     linear   BCT 11-OCT-2001",
          "linear", "DNA", "BCT", None),
         ("LOCUS       NC_005816               9609 bp    DNA     circular BCT 21-JUL-2008",
          "circular", "DNA", "BCT", None),
         ("LOCUS       SCX3_BUTOC                64 aa            linear   INV 16-OCT-2001",
          "linear", None, "INV", None),
         ("LOCUS       pEH010                  5743 bp    DNA     circular",
          "circular", "DNA", None, [BiopythonParserWarning]),
         # This is a test of the format > 80 chars long
         ("LOCUS       AZZZAA02123456789 1000000000 bp    DNA     linear   PRI 15-OCT-2018",
          "linear", "DNA", "PRI", None)
     ]
     for (line, topo, mol_type, div, warning_list) in tests:
         with warnings.catch_warnings(record=True) as caught:
             warnings.simplefilter("always")
             scanner = Scanner.GenBankScanner()
             consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner)
             scanner._feed_first_line(consumer, line)
             t = consumer.data.annotations.get('topology', None)
             self.assertEqual(t, topo,
                              "Wrong topology %r not %r from %r" % (t, topo, line))
             mt = consumer.data.annotations.get('molecule_type', None)
             self.assertEqual(mt, mol_type,
                              "Wrong molecule_type %r not %r from %r" %
                              (mt, mol_type, line))
             d = consumer.data.annotations.get('data_file_division', None)
             self.assertEqual(d, div,
                              "Wrong division %r not %r from %r" % (d, div, line))
             if warning_list is None:
                 self.assertEqual(len(caught), 0)
             else:
                 self.assertEqual(len(caught), len(warning_list))
                 for i, warning_class in enumerate(warning_list):
                     self.assertEqual(caught[i].category, warning_class)
Пример #38
0
 def test_topology_embl(self):
     """Check EMBL ID line parsing."""
     # This is a bit low level, but can test pasing the ID line only
     tests = [
         # Modern examples with sequence version
         ("ID   X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.",
          "linear", "mRNA", "PLN"),
         ("ID   CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP.",
          "linear", "genomic DNA", "MAM"),
         # Example to match GenBank example used above:
         ("ID   U49845; SV 1; linear; genomic DNA; STD; FUN; 5028 BP.",
          "linear", "genomic DNA", "FUN"),
         # Old examples:
         ("ID   BSUB9999   standard; circular DNA; PRO; 4214630 BP.",
          "circular", "DNA", "PRO"),
         ("ID   SC10H5 standard; DNA; PRO; 4870 BP.",
          None, "DNA", "PRO"),
         # Patent example from 2016-06-10
         # ftp://ftp.ebi.ac.uk/pub/databases/embl/patent/
         ("ID   A01679; SV 1; linear; unassigned DNA; PAT; MUS; 12 BP.",
          "linear", "unassigned DNA", "MUS"),
         # Old patent examples
         ("ID   NRP_AX000635; PRT; NR1; 15 SQ", None, None, "NR1"),
         ("ID   NRP0000016E; PRT; NR2; 5 SQ", None, None, "NR2"),
         # KIPO patent examples
         ("ID   DI500001       STANDARD;      PRT;   111 AA.", None, None, None),
         ("ID   DI644510   standard; PRT;  1852 AA.", None, None, None),
     ]
     for (line, topo, mol_type, div) in tests:
         scanner = Scanner.EmblScanner()
         consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner)
         scanner._feed_first_line(consumer, line)
         t = consumer.data.annotations.get('topology', None)
         self.assertEqual(t, topo,
                          "Wrong topology %r not %r from %r" % (t, topo, line))
         mt = consumer.data.annotations.get('molecule_type', None)
         self.assertEqual(mt, mol_type,
                          "Wrong molecule_type %r not %r from %r" %
                          (mt, mol_type, line))
         d = consumer.data.annotations.get('data_file_division', None)
         self.assertEqual(d, div,
                          "Wrong division %r not %r from %r" % (d, div, line))
Пример #39
0
	def search(self,start,end):
	
		ncbi_dict = GenBank.NCBIDictionary()
		j=1
		if start<1:
			start =1	
		if end >len(self.accs) or end == 0:
			end = len(self.accs)
			
		for k in range(start-1,end):
			sys.stderr.write("No " + repr(j) + ": " + self.accs[k] +'\n')
			j=j+1
			gi_list = GenBank.search_for(self.accs[k],database='protein')
		
			for i in range(0,len(gi_list)):
				try:
					gb_record = ncbi_dict[gi_list[i]]
					sys.stdout.write('>'+self.accs[k]+'\n')
					sys.stdout.write( gb_record)
				except:
					sys.stderr.write( self.accs[k] + " fetching error \n")
Пример #40
0
 def test_first_line_imgt(self):
     """Check IMGT ID line parsing."""
     # This is a bit low level, but can test pasing the ID line only
     tests = [
         ("ID   HLA00001   standard; DNA; HUM; 3503 BP.",
          None, "DNA", "HUM"),
         ("ID   HLA00001; SV 1; standard; DNA; HUM; 3503 BP.",
          None, "DNA", "HUM"),
     ]
     for (line, topo, mol_type, div) in tests:
         scanner = Scanner._ImgtScanner()
         consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner)
         scanner._feed_first_line(consumer, line)
         t = consumer.data.annotations.get('topology', None)
         self.assertEqual(t, topo,
                          "Wrong topology %r not %r from %r" % (t, topo, line))
         mt = consumer.data.annotations.get('molecule_type', None)
         self.assertEqual(mt, mol_type,
                          "Wrong molecule_type %r not %r from %r" %
                          (mt, mol_type, line))
         d = consumer.data.annotations.get('data_file_division', None)
         self.assertEqual(d, div,
                          "Wrong division %r not %r from %r" % (d, div, line))
Пример #41
0
class RecordReceiver:
    def __init__(self, handle):
        self.handle = handle

    def __call__(self, id, rec):
        self.handle.write(rec)


# Functor that deals with bad records - prints an error message to HANDLE
class BadRecordReceiver:
    def __init__(self, handle):
        self.handle = handle
        self.badIDs = []

    def __call__(self, badID):
        self.badIDs.append(badID)
        self.handle.write("Bad ID: %s\n" % badID)


# Form pattern for accession strings
accessionPat = prefix + "%%0%dd" % digits

batchSize = 500

for curr in xrange(start, end + 1, batchSize):
    # Generate accession strings for this batch
    ids = [accessionPat % num for num in range(curr, min(curr + batchSize, end + 1))]
    GenBank.download_many(
        ids, RecordReceiver(sys.stdout), database="nucleotide", broken_fn=BadRecordReceiver(sys.stderr), faildelay=5.0
    )
Пример #42
0
    body = '\n'.join(textwrap.wrap(rec.seq.data, width=80))
    return head, body
    
if __name__ == '__main__':
    
    mode  = sys.argv[1]
    text  = sys.argv[2]
    output_file = sys.argv[3]

    print 'Searching for %s <br>' % text
    
    # check if inputs are all numbers
    try:
        gi_list = text.split()
        tmp = map(int, gi_list)
    except ValueError:
        gi_list = GenBank.search_for(text, max_ids=10)
    
    fp = open(output_file, 'wt')
    record_parser = GenBank.FeatureParser()
    ncbi_dict = GenBank.NCBIDictionary(mode, 'genbank', parser = record_parser)
    for gid in gi_list:
        res = ncbi_dict[gid]
        head, body =  make_fasta(res)
        fp.write(head+body+'\n')
        print head
    fp.close()

   

Пример #43
0
 def test_genbank_read(self):
     with open(path.join("GenBank", "NC_000932.gb")) as handle:
         record = GenBank.read(handle)
     self.assertEqual(['NC_000932'], record.accession)