Пример #1
0
def load_genbank(seqfile):
    """Load single-record GenBank file."""
    parser = GenBank.FeatureParser()
    input_handle = open(seqfile, 'rU')
    gb_record = parser.parse(input_handle)
    input_handle.close()
    return gb_record
Пример #2
0
def pLonk(plasmids):
    pLenks = []
    pLasmids = []
    for (pName, seq_infile, offset, order) in plasmids:
        fhandle = open(seq_infile, 'r')  # load plasmid sequence file
        # evaluate file name to detect format using Quixote [ filename ]
        format = Quixote(seq_infile)
        if format == 'genbank':
            parser = GenBank.FeatureParser()
            gb_entry = parser.parse(fhandle)
            pLen = len(gb_entry.seq)  # read in length of plasmid sequence
            print pName, pLen
        elif format == 'fasta' or format == 'seq':
            for fa_entry in SeqIO.parse(fhandle, "fasta"):
                pLen = len(fa_entry.seq)  # read in length of plasmid sequence
        else:
            print "TERMINAL ERROR : file format not recognized for " + pName + " !!!"
            break
        fhandle.close()  # close sequence file (to free up memory)
        pLenks.append(pLen)
        pLasmids.append((pName, seq_infile, int(pLen), int(offset)))
        pLenks.sort()
        pLenks.reverse()
        pLen_MAX = pLenks[0]
    return pLenks, pLen_MAX, pLasmids
Пример #3
0
    def setUp(self):
        # create TESTDB
        create_database()

        # load the database
        db_name = "biosql-test"
        server = BioSeqDatabase.open_database(driver=DBDRIVER,
                                              user=DBUSER,
                                              passwd=DBPASSWD,
                                              host=DBHOST,
                                              db=TESTDB)

        # remove the database if it already exists
        try:
            server[db_name]
            server.remove_database(db_name)
        except KeyError:
            pass

        self.db = server.new_database(db_name)

        # get the GenBank file we are going to put into it
        input_file = os.path.join(os.getcwd(), "GenBank", "cor6_6.gb")
        handle = open(input_file, "r")
        parser = GenBank.FeatureParser()
        self.iterator = GenBank.Iterator(handle, parser)
Пример #4
0
    def search(self):

        if self.database == 'PubMed':
            from Bio import PubMed
            from Bio import GenBank

        searchIds = PubMed.search_for(self.searchTerm, max_ids=self.maxResults)

        GBrecParser = GenBank.FeatureParser()
        ncbiDict = GenBank.NCBIDictionary(self.type,
                                          'genbank',
                                          parser=GBrecParser)

        from Bio import Medline

        MLrecParser = Medline.RecordParser()
        medlineDict = PubMed.Dictionary(delay=1.0, parser=MLrecParser)
        for id in searchIds:
            MLrecord = medlineDict[id]
            GBrecord = ncbiDict[id]
            newDBItem = DBItem(self.project,
                               seq=GBrecord.seq,
                               descript=GBrecord.description,
                               id=id,
                               record=MLrecord)
            self.items[id] = newDBItem
Пример #5
0
def plot_unique_genome_diagram(gbk, unique_loci):
    parser = GenBank.FeatureParser()
    fhandle = open(gbk, 'r')
    genbank_entry = parser.parse(fhandle)
    fhandle.close()

    gdd = GenomeDiagram.Diagram(gbk)
    gd_track_for_features = gdd.new_track(1,
                                          name="CDS",
                                          scale_smalltick_interval=100000)
    gdfs = gd_track_for_features.new_set()
    for feature in genbank_entry.features:
        if feature.type == 'CDS':
            feature.strand = 1
            if feature.qualifiers['locus_tag'][0] in unique_loci:
                gdfs.add_feature(feature, color=rcolors.HexColor("#93341F"))
            else:
                gdfs.add_feature(feature, color=rcolors.HexColor("#058F45"))
    gdd.draw(format='circular',
             orientation='landscape',
             tracklines=0,
             pagesize='A5',
             fragments=5,
             circular=1)
    return gdd
Пример #6
0
def BaseDraw(plasmids):
    ordN = 0
    for (pName, seq_infile, pLen, offset) in plasmids:
        # set Y axis once and for all for the plasmid being processed
        y0 = (pNs - ordN) * dBL  # starts from the top
        pLeni = int(pLen)
        print 'offset', offset
        offset = int(offset)
        # draw plasmid baseline
        BaseL(ordN, pName, pLeni, y0, canvas_main)
        # label the baseline with plasmid name and size
        LabeL(ordN, pName, pLeni, y0, canvas_main)
        # evaluate file name to detect format using Quixote [filename]
        format = Quixote(seq_infile)
        # mark up sequence origin if there is an offset
        if offset < -1 or offset > 1:
            Zs, dir = Off7(1, pLeni, offset)
            xs = Zs * u
            canvas_main.setFont(bFont, NfSize)
            canvas_main.drawString(xs, y0 + da / 2, osym)
        # filter and draw annotation features
        if format == 'genbank':
            # load GB file to filter features
            parser = GenBank.FeatureParser()
            fhandle = open(seq_infile, 'r')  # load GenBank file
            gb_entry = parser.parse(fhandle)
            ORFcnt = 0
            for feature in gb_entry.features:
                if feature.type == 'CDS' or feature.type == 'cds':  # draw CDS using ORFeus
                    ORFcnt += 1
                    ORFeus(feature, pLeni, offset, y0, ORFcnt)
                elif SFX == 'on':
                    if feature.type == 'SNP':
                        Snippit(feature, pLeni, offset,
                                y0)  # draw asterisk at feature location
                    if feature.type == 'IR':
                        IRFlag(feature, pLeni, offset,
                               y0)  # draw flag at feature location
                # need other functions for other features ( with conditional, default switch off)
            fhandle.close()
            print "    got a GenBank-style file for " + pName + " with " + str(
                ORFcnt) + " ORFs"
        else:
            # no features so just skip this step
            print "    got a non-genbank-style file for " + pName + "; no features to draw"
        # increment plasmid ordinal count
        ordN = ordN + 1
        print "    " + pName + " (" + str(pLeni) + " bp) drawn with " + str(
            ORFcnt) + " ORFs"
    print "    OK"
Пример #7
0
 def loadData(self, data, dbtype):
     if (dbtype == "GenBank"):
         # get the GenBank file we are going to put into it
         parser = GenBank.FeatureParser()
         iterator = GenBank.Iterator(data, parser)
         # finally put it in the database
         try:
             self.getDatabase().load(iterator)
         except:
             self.getBioSQLRoot().getDBServer().adaptor.conn.rollback()
             return traceback.format_exc()
         self.getBioSQLRoot().getDBServer().adaptor.conn.commit()
         return ""
     else:
         raise "Unknown dbtype: %r" % (dbtype) 
Пример #8
0
def t_cleaning_features():
    """Test the ability to clean up feature values."""
    gb_parser = GenBank.FeatureParser(
        feature_cleaner=utils.FeatureValueCleaner())
    handle = open(os.path.join("GenBank", "arab1.gb"))
    iterator = GenBank.Iterator(handle, gb_parser)

    first_record = next(iterator)

    # test for cleaning of translation
    translation_feature = first_record.features[1]
    test_trans = translation_feature.qualifiers["translation"][0]
    assert " " not in test_trans, "Did not clean spaces out of the translation"
    assert "\012" not in test_trans, "Did not clean newlines out of the translation"

    handle.close()
Пример #9
0
def t_cleaning_features():
    """Test the ability to clean up feature values.
    """
    parser = GenBank.FeatureParser(feature_cleaner = \
                                   utils.FeatureValueCleaner())
    handle = open(os.path.join("GenBank", "arab1.gb"))
    iterator = GenBank.Iterator(handle, parser)

    first_record = iterator.next()

    # test for cleaning of translation
    translation_feature = first_record.features[1]
    test_trans = translation_feature.qualifiers["translation"][0]
    assert test_trans.find(" ") == -1, \
      "Did not clean spaces out of the translation"
    assert test_trans.find("\012") == -1, \
      "Did not clean newlines out of the translation"
Пример #10
0
def load_database(gb_handle):
    """Load a GenBank file into a BioSQL database.
    
    This is useful for running tests against a newly created database.
    """

    create_database()
    # now open a connection to load the database
    db_name = "biosql-test"
    server = BioSeqDatabase.open_database(driver=DBDRIVER,
                                          user=DBUSER,
                                          passwd=DBPASSWD,
                                          host=DBHOST,
                                          db=TESTDB)
    db = server.new_database(db_name)

    # get the GenBank file we are going to put into it
    parser = GenBank.FeatureParser()
    iterator = GenBank.Iterator(gb_handle, parser)
    # finally put it in the database
    db.load(iterator)
    server.adaptor.conn.commit()
    server.adaptor.conn.close()
Пример #11
0
# don't test dbsource_wrap because it is a junky RefSeq file

files_to_parse = []
for file in test_files:
    files_to_parse.append(os.path.join(gb_file_dir, file))

# parse the bioperl test files
# comment this out for now -- there are a bunch of junky records in here
# that no longer exist in GenBank -- do we really need to support those?
# files_to_parse = [os.path.join(os.getcwd(), 'GenBank', 'bioperl_test.gb')]

# parse the biojava test files
# files_to_parse += [os.path.join(os.getcwd(), 'GenBank', 'biojava_test.gb')]

# test the parsers
feature_parser = GenBank.FeatureParser(debug_level=0)
record_parser = GenBank.RecordParser(debug_level=0)

all_parsers = [feature_parser, record_parser]
print("Testing parsers...")
for parser in all_parsers:
    for filename in files_to_parse:
        if not os.path.isfile(filename):
            print("Missing test input file: %s" % filename)
            continue

        handle = open(filename, 'r')
        iterator = GenBank.Iterator(handle, parser)

        while True:
            with warnings.catch_warnings():
Пример #12
0
from Bio import GenBank
from BioSQL import BioSeqDatabase

server = BioSeqDatabase.open_database(host="192.168.0.192",
                                      user="******",
                                      passwd="",
                                      db="pythonloadtest")

# remove the database if it already exists
db_name = "testload"
try:
    server[db_name]
    server.remove_database(db_name)
except KeyError:
    pass
db = server.new_database(db_name)

input_file = "/home/hack/install/biopython/Tests/GenBank/cor6_6.gb"
handle = open(input_file, "r")
parser = GenBank.FeatureParser()
iterator = GenBank.Iterator(handle, parser)

# -- do the timing part
start_time = time.time()
num_records = db.load(iterator)
end_time = time.time()
elapsed_time = end_time - start_time
print("Loading")
print("\tDid %s records in %s seconds for\n\t%f records per second" %
      (num_records, elapsed_time, float(num_records) / float(elapsed_time)))
Пример #13
0
    org = rec.annotations.get('organism', '')
    date = rec.annotations.get('date', '')
    head = '>gi:%s, id:%s, org:%s, date:%s\n' % (gi, rec.id, org, date)
    body = '\n'.join(textwrap.wrap(rec.seq.data, width=80))
    return head, body


if __name__ == '__main__':
    mode = sys.argv[1]
    text = sys.argv[2]
    output_file = sys.argv[3]

    print('Searching for %s <br>' % text)

    # check if inputs are all numbers
    try:
        gi_list = text.split()
        [int(_) for _ in gi_list]
    except ValueError:
        gi_list = GenBank.search_for(text, max_ids=10)

    fp = open(output_file, 'wt')
    record_parser = GenBank.FeatureParser()
    ncbi_dict = GenBank.NCBIDictionary(mode, 'genbank', parser=record_parser)
    for gid in gi_list:
        res = ncbi_dict[gid]
        head, body = make_fasta(res)
        fp.write(head + body + '\n')
        print(head)
    fp.close()
Пример #14
0
    def run(self):
        if not self.allowRefSeqs:
            print 'NOT ALLOWING REFSEQS'
            if self.query_string.startswith(
                    'GI:') or self.query_string.startswith('gi:'):
                self.query_string = self.query_string[3:]
                q = self.query_string
                gi_list = self.search(q)
            else:
                q = "mycobacterium phage " + self.query_string + " AND Hatfull GF[AUTH] NOT srcdb_refseq[prop]"
                print "search query:", q
                gi_list = self.search(q)
                print 'gi_list:', gi_list
            if len(gi_list) == 0:
                print 'Got no results.  Changing search criteria'
                q = self.query_string + " AND Hatfull GF[AUTH] NOT srcdb_refseq[prop]"
                print "search query:", q
                gi_list = self.search(q)
            if len(gi_list) == 0:
                print 'Got no results.  Changing search criteria'
                q = self.query_string + " NOT srcdb_refseq[prop]"
                print "search query:", q
                gi_list = self.search(q)
            if len(gi_list) != 0:
                print 'found GenBank Direct Submission(s)'
                print gi_list
            else:
                print 'found no results other than refSeq(s), which you refused'
                self.result = None
                return
        else:  # allowing refSeqs
            print 'ALLOWING REFSEQS'
            if self.query_string.startswith(
                    'GI:') or self.query_string.startswith('gi:'):
                self.query_string = self.query_string[3:]
                q = self.query_string
                gi_list = self.search(q)
            else:
                q = "mycobacterium phage " + self.query_string + " AND Hatfull GF[AUTH]"
                print "search query:", q
                gi_list = self.search(q)
            if len(gi_list) == 0:
                q = self.query_string + " AND Hatfull GF[AUTH]"
                gi_list = self.search(q)
            if len(gi_list) == 0:
                print 'Got no results.  Changing search criteria'
                print 'search query:', self.query_string
                gi_list = self.search(self.query_string)

            if len(gi_list) == 0:
                print 'no results found'

        self.results = gi_list
        return

        if len(gi_list) > 1:
            selection = -1
            for i in range(len(gi_list)):
                print i + 1, '\t', gi_list[i]
            selection = raw_input(
                "Your search returned multiple results.  Please type the number for your selection: "
            )
            selection = int(selection) - 1
        else:
            selection = 0
        print 'creating parser...'
        feature_parser = GenBank.FeatureParser()
        print 'creating dict'
        ncbi_dict = GenBank.NCBIDictionary('nucleotide',
                                           'genbank',
                                           parser=feature_parser)

        if selection == -1:  ## Accounts for non-existent phage query
            print 'non-existent phage query'
            self.result = 0
        else:
            print 'got result'
            self.result = ncbi_dict[gi_list[selection]]
Пример #15
0
def loadDB(catalog):
    from BioSQL import BioSeqDatabase
    import sys
    
    username = raw_input("Please enter user name: ")
    password = raw_input("and password: "******"dbpg-ifi-utv.uio.no"
    db_name = "rnammer"

    server = BioSeqDatabase.open_database(driver="psycopg2", user=username,passwd=password, 
            host=host, db=db_name)
    
    biodb_name = "empty"     # genebank problem ? se staving

    db  = "nodb"

    gi_rep = 1
    
    for gbff in catalog:
                               #server.remove_database(source)
        print gi_rep
        print gbff


        parser = GenBank.FeatureParser()
        #record = parser.parse(open(gbff))
        #records = SeqIO.parse(open(gbff),'genbank')
        records = GenBank.Iterator(open(gbff), parser)
        
        for x in records:
            if re.search("plasmid",x.description, re.IGNORECASE):
                continue
            print "Record name:"
            print x.id
            #print dir(x)

            if "Proteobacteria" == x.annotations["taxonomy"][1]:
                print x.annotations["taxonomy"][1]
                print x.annotations["taxonomy"][2]
                biodb_name = x.annotations["taxonomy"][2]
            else :
                print x.annotations["taxonomy"][1]
                biodb_name = x.annotations["taxonomy"][1]
            while True : 
                try :
                    db = server[biodb_name] 
                    #print "here"
                    break
                except KeyError :
                    #print ("Cannot find biodatabase with name %r making it" % source)
                    server.new_database(biodb_name)
                    server.commit()
            db.load([x])
        #record.annotations["gi"] = gi_rep 
        #print type(records)

        #print record.id
        gi_rep = gi_rep + 1

        #db.load([records])

    server.adaptor.commit()