def write_format(file): record_parser = GenBank.RecordParser(debug_level=2) print("Testing GenBank writing for %s..." % os.path.basename(file)) # be able to handle gzipped files if '.gz' in file: cur_handle = gzip.open(file, "r") compare_handle = gzip.open(file, "r") else: cur_handle = open(file, "r") compare_handle = open(file, "r") iterator = GenBank.Iterator(cur_handle, record_parser) compare_iterator = GenBank.Iterator(compare_handle) while 1: cur_record = iterator.next() compare_record = compare_iterator.next() if cur_record is None or compare_record is None: break # print("\tTesting for %s" % cur_record.version) output_record = str(cur_record) + "\n" try: do_comparison(compare_record, output_record) except AssertionError as msg: print("\tTesting for %s" % cur_record.version) print(msg) cur_handle.close() compare_handle.close()
def t_write_format(): record_parser = GenBank.RecordParser(debug_level = 0) for file in write_format_files: print("Testing GenBank writing for %s..." % os.path.basename(file)) cur_handle = open(os.path.join("GenBank", file), "r") compare_handle = open(os.path.join("GenBank", file), "r") iterator = GenBank.Iterator(cur_handle, record_parser) compare_iterator = GenBank.Iterator(compare_handle) while 1: cur_record = iterator.next() compare_record = compare_iterator.next() if cur_record is None or compare_record is None: break print("\tTesting for %s" % cur_record.version) output_record = str(cur_record) + "\n" do_comparison(compare_record, output_record) cur_handle.close() compare_handle.close()
def t_write_format(): """Test writing to the difference formats.""" record_parser = GenBank.RecordParser(debug_level=0) for next_file in write_format_files: print("Testing GenBank writing for %s..." % os.path.basename(next_file)) cur_handle = open(os.path.join("GenBank", next_file), "r") compare_handle = open(os.path.join("GenBank", next_file), "r") iterator = GenBank.Iterator(cur_handle, record_parser) compare_iterator = GenBank.Iterator(compare_handle) while True: cur_rec = next(iterator) compare_record = next(compare_iterator) if cur_rec is None or compare_record is None: break print("\tTesting for %s" % cur_rec.version) output_record = str(cur_rec) + "\n" do_comparison(compare_record, output_record) cur_handle.close() compare_handle.close()
def write_format(file): """Write a GenBank record from a Genbank file and compare them.""" record_parser = GenBank.RecordParser(debug_level=2) print("Testing GenBank writing for %s..." % os.path.basename(file)) # be able to handle gzipped files if ".gz" in file: cur_handle = gzip.open(file, "rb") compare_handle = gzip.open(file, "rb") else: cur_handle = open(file) compare_handle = open(file) iterator = GenBank.Iterator(cur_handle, record_parser) compare_iterator = GenBank.Iterator(compare_handle) while True: cur_record = next(iterator) compare_record = next(compare_iterator) if cur_record is None or compare_record is None: break # print("\tTesting for %s" % cur_record.version) output_record = str(cur_record) + "\n" try: do_comparison(compare_record, output_record) except AssertionError as msg: print("\tTesting for %s" % cur_record.version) print(msg) cur_handle.close() compare_handle.close()
def __processGffFilesNotNew(self, changed): for gff in changed: loc = os.path.dirname(gff) dbName = os.path.splitext(os.path.basename(gff))[0] + '.db' dbName = os.path.join(loc, dbName) gffRewriter = GFFRewriter(filename=gff, outfile=gff + ".sorted.prepared", accession=genbank_id) #print setting.DATABASES['default']['USER'] gffRewriter.addUnknownCvTerms({ 'user': settings.DATABASES['default']['USER'], 'password': settings.DATABASES['default']['PASSWORD'], 'db': settings.DATABASES['default']['NAME'] }) gffRewriter.addColor({ 'user': settings.DATABASES['default']['USER'], 'password': settings.DATABASES['default']['PASSWORD'], 'db': 'go' }) error = gffRewriter.getError() # run the sqlite database loader to be able to add it to GBrowse # since the name should be preserved, no changes need to be made # to the GBrowse configuration file args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff] runProgram('bp_seqfeature_load.pl', args) parser = GenBank.RecordParser() gbk = os.path.join(os.path.splitext(gff)[0], '.gbk') record = parser.parse(open(gbk)) organismName = record.organism organismDir = os.path.basename(loc) GenomeDBUtil.editGBrowseEntry(gff, dbName, organismDir, organismName) # now edit the record in Chado args = [ '--organism', organismName, "--gfffile", gff, "--dbname", settings.DATABASES['default']['NAME'], "--dbuser", settings.DATABASES['default']['USER'], "--dbpass", settings.DATABASES['default']['PASSWORD'], "--random_tmp_dir" ] runProgram('gmod_bulk_load_gff3.pl', args)
def genbank_single(filename): """ >>> record = genbank_single("GFF/NC_001422.gbk") >>> record.taxonomy ['Viruses', 'ssDNA viruses', 'Microviridae', 'Microvirus'] >>> cds = record.features[-4] >>> cds.key 'CDS' >>> location = LocationFromString(cds.location) >>> print location 2931..3917 >>> subseq = record_subseq(record, location) >>> subseq[0:20] Seq('ATGTTTGGTGCTATTGCTGG', Alphabet()) """ return GenBank.RecordParser().parse(open(filename))
files_to_parse = [] for file in test_files: files_to_parse.append(os.path.join(gb_file_dir, file)) # parse the bioperl test files # comment this out for now -- there are a bunch of junky records in here # that no longer exist in GenBank -- do we really need to support those? # files_to_parse = [os.path.join(os.getcwd(), 'GenBank', 'bioperl_test.gb')] # parse the biojava test files # files_to_parse += [os.path.join(os.getcwd(), 'GenBank', 'biojava_test.gb')] # test the parsers feature_parser = GenBank.FeatureParser(debug_level=0) record_parser = GenBank.RecordParser(debug_level=0) all_parsers = [feature_parser, record_parser] print("Testing parsers...") for parser in all_parsers: for filename in files_to_parse: if not os.path.isfile(filename): print("Missing test input file: %s" % filename) continue handle = open(filename, 'r') iterator = GenBank.Iterator(handle, parser) while True: with warnings.catch_warnings(): warnings.simplefilter("ignore", BiopythonParserWarning)
def ntgenbank(): #retreiving all genebank files in a list calling another function nuc_genbank = readfile() #nuc_genbank = filter(None, nuc_genbank) print len(nuc_genbank) print nuc_genbank[0] length = len(nuc_genbank) print "\nParsing started" output = open('result_ntgenbank.csv', 'w') # opening a file to write the ouput #writing headings of the output file output.write('Name'+','+'NM'+','+ 'NM_version'+','+ 'Symbol'+','+'CDS_start'+','+ 'CDS_stop'+','+'HGNC'+','+\ 'MIM'+','+'EC_number'+','+ 'GeneID' +','+ 'NP'+','+'NP_version'+','+'gene_synonym'+','+'AA_seq'+','+\ 'AA_number'+','+'Chromosome'+ ','+'Chromosome_map'+','+ 'NT_seq'+','+'Organism'+'\n') # going through all the genes in the list for n in range(1, length): #0 index is empty print n test = 'LOCUS ' + nuc_genbank[n].lstrip( '\n') #removing new line of from individual genebank files query = open( 'genbank.txt', 'w') #creating a genbank file to create query gene bank file query.write(test) query.close() parser = GenBank.RecordParser() #using biopython function for parsing record = parser.parse(open('genbank.txt')) ########################################################################################## nt_seq = (record.sequence).strip('\n') #stores nucleotide sequence nm_and_version = (record.version).strip( '\n') #contains nm and nm_version nm = (nm_and_version.split('.')[0]).strip('\n') nm_version = (nm_and_version.split('.')[1]).strip('\n') ############################################################################################ source = record.features[0] #contains all the fields of source organism = source.qualifiers[0].value.strip( '\n') + ':' + source.qualifiers[2].value.strip('\n') try: organism = source.qualifiers[0].value.strip( '\n') + ':' + source.qualifiers[2].value.strip('\n') except: organism = '' try: chrm = (source.qualifiers[3].value).strip( '\n') #stores chromosome number except: chrm = '' try: chrm_map = source.qualifiers[4].value.strip('\n') except: chrm_map = '' ############################################################################################ gene = record.features[1] #contains all the field of gene symbol = (gene.qualifiers[0].value).strip('\n') #symbol or gene ######################################################################################### cds = '' for c in range(0, len(record.features)): if ('CDS' in record.features[c].key): cds = record.features[c] break else: continue if cds != '': cds_start_stop = (cds.location).strip( '\n') #stores cds start and stop position cds_start = (cds_start_stop.split('..')[0]).strip('\n') cds_stop = (cds_start_stop.split('..')[1]).strip('\n') #creating a empty dictionary to go through the elements in the CDS and update later if present cds_dict = { "HGNC": '', "MIM:": '', "EC_number": '', "GeneID": '', "product": '', "protein_id": '', "translation": '', "num_aa": '', "gene_synonym": '' } for n in range(0, len(cds.qualifiers) ): #going through all the elements in the cds for key, value in cds_dict.iteritems( ): #looping through the dictionary items to see if present in cds if ((key in cds.qualifiers[n].key) or (key in cds.qualifiers[n].value)): keys = str(key) #storing dictionary key cds_dict[keys] = str( cds.qualifiers[n].value ) #updating dictionary key with values break else: continue np = cds_dict["protein_id"].split('.')[0] + '"' np_version = '"' + cds_dict["protein_id"].split('.')[1] hgnc = cds_dict["HGNC"] mim = cds_dict["MIM:"] geneid = cds_dict["GeneID"] name = cds_dict["product"] synonym = cds_dict["gene_synonym"] translation = cds_dict["translation"] if translation != '': num_aa = len(translation) if len(hgnc) != 0: hgnc = '"' + hgnc.split(':')[2] if len(mim) != 0: mim = '"' + mim.split(':')[1] if len(geneid) != 0: geneid = '"' + geneid.split(':')[1] gvalue = name+','+nm+','+nm_version+','+symbol+','+cds_start+','+cds_stop+',' + hgnc +','+\ mim+','+cds_dict["EC_number"]+','+geneid+ ','+np+','+np_version+','+synonym+','+\ translation+','+str(num_aa) +','+str(chrm)+','+chrm_map+','+nt_seq+','+organism+'\n' output.write(gvalue) print "Parsing completed" output.close()
def __processGffFilesNew(self, newOrganismDirs): for newOrganism in newOrganismDirs: # start by creating the BLAST database newOrganism = os.path.join(NEW_GENOMIC_DATA_DIR, newOrganism) print newOrganism organismFiles = os.walk(newOrganism).next()[2] faa = None ffn = None gff = None gbk = None for organismFile in organismFiles: extension = os.path.splitext(organismFile)[1] if (extension == '.ffn'): ffn = organismFile elif (extension == '.faa'): faa = organismFile elif (extension == '.gff'): gff = organismFile elif (extension == '.gbk'): gbk = organismFile if (faa and ffn and gff and gbk): break if (faa): GenomeDBUtil.runFormatDB(os.path.basename(faa), newOrganism, protein=True) self.report.addLogEntry('Ran formatdb successully on ' + faa) if (ffn): GenomeDBUtil.runFormatDB(os.path.basename(ffn), newOrganism, protein=False) self.report.addLogEntry('Ran formatdb successully on ' + ffn) # process the gff and genbank files for creating the databases if (gff and gbk): # create the sqlite database for GBrowse and create the configuration file # for GBrowse hook up dbName = os.path.splitext(os.path.basename(gff))[0] + '.db' dbName = os.path.join(newOrganism, dbName) gff = os.path.join(newOrganism, gff) parser = GenBank.RecordParser() gbk = os.path.join(newOrganism, gbk) record = parser.parse(open(gbk)) organismName = record.organism accession = record.accession[0] self.report.addLogEntry('Found organism name ' + organismName) # create a brand new GBrowse configuration file examiner = GFFExaminer() gffHandle = open(gff) landmark = examiner.available_limits(gffHandle)['gff_id'].keys()[0][0] gffRewriter = GFFRewriter(filename=gff, outfile=gff+".sorted.prepared" , accession=accession) '''gffRewriter.addUnknownCvTerms({ 'user' : settings.DATABASES['default']['USER'], 'password' : settings.DATABASES['default']['PASSWORD'], 'db' : settings.DATABASES['default']['NAME'] })''' gffRewriter.addColor({ 'user' : settings.DATABASES['default']['USER'], 'password' : settings.DATABASES['default']['PASSWORD'], 'db' : 'MyGO' }) error = gffRewriter.getError() print error gff = gff + ".sorted.prepared" args = ['-a', 'DBI::SQLite', '-c', '-f', '-d', dbName, gff] runProgram('bp_seqfeature_load.pl', args) self.report.addLogEntry('Successfully created sqlite database for ' + str(gff)) organismDir = os.path.basename(newOrganism) self.report.addLogEntry('Added new GBrowse entry for ' + organismName) # now edit the record in Chado by first adding the organism and then adding # bulk loading the information from gff3 id = GenomeDBUtil.addOrganismToChado(gff, organismName) GenomeDBUtil.createNewGBrowseEntry(landmark, dbName, organismDir, organismName, id)
errorfile = open(datapath+'/Parser_errr.out', 'w') genomeid = [] ############################################################################### # Parsing genbank files ############################################################################### print("\nParsing genbank files...") start = time.time() for file in glob.glob('*.gbk'): print("Parsing file: ",gbkpath+"/"+file) try: w = re.findall(r"[\w']+",file) parser = GenBank.RecordParser() record = parser.parse(open(gbkpath+"/"+file)) genomefile = open(genomepath+"/"+record.locus+".fasta", "w") genomefile.write(">" + record.locus + "\n") genomefile.write(record.sequence) genomefile.close() definition = record.definition.split(',') definition = definition[0] trest = re.sub('[^A-Za-z0-9]+', '_', str(definition)) organismlist = record.organism.split(" ") genuslist.append(organismlist[0]) specieslist.append(organismlist[1]) statsFile = open(statisticspath+'/'+record.locus + '.stats','w')