def test_get_genbank_summary(self): """Get the summary!!""" exp = GreengenesRecord({'ncbi_acc_w_ver':'AGIY01000001.1', 'ncbi_gi':'354825968', 'gold_id':'Gi05850', 'decision':'named_isolate', 'isolation_source':'anaerobic digested sludge', 'organism':'Methanolinea tarda NOBI-1', 'strain':'NOBI-1', 'prokmsaname':'Methanolinea tarda NOBI-1', 'specific_host':'Methanolinea tarda NOBI-1 ctg73, whole genome shotgun sequence.', 'authors':'Lucas,S., Han,J., Lapidus,A., Cheng,J.-F., Goodwin,L., Pitluck,S., Peters,L., Land,M.L., Hauser,L., Imachi,H., Sekiguchi,Y., Kamagata,Y., Cadillo-Quiroz,H., Zinder,S., Liu,W.T., Tamaki,H. and Woyke,T.J.', 'title':'The draft genome of Methanolinea tarda NOBI-1', 'submit_date':'31-OCT-2011', 'country':'Japan: Nagaoka', #'NCBI_tax_id':'882090', 'ncbi_tax_string':'Archaea; Euryarchaeota; Methanomicrobia; Methanomicrobiales; Methanoregulaceae; Methanolinea'}) obs = get_genbank_summary(self.gb1) self.assertEqual(obs,exp)
# verify the sequence is DNA. NCBI silently corrupts records # every once and a while leading to crap in the sequence. # Thanks NCBI. seq_chars = set(sequence) if not seq_chars.issubset(alpha): logline = log_f("Corrupt sequence, accession: %s" % (accession)) logger.write(logline) if verbose: stdout.write(logline) failure_count += 1 continue # gg_record contains gb summary data try: gg_record = get_genbank_summary(next_record) except KeyError, e: failure_count += 1 continue seen.add(accession) write_sequence(sequences, accession, sequence) write_gg_record(gg_records, gg_record) write_obs_record(obs_records, accession) if failure_count >= max_failures: logline = log_f("MAX FAILURES OF %d REACHED IN %s" % (max_failures, \ gb_fp)) logger.write(logline) stderr.write(logline) else: