Пример #1
0
 def test_get_genbank_summary(self):
     """Get the summary!!"""
     exp = GreengenesRecord({'ncbi_acc_w_ver':'AGIY01000001.1',
             'ncbi_gi':'354825968',
             'gold_id':'Gi05850',
             'decision':'named_isolate',
             'isolation_source':'anaerobic digested sludge',
             'organism':'Methanolinea tarda NOBI-1',
             'strain':'NOBI-1',
             'prokmsaname':'Methanolinea tarda NOBI-1',
             'specific_host':'Methanolinea tarda NOBI-1 ctg73, whole genome shotgun sequence.',
            'authors':'Lucas,S., Han,J., Lapidus,A., Cheng,J.-F., Goodwin,L., Pitluck,S., Peters,L., Land,M.L., Hauser,L., Imachi,H., Sekiguchi,Y., Kamagata,Y., Cadillo-Quiroz,H., Zinder,S., Liu,W.T., Tamaki,H. and Woyke,T.J.',
            'title':'The draft genome of Methanolinea tarda NOBI-1',
            'submit_date':'31-OCT-2011',
            'country':'Japan: Nagaoka',
            #'NCBI_tax_id':'882090',
            'ncbi_tax_string':'Archaea; Euryarchaeota; Methanomicrobia; Methanomicrobiales; Methanoregulaceae; Methanolinea'})
     obs = get_genbank_summary(self.gb1)
    
     self.assertEqual(obs,exp)
Пример #2
0
            # verify the sequence is DNA. NCBI silently corrupts records 
            # every once and a while leading to crap in the sequence. 
            # Thanks NCBI.
            seq_chars = set(sequence)
            if not seq_chars.issubset(alpha):
                logline = log_f("Corrupt sequence, accession: %s" % (accession))
                logger.write(logline)
                if verbose:
                    stdout.write(logline)
                failure_count += 1
                continue

            # gg_record contains gb summary data  
            try:
                gg_record = get_genbank_summary(next_record)
            except KeyError, e:
                failure_count += 1
                continue

            seen.add(accession)
            write_sequence(sequences, accession, sequence)
            write_gg_record(gg_records, gg_record)
            write_obs_record(obs_records, accession)
            
        if failure_count >= max_failures:
            logline = log_f("MAX FAILURES OF %d REACHED IN %s" % (max_failures, \
                                                                  gb_fp))
            logger.write(logline)
            stderr.write(logline)
        else: