def test_get_sequence(self): """get the sequence""" exp_gb1_start = "CCATGATTCGACCATTTTCAGAGAG" exp_gb1_end = "CAACGGTCAGGCCAG" exp_gb1_len = 452612 obs = get_sequence(self.gb1) obs_gb1_start = obs[:25] obs_gb1_end = obs[-15:] obs_gb1_len = len(obs) self.assertEqual(obs_gb1_start, exp_gb1_start) self.assertEqual(obs_gb1_end, exp_gb1_end) self.assertEqual(obs_gb1_len, exp_gb1_len)
# accession is str including version try: accession = get_accession(next_record) except: failure_count += 1 continue if accession in observed_records: continue if accession in seen: continue # accession added at the end of this while loop # sequence is just a str of sequence try: sequence = get_sequence(next_record) except NoSequenceError: # this isn't a failure, so no point in continuing but record # the accession so it isn't hit again write_obs_record(obs_records, accession) continue except: failure_count += 1 continue # verify the sequence is DNA. NCBI silently corrupts records # every once and a while leading to crap in the sequence. # Thanks NCBI. seq_chars = set(sequence) if not seq_chars.issubset(alpha): logline = log_f("Corrupt sequence, accession: %s" % (accession))