Пример #1
0
    def test_get_sequence(self):
        """get the sequence"""
        exp_gb1_start = "CCATGATTCGACCATTTTCAGAGAG"
        exp_gb1_end =   "CAACGGTCAGGCCAG"
        exp_gb1_len = 452612
        
        obs = get_sequence(self.gb1)
        obs_gb1_start = obs[:25]
        obs_gb1_end = obs[-15:]
        obs_gb1_len = len(obs)

        self.assertEqual(obs_gb1_start, exp_gb1_start)
        self.assertEqual(obs_gb1_end, exp_gb1_end)
        self.assertEqual(obs_gb1_len, exp_gb1_len)
Пример #2
0
            # accession is str including version
            try:
                accession = get_accession(next_record)
            except:
                failure_count += 1
                continue
            if accession in observed_records:
                continue
            if accession in seen:
                continue
                # accession added at the end of this while loop

            # sequence is just a str of sequence
            try:
                sequence = get_sequence(next_record)
            except NoSequenceError:
                # this isn't a failure, so no point in continuing but record
                # the accession so it isn't hit again
                write_obs_record(obs_records, accession)
                continue
            except:
                failure_count += 1
                continue

            # verify the sequence is DNA. NCBI silently corrupts records 
            # every once and a while leading to crap in the sequence. 
            # Thanks NCBI.
            seq_chars = set(sequence)
            if not seq_chars.issubset(alpha):
                logline = log_f("Corrupt sequence, accession: %s" % (accession))