Exemplo n.º 1
0
    def test_file_to_dict(self):
        '''check file_to_dict fills dictionary correctly'''
        d_test = {}
        d = {}
        tasks.file_to_dict(os.path.join(data_dir, 'sequences_test.fa'), d_test)
        for i in range(1, 5):
            d[str(i)] = sequences.Fasta(str(i), 'ACGTA')

        self.assertSequenceEqual(d_test.keys(), d.keys())
        for i in range(1, 5):
            key = str(i)
            self.assertEqual(d_test[key].id, d[key].id)
            self.assertEqual(d_test[key].seq, d[key].seq)
Exemplo n.º 2
0
Arquivo: tasks.py Projeto: nds/Fastaq
def deinterleave(infile, outfile_1, outfile_2, fasta_out=False):
    seq_reader = sequences.file_reader(infile)
    f_1 = utils.open_file_write(outfile_1)
    f_2 = utils.open_file_write(outfile_2)
    for seq in seq_reader:
        if fasta_out:
            print(sequences.Fasta(seq.id, seq.seq), file=f_1)
        else:
            print(seq, file=f_1)
        try:
            next(seq_reader)
        except StopIteration:
            utils.close(f_1)
            utils.close(f_2)
            raise Error('Error getting mate for sequence. Cannot continue')
        if fasta_out:
            print(sequences.Fasta(seq.id, seq.seq), file=f_2)
        else:
            print(seq, file=f_2)

    utils.close(f_1)
    utils.close(f_2)
Exemplo n.º 3
0
    def test_get_next_from_embl_file(self):
        f_in = utils.open_file_read(
            os.path.join(data_dir, 'sequences_test.embl'))
        embl = sequences.Embl()
        counter = 1

        while embl.get_next_from_file(f_in):
            self.assertEqual(
                embl,
                sequences.Fasta('seq' + str(counter),
                                expected_embl[counter - 1]))
            counter += 1

        utils.close(f_in)
Exemplo n.º 4
0
    def test_trim_Ns(self):
        '''trim_Ns() should do the right trimming of a sequence'''
        fa = sequences.Fasta('ID', 'ANNANA')
        test_seqs = [sequences.Fasta('ID', 'ANNANA'),
                     sequences.Fasta('ID', 'NANNANA'),
                     sequences.Fasta('ID', 'NANNANAN'),
                     sequences.Fasta('ID', 'ANNANAN'),
                     sequences.Fasta('ID', 'NNNNNNANNANAN'),
                     sequences.Fasta('ID', 'NNANNANANn')]

        for s in test_seqs:
            s.trim_Ns()
            self.assertEqual(fa, s)
Exemplo n.º 5
0
    def test_file_reader_phylip(self):
        '''Test read phylip file'''
        test_files = [
            'sequences_test_phylip.interleaved',
            'sequences_test_phylip.interleaved2',
            'sequences_test_phylip.sequential'
        ]

        test_files = [os.path.join(data_dir, f) for f in test_files]

        expected_seqs = [
            sequences.Fasta('Turkey', 'AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT'),
            sequences.Fasta('Salmo_gair', 'AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT'),
            sequences.Fasta('H. Sapiens', 'ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA')
        ]

        for fname in test_files:
            reader = sequences.file_reader(fname)
            i = 0
            for seq in reader:
                self.assertEqual(expected_seqs[i], seq)
                i += 1

        # files made by seaview are a little different in the first line.
        # Test one of these
        expected_seqs = [
            sequences.Fasta('seq1', 96 * 'G' + 'T'),
            sequences.Fasta('seq2', 94 * 'A' + 'G')
        ]

        reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_phylip.made_by_seaview'))
        i = 0
        for seq in reader:
            print(seq)
            self.assertEqual(expected_seqs[i], seq)
            i += 1
Exemplo n.º 6
0
    def test_is_complete_orf(self):
        '''Test is_complete_orf'''
        tests = [
            (sequences.Fasta('ID', 'TTT'), False),
            (sequences.Fasta('ID', 'TTTTAA'), True),
            (sequences.Fasta('ID', 'TTTTAATAA'), False),
            (sequences.Fasta('ID', 'TTGTAA'), True),
            (sequences.Fasta('ID', 'TTTAAC'), True),
            (sequences.Fasta('ID', 'TGA'), False),
            (sequences.Fasta('ID', 'TGAA'), False),
        ]

        for t in tests:
            self.assertEqual(t[0].is_complete_orf(), t[1])
Exemplo n.º 7
0
    def get_next_from_file(self, f):
        self.__init__()
        line = f.readline()
        if not line:
            return None
        while line == '\n':
            line = f.readline()

        if not line.startswith('DNA : '):
            raise  Error("Error reading caf file. Expected line starting with 'DNA : ...'")

        self.id = line.rstrip().split()[2]

        line = f.readline()
        seq = []

        while line != '\n':
            seq.append(line.rstrip())
            line = f.readline()

        self.seq = sequences.Fasta(self.id, ''.join(seq))

        line = f.readline()
        if not line.startswith('BaseQuality : '):
            raise  Error("Error reading caf file. Expected line starting with 'BaseQuality : ...'")

        quals = [int(x) for x in f.readline().rstrip().split()]
        self.seq = self.seq.to_Fastq(quals)

        line = f.readline()
        assert line == '\n'
        line = f.readline()

        while line not in ['', '\n']:
            a = line.rstrip().split()
            if a[0] == 'Insert_size':
                self.insert_min, self.insert_max = int(a[1]), int(a[2])
            elif a[0] == 'Ligation_no':
                self.ligation = a[1]
            elif a[0] == 'Clone':
                self.clone = a[1]
            elif a[0] == 'Clipping' and a[1] == 'QUAL':
                self.clip_start, self.clip_end = int(a[2]) - 1, int(a[3]) - 1

            line = f.readline()

        return True
Exemplo n.º 8
0
    def test_get_next_from_gbk_file(self):
        f_in = utils.open_file_read(
            os.path.join(data_dir, 'sequences_test.gbk'))
        embl = sequences.Embl()
        counter = 1
        expected = [
            'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgatc',
            'gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaatgccatgactcagattctaattttaagctattcaatttctctttgaaa'
        ]

        while embl.get_next_from_file(f_in):
            self.assertEqual(
                embl,
                sequences.Fasta('NAME' + str(counter), expected[counter - 1]))
            counter += 1

        utils.close(f_in)
Exemplo n.º 9
0
    def test_looks_like_gene(self):
        '''Test looks_like_gene'''
        tests = [
            (sequences.Fasta('ID', 'TTT'), False),
            (sequences.Fasta('ID', 'TTGTAA'), True),
            (sequences.Fasta('ID', 'ttgTAA'), True),
            (sequences.Fasta('ID', 'TTGTTTTAA'), True),
            (sequences.Fasta('ID', 'TTGTAATTTTAA'), False),
            (sequences.Fasta('ID', 'TTGTTTTGAA'), False),
        ]

        for t in tests:
            self.assertEqual(t[0].looks_like_gene(), t[1])

        sequences.genetic_code = 1
        self.assertFalse(sequences.Fasta('ID', 'ATTCAGTAA').looks_like_gene())
        sequences.genetic_code = 11
        self.assertTrue(sequences.Fasta('ID', 'ATTCAGTAA').looks_like_gene())
        sequences.genetic_code = 1
Exemplo n.º 10
0
 def test_gc_content(self):
     """Test GC content calculation works as expected"""
     tests = [
         (sequences.Fasta('ID', 'cgCG'), 1.0),
         (sequences.Fasta('ID', 'tTaA'), 0.0),
         (sequences.Fasta('ID', 'GCAT'), 0.5),
         (sequences.Fasta('ID', 'GCATNN'), 0.5),
         (sequences.Fasta('ID', 'GCATNNS'), 0.6),
         (sequences.Fasta('ID', 'GCATNNSK'), 0.5)
     ]
     for test, answer in tests:
         self.assertAlmostEqual(test.gc_content(), answer)
         self.assertAlmostEqual(test.gc_content(as_decimal=False), answer * 100)
Exemplo n.º 11
0
def scaffolds_to_contigs(infile, outfile, number_contigs=False):
    '''Makes a file of contigs from scaffolds by splitting at every N.
       Use number_contigs=True to add .1, .2, etc onto end of each
       contig, instead of default to append coordinates.'''
    seq_reader = sequences.file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        contigs = seq.contig_coords()
        counter = 1
        for contig in contigs:
            if number_contigs:
                name = seq.id + '.' + str(counter)
                counter += 1
            else:
                name = '.'.join([seq.id, str(contig.start + 1), str(contig.end + 1)])
            print(sequences.Fasta(name, seq[contig.start:contig.end+1]), file=fout)

    utils.close(fout)
Exemplo n.º 12
0
    def test_file_reader_embl(self):
        '''Test read embl file'''
        reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test.embl'))

        counter = 1
        for seq in reader:
            self.assertEqual(seq, sequences.Fasta('seq' + str(counter), expected_embl[counter-1]))
            counter += 1

        bad_files = [
            'sequences_test.embl.bad',
            'sequences_test.embl.bad2',
        ]
        bad_files = [os.path.join(data_dir, x) for x in bad_files]

        for filename in bad_files:
            with self.assertRaises(sequences.Error):
                reader = sequences.file_reader(filename)
                for seq in reader:
                    pass
Exemplo n.º 13
0
def make_random_contigs(contigs, length, outfile, name_by_letters=False, prefix='', seed=None, first_number=1):
    '''Makes a multi fasta file of random sequences, all the same length'''
    random.seed(a=seed)
    fout = utils.open_file_write(outfile)
    letters = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
    letters_index = 0

    for i in range(contigs):
        if name_by_letters:
            name = letters[letters_index]
            letters_index += 1
            if letters_index == len(letters):
                letters_index = 0
        else:
            name = str(i + first_number)

        fa = sequences.Fasta(prefix + name, ''.join([random.choice('ACGT') for x in range(length)]))
        print(fa, file=fout)

    utils.close(fout)
Exemplo n.º 14
0
Arquivo: tasks.py Projeto: nds/Fastaq
def to_fasta(infile,
             outfile,
             line_length=60,
             strip_after_first_whitespace=False):
    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)
    original_line_length = sequences.Fasta.line_length
    sequences.Fasta.line_length = line_length

    for seq in seq_reader:
        if strip_after_first_whitespace:
            seq.strip_after_first_whitespace()

        if type(seq) == sequences.Fastq:
            print(sequences.Fasta(seq.id, seq.seq), file=f_out)
        else:
            print(seq, file=f_out)

    utils.close(f_out)
    sequences.Fasta.line_length = original_line_length
Exemplo n.º 15
0
 def test_extend(self):
     '''Test extend'''
     ctg = contig.Contig(sequences.Fasta('ID', 'ACCGT'))
     self.assertEqual(ctg.extend(5, 2, 100), (0, 0))
     self.assertEqual(ctg.fa, sequences.Fasta('ID', 'ACCGT'))
     ctg.add_left_kmer('GT')
     self.assertEqual(ctg.extend(1, 2, 100), (2, 0))
     self.assertEqual(ctg.fa, sequences.Fasta('ID', 'GTACCGT'))
     self.assertEqual(ctg.extend(1, 2, 100), (0, 0))
     self.assertEqual(ctg.fa, sequences.Fasta('ID', 'GTACCGT'))
     ctg.add_right_kmer('TG')
     self.assertEqual(ctg.extend(1, 2, 100), (0, 2))
     self.assertEqual(ctg.fa, sequences.Fasta('ID', 'GTACCGTTG'))
     self.assertEqual(ctg.extend(1, 2, 100), (0, 0))
     self.assertEqual(ctg.fa, sequences.Fasta('ID', 'GTACCGTTG'))
     ctg.add_left_kmer('AG')
     ctg.add_right_kmer('GC')
     self.assertEqual(ctg.extend(1, 2, 100), (2, 2))
     self.assertEqual(ctg.fa, sequences.Fasta('ID', 'AGGTACCGTTGGC'))
Exemplo n.º 16
0
def merge_to_one_seq(infile, outfile, seqname='union'):
    '''Takes a multi fasta or fastq file and writes a new file that contains just one sequence, with the original sequences catted together, preserving their order'''
    seq_reader = sequences.file_reader(infile)
    seqs = []

    for seq in seq_reader:
        seqs.append(copy.copy(seq))

    new_seq = ''.join([seq.seq for seq in seqs])

    if type(seqs[0]) == sequences.Fastq:
        new_qual = ''.join([seq.qual for seq in seqs])
        seqs[:] = []
        merged = sequences.Fastq(seqname, new_seq, new_qual)
    else:
        merged = sequences.Fasta(seqname, new_seq)
        seqs[:] = []

    f = utils.open_file_write(outfile)
    print(merged, file=f)
    utils.close(f)
Exemplo n.º 17
0
def to_fasta(infile,
             outfile,
             line_length=60,
             strip_after_first_whitespace=False,
             check_unique=False):
    seq_reader = sequences.file_reader(infile)
    f_out = utils.open_file_write(outfile)
    original_line_length = sequences.Fasta.line_length
    sequences.Fasta.line_length = line_length
    if check_unique:
        used_names = {}

    for seq in seq_reader:
        if strip_after_first_whitespace:
            seq.strip_after_first_whitespace()

        if check_unique:
            used_names[seq.id] = used_names.get(seq.id, 0) + 1

        if type(seq) == sequences.Fastq:
            print(sequences.Fasta(seq.id, seq.seq), file=f_out)
        else:
            print(seq, file=f_out)

    utils.close(f_out)
    sequences.Fasta.line_length = original_line_length

    if check_unique:
        all_unique = True

        for name, count in used_names.items():
            if count > 1:
                print('Sequence name "' + name + '" not unique. Found',
                      count,
                      'times',
                      file=sys.stderr)
                all_unique = False

        if not all_unique:
            raise Error('Not all sequence names unique. Cannot continue')
Exemplo n.º 18
0
    def test_contig_coords(self):
        '''contig_coords() should get the coords of all contigs in a sequence correctly'''
        test_seqs = [sequences.Fasta('ID', 'ACGT'),
                     sequences.Fasta('ID', 'NACGT'),
                     sequences.Fasta('ID', 'NNACGT'),
                     sequences.Fasta('ID', 'ACGTN'),
                     sequences.Fasta('ID', 'ACGTNN'),
                     sequences.Fasta('ID', 'NANNCGT'),
                     sequences.Fasta('ID', 'ACNNNGTNA'),
                     sequences.Fasta('ID', 'ANNCGTNNAAAAA')]

        correct_coords = [[intervals.Interval(0,3)],
                         [intervals.Interval(1, 4)],
                         [intervals.Interval(2, 5)],
                         [intervals.Interval(0, 3)],
                         [intervals.Interval(0, 3)],
                         [intervals.Interval(1, 1), intervals.Interval(4,6)],
                         [intervals.Interval(0, 1), intervals.Interval(5, 6), intervals.Interval(8, 8)],
                         [intervals.Interval(0, 0), intervals.Interval(3, 5), intervals.Interval(8, 12)]]

        for i in range(len(test_seqs)):
            gaps = test_seqs[i].contig_coords()
            self.assertListEqual(correct_coords[i], gaps)
Exemplo n.º 19
0
    def test_strip_illumina_suffix(self):
        '''Check that /1 and /2 removed correctly from IDs'''
        seqs = [sequences.Fasta('name/1', 'A'),
                sequences.Fasta('name/2', 'A'),
                sequences.Fasta('name', 'A'),
                sequences.Fasta('name/1/2', 'A'),
                sequences.Fasta('name/2/1', 'A'),
                sequences.Fasta('name/3', 'A')]

        correct_names = ['name', 'name', 'name', 'name/1', 'name/2', 'name/3']

        for seq in seqs:
            seq.strip_illumina_suffix()

        for i in range(len(seqs)):
            self.assertEqual(seqs[i].id, correct_names[i])
	def run(self):
		'''Produce a filtered fasta file.'''	
		original_dir = os.getcwd()
		os.chdir(self.working_directory)
		small_contigs = set()
		contained_contigs = set()
		if len(self.contigs) > len(self.ids_to_skip):
			alignments = utils.run_nucmer(self.fasta_file, self.fasta_file, self._build_nucmer_filename(), min_percent_id=self.percent_match, run_promer=False)
			for id in self.contigs.keys():
				if not id in self.ids_to_skip:
					if len(self.contigs[id]) < self.cutoff_contig_length:
						small_contigs.add(id)
					else:
						for algn in alignments:
							if (not algn.is_self_hit()) \
							   and algn.qry_name == id \
							   and algn.ref_name != algn.qry_name \
							   and not algn.ref_name in contained_contigs \
							   and (algn.hit_length_qry/algn.qry_length) * 100 >= self.percent_match:
								contained_contigs.add(id)
					
			discard = small_contigs.union(contained_contigs)
			ids_file = utils.write_ids_to_file(discard, "contig.ids.discard")  
			tasks.filter(self.fasta_file, self.output_file, ids_file=ids_file, invert=True)	
								
			if not self.debug:
				utils.delete(ids_file)
				utils.delete(self._build_nucmer_filename())
		else:
			output_fw = fastaqutils.open_file_write(self.output_file)
			for contig_id in self.contigs:
				print(sequences.Fasta(contig_id, self.contigs[contig_id]), file=output_fw)
			fastaqutils.close(output_fw)
		
		self._write_summary(small_contigs, contained_contigs)	
		os.chdir(original_dir)
Exemplo n.º 21
0
    def test_replace_interval(self):
        '''Test replace_interval()'''
        fa = sequences.Fasta('ID', 'ACGTA')
        fa.replace_interval(0, 0, 'NEW')
        self.assertEqual(fa, sequences.Fasta('ID', 'NEWCGTA'))

        fa = sequences.Fasta('ID', 'ACGTA')
        fa.replace_interval(4, 4, 'NEW')
        self.assertEqual(fa, sequences.Fasta('ID', 'ACGTNEW'))

        fa = sequences.Fasta('ID', 'ACGTA')
        fa.replace_interval(2, 3, 'NEW')
        self.assertEqual(fa, sequences.Fasta('ID', 'ACNEWA'))

        fa = sequences.Fasta('ID', 'ACGTA')
        with self.assertRaises(sequences.Error):
            fa.replace_interval(3,2,'x')
        with self.assertRaises(sequences.Error):
            fa.replace_interval(1,5,'x')
        with self.assertRaises(sequences.Error):
            fa.replace_interval(5,10,'x')

        fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE')
        fq.replace_interval(0, 0, 'NEW', 'III')
        self.assertEqual(fq, sequences.Fastq('ID', 'NEWCGTA', 'IIIBCDE'))

        fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE')
        fq.replace_interval(4, 4, 'NEW', 'III')
        self.assertEqual(fq, sequences.Fastq('ID', 'ACGTNEW', 'ABCDIII'))

        fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE')
        fq.replace_interval(2, 3, 'NEW', 'III')
        self.assertEqual(fq, sequences.Fastq('ID', 'ACNEWA', 'ABIIIE'))

        with self.assertRaises(sequences.Error):
            fq.replace_interval(1,1,'x', 'xx')
Exemplo n.º 22
0
 def test_add_insertions(self):
     '''Test add_insertions'''
     fa = sequences.Fasta('X', 'acgtacgtacgt')
     fa.add_insertions(skip=4, window=0, test=True)
     self.assertEqual(fa, sequences.Fasta('X', 'acgtNacgtNacgt'))
Exemplo n.º 23
0
def run(description):
    parser = argparse.ArgumentParser(
        description = 'Makes perfect paired end fastq reads from a sequence file, with insert sizes sampled from a normal distribution. Read orientation is innies. Output is an interleaved FASTQ file.',
        usage = 'fastaq to_perfect_reads [options] <infile> <outfile> <mean insert size> <insert std deviation> <mean coverage> <read length>')
    parser.add_argument('infile', help='Name of input file')
    parser.add_argument('outfile', help='Name of output file')
    parser.add_argument('mean_insert', type=int, help='Mean insert size of read pairs', metavar='mean insert size')
    parser.add_argument('insert_std', type=float, help='Standard devation of insert size', metavar='insert std deviation')
    parser.add_argument('coverage', type=float, help='Mean coverage of the reads', metavar='mean coverage')
    parser.add_argument('readlength', type=int, help='Length of each read', metavar='read length')
    parser.add_argument('--fragments', help='Write FASTA sequences of fragments (i.e. read pairs plus sequences in between them) to the given filename', metavar='FILENAME')
    parser.add_argument('--no_n', action='store_true', help='Don\'t allow any N or n characters in the reads')
    parser.add_argument('--seed', type=int, help='Seed for random number generator. Default is to use python\'s default', default=None, metavar='INT')
    options = parser.parse_args()

    random.seed(a=options.seed)

    seq_reader = sequences.file_reader(options.infile)
    fout = utils.open_file_write(options.outfile)
    pair_counter = 1

    if options.fragments:
        fout_frags = utils.open_file_write(options.fragments)

    for ref in seq_reader:
        # check if current seq is long enough
        if len(ref) < options.mean_insert + 4 * options.insert_std:
            print('Warning, sequence ', ref.id, ' too short.  Skipping it...', file=sys.stderr)
            continue

        # work out how many reads to simulate
        read_pairs = int(0.5 * options.coverage * len(ref) / options.readlength)

        # it's possible that we pick the same fragment twice, in which case the
        # reads would get the same name. So remember the frag coords
        used_fragments = {}  # (middle_position, length) => count

        # do the simulation:  pick insert size from normal distribution, and
        # position in genome from uniform distribution
        x = 0
        while x < read_pairs:
            isize = int(random.normalvariate(options.mean_insert, options.insert_std))
            while isize > len(ref) or isize < options.readlength:
                isize = int(random.normalvariate(options.mean_insert, options.insert_std))
            middle_pos = random.randint(ceil(0.5 *isize), floor(len(ref) - 0.5 * isize))
            read_start1 = int(middle_pos - ceil(0.5 * isize))
            read_start2 = read_start1 + isize - options.readlength

            readname = ':'.join([ref.id, str(pair_counter), str(read_start1+1), str(read_start2+1)])

            fragment = (middle_pos, isize)
            if fragment in used_fragments:
                used_fragments[fragment] += 1
                readname += '.dup.' + str(used_fragments[fragment])
            else:
                used_fragments[fragment] = 1

            read1 = sequences.Fastq(readname + '/1', ref.seq[read_start1:read_start1 + options.readlength], 'I' * options.readlength)
            read2 = sequences.Fastq(readname + '/2', ref.seq[read_start2:read_start2 + options.readlength], 'I' * options.readlength)


            if options.no_n and ('n' in read1.seq or 'N' in read1.seq or 'n' in read2.seq or 'N' in read2.seq):
                continue

            read2.revcomp()

            print(read1, file=fout)
            print(read2, file=fout)

            if options.fragments:
                frag = sequences.Fasta(readname, ref.seq[read_start1:read_start2 + options.readlength])
                print(frag, file=fout_frags)

            pair_counter += 1
            x += 1

    utils.close(fout)
    if options.fragments:
        utils.close(fout_frags)
Exemplo n.º 24
0
 def setUp(self):
     self.fasta = sequences.Fasta('ID', 'ACGTA')
Exemplo n.º 25
0
    def test_make_into_gene_fasta(self):
        '''Test make_into_gene fasta'''
        print('sequences.genetic_code', sequences.genetic_code)
        tests = [
            (sequences.Fasta('ID', 'T'), None),
            (sequences.Fasta('ID', 'TT'), None),
            (sequences.Fasta('ID', 'TTT'), None),
            (sequences.Fasta('ID', 'TTG'), None),
            (sequences.Fasta('ID', 'TAA'), None),
            (sequences.Fasta('ID', 'TTGAAATAA'), (sequences.Fasta('ID', 'TTGAAATAA'), '+', 0)),
            (sequences.Fasta('ID', 'TTGAAATAT'), None),
            (sequences.Fasta('ID', 'TTGTAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 0)),
            (sequences.Fasta('ID', 'TTGTAAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 0)),
            (sequences.Fasta('ID', 'TTGTAAAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 0)),
            (sequences.Fasta('ID', 'TTGTAAAAA'), None),
            (sequences.Fasta('ID', 'ATTGTAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 1)),
            (sequences.Fasta('ID', 'ATTGTAAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 1)),
            (sequences.Fasta('ID', 'ATTGTAAAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 1)),
            (sequences.Fasta('ID', 'ATTGTAAAAA'), None),
            (sequences.Fasta('ID', 'AATTGTAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 2)),
            (sequences.Fasta('ID', 'AATTGTAAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 2)),
            (sequences.Fasta('ID', 'AATTGTAAAA'), (sequences.Fasta('ID', 'TTGTAA'), '+', 2)),
            (sequences.Fasta('ID', 'AATTGTAAAAA'), None),
            (sequences.Fasta('ID', 'TTACAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 0)),
            (sequences.Fasta('ID', 'ATTACAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 0)),
            (sequences.Fasta('ID', 'AATTACAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 0)),
            (sequences.Fasta('ID', 'AAATTACAA'), None),
            (sequences.Fasta('ID', 'TTACAAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 1)),
            (sequences.Fasta('ID', 'ATTACAAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 1)),
            (sequences.Fasta('ID', 'AATTACAAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 1)),
            (sequences.Fasta('ID', 'AAATTACAAA'), None),
            (sequences.Fasta('ID', 'TTACAAAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 2)),
            (sequences.Fasta('ID', 'ATTACAAAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 2)),
            (sequences.Fasta('ID', 'AATTACAAAA'), (sequences.Fasta('ID', 'TTGTAA'), '-', 2)),
            (sequences.Fasta('ID', 'AAATTACAAAA'), None),
        ]

        for seq, expected in tests:
            self.assertEqual(seq.make_into_gene(), expected)
Exemplo n.º 26
0
 def test_to_Fasta_and_qual(self):
     '''Check to_Fasta_and_qual converts quality scores correctly'''
     fq = sequences.Fastq('ID', 'ACGT', '>ADI')
     (fa, qual) = fq.to_Fasta_and_qual()
     self.assertEqual(fa, sequences.Fasta('ID', 'ACGT'))
     self.assertListEqual(qual, [29, 32, 35, 40])
Exemplo n.º 27
0
 def test_subseq(self):
     '''Test subseq'''
     fa = sequences.Fasta('name', 'ACGTA')
     self.assertEqual(fa.subseq(1,4), sequences.Fasta('name', 'CGT'))
     self.assertEqual(fa.subseq(None,4), sequences.Fasta('name', 'ACGT'))
     self.assertEqual(fa.subseq(1,None), sequences.Fasta('name', 'CGTA'))
Exemplo n.º 28
0
    def test_is_all_Ns(self):
        '''Test is_all_Ns()'''
        self.assertTrue(sequences.Fasta('ID', 'n').is_all_Ns())
        self.assertTrue(sequences.Fasta('ID', 'N').is_all_Ns())
        self.assertTrue(sequences.Fasta('ID', 'nNn').is_all_Ns())
        self.assertFalse(sequences.Fasta('ID', 'a').is_all_Ns())
        self.assertFalse(sequences.Fasta('ID', '').is_all_Ns())
        self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns())
        self.assertFalse(sequences.Fasta('ID', 'naN').is_all_Ns())
        self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(start=0, end=0))
        self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(start=0, end=1))
        self.assertTrue(sequences.Fasta('ID', 'anNg').is_all_Ns(start=1, end=1))
        self.assertTrue(sequences.Fasta('ID', 'anNg').is_all_Ns(start=1, end=2))
        self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(start=1))
        self.assertTrue(sequences.Fasta('ID', 'anN').is_all_Ns(start=1))
        self.assertFalse(sequences.Fasta('ID', 'anNg').is_all_Ns(end=1))
        self.assertTrue(sequences.Fasta('ID', 'nNA').is_all_Ns(end=1))

        with self.assertRaises(sequences.Error):
            sequences.Fasta('ID', 'anNg').is_all_Ns(start=1, end=0)
Exemplo n.º 29
0
    def test_expand_nucleotides(self):
        '''Test expand_nucleotides'''
        tests = [
            (sequences.Fasta('1', 'A'), [sequences.Fasta('1.1', 'A')]),
            (sequences.Fasta('2', 'C'), [sequences.Fasta('2.1', 'C')]),
            (sequences.Fasta('3', 'G'), [sequences.Fasta('3.1', 'G')]),
            (sequences.Fasta('4', 'T'), [sequences.Fasta('4.1', 'T')]),
            (sequences.Fasta('6', 'R'), [sequences.Fasta('6.1', 'A'), sequences.Fasta('6.2', 'G')]),
            (sequences.Fasta('7', 'Y'), [sequences.Fasta('7.1', 'C'), sequences.Fasta('7.2', 'T')]),
            (sequences.Fasta('8', 'S'), [sequences.Fasta('8.1', 'C'), sequences.Fasta('8.2', 'G')]),
            (sequences.Fasta('9', 'W'), [sequences.Fasta('9.1', 'A'), sequences.Fasta('9.2', 'T')]),
            (sequences.Fasta('10', 'K'), [sequences.Fasta('10.1', 'G'), sequences.Fasta('10.2', 'T')]),
            (sequences.Fasta('11', 'M'), [sequences.Fasta('11.1', 'A'), sequences.Fasta('11.2', 'C')]),
            (sequences.Fasta('12', 'B'), [sequences.Fasta('12.1', 'C'), sequences.Fasta('12.2', 'G'), sequences.Fasta('12.3', 'T')]),
            (sequences.Fasta('13', 'D'), [sequences.Fasta('13.1', 'A'), sequences.Fasta('13.2', 'G'), sequences.Fasta('13.3', 'T')]),
            (sequences.Fasta('14', 'H'), [sequences.Fasta('14.1', 'A'), sequences.Fasta('14.2', 'C'), sequences.Fasta('14.3', 'T')]),
            (sequences.Fasta('15', 'V'), [sequences.Fasta('15.1', 'A'), sequences.Fasta('15.2', 'C'), sequences.Fasta('15.3', 'G')]),
            (sequences.Fasta('16', 'N'), [sequences.Fasta('16.1', 'A'), sequences.Fasta('16.2', 'C'), sequences.Fasta('16.3', 'G'), sequences.Fasta('16.4', 'T')]),
            (sequences.Fasta('17', 'ART'), [sequences.Fasta('17.1', 'AAT'), sequences.Fasta('17.2', 'AGT')]),
            (sequences.Fasta('18', 'ARRT'), [sequences.Fasta('18.1', 'AAAT'), sequences.Fasta('18.2', 'AAGT'), sequences.Fasta('18.3', 'AGAT'), sequences.Fasta('18.4', 'AGGT')]),
            (sequences.Fasta('19', 'ARTR'), [sequences.Fasta('19.1', 'AATA'), sequences.Fasta('19.2', 'AATG'), sequences.Fasta('19.3', 'AGTA'), sequences.Fasta('19.4', 'AGTG')]),
            (sequences.Fastq('20', 'ART', 'GHI'), [sequences.Fastq('20.1', 'AAT', 'GHI'), sequences.Fastq('20.2', 'AGT', 'GHI')]),
        ]

        for t in tests:
            self.assertListEqual(t[0].expand_nucleotides(), t[1])
Exemplo n.º 30
0
 def test_replace_bases(self):
     '''Check that bases get replaced correctly'''
     fa = sequences.Fasta('X', 'AUCGTUUACT')
     fa.replace_bases('U', 'T')
     self.assertEqual(fa, sequences.Fasta('X', 'ATCGTTTACT'))