Exemplo n.º 1
0
 def test_raise_exception(self):
     '''open_file_write() and open_file_read() should raise an exception when can't do the opening'''
     with self.assertRaises(utils.Error):
         utils.open_file_read('this_file_is_not_here_so_throw_error')
     with self.assertRaises(utils.Error):
         utils.open_file_read('this_file_is_not_here_so_throw_error.gz')
     with self.assertRaises(utils.Error):
         utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error'))
     with self.assertRaises(utils.Error):
         utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error.gz'))
Exemplo n.º 2
0
def barplot_by_input_data(stat_to_plot, prefix, plot_width=21, plot_height=7):
    bar_heights = []
    colours = []

    for t in test_data_types:
        for scaff in scaffolders:
            bar_heights.append(results[t].results[scaff][stat_to_plot])
            colours.append(r_colours[scaff])

    r_script = prefix + '.R'
    f = utils.open_file_write(r_script)

    for type in ['png', 'pdf']:
        print(type + '("' + outprefix + '.' + type + '", width=', plot_width, ', height=', plot_height, ')', file=f)

        print('barplot(c(' + ','.join(bar_heights), '), ',
                      #'names.arg=', names, ', '
                      ' ylab="', stat_to_plot, '", ',
                      'col=c(', ','.join(['"' + x + '"' for x in colours]), ') ',
                      ')', sep='', file=f)

        print('dev.off()', file=f)

    utils.close(f)
    run_r_script(r_script)
Exemplo n.º 3
0
    def plot_scatter(self, stat1, stat2, outprefix, legend=False, main=''):
        r_script = outprefix + '.R'
        f = utils.open_file_write(r_script)

        x_coords = [int(self.results[scaff][stat1]) for scaff in scaffolders if (self.data_type, scaff) not in bad_runs]
        y_coords = [int(self.results[scaff][stat2]) for scaff in scaffolders if (self.data_type, scaff) not in bad_runs]
        x_max = max(x_coords)
        y_max = max(y_coords)
        r_syms_v, r_syms_l, r_cols_v, r_cols_l = self.get_r_vectors()

        for type in ['pdf', 'png', 'svg']:
            print(type + '("' + outprefix + '.' + type + '")', file=f)

            print('plot(c(' + ','.join(str(x) for x in x_coords), '), ',
                  'c(', ','.join(str(x) for x in y_coords), '), ',
                  'xlab="', stat1, '", ',
                  'ylab="', stat2, '", ',
                  #'xlim=c(0,', x_max, '), ',
                  #'ylim=c(0,', y_max, '), ',
                  'main="', main, '",',
                  'col=', r_cols_v, ', ',
                  'pch=', r_syms_v, ', ',
                  'bg=', r_cols_v,
                  ')', sep='', file=f)

            if legend:
                print(r_legend('topleft'), file=f)

            print('dev.off()', file=f)

        utils.close(f)
        run_r_script(r_script)
Exemplo n.º 4
0
def interleave(infile_1, infile_2, outfile):
    seq_reader_1 = file_reader(infile_1)
    seq_reader_2 = file_reader(infile_2)
    f_out = utils.open_file_write(outfile)

    for seq_1 in seq_reader_1:
        try:
            seq_2 = next(seq_reader_2)
        except:
            utils.close(f_out)
            raise Error('Error getting mate for sequence', seq_1.id,
                        ' ... cannot continue')

        print(seq_1, file=f_out)
        print(seq_2, file=f_out)

    try:
        seq_2 = next(seq_reader_2)
    except:
        seq_2 = None

    if seq_2 is not None:
        utils.close(f_out)
        raise Error('Error getting mate for sequence', seq_2.id,
                    ' ... cannot continue')

    utils.close(f_out)
Exemplo n.º 5
0
    def barplot_of_one_stat_sorted(self, stat, outprefix, main='', stat2=None):
        r_script = outprefix + '.R'
        f = utils.open_file_write(r_script)

        if stat2 is None:
            bar_heights = [self.results[scaff][stat] for scaff in scaffolders if (self.data_type, scaff) not in bad_runs]
        else:
            bar_heights = [self.results[scaff][stat] + self.results[scaff][stat2] for scaff in scaffolders if (self.data_type, scaff) not in bad_runs]

        all_data = list(zip([int(x) for x in bar_heights], [scaff for scaff in scaffolders if (self.data_type, scaff) not in bad_runs], [r_colours[scaff] for scaff in scaffolders if (self.data_type, scaff) not in bad_runs]))


        all_data.sort()
        bar_heights = [str(x[0]) for x in all_data]
        names = 'c(' + ','.join(['"' + x[1] + '"' for x in all_data]) + ')'
        cols = ['"' + t[2] + '"' for t in all_data]

        for type in ['pdf', 'png', 'svg']:
            print(type + '("' + outprefix + '.' + type + '")', file=f)
            print('par(mar=c(10,4,4,2) + 0.1)', file=f)

            print('barplot(c(' + ','.join(bar_heights), '), ',
                  'names.arg=', names, ', '
                  'main="', main, '",',
                  ' ylab="', stat, '", ',
                  'col=c(' + ','.join(cols) + '), ',
                  'las=2',
                  ')', sep='', file=f)

            print('dev.off()', file=f)

        utils.close(f)
        run_r_script(r_script)
def print_dict_as_tsv(d, filename):
    f = utils.open_file_write(filename)

    for id in d:
        for interval in d[id]:
            print(id, interval.start+1, interval.end+1, sep='\t', file=f)

    utils.close(f)
Exemplo n.º 7
0
    def write_gff(self, filename):
        # sort the output by reference name then position
        f = utils.open_file_write(filename)

        for k in sorted(self.mutations.keys()):
            print(self.mutations[k].to_gff(), file=f)

        utils.close(f)
Exemplo n.º 8
0
def replace_bases(infile, outfile, old, new):
    seq_reader = file_reader(infile)
    f_out = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.replace_bases(old, new)
        print(seq, file=f_out)

    utils.close(f_out)
Exemplo n.º 9
0
def reverse_complement(infile, outfile):
    seq_reader = file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.revcomp()
        print(seq, file=fout)

    utils.close(fout)
Exemplo n.º 10
0
 def test_file_reader_mpileup(self):
     '''file_reader should iterate through a pileup file correctly'''
     tmp_out = 'tmp.mpileup'
     fout = utils.open_file_write(tmp_out)
     mpileup_reader = mpileup.file_reader('mpileup_unittest.mpileup')
     for mp in mpileup_reader:
         print(mp, file=fout)
     utils.close(fout)
     self.assertTrue(filecmp.cmp('mpileup_unittest.mpileup', tmp_out))
     os.unlink(tmp_out)
Exemplo n.º 11
0
def fastn_to_quasr_primers(infile, outfile):
    seq_reader = file_reader(infile)
    f_out = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq2 = copy.copy(seq)
        seq2.revcomp()
        print(seq.seq, seq2.seq, sep='\t', file=f_out)

    utils.close(f_out)
Exemplo n.º 12
0
 def test_file_reader(self):
     '''file_reader should iterate through a nucmer file correctly'''
     tmp_out = 'nucmer_unittest.coords.tmp'
     fout = utils.open_file_write(tmp_out)
     nucmer_reader = nucmer.file_reader('nucmer_unittest.coords')
     for hit in nucmer_reader:
         print(hit, file=fout)
     utils.close(fout)
     self.assertTrue(filecmp.cmp('nucmer_unittest.coords.out', tmp_out))
     os.unlink(tmp_out)
Exemplo n.º 13
0
def trim(infile, outfile, start, end):
    seq_reader = file_reader(infile)
    fout = utils.open_file_write(outfile)

    for seq in seq_reader:
        seq.trim(start, end)
        if len(seq):
            print(seq, file=fout)

    utils.close(fout)
Exemplo n.º 14
0
 def test_file_reader_sam(self):
     '''file_reader should iterate through a BAM file correctly'''
     tmp_sam_out = 'tmp.sam'
     fout = utils.open_file_write(tmp_sam_out)
     sam_reader = sam.file_reader('sam_unittest.bam')
     for sam_record in sam_reader:
         print(sam_record, file=fout)
     utils.close(fout)
     self.assertTrue(filecmp.cmp('sam_unittest.sam', tmp_sam_out))
     os.unlink(tmp_sam_out)
Exemplo n.º 15
0
    def test_file_reader(self):
        '''file_reader should iterate through a blast file correctly'''
        tmp_out = 'blast_unittest.m8.tmp'

        for f in ['blast_unittest.m8', 'blast_unittest.m8.with_lengths']:
            blast_reader = blast.file_reader('blast_unittest.m8')
            fout = utils.open_file_write(tmp_out)
            for hit in blast_reader:
                print(hit, file=fout)
            utils.close(fout)
            self.assertTrue(filecmp.cmp('blast_unittest.m8', tmp_out))
            os.unlink(tmp_out)
Exemplo n.º 16
0
def deinterleave(infile, outfile_1, outfile_2, fasta_out=False):
    seq_reader = file_reader(infile)
    f_1 = utils.open_file_write(outfile_1)
    f_2 = utils.open_file_write(outfile_2)
    for seq in seq_reader:
        if fasta_out:
            print(Fasta(seq.id, seq.seq), file=f_1)
        else:
            print(seq, file=f_1)
        try:
            next(seq_reader)
        except StopIteration:
            utils.close(f_1)
            utils.close(f_2)
            raise Error('Error getting mate for sequence. Cannot continue')
        if fasta_out:
            print(Fasta(seq.id, seq.seq), file=f_2)
        else:
            print(seq, file=f_2)

    utils.close(f_1)
    utils.close(f_2)
Exemplo n.º 17
0
def add_sequence_lengths(infile, ref_fai, qry_fai, outfile):
    ref_lengths = {}
    qry_lengths = {}

    fastn.lengths_from_fai(ref_fai, ref_lengths)
    fastn.lengths_from_fai(qry_fai, qry_lengths)

    f = utils.open_file_write(outfile)
    blast_reader = file_reader(infile)
    for hit in blast_reader:
        hit.add_sequence_lengths(ref_lengths, qry_lengths)
        print(hit, file=f)
    utils.close(f)
Exemplo n.º 18
0
def add_sequence_lengths(infile, ref_fai, qry_fai, outfile):
    ref_lengths = {}
    qry_lengths = {}

    fastn.lengths_from_fai(ref_fai, ref_lengths)
    fastn.lengths_from_fai(qry_fai, qry_lengths)

    f = utils.open_file_write(outfile)
    blast_reader = file_reader(infile)
    for hit in blast_reader:
        hit.add_sequence_lengths(ref_lengths, qry_lengths)
        print(hit, file=f)
    utils.close(f)
Exemplo n.º 19
0
def split_by_base_count(infile, outfiles_prefix, max_bases, max_seqs=None):
    '''Splits a fasta/q file into separate files, file size determined by number of bases.

    Puts <= max_bases in each split file The eException is a single sequence >=max_bases
    is put in its own file.  This does not split sequences.
    '''
    seq_reader = file_reader(infile)
    base_count = 0
    file_count = 1
    seq_count = 0
    fout = None
    if max_seqs is None:
        max_seqs = float('inf')

    for seq in seq_reader:
        if base_count == 0:
            fout = utils.open_file_write(outfiles_prefix + '.' +
                                         str(file_count))
            file_count += 1

        if base_count + len(seq) > max_bases or seq_count >= max_seqs:
            if base_count == 0:
                print(seq, file=fout)
                utils.close(fout)
            else:
                utils.close(fout)
                fout = utils.open_file_write(outfiles_prefix + '.' +
                                             str(file_count))
                print(seq, file=fout)
                base_count = len(seq)
                file_count += 1
                seq_count = 1
        else:
            base_count += len(seq)
            seq_count += 1
            print(seq, file=fout)

    utils.close(fout)
Exemplo n.º 20
0
def fasta_to_fastq(fasta_in, qual_in, outfile):
    fa_reader = file_reader(fasta_in)
    qual_reader = file_reader(qual_in, read_quals=True)
    f_out = utils.open_file_write(outfile)

    for seq in fa_reader:
        qual = next(qual_reader)
        if seq.id != qual.id:
            raise Error('Mismatch in names from fasta and qual file', seq.id,
                        qual.id)

        qual.seq = [int(x) for x in qual.seq.split()]
        print(seq.to_Fastq(qual.seq), file=f_out)

    utils.close(f_out)
def make_tags_file(filename, seqs_for_tagging, tag_length):
    tags = {}
    f = utils.open_file_write(filename)

    for id, seq in seqs_for_tagging.items():
        tag = make_tag_from_fastn(seq, tag_length)
        try:
            print(str(tag.to_fasta()), file=f)
        except:
            print('Error! str(tag.to_fasta()), tag=', tag)
            sys.exit(1)
        tags[seq.id] = copy.copy(tag)

    utils.close(f)
    return tags
Exemplo n.º 22
0
    def test_write_and_read(self):
        '''open_file_write() and open_file_read() should do the right thing depending gzipped or not'''
        for filename in ['utils.tmp', 'utils.tmp.gz', 'utils.tmp.bgz']:
            f = utils.open_file_write(filename)
            for i in range(3):
                print(i, file=f)
            utils.close(f)

            counter = 0

            f = utils.open_file_read(filename)
            for line in f:
                self.assertEqual(counter, int(line.strip()))
                counter += 1
            utils.close(f)

            os.unlink(filename)
Exemplo n.º 23
0
def fastq_to_mira_xml(infile, outfile):
    seq_reader = file_reader(infile)
    fout = utils.open_file_write(outfile)
    print('<?xml version="1.0"?>', '<trace_volume>', sep='\n', file=fout)

    for seq in seq_reader:
        print('    <trace>',
              '        <trace_name>' + seq.id + '</trace_name>',
              '        <clip_quality_right>' + str(len(seq)) +
              '</clip_quality_right>',
              '        <clip_vector_left>1</clip_vector_left>',
              '    </trace>',
              sep='\n',
              file=fout)

    print('</trace_volume>', file=fout)
    utils.close(fout)
Exemplo n.º 24
0
    def barplot_beside(self, stat1, stat2, y_label, outprefix, main=''):
        r_script = outprefix + '.R'
        f = utils.open_file_write(r_script)
        r_syms_v, r_syms_l, r_cols_v, r_cols_l = self.get_r_vectors()

        bar_heights1 = [self.results[scaff][stat1] for scaff in scaffolders if (self.data_type, scaff) not in bad_runs]
        bar_heights2 = [self.results[scaff][stat2] for scaff in scaffolders if (self.data_type, scaff) not in bad_runs]
        #names = 'c(' + ','.join(['"' + x + '"' for x in scaffolders]) + ')'

        #all_data = [(max(int(bar_heights1[i]), int(bar_heights2[i])), bar_heights1[i], bar_heights2[i], scaffolders[i]) for i in range(len(bar_heights1))]
        all_data = [(int(bar_heights1[i]), bar_heights1[i], bar_heights2[i], scaffolders[i]) for i in range(len(bar_heights1)) if (self.data_type, scaffolders[i]) not in bad_runs]
        all_data.sort()
        bar_heights1 = [all_data[i][1] for i in range(len(all_data))]
        bar_heights2 = [all_data[i][2] for i in range(len(all_data))]
        names = 'c(' + ','.join(['"' + all_data[i][3] + '"' for i in range(len(all_data))]) + ')'
        #cols = [r_colours[x] for x in names]

        print(r''' col2trans = function(colour) {
    x=as.vector(col2rgb(colour))
    return(rgb(x[1],x[2],x[3],20,maxColorValue=255))
}
''', file=f)
        print('cols=', r_cols_v, file=f)
        print('trans_cols=sapply(cols, col2trans)', file=f)

        for type in ['pdf', 'png', 'svg']:
            print(type + '("' + outprefix + '.' + type + '")', file=f)
            print('par(mar=c(10,4,4,2) + 0.1)', file=f)
            print('bar_heights1 = c(' + ','.join(bar_heights1) + ')',
                  'bar_heights2 = c(' + ','.join(bar_heights2) + ')',
                  'bar_heights = t(as.matrix(data.frame(bar_heights1, bar_heights2)))', sep='\n', file=f)
            print('barplot(bar_heights, names.arg=', names, ', '
                  ' ylab="', y_label, '", ',
                  #'col=c(cols, col2trans) ',
                  'col=c("black", "gray"), ',
                  'beside=T,',
                  'main="', main,  '",',
                  'las=2)', sep='', file=f)

            print('dev.off()', file=f)

        utils.close(f)
        run_r_script(r_script)
Exemplo n.º 25
0
    def test_print_line_length(self):
        '''__str__ should be formatted correctly with the right number of chars per line of sequence'''
        line_lengths = [0, 3]
        correct_files = [
            'fastn_unittest_one-per-line.fa', 'fastn_unittest_3-per-line.fa'
        ]

        for i in range(len(line_lengths)):
            seq_reader = fastn.file_reader('fastn_unittest_one-per-line.fa')
            fastn.Fasta.line_length = line_lengths[i]
            tmp_out = 'tmp.line_length_test.fa'
            f = utils.open_file_write(tmp_out)
            for s in seq_reader:
                print(s, file=f)
            utils.close(f)
            self.assertTrue(filecmp.cmp(correct_files[i], tmp_out))
            os.unlink(tmp_out)

        fastn.Fasta.line_length = 60
Exemplo n.º 26
0
    def barplot_of_one_stat_coloured(self, stat, outprefix, main=''):
        r_script = outprefix + '.R'
        f = utils.open_file_write(r_script)
        r_syms_v, r_syms_l, r_cols_v, r_cols_l = self.get_r_vectors()

        bar_heights = [self.results[scaff][stat] for scaff in scaffolders if (self.data_type, scaff) not in bad_runs]
        names = 'c(' + ','.join(['"' + x + '"' for x in scaffolders if (self.data_type, x) not in bad_runs]) + ')'

        for type in ['pdf', 'png', 'svg']:
            print(type + '("' + outprefix + '.' + type + '")', file=f)

            print('barplot(c(' + ','.join(bar_heights), '), ',
                  'names.arg=', names, ', '
                  'main="', main, '",',
                  ' ylab="', stat, '", ',
                  'col=', r_cols_v, ', ',
                  ')', sep='', file=f)

            print('dev.off()', file=f)

        utils.close(f)
        run_r_script(r_script)
Exemplo n.º 27
0
#!/usr/bin/env python3.3

import argparse
import fastn
import utils

parser = argparse.ArgumentParser(
    description='Gets all IDs from a fasta or fastq file',
    usage='%(prog)s <infile> <outfile>')
parser.add_argument('infile', help='Name of fasta/q file to be read')
parser.add_argument('outfile', help='Name of output file')
options = parser.parse_args()

seq_reader = fastn.file_reader(options.infile)
f_out = utils.open_file_write(options.outfile)

for seq in seq_reader:
    print(seq.id, file=f_out)

utils.close(f_out)
Exemplo n.º 28
0
        hit_start = sam_record.cigar.operations[0].number

    if sam_record.cigar.operations[-1].operator == 'S':
        hit_end = len(sam_record.seq) - sam_record.cigar.operations[-1].number

    if sam_record.id not in read_hit_coords:
        read_hit_coords[sam_record.id] = []

    read_hit_coords[sam_record.id].append(genome_intervals.Interval(hit_start - 1, hit_end - 1))

external_progs.bwa_index_clean(bwa_index)
os.unlink(bwa_sam)


seq_reader = fastn.file_reader(options.reads_in)
f_fa = utils.open_file_write(options.outprefix + '.fq')
f_log = utils.open_file_write(options.outprefix + '.log')

for seq in seq_reader:
    if seq.id not in read_hit_coords:
        print(seq, file=f_fa)
        print(seq.id, 'no hit', sep='\t', file=f_log)
    else:
        hits = read_hit_coords[seq.id]
        genome_intervals.merge_overlapping_in_list(hits)
        i = 0

        while i < len(hits) - 1:
            if hits[i+1].start - hits[i].end <= options.join_distance:
                hits[i] = hits[i].union_fill_gap(hits[i+1])
                hits.pop(i+1)
    if qry_index is not None and ref_index is not None:
        clusters[qry_index].add(hit.ref_name)
        if qry_index != ref_index:
            clusters[qry_index].update(clusters[ref_index])
            clusters.pop(ref_index)
    elif qry_index is not None:
        clusters[qry_index].add(hit.ref_name)
    elif ref_index is not None:
        clusters[ref_index].add(hit.qry_name)
    else:
        clusters.append(set([hit.qry_name, hit.ref_name]))

# print clusters ordered by size of cluster
clusters.sort(key=len, reverse=True)
f = utils.open_file_write(options.outfile)

for i in range(len(clusters)):
    for x in clusters[i]:
        unused_seqs.discard(x)
        print(i + 1, x, strands[x], sep='\t', file=f)

counter = -1
for x in sorted(unused_seqs):
    print(counter, x, sep='\t', file=f)
    counter -= 1

utils.close(f)

assembled_seqs = []
utils.syscall(external_progs.bowtie2_align + ' -f -x ' + options.scaffolds_fa +
              ' -U ' + tags_fa_file + ' -S ' + samfile)
utils.syscall('samtools view -T ' + options.scaffolds_fa + ' -bS ' + samfile +
              ' > ' + bamfile)
os.unlink(samfile)
utils.syscall('samtools sort ' + bamfile + ' ' + sorted_bamfile[0:-4])
#os.unlink(bamfile)

# Load the hits into memory
previous_sam = None
previous_tag = None
sam_reader = sam.file_reader(sorted_bamfile)
flag_counts = {k: 0 for k in [0, 1, 2, 4, 5, 8, 12, 16]}
tags_from_bam = set()
tag_distances = []
f_log = utils.open_file_write(options.outprefix + '.log')
f_tags_and_sam = utils.open_file_write(options.outprefix + '.tags_and_sam.gz')
skipped_tags = 0

for current_sam in sam_reader:
    if current_sam.is_mapped():
        tags_from_bam.add(current_sam.id)
        if current_sam.tags['AS'][1] != 0:
            print('Nonzero alignemnt score', current_sam, file=f_log)
        if 'XS' in current_sam.tags and current_sam.tags['XS'][
                1] >= current_sam.tags['AS'][1]:
            print('Non-unique best hit', current_sam, file=f_log)
    else:
        print('Unmapped', current_sam, file=f_log)

    if previous_sam is None:
    print(scaff)
    scaff_dir = options.dir_in + '/' + scaff
    results[scaff]['flag_counts'] = get_scaff_results(scaff_dir)

    bsub_outfile = scaff_dir + '/' + scaff.split('.')[0].lower() + '.o'
    bsub_out = utils.syscall_get_stdout('bsub-out2stats.py -s ' + bsub_outfile)
    assert len(bsub_out) == 1
    (attempt_no, exit_code, wall_hrs, cpu_secs, cpu_hrs, mem, swap,
     filename) = bsub_out[0].split('\t')
    assert exit_code == '0'

    results[scaff]['CPU'] = int(round(float(cpu_secs), 0))
    results[scaff]['mem'] = mem

# make a tsv file of all the stats
f = utils.open_file_write(options.outprefix + '.stats.tsv')
print('Scaffolder',
      'Good joins',
      '\t'.join([str(x) for x in possible_flags if x not in [0, 16]]),
      'Bad joins',
      'Total joins',
      '% correct joins',
      'Lost tags',
      'Skipped tags',
      'CPU',
      'Mem',
      'Extra CPU',
      'Extra Mem',
      sep='\t',
      file=f)