Exemplo n.º 1
0
def process(pathnames, good_coverage, bad_coverage, randomization_rate,
            nseconds, use_pbar):
    """
    @param pathnames: paths to files to process
    @param good_coverage: the expected number of reads at informative positions
    @param bad_coverage: the expected number of reads at uninformative positions
    @param randomization_rate: the probability of an error per read
    @param nseconds: None or impose a time limit of this many seconds
    @param use_pbar: True iff a progress bar should be used
    @return: the multi-line string of the resulting csv file
    """
    # define the three models
    homozygous = ReadCoverage.Homozygous(randomization_rate, good_coverage)
    heterozygous = ReadCoverage.Heterozygous(randomization_rate, good_coverage)
    overcovered = ReadCoverage.Overcovered(randomization_rate, bad_coverage)
    models = [homozygous, heterozygous, overcovered]
    # define the oracle
    cache_size = 100000
    oracle = Oracle(models, cache_size)
    # do some initialization
    out = StringIO()
    start_time = time.time()
    nfiles = len(pathnames)
    pbar = Progress.Bar(nfiles) if (use_pbar and nfiles > 1) else None
    chromosome_dict = {}
    termination_reason = 'finished the analysis'
    try:
        for i, pathname in enumerate(pathnames):
            with open(pathname) as fin:
                lines = fin.readlines()
                lines = [line.strip() for line in lines]
                lines = [line for line in lines if line]
                # validate the number of lines
                if len(lines) < 2:
                    raise ValueError(
                        'there should be at least two lines of input')
                # break the lines of input into rows of elements
                rows = [line_to_row(line) for line in lines]
                # validate the columns of data
                ncolumns_expected = 8
                ncolumns = len(rows[0])
                if ncolumns != ncolumns_expected:
                    raise ValueError('expected %d columns of input' %
                                     ncolumns_expected)
                for row in rows:
                    if len(row) != ncolumns:
                        raise ValueError(
                            'each row of input should have the same number of elements as the first row'
                        )
                # process the data rows
                data_rows = rows[1:]
                for row in data_rows:
                    if nseconds and time.time() - start_time > nseconds:
                        raise TimeoutError()
                    process_genomic_position(row, chromosome_dict, oracle)
            if pbar:
                pbar.update(i + 1)
    except KeyboardInterrupt, e:
        termination_reason = 'early termination by control-c'
Exemplo n.º 2
0
def process(input_lines, good_coverage, bad_coverage, randomization_rate, nstickinesses, nseconds, use_pbar):
    """
    @param input_lines: lines of input of csv data including the header
    @param good_coverage: the expected number of reads at informative positions
    @param bad_coverage: the expected number of reads at uninformative positions
    @param randomization_rate: the probability of an error per read
    @param nstickinesses: use this many different levels of stickiness
    @param nseconds: None or impose a time limit of this many seconds
    @param use_pbar: True iff a progress bar should be used
    @return: the multi-line string of the resulting csv file
    """
    # do some initialization
    out = StringIO()
    pbar = None
    start_time = time.time()
    # define the three models
    homozygous = ReadCoverage.Homozygous(randomization_rate, good_coverage)
    heterozygous = ReadCoverage.Heterozygous(randomization_rate, good_coverage)
    overcovered = ReadCoverage.Overcovered(randomization_rate, bad_coverage)
    models = [homozygous, heterozygous, overcovered]
    model_names = ['hom', 'het', 'ovr']
    # read the chromosome data
    chromosomes = parse(input_lines)
    # write the header line
    header_row = []
    header_row.extend([
        'genetic_line', 'chromosome', 'position',
        'A_count', 'C_count', 'G_count', 'T_count', 'gap_count'])
    for stickiness in range(nstickinesses):
        for name in ('call', 'hom', 'het', 'bad'):
            header_row.append('%s_%d' % (name, stickiness))
    print >> out, ','.join(header_row)
    # prepare to annotate the chromosomes
    if use_pbar:
        count = 0
        pbar = Progress.Bar(len(chromosomes)*nstickinesses)
    # annotate the chromosomes using the models
    for i, chromosome in enumerate(chromosomes):
        for stickiness in range(nstickinesses):
            if nseconds and time.time() - start_time > nseconds:
                raise TimeoutError()
            chromosome.annotate_posteriors(stickiness, models)
            if pbar:
                count += 1
                pbar.update(count)
        print >> out, '\n'.join(','.join(row) for row in chromosome.get_rows_of_strings(model_names))
    if pbar:
        pbar.finish()
    # return the output text
    return out.getvalue().strip()
Exemplo n.º 3
0
 def test_flat_model_compatibility(self):
     target_state = ReadCoverage.FlatState(10)
     query_state = FlatState(4, 10)
     observation = (1, 2, 3, 4)
     target_likelihood = target_state.get_likelihood(observation)
     query_likelihood = query_state.get_likelihood(observation)
     self.assertAlmostEqual(query_likelihood, target_likelihood)
Exemplo n.º 4
0
 def test_heterozygous_compatibility(self):
     p = 0.1
     coverage = 10
     target_state = ReadCoverage.Heterozygous(p, coverage)
     query_state = Heterozygous(4, p, coverage)
     observation = (1, 2, 3, 4)
     target_likelihood = target_state.get_likelihood(observation)
     query_likelihood = query_state.get_likelihood(observation)
     self.assertAlmostEqual(query_likelihood, target_likelihood)
Exemplo n.º 5
0
 def test_single_pattern_compatibility(self):
     d = (.1, .2, .3, .4)
     coverage = 10
     target_state = ReadCoverage.SinglePatternState(d, coverage)
     query_state = SinglePatternState(d, coverage)
     observation = (1, 2, 3, 4)
     target_likelihood = target_state.get_likelihood(observation)
     query_likelihood = query_state.get_likelihood(observation)
     self.assertAlmostEqual(query_likelihood, target_likelihood)
Exemplo n.º 6
0
def process(input_lines, good_coverage, bad_coverage, randomization_rate, T,
            nseconds, use_pbar):
    """
    @param input_lines: lines of input of csv data including the header
    @param good_coverage: the expected number of reads at informative positions
    @param bad_coverage: the expected number of reads at uninformative positions
    @param randomization_rate: the probability of an error per read
    @param T: a transition matrix relating the hidden states
    @param nseconds: None or impose a time limit of this many seconds
    @param use_pbar: True iff a progress bar should be used
    @return: the multi-line string of the resulting csv file
    """
    # do some initialization
    out = StringIO()
    pbar = None
    start_time = time.time()
    # define the three models
    homozygous = ReadCoverage.Homozygous(randomization_rate, good_coverage)
    heterozygous = ReadCoverage.Heterozygous(randomization_rate, good_coverage)
    overcovered = ReadCoverage.Overcovered(randomization_rate, bad_coverage)
    models = [homozygous, heterozygous, overcovered]
    # read the chromosome data
    chromosomes = parse(input_lines)
    # write the header line
    print >> out, ','.join(g_output_header_row)
    # prepare to annotate the chromosomes
    if use_pbar:
        pbar = Progress.Bar(len(chromosomes))
    # annotate the chromosomes using the models
    try:
        for i, chromosome in enumerate(chromosomes):
            if nseconds and time.time() - start_time > nseconds:
                raise TimeoutError()
            chromosome.annotate_likelihoods(models)
            chromosome.annotate_posteriors(T, models)
            print >> out, '\n'.join(
                ','.join(row) for row in chromosome.get_rows_of_strings())
            if pbar:
                pbar.update(i + 1)
    except KeyboardInterrupt, e:
        if pbar:
            pbar.finish()
        raise e
Exemplo n.º 7
0
 def test_heterozygous_distribution_compatibility(self):
     p = .1
     target_distributions = ReadCoverage.get_heterozygous_distributions(p)
     query_distributions = get_heterozygous_distributions(4, p)
     self.assertTrue(np.allclose(target_distributions, query_distributions))
Exemplo n.º 8
0
def process(input_lines, good_coverage, bad_coverage, randomization_rate,
            nseconds, use_pbar):
    """
    @param input_lines: lines of input of csv data including the header
    @param good_coverage: the expected number of reads at informative positions
    @param bad_coverage: the expected number of reads at uninformative positions
    @param randomization_rate: the probability of an error per read
    @param nseconds: None or impose a time limit of this many seconds
    @param use_pbar: True iff a progress bar should be used
    @return: a multi-line string of the annotated csv file
    """
    verbose = False
    # validate the number of lines
    if len(input_lines) < 6:
        raise ValueError('there should be at least six lines of input')
    if len(input_lines) % 5 != 1:
        raise ValueError(
            'the input lines should consist of a header plus a multiple of five data lines'
        )
    # break the lines of input into rows of elements
    input_rows = [line_to_row(line) for line in input_lines]
    # validate the columns of data
    ncolumns = len(input_rows[0])
    if ncolumns < 7:
        raise ValueError('there should be at least seven columns of input')
    if ncolumns % 2 != 1:
        raise ValueError('the number of input columns should be odd')
    for row in input_rows:
        if len(row) != ncolumns:
            raise ValueError(
                'each row of input should have the same number of elements as the first row'
            )
    # define the three models
    homozygous = ReadCoverage.Homozygous(randomization_rate, good_coverage)
    heterozygous = ReadCoverage.Heterozygous(randomization_rate, good_coverage)
    overcovered = ReadCoverage.Overcovered(randomization_rate, bad_coverage)
    models = [homozygous, heterozygous, overcovered]
    # initialize the output header row
    header_row = input_rows[0]
    output_header_row = header_row[:5]
    for heading in header_row[5:]:
        if heading.endswith('sco'):
            output_header_row.append(heading)
        elif heading.endswith('cov'):
            output_header_row.extend([
                heading, heading + '_hom', heading + '_het', heading + '_ovr'
            ])
        else:
            raise ValueError(
                'each heading after the fifth should end with sco or cov')
    # get the rest of the rows
    data_rows = input_rows[1:]
    # define the number of genomic positions and the number of strains
    npositions = len(data_rows) / 5
    nstrains = (ncolumns - 5) / 2
    # begin the output
    out = StringIO()
    print >> out, ','.join(output_header_row)
    # initialize some stuff
    start_time = time.time()
    pbar = Progress.Bar(npositions) if use_pbar else None
    try:
        for position in range(npositions):
            # check the time
            if nseconds and time.time() - start_time > nseconds:
                raise TimeoutError()
            # get a chunk of five consecutive rows
            position_rows = [data_rows[position * 5 + i] for i in range(5)]
            # get the corresponding log likelihoods
            log_likelihood_lists = get_log_likelihoods_per_strain(
                position_rows, models)
            # construct five annotated output lines
            for position_row in position_rows:
                output_row = position_row[:5]
                for i, log_likelihoods in enumerate(log_likelihood_lists):
                    # add the coverage, three annotations, and the score
                    coverage_string = position_row[5 + 2 * i]
                    score_string = position_row[5 + 2 * i + 1]
                    if log_likelihoods:
                        annotations = [str(x) for x in log_likelihoods]
                    else:
                        annotations = ['-', '-', '-']
                    output_row.extend([coverage_string] + annotations +
                                      [score_string])
                print >> out, ','.join(output_row)
            # update the progress bar
            if pbar:
                pbar.update(position + 1)
    except KeyboardInterrupt, e:
        if pbar:
            pbar.finish()
        raise e
Exemplo n.º 9
0
# subprocess.call(['wc', '-l', args.inFile])
num_lines = int(
    subprocess.check_output(['wc', '-l', args.inFile]).decode().split(' ')[0])

print("lines: %u" % num_lines)

import shutil
shutil.rmtree(args.out_dir)

os.mkdir(args.out_dir)

# with open('a', 'w') as a, open('b', 'w') as b:
with open(args.inFile, 'r') as f:
    # header = next(f)
    for line in f:
        cs = ReadCoverage(line, args)

        print("len(y) %u" % cs.aln_centres.shape[2])
        print("plotting chr %u, pos %u" % (cs.chromosome, cs.pos))
        print("centre of mut reads: %3f" % (cs.expCentreMut))
        print("centre of all reads: %3f" % (cs.expCentre))
        cs.plot()
        cs.print_plot(args.out_dir, 'coverage')
        plt.close("all")

        cs.plot_scatters('snp_cov')
        # cs.show_plot()
        cs.print_plot(args.out_dir, 'scatters')
        plt.close("all")