示例#1
0
def headers(samples, datasets):
    '''Statistics headers'''
    headers = ['Sample', 'Total reads', 'Mapped reads', 'Deduplicated reads']
    splits_headers = set()
    for sample in samples:
        splits_headers.update([split[len(sample) + 1:] for split in Split.splits(sample)])
    if datasets:
        for dataset in datasets:
            splits_headers.update([split[len(sample) + 1:] for split in Split.splits(sample)])
    splits_headers = [header for header in splits_headers]
    splits_headers.sort(key=Split.splitkey)
    headers.extend(splits_headers)
    return (headers, splits_headers)
def sample_splits_prepgenomecov(sample):
    '''Prepare BED file used for genome coverage on a single sample.'''
    print('Compute genome coverage on sample {}'.format(sample))
    prepare_genome_coverage_sample(sample)
    splits = sb.splits(sample)
    for split in splits:
        prepare_genome_coverage_sample(split)
def sample_splits_genome_coverage(sample, genome, scale=None, strand=None, input_suffix='', output_suffix='-cov', spike_suffix=None, control_suffix=None, genomecov_args=()):
    '''Compute genome coverage on a single sample.'''
    print ('Computing genome coverage on sample {}'.format(sample))
    genome_coverage(sample, genome, scale, strand, input_suffix, output_suffix, spike_suffix, control_suffix, genomecov_args)
    splits = Split.splits(sample)
    for split in splits:
        genome_coverage(split, genome, scale, strand, input_suffix, output_suffix, spike_suffix, control_suffix, genomecov_args)
示例#4
0
def center_annotations_sample_splits(sample, input_suffix='', output_suffix='-forcov'):
    '''Prepare BED file used for genome coverage on a single sample.'''
    print ('Center annotations on sample {}'.format(sample))
    center_annotations_sample(sample, input_suffix, output_suffix)
    splits = sb.splits(sample)
    for split in splits:
        center_annotations_sample(split, input_suffix, output_suffix)
示例#5
0
def dyad_coverage(samples,
                  genes='genes.txt',
                  selection=None,
                  absolute=False,
                  minp=-75,
                  maxp=75,
                  smoothing=None,
                  suffix=None,
                  index=None):
    '''Finds the distribution of ditances between fragments and dyad.'''
    genes_info = pd.read_csv(genes, sep='\t', comment='#')
    genes_info = genes_info.loc[genes_info[genes_info.columns[6]] != -1]
    if selection:
        selection_genes = Parser.first(selection)
        genes_info = genes_info[genes_info[genes_info.columns[2]].isin(
            selection_genes)]
    sample_names = Parser.first(samples)
    if index != None:
        sample_names = [sample_names[index]]
    for sample in sample_names:
        dyad_coverage_sample(sample, genes_info, absolute, minp, maxp, suffix,
                             smoothing)
        splits = sb.splits(sample)
        for split in splits:
            dyad_coverage_sample(split, genes_info, absolute, minp, maxp,
                                 suffix, smoothing)
示例#6
0
def fit_gaussian(samples='samples.txt',
                 absolute=False,
                 components=False,
                 svg=False,
                 verbose=False,
                 center=None,
                 cmin=None,
                 cmax=None,
                 amp=None,
                 amin=None,
                 sigma=None,
                 smin=None,
                 suffix=None,
                 index=None):
    '''Fits gaussian curve to dyad coverage.'''
    sample_names = Parser.first(samples)
    if index != None:
        sample_names = [sample_names[index]]
    for sample in sample_names:
        fit_gaussian_sample(sample, absolute, components, svg, verbose, center,
                            cmin, cmax, amp, amin, sigma, smin, suffix)
        splits = sb.splits(sample)
        for split in splits:
            fit_gaussian_sample(split, absolute, components, svg, verbose,
                                center, cmin, cmax, amp, amin, sigma, smin,
                                suffix)
def ignore_strand_sample_splits(sample,
                                input_suffix='',
                                output_suffix='-forcov'):
    '''Prepare BED file used for genome coverage on a single sample.'''
    print('Duplicate annotations on other strand on sample {}'.format(sample))
    ignore_strand_sample(sample, input_suffix, output_suffix)
    splits = sb.splits(sample)
    for split in splits:
        ignore_strand_sample(split, input_suffix, output_suffix)
示例#8
0
def test_splits_2(testdir, mock_testclass):
    sample = 'POLR2A'
    Path(sample + '-100-150.bed').touch()
    Path(sample + '-200-250.bed').touch()
    Path(sample + '-300-350.bed').touch()
    Path(sample + '-400-450.bed').touch()
    splits = s.splits(sample)
    assert splits[0] == sample + '-100-150'
    assert splits[1] == sample + '-200-250'
    assert splits[2] == sample + '-300-350'
    assert splits[3] == sample + '-400-450'
    assert len(splits) == 4
示例#9
0
def test_splits(testdir, mock_testclass):
    sample = 'POLR2A'
    Path(sample + '-100-110.bed').touch()
    Path(sample + '-110-120.bed').touch()
    Path(sample + '-120-130.bed').touch()
    Path(sample + '-130-140.bed').touch()
    splits = s.splits(sample)
    assert splits[0] == sample + '-100-110'
    assert splits[1] == sample + '-110-120'
    assert splits[2] == sample + '-120-130'
    assert splits[3] == sample + '-130-140'
    assert len(splits) == 4
示例#10
0
def dyadcov(samples, genes, minp, maxp, smoothing, index):
    '''Finds the distribution of ditances between fragments and dyad.'''
    logging.basicConfig(filename='debug.log', level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
    genes_info = pd.read_csv(genes, sep='\t', comment='#')
    genes_info = genes_info.loc[genes_info[genes_info.columns[6]] != -1]
    sample_names = pd.read_csv(samples, header=None, sep='\t', comment='#')[0]
    if index != None:
        sample_names = [sample_names[index]]
    for sample in sample_names:
        dyad_coverage_sample(sample, genes_info, minp, maxp, smoothing)
        splits = sb.splits(sample)
        for split in splits:
            dyad_coverage_sample(split, genes_info, minp, maxp, smoothing)
示例#11
0
def fitgaussian(samples, components, svg, verbose, center, cmin, cmax, amp,
                amin, sigma, smin, index):
    '''Fits gaussian curve to dyad coverage.'''
    logging.basicConfig(filename='debug.log',
                        level=logging.DEBUG,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    sample_names = pd.read_csv(samples, header=None, sep='\t', comment='#')[0]
    if index != None:
        sample_names = [sample_names[index]]
    for sample in sample_names:
        fitgaussian_sample(sample, components, svg, verbose, center, cmin,
                           cmax, amp, amin, sigma, smin)
        splits = sb.splits(sample)
        for split in splits:
            fitgaussian_sample(split, components, svg, verbose, center, cmin,
                               cmax, amp, amin, sigma, smin)
def fit_double_gaussian(samples='samples.txt',
                        absolute=False,
                        components=False,
                        gaussian=False,
                        svg=False,
                        verbose=False,
                        center1=None,
                        cmin1=None,
                        cmax1=None,
                        amp1=None,
                        amin1=None,
                        sigma1=None,
                        smin1=None,
                        center2=None,
                        cmin2=None,
                        cmax2=None,
                        amp2=None,
                        amin2=None,
                        sigma2=None,
                        smin2=None,
                        suffix=None,
                        index=None):
    '''Fits double gaussian curve to dyad coverage.'''
    sample_names = Parser.first(samples)
    if index != None:
        sample_names = [sample_names[index]]
    for sample in sample_names:
        fit_double_gaussian_sample(sample, absolute, components, gaussian, svg,
                                   verbose, center1, cmin1, cmax1, amp1, amin1,
                                   sigma1, smin1, center2, cmin2, cmax2, amp2,
                                   amin2, sigma2, smin2, suffix)
        splits = sb.splits(sample)
        for split in splits:
            fit_double_gaussian_sample(split, absolute, components, gaussian,
                                       svg, verbose, center1, cmin1, cmax1,
                                       amp1, amin1, sigma1, smin1, center2,
                                       cmin2, cmax2, amp2, amin2, sigma2,
                                       smin2, suffix)
示例#13
0
def test_splits_none(testdir, mock_testclass):
    sample = 'POLR2A'
    splits = s.splits(sample)
    assert len(splits) == 0