def dyad_coverage(samples, genes='genes.txt', selection=None, absolute=False, minp=-75, maxp=75, smoothing=None, suffix=None, index=None): '''Finds the distribution of ditances between fragments and dyad.''' genes_info = pd.read_csv(genes, sep='\t', comment='#') genes_info = genes_info.loc[genes_info[genes_info.columns[6]] != -1] if selection: selection_genes = Parser.first(selection) genes_info = genes_info[genes_info[genes_info.columns[2]].isin( selection_genes)] sample_names = Parser.first(samples) if index != None: sample_names = [sample_names[index]] for sample in sample_names: dyad_coverage_sample(sample, genes_info, absolute, minp, maxp, suffix, smoothing) splits = sb.splits(sample) for split in splits: dyad_coverage_sample(split, genes_info, absolute, minp, maxp, suffix, smoothing)
def statistics_samples(samples='samples.txt', datasets='dataset.txt', output='statistics.txt'): '''Creates statistics file for samples.''' sample_names = Parser.first(samples) datasets_names = [] if os.path.exists(datasets): datasets_names = Parser.first(datasets) compute_statistics(sample_names, datasets_names, output)
def fit_gaussian(samples='samples.txt', absolute=False, components=False, svg=False, verbose=False, center=None, cmin=None, cmax=None, amp=None, amin=None, sigma=None, smin=None, suffix=None, index=None): '''Fits gaussian curve to dyad coverage.''' sample_names = Parser.first(samples) if index != None: sample_names = [sample_names[index]] for sample in sample_names: fit_gaussian_sample(sample, absolute, components, svg, verbose, center, cmin, cmax, amp, amin, sigma, smin, suffix) splits = sb.splits(sample) for split in splits: fit_gaussian_sample(split, absolute, components, svg, verbose, center, cmin, cmax, amp, amin, sigma, smin, suffix)
def removesecondmate_samples(samples='samples.txt', input_suffix='-dedup', output_suffix='-mate1', threads=None, index=None): '''Removes second mate from BAM.''' sample_names = Parser.first(samples) if index != None: sample_names = [sample_names[index]] for sample in sample_names: removesecondmate_sample(sample, input_suffix, output_suffix, threads)
def split_samples(samples='samples.txt', index=None, binlength=10, binminlength=100, binmaxlength=500): '''Split BED files from samples based on lenght of annotations.''' sample_names = Parser.first(samples) if index != None: sample_names = [sample_names[index]] for sample in sample_names: split_sample(sample, binlength, binminlength, binmaxlength)
def genome_coverage_samples(samples='samples.txt', genome='sacCer3.chrom.sizes', scale=None, strand=None, input_suffix='', output_suffix='-cov', spike_suffix=None, control_suffix=None, index=None, genomecov_args=()): '''Compute genome coverage on samples.''' sample_names = Parser.first(samples) if index != None: sample_names = [sample_names[index]] for sample in sample_names: sample_splits_genome_coverage(sample, genome, scale, strand, input_suffix, output_suffix, spike_suffix, control_suffix, genomecov_args)
def center_annotations_samples(samples='samples.txt', input_suffix='', output_suffix='-forcov', index=None): '''Prepare BED file used for genome coverage on samples.''' sample_names = Parser.first(samples) if index != None: sample_names = [sample_names[index]] for sample in sample_names: center_annotations_sample_splits(sample, input_suffix, output_suffix)
def intersectannotations(input, annotations, output): '''Filter BED file to keep only annotations present in annotations.''' logging.basicConfig(filename='seqtools.log', level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') genes = Parser.first(annotations) incolumns = Parser.columns(input) with open(output, 'w') as outfile: for columns in incolumns: if columns[3] in genes: outfile.write(str(columns[0])) for column in columns[1:]: outfile.write('\t') outfile.write(str(column)) outfile.write('\n')
def plot2do_samples(file, input_suffix='', index=None, plot2do_args=()): '''Run plot2DO on samples.''' file_parent = Path(file).parent sample_names = Parser.first(file) if index != None: sample_names = [sample_names[index]] for sample in sample_names: plot2do_sample(str(file_parent / sample), input_suffix, plot2do_args)
def test_columns(): samples = Path(__file__).parent.parent.joinpath('samples.txt') columns = p.columns(samples) assert columns[0][0] == 'POLR2A' assert columns[0][1] == 'SRR8518913' assert columns[1][0] == 'ASDURF' assert columns[1][1] == 'SRX5322424' assert columns[2][0] == 'POLR1C' assert columns[2][1] == 'SRR8518915'
def merge_datasets(datasets='dataset.txt', index=None): '''Merge BED files related to samples.''' datasets_columns = Parser.columns(datasets) if index != None: datasets_columns = [datasets_columns[index]] for columns in datasets_columns: name = columns[0] samples = [sample for sample in columns[1:]] merge_dataset(name, samples)
def bowtie_samples(samples='samples.txt', threads=None, output_suffix='', index=None, bowtie_args=()): '''Align samples using bowtie2 program.''' sample_names = Parser.first(samples) if index != None: sample_names = [sample_names[index]] for sample in sample_names: bowtie_sample(sample, threads, output_suffix, bowtie_args)
def merge_datasets(datasets='dataset.txt', sizes='sacCer3.chrom.sizes', index=None): '''Merge bigWig files related to samples.''' datasets_columns = Parser.columns(datasets) if index != None: datasets_columns = [datasets_columns[index]] for columns in datasets_columns: name = columns[0] samples = [sample for sample in columns[1:]] merge_dataset(name, samples, sizes)
def bwa_samples(samples='samples.txt', fasta='sacCer3.fa', threads=None, output_suffix='', index=None, bwa_args=()): '''Align samples using bwa program.''' sample_names = Parser.first(samples) if index != None: sample_names = [sample_names[index]] for sample in sample_names: bwa_sample(sample, fasta, threads, output_suffix, bwa_args)
def shift_annotations_samples(samples='samples.txt', input_suffix='', output_suffix='-forcov', index=None, bedtools_args=()): '''Moves annotations contained in BED files.''' sample_names = Parser.first(samples) if index != None: sample_names = [sample_names[index]] for sample in sample_names: shift_annotations_sample(sample, input_suffix, output_suffix, bedtools_args)
def test_columns_merge(): samples = Path(__file__).parent.parent.joinpath('dataset.txt') columns = p.columns(samples) assert columns[0][0] == 'POLR2A' assert columns[0][1] == 'POLR2A_1' assert columns[0][2] == 'POLR2A_2' assert columns[1][0] == 'ASDURF' assert columns[1][1] == 'ASDURF_1' assert columns[1][2] == 'ASDURF_2' assert columns[2][0] == 'POLR1C' assert columns[2][1] == 'POLR1C_1' assert columns[2][2] == 'POLR1C_2'
def download_samples(samples='samples.txt', fast=True, threads=None, mem='100MB', index=None): '''Download reads of all samples.''' sample_columns = Parser.columns(samples) if index != None: sample_columns = [sample_columns[index]] for columns in sample_columns: sample = columns[0] srr = columns[1] if len(columns) > 1 else None download_sample(sample, srr, fast, threads, mem)
def filter_bam(samples='samples.txt', paired=True, dedup=True, threads=None, input_suffix='', output_suffix='', index=None): '''Filter BAM file to keep only properly paired reads and remove supplementary alignments and duplicates.''' sample_names = Parser.first(samples) if index != None: sample_names = [sample_names[index]] for sample in sample_names: filter_bam_sample(sample, paired, dedup, threads, input_suffix, output_suffix)
def fit_double_gaussian(samples='samples.txt', absolute=False, components=False, gaussian=False, svg=False, verbose=False, center1=None, cmin1=None, cmax1=None, amp1=None, amin1=None, sigma1=None, smin1=None, center2=None, cmin2=None, cmax2=None, amp2=None, amin2=None, sigma2=None, smin2=None, suffix=None, index=None): '''Fits double gaussian curve to dyad coverage.''' sample_names = Parser.first(samples) if index != None: sample_names = [sample_names[index]] for sample in sample_names: fit_double_gaussian_sample(sample, absolute, components, gaussian, svg, verbose, center1, cmin1, cmax1, amp1, amin1, sigma1, smin1, center2, cmin2, cmax2, amp2, amin2, sigma2, smin2, suffix) splits = sb.splits(sample) for split in splits: fit_double_gaussian_sample(split, absolute, components, gaussian, svg, verbose, center1, cmin1, cmax1, amp1, amin1, sigma1, smin1, center2, cmin2, cmax2, amp2, amin2, sigma2, smin2, suffix)
def merge_dataset(name, samples, sizes): '''Merge bigWig files related to samples.''' print('Merging samples {} into dataset {}'.format(samples, name)) sizes_columns = Parser.columns(sizes) bws = [pbw.open(sample + '.bw') for sample in samples] merge_temp_o, merge_temp = tempfile.mkstemp(suffix='.bed') with open(merge_temp_o, 'w') as output: output.write('track type=bedGraph name="' + name + '"\n') for size_columns in sizes_columns: chromosome = size_columns[0] size = size_columns[1] sums = [0] * size for bw in bws: bw_size = bw.chroms(chromosome) if bw.chroms(chromosome) else 0 if bw_size == 0: continue values = bw.values(chromosome, 0, min(size, bw_size)) sums = [ sums[i] + (values[i] if not math.isnan(values[i]) else 0) for i in range(0, min(size, bw_size)) ] for i in range(0, len(sums)): output.write(chromosome) output.write('\t') output.write(str(i)) output.write('\t') output.write(str(i + 1)) output.write('\t') output.write(str(sums[i])) output.write('\n') sort_temp_o, sort_temp = tempfile.mkstemp(suffix='.bed') Bed.sort(merge_temp, sort_temp) merged_bw = name + '.bw' Bed.bedgraph_to_bigwig(sort_temp, merged_bw, sizes) os.remove(sort_temp) os.remove(merge_temp)
def test_first_merge(): samples = Path(__file__).parent.parent.joinpath('dataset.txt') names = p.first(samples) assert names == ['POLR2A', 'ASDURF', 'POLR1C']