def gather_files(path, replicates, name): res = defaultdict(list) files = [x for x in get_only_files(path) if "annotated" in x] files.sort() for f in files: for r in replicates: if (os.path.basename(f).startswith(r)): res[r].append(f) res[name] = files return res
type=int, help="Flank length of the peak area around its highest point") parser.add_argument('--outdir', required=True, nargs='?', type=str, help="Path to the output directory") args = parser.parse_args() def check_interval(interval, mincov): return all( [float(x) > mincov for x in interval.attrs['topcoverage'].split(",")]) for path in get_only_files(args.path): if (path.endswith('gff') or path.endswith('bed')): bedtool = BedTool(path) if (len(bedtool)): with open(os.path.join(args.outdir, os.path.basename(path)), 'w') as f: for interval in bedtool: if (check_interval(interval, args.mincov)): center = int(interval.name) f.write( str( Interval(interval.chrom, center - args.flank, center + args.flank, name=interval.name, strand=interval.strand,
'--log', nargs='?', default=False, const=True, type=bool, help="If set, log2 transformation is apllied to the coverage") parser.add_argument('--labels', nargs='?', required=True, type=str, help="Path to the file with peak labels") args = parser.parse_args() NAME_ORDER = ['glu_wt', 'glu_ko_cyab', 'ace_glu_wt', 'ace_glu_ko_cyab'] files = [x for x in get_only_files(args.path) if "normalized" in x] name2files = defaultdict(list) for f in files: name = "_".join(os.path.basename(f).split("_")[:-1]) name2files[name].append(f) size = len(name2files) name2coverage = {} for name, local_files in name2files.items(): local_coverages = [list(coverage2dict(f).values())[0] for f in local_files] averaged_coverage = np.mean(local_coverages, axis=0) if (args.log): averaged_coverage = [np.log2(x + 1) for x in averaged_coverage] name2coverage[name] = averaged_coverage length_coverage = len(averaged_coverage)
for x in genes if x.strand == '+'] tss_list.extend([(x, fasta[x.chrom].seq[x.end - downstream:x.end + upstream]) for x in genes if x.strand == '-']) for tss, seq in tss_list: for adstart, adend, lseq, at_fraction, gc_count in get_at_rich_stretches( seq, 5, 20, 4, 0.4): stretches.append( (tss.chrom, tss.strand, tss.start + adstart, tss.start + adend, lseq, stretch_score(at_fraction, adend, adstart), seq)) stretches.sort(key=lambda x: x[5], reverse=True) return stretches[:top] raw_files = sorted(get_only_files(args.path)) files_list = [(os.path.basename(x[1][0]).split(".")[0], x[1][0], x[1][1]) for x in enumerate(zip(raw_files, raw_files[1:])) if x[0] % 2 == 0] #sys.exit() for name, fasta_path, gff_path in files_list[:]: genome = SeqIO.to_dict(SeqIO.parse(fasta_path, "fasta")) genes = BedTool(gff_path) with open(os.path.join(args.outdir, "%s.at_stretches.tsv" % name), 'w') as f: f.write("chromosome\tstrand\tstart\tstop\tseq\tscore\tall_tss_seq\n") for el in get_tss_at_contents(genes, genome, args.upstream, args.downstream, args.top): f.write("%s\t%s\t%d\t%d\t%s\t%1.2f\t%s\n" % el)
parser.add_argument('path', metavar = 'N', nargs = '?', type = str, help = "Path to the folder with reads"); parser.add_argument('--table', nargs = '?', type = os.path.abspath, required = True, help = "Path to the sample table, tsv format"); parser.add_argument('--outdir', nargs = '?', type = str, required = True, help = "Path to the output directory"); #parser.add_argument('--paired', nargs = '?', default = False, const=True, type = bool, help = "If set, reads are assumed to be paired-end") #parser.add_argument('--table', nargs = '?', type = os.path.abspath, help = "Path to the table which connects the read file names to the meaningful names"); args = parser.parse_args(); sample2type = {}; with open(args.table) as f: for l in f: a = l.strip().split("\t"); time = a[1].replace(" ", "").replace(".", "") sample2type[a[2]] = a[0], time, "chap" sample2type[a[3]] = a[0], time, "control" for cond in set([x[0] for x in sample2type.values()]): for type_ in ('chap', 'control'): path = os.path.join(args.outdir, "%s_%s" % (cond, type_)) Path(path).mkdir(parents=True, exist_ok=True) #sys.exit() for f in get_only_files(args.path): if(f.endswith('fastq')): name, mate, _ = os.path.basename(f).split(".") cond, time, type_ = sample2type[name] path = os.path.join(args.outdir, "%s_%s" % (cond, type_), "%s.%s.fastq" % (time, mate)) copyfile(f, path)
parser = argparse.ArgumentParser(description='Detects nondepleted rRNA regions beased on the genomic coverage'); parser.add_argument('path', metavar = 'N', nargs = '?', type = str, help = "Path to the coverage folder"); parser.add_argument('--rrna', nargs = '?', required=True, type = str, help = "Path to the rRNA, gff file"); parser.add_argument('--genome', nargs = '?', required=True, type = str, help = "Path to the genome, fasta file"); parser.add_argument('--minfraction', nargs = '?', default=1, type = float, help = "Minimal required fraction/multiplier (of the mean rRNA coverage) for a particular position to be counted as nondepleted"); parser.add_argument('--minlength', nargs = '?', default=20, type = float, help = "Minimal required length of non-depleted regions"); parser.add_argument('--outtype', nargs = '?', choices=['fa', 'tsv'], default='tsv', type = str, help = "Type of the output file, fasta or tsv"); args = parser.parse_args(); strand_conv = {'plus': '+', 'minus': '-'} genome = SeqIO.to_dict(SeqIO.parse(args.genome,'fasta')) #rrna = BedTool(args.rrna); files = get_only_files(args.path) strand2coverage = {} for f in files[:]: strand = strand_conv[f.split(".")[-2]] for chrom, cov in coverage2dict(f, cpos=2).items(): if( (chrom, strand) in strand2coverage): strand2coverage[(chrom, strand)] += cov else: strand2coverage[(chrom, strand)] = cov total_sum = sum([sum(x) for x in strand2coverage.values()]) #print(total_sum) rrna2coverage =[]; for interval in BedTool(args.rrna):
mlist.append('clean:\n\techo "nothing to clean."\n') return "\n\n".join(mlist) seq_package = os.path.join(args.package, 'sequencing') sample2name = {} with open(args.table) as f: for l in f: a = l.strip().split("\t") sample2name[a[0]] = a[1:] if (args.paired): name2sample = defaultdict(lambda: [None] * 2) for sample in get_only_files(args.path): a = sample2name.get(os.path.basename(sample)) if (a): name2sample[a[0]][int(a[1]) - 1] = sample else: sys.stderr.write( "Sample %s was not found in the provided table\n" % sample) else: name2sample = {} for sample in get_only_files(args.path): a = sample2name.get(os.path.basename(sample)) if (a): name2sample[a[0]] = sample else: sys.stderr.write( "Sample %s was not found in the provided table\n" % sample)
next(f) for l in f: a = l.strip().split("\t") start = int(a[1]) stop = int(a[2]) temp = ["%1.1f" % float(x) if x != 'None' else '0.0' for x in a[9:14]] interval = Interval("NC_003450.3", start, stop, "_".join(a[5:7] + temp), '0', '+') #print(interval.name) reference.append(interval) reference = BedTool(reference) overlap_counter = defaultdict(int) for folder in args.replicates: files = [x for x in get_only_files(folder) if "filtered" in x] replicates_list = [BedTool(x) for x in files] for r in reference.intersect(b=replicates_list, u=True): overlap_counter[(r.name)] += 1 #for km in replicates_list: #for r in reference.intersect(b=km, u = True): #overlap_counter[(r.start, r.stop)] += 1; for name, a in name2string.items(): #if(overlap_counter[name]==2): a.append(str(overlap_counter[name])) print("\t".join(a)) #for k, v in overlap_counter.items(): #m = name2string[k]
def get_tpms(path): return dict([(x.name, float(x.attrs['tpm'])) for x in BedTool(path)]) def line2score(l): return sum([sum(x) for x in l[1:]]) gene2annotation = dict([(x.attrs['ID'], x) for x in BedTool(args.annotation)]) #print(gene2annotation) label2file = defaultdict(list) for f in sorted(get_only_files(args.path)): label = "_".join(os.path.basename(f).split("_")[:-1]) label2file[label].append(f) if (args.order): label2file = [(x, label2file[x]) for x in args.order] else: label2file = list(sorted(label2file.items(), key=lambda x: x[0])) expression = [] labels = [] genes = set() for label, local_files in label2file: local_expr = [] for lf in local_files: #print(lf)
plt.xticks(xvals, xticks, rotation=45) plt.savefig(os.path.join(args.outdir, "%s.%s" % (name, args.format)), format=args.format) plt.clf() plt.close() if (args.mode == 'length'): original = [(int(x.name), float(x.score)) for x in BedTool(args.original)] original.sort(key=lambda x: x[0]) #for r1, r2 in split2chunks(original, 2): #print(abs(r1[0] - r2[0])) length2step = [] for path in [x for x in get_only_files(args.detected) if 'annotated' in x]: name = get_name(path, args.mode) xticks, yvals = process_detected(path, original) length2step.append((int(name), find_step_for_sample(yvals, xticks, name, args.detection_fraction))) draw_single(yvals, xticks, name) xlabel = 'Read length' elif (args.mode == 'ratio'): original_dict = {} for path in get_only_files(args.original): name = os.path.basename(path).split(".")[0][1:] original = [(int(x.name), float(x.score)) for x in BedTool(path)] original.sort(key=lambda x: x[0]) original_dict[name] = copy.copy(original)
import yaml from afbio.generators import get_only_files _confdir = os.path.dirname(os.path.realpath(__file__)) class LocalConfigError(Exception): pass #CONFIGS is used for shortcuts while calling configuration file #CONFIGS = {'samstat': 'samstat.yml', 'bedstat': 'bedstat.yml', 'chiflex': 'chiflex.yml', 'doublechiflex': 'doublechiflex.yml', 'chipchap': 'chipchap.yml', 'lrg': 'lrg.yml'} CONFIGS = {} for path in get_only_files(_confdir): CONFIGS[os.path.basename(path).split(".")[0]] = path def load_config(configuration): p = CONFIGS.get(configuration) if (not p): p = configuration with open(p, 'r') as f: d = yaml.load(f, Loader=yaml.FullLoader) if (isinstance(d, dict)): return d else: raise LocalConfigError( 'Config file %s is malformatted. It has to be convertible into dictionary'
def parse_file(path): with open(path) as f: labels = next(f).strip().split("\t")[1:] expression = [[float(y) for y in x.split(",")] for x in next(f).strip().split("\t")[1:]] variants, var_names = [], [] for l in f: a = l.strip().split("\t") variants.append([[float(y) for y in x.split(",")] for x in a[1:]]) var_names.append(a[0]) return labels, var_names, expression, variants normal_list, spurious_list = [], [] for path in [x for x in get_only_files(args.path) if x.endswith("tsv")]: name = os.path.basename(path).split(".")[0] labels, var_names, expression, variants = parse_file(path) res = check(labels, var_names, expression, variants, args.minexpr, args.mindiff, args.minfraction) if (res[0]): normal_list.append((name, res[0])) if (res[1]): spurious_list.append((name, res[1])) normal_list.sort(reverse=True, key=lambda x: x[1][-1]) spurious_list.sort(reverse=True, key=lambda x: x[1][-1]) with open(os.path.join(args.outdir, "normal.tsv"), 'w') as f: header = [ "gene", "time1", "time2", "tss", "score", "expression1", "expression2", "fraction1", "fraction2"