def output_counts(end_function, count_file): # if the count_file already exists, skip outdir = os.path.join(config["dir"]["results"], stage) safe_makedir(outdir) count_file = os.path.join(outdir, count_file) if os.path.exists(count_file): return count_file # outputs a tab file of the counts at the end # of the fastq files kj counts = [reduce(count_ends, apply_seqio(x, end_function, kind="fastq"), {}) for x in curr_files] df = pd.DataFrame(counts, index=map(_short_name, curr_files)) df = df.astype(float) total = df.sum(axis=1) df = df.div(total, axis=0) df["total"] = total df.to_csv(count_file, sep="\t")
def _trim(fasta_file, input_files): # make dictionary of all sequence ids in them # if a sequence file isnt in the dictionary add it and its begin/end/length # if it is already and the length in the dictionary is shorter than the d = _make_length_dict(input_files) def trim_function(x, d): new_seq = x entry = d[x.id] start = int(entry["qstart"]) end = int(entry["qend"]) new_seq.seq = x.seq[start:end] return new_seq out_file = append_stem(fasta_file, "trimmed") out_handle = open(out_file, "w") def output_writer(x): return SeqIO.write(x, out_handle, "fasta") map(output_writer, fasta.apply_seqio(fasta_file, partial(trim_function, d=d), "fasta")) return out_file
def _make_combined_csv(fasta_file, input_files, org_names, out_file=None): """ takes a list of output files from blastn and a fasta file and attaches the columns jagesh wants all into one big data frame and writes it out """ suffixes = map(lambda x: "_" + x, org_names) # columns to keep TO_KEEP = list(flatten(["qseqid", map(lambda x: "sseqid" + x, suffixes), map(lambda x: "evalue" + x, suffixes), map(lambda x: "length" + x, suffixes), map(lambda x: "pident" + x, suffixes), map(lambda x: "sstart" + x, suffixes), map(lambda x: "send" + x, suffixes)])) # read inputs as tables and merge into one big table inputs = zip(suffixes, map(pd.read_table, input_files)) dfs = [inp[1].rename(columns=lambda name: name + inp[0]) for inp in inputs] d = {} for x in suffixes: d["qseqid" + x] = "qseqid" renamed = [x.rename(columns=d) for x in dfs] merged = reduce(lambda x, y: pd.merge(x, y, on="qseqid"), renamed[1:], renamed[0]) df_subset = merged[TO_KEEP] # add the sequence from the fasta file seqs = pd.DataFrame(fasta.apply_seqio( fasta_file, lambda x: {'qseqid': x.id, 'seq': str(x.seq)}, "fasta")) merged = pd.merge(df_subset, seqs, on="qseqid") merged.to_csv(out_file, index=False, sep="\t") return out_file