def combine_spikein(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "spikein") dont_combine, to_combine = partition(dd.get_spikein_counts, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "spikein.sf") if not file_exists(tidy_file): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_spikein_counts(data) samplename = dd.get_sample_name(data) new_df = sailfish._sailfish_expression_parser( sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_spikein_counts(data, tidy_file) updated_samples.append([data]) return updated_samples
def counts_spikein(data): data = utils.to_single_data(data) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "spikein", samplename) fasta_file = dd.get_spikein_fasta(data) if not fasta_file: return data files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file readlength = fastq.estimate_read_length(fq1) if readlength % 2 == 0: readlength -= 1 kmersize = min(readlength, 31) logger.info("kmersize used for salmon index at spikein quant: %s" % kmersize) kmersize = kmersize if not dd.get_analysis( data).lower() == "smallrna-seq" else 15 fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmersize) out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data) data = dd.set_spikein_counts(data, out_file) return data
def combine_spikein(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "spikein") dont_combine, to_combine = partition(dd.get_spikein_counts, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "spikein.sf") if not file_exists(tidy_file): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_spikein_counts(data) samplename = dd.get_sample_name(data) new_df = sailfish._sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_spikein_counts(data, tidy_file) updated_samples.append([data]) return updated_samples
def counts_spikein(data): data = utils.to_single_data(data) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "spikein", samplename) fasta_file = dd.get_spikein_fasta(data) if not fasta_file: return data files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file kmer = 31 if not dd.get_analysis(data).lower() == "smallrna-seq" else 15 fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmer) out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data) data = dd.set_spikein_counts(data, out_file) return data
def counts_spikein(data): data = utils.to_single_data(data) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "spikein", samplename) fasta_file = dd.get_spikein_fasta(data) if not fasta_file: return data files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file readlength = fastq.estimate_read_length(fq1) if readlength % 2 == 0: readlength -= 1 kmersize = min(readlength, 31) logger.info("kmersize used for salmon index at spikein quant: %s" % kmersize) kmersize = kmersize if not dd.get_analysis(data).lower() == "smallrna-seq" else 15 fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmersize) out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data) data = dd.set_spikein_counts(data, out_file) return data