def load_data(hmmcopy_filename, alignment_filename, colnames): hmmcopy_data = csvutils.read_csv_and_yaml(hmmcopy_filename) alignment_data = csvutils.read_csv_and_yaml(alignment_filename) hmmcopy_data = hmmcopy_data.set_index('cell_id') alignment_data = alignment_data.set_index('cell_id') data = [] for colname in colnames: if colname in hmmcopy_data: coldata = hmmcopy_data[colname] else: coldata = alignment_data[colname] if colname == 'scaled_halfiness': # haploid poison adds inf, replace with big number since 0 is considered good # and we want to score to decrease coldata = coldata.replace(np.inf, 1e10) data.append(coldata) data = pd.concat(data, axis=1) data = data.replace(-np.inf, np.nan) data = data.fillna(0) return data
def compare_annotation(annotation, refannotation): annotation = csvutils.read_csv_and_yaml(annotation) refannotation = csvutils.read_csv_and_yaml(refannotation) common_cols = _check_for_missing_cols(annotation, refannotation) for col in common_cols: ann = annotation[col].dropna() ref = refannotation[col].dropna() assert set(ann) == set(ref)
def test_breakpoint_calling(args): output_path = args[1] ref_path = args[2] ref_strelka, ref_museq, ref_snpeff = get_inputs(ref_path) strelka, museq, snpeff = get_inputs(output_path) compare.compare_variant_calls(ref_snpeff, snpeff) ref_strelka = csvutils.read_csv_and_yaml(ref_strelka) strelka = csvutils.read_csv_and_yaml(strelka) assert ref_strelka.empty and strelka.empty
def read_data(filename, tablename, gzipped=True): fileformat = single_cell.utils.helpers.get_file_format(filename) if fileformat == 'h5': data = read_from_h5(filename, tablename) elif fileformat == 'csv': data = csvutils.read_csv_and_yaml(filename) elif fileformat == 'gzip': data = csvutils.read_csv_and_yaml(filename) else: raise Exception("unknown file format") return data
def cell_cycle_classifier(hmmcopy_reads, hmmcopy_metrics, alignment_metrics, output, tempdir, docker_image=None): helpers.makedirs(tempdir) temp_output = os.path.join(tempdir, 'cell_cycle_output.csv') cmd = [ 'cell_cycle_classifier', 'train-classify', hmmcopy_reads, hmmcopy_metrics, alignment_metrics, temp_output ] pypeliner.commandline.execute(*cmd, docker_image=docker_image) cell_cycle_df = pd.read_csv(temp_output) hmm_metrics_df = csvutils.read_csv_and_yaml(hmmcopy_metrics) hmm_metrics_df = hmm_metrics_df.merge(cell_cycle_df, on=['cell_id'], how='outer') csvutils.write_dataframe_to_csv_and_yaml(hmm_metrics_df, output)
def add_corrupt_tree_order(corrupt_tree, metrics, output): """ adds corrupt tree order to metrics """ with open(corrupt_tree) as newickfile: newickdata = newickfile.readline() assert newickfile.readline() == '' tree = Tree(newickdata, format=1) leaves = [node.name for node in tree.traverse("levelorder")] leaves = [val[len('cell_'):] for val in leaves if val.startswith("cell_")] ordering = {val: i for i, val in enumerate(leaves)} metrics = csvutils.read_csv_and_yaml(metrics) cells = metrics.cell_id for cellid in cells: order = ordering.get(cellid, float('nan')) metrics.loc[metrics["cell_id"] == cellid, "order_corrupt_tree"] = order csvutils.write_dataframe_to_csv_and_yaml(metrics, output, write_header=True)
def add_contamination_status(infile, outfile, config, reference='grch37', threshold=0.05): data = csvutils.read_csv_and_yaml(infile) data = data.set_index('cell_id', drop=False) organisms = [genome['name'] for genome in config['genomes']] if reference not in organisms: raise Exception("Could not find the fastq screen counts") alts = [col for col in organisms if not col == reference] data['is_contaminated'] = False for altcol in alts: perc_alt = _get_col_data(data, altcol) / data['total_reads'] data.loc[perc_alt > threshold, 'is_contaminated'] = True col_type = dtypes()['metrics']['is_contaminated'] data['is_contaminated'] = data['is_contaminated'].astype(col_type) csvutils.write_dataframe_to_csv_and_yaml(data, outfile, dtypes()['metrics'])
def cell_cycle_classifier(hmmcopy_reads, hmmcopy_metrics, alignment_metrics, output, tempdir, genome_labels): helpers.makedirs(tempdir) temp_output = os.path.join(tempdir, 'cell_cycle_output.csv') cmd = [ 'cell_cycle_classifier', 'train-classify', hmmcopy_reads, hmmcopy_metrics, alignment_metrics, temp_output ] pypeliner.commandline.execute(*cmd) cell_cycle_df = pd.read_csv(temp_output) cols_cell_cycle = cell_cycle_df.columns.values hmm_metrics_df = csvutils.read_csv_and_yaml(hmmcopy_metrics) hmm_metrics_df = hmm_metrics_df.merge(cell_cycle_df, on=['cell_id'], how='outer') out_dtypes = dtypes(genome_labels) for colname in cols_cell_cycle: hmm_metrics_df[colname] = hmm_metrics_df[colname].astype( out_dtypes[colname]) csvutils.write_dataframe_to_csv_and_yaml(hmm_metrics_df, output, out_dtypes)
def filter_plot_tar(metrics, src_tar, pass_tar, fail_tar, tempdir, filters): allplots = os.path.join(tempdir, 'allplots') helpers.makedirs(allplots) helpers.extract_tar(src_tar, allplots) metrics_data = csvutils.read_csv_and_yaml(metrics) all_cells = metrics_data.cell_id.tolist() metrics_data = helpers.filter_metrics(metrics_data, filters) good_cells = metrics_data.cell_id.tolist() bad_cells = [cell for cell in all_cells if cell not in good_cells] plotdir = os.path.join(tempdir, 'segs_pass') helpers.makedirs(plotdir) for cell in good_cells: src_path = os.path.join(allplots, 'segments', '{}_{}.png'.format(cell, 'segments')) dest_path = os.path.join(plotdir, '{}_{}.png'.format(cell, 'segments')) shutil.copyfile(src_path, dest_path) helpers.make_tarfile(pass_tar, plotdir) plotdir = os.path.join(tempdir, 'segs_fail') helpers.makedirs(plotdir) for cell in bad_cells: src_path = os.path.join(allplots, 'segments', '{}_{}.png'.format(cell, 'segments')) dest_path = os.path.join(plotdir, '{}_{}.png'.format(cell, 'segments')) shutil.copyfile(src_path, dest_path) helpers.make_tarfile(fail_tar, plotdir)
def test_breakpoint_calling(args): output_path = args[1] ref_path = args[2] ref_must_exist, ref_lumpy, ref_destruct = get_inputs(ref_path) must_exist, lumpy, destruct = get_inputs(output_path) assert all(map(os.path.exists, ref_must_exist)) assert all(map(os.path.exists, must_exist)) compare.compare_breakpoint_calls(ref_lumpy, lumpy) ref_destruct = csvutils.read_csv_and_yaml(ref_destruct) destruct = csvutils.read_csv_and_yaml(destruct) assert ref_destruct.empty and destruct.empty
def load_data(infile, gc=False): df = csvutils.read_csv_and_yaml(infile) if gc: df.index = df.cell_id del df['cell_id'] df.columns = map(int, df.columns.values) return df
def _load(file, by, reindex=False): loaded = csvutils.read_csv_and_yaml(file) loaded = loaded.sort_values(by, ascending=[True] * len(by)) if reindex: loaded = loaded.set_index(by) return loaded
def get_good_cells(metrics, cell_filters): metrics_data = csvutils.read_csv_and_yaml(metrics) if not cell_filters: return metrics_data.cell_id.tolist() metrics_data = helpers.filter_metrics(metrics_data, cell_filters) return metrics_data.cell_id.tolist()
def load_hmmcopy_reads_data(readsfile): keepcols = ['ideal', 'valid', 'gc', 'map', 'state', 'cor_gc', 'copy'] reads = csvutils.read_csv_and_yaml(readsfile) reads = reads.set_index(['cell_id', 'chr', 'start', 'end']) reads = reads[keepcols] return reads
def get_mappability_col(reads, annotated_reads): reads = csvutils.read_csv_and_yaml(reads, chunksize=100) alldata = [] for read_data in reads: read_data['is_low_mappability'] = (read_data['map'] <= 0.9) alldata.append(read_data) alldata = pd.concat(alldata) csvutils.write_dataframe_to_csv_and_yaml( alldata, annotated_reads, dtypes()['reads'], write_header=True )
def write_to_output(hmmcopy_filename, output, predictions): data = csvutils.read_csv_and_yaml(hmmcopy_filename) data['quality'] = data['cell_id'].map(predictions) data.quality = data.quality.astype(float) fileformat = single_cell.utils.helpers.get_file_format(output) if fileformat == 'csv': write_to_csv(output, data) elif fileformat == "gzip": write_to_csv(output, data, gzipped=True) else: raise Exception("unknown file format")
def annotate_metrics(metrics, output, sample_info, cells): """ adds sample information to metrics in place """ metrics = csvutils.read_csv_and_yaml(metrics) for cellid in cells: cellinfo = sample_info[cellid] for colname, value in cellinfo.items(): metrics.loc[metrics["cell_id"] == cellid, colname] = value csvutils.write_dataframe_to_csv_and_yaml(metrics, output)
def get_hierarchical_clustering_order( reads_filename, chromosomes=None): data = [] chunksize = 10 ** 5 for chunk in csvutils.read_csv_and_yaml( reads_filename, chunksize=chunksize): chunk["bin"] = list(zip(chunk.chr, chunk.start, chunk.end)) # for some reason pivot doesnt like an Int64 state col chunk['state'] = chunk['state'].astype('float') chunk = chunk.pivot(index='cell_id', columns='bin', values='state') data.append(chunk) # merge chunks, sum cells that get split across chunks table = pd.concat(data) table = table.groupby(table.index).sum() bins = pd.DataFrame( table.columns.values.tolist(), columns=[ 'chr', 'start', 'end']) bins['chr'] = bins['chr'].astype(str) bins = sort_bins(bins, chromosomes) table = table.sort_values(bins, axis=0) data_mat = np.array(table.values) data_mat[np.isnan(data_mat)] = -1 row_linkage = hc.linkage(sp.distance.pdist(data_mat, 'cityblock'), method='ward') order = hc.leaves_list(row_linkage) samps = table.index order = [samps[i] for i in order] order = {v: i for i, v in enumerate(order)} return order
def test_contamination(tmpdir): data = {} cols = [ 'fastqscreen_nohit', 'fastqscreen_grch37', 'fastqscreen_grch37_multihit', 'fastqscreen_mm10', 'fastqscreen_mm10_multihit', 'fastqscreen_salmon', 'fastqscreen_salmon_multihit' ] for i in range(5): data[i] = {'cell_id': 'SA123_A123_R{0}_C{0}'.format(i)} for col in cols: data[i][col] = i * 10 data[i]['fastqscreen_grch37'] = i * 1000 data[i]['fastqscreen_mm10'] = i * 100 for i in range(5, 10): data[i] = {'cell_id': 'SA123_A123_R{0}_C{0}'.format(i)} for col in cols: data[i][col] = (i * 10) data[i]['fastqscreen_grch37'] = i * 1000 data = pd.DataFrame.from_dict(data, orient='index') data['total_reads'] = data[cols].sum(axis=1) dtypes = {col: 'int' for col in cols} dtypes['cell_id'] = 'str' dtypes['total_reads'] = 'int' infile = os.path.join(tmpdir, 'input.csv.gz') outfile = os.path.join(tmpdir, 'output.csv.gz') csvutils.write_dataframe_to_csv_and_yaml(data, infile, dtypes) config = {'genomes': [{'name': 'grch37'}, {'name': 'mm10'}, {'name': 'salmon'}]} tasks.add_contamination_status(infile, outfile, config) output = csvutils.read_csv_and_yaml(outfile) assert output['is_contaminated'].tolist() == [False] + [True] * 4 + [False] * 5
def read_input_data(self, infile, tablename): fileformat = helpers.get_file_format(infile) if fileformat == "csv" or fileformat == 'gzip': metrics = csvutils.read_csv_and_yaml(infile) else: with pd.HDFStore(infile, 'r') as metrics_store: metrics = metrics_store[tablename] metrics = metrics.reset_index() if 'cell_call' in metrics.columns.values: # plotting code doesnt work with nan # tenx data will have nan for cell call, experimental condition # row, col metrics['cell_call'] = metrics["cell_call"].fillna("nan") metrics['experimental_condition'] = metrics["experimental_condition"].fillna("nan") return metrics
def load(self, fname): ''' load tsv file into a pandas data frame ''' extension = os.path.splitext(fname)[-1] if extension in [".h5", ".hdf5"]: with pandas.HDFStore(self.input, 'r') as metrics_store: data = metrics_store[self.tablename] data = data.reset_index() else: data = csvutils.read_csv_and_yaml(fname) # data['chromosome'] = data['chromosome'].astype(str) return data
def add_contamination_status(infile, outfile, reference='grch37', ref_threshold=0.6, alt_threshold=0.2, strict_validation=True): data = csvutils.read_csv_and_yaml(infile) data = data.set_index('cell_id', drop=False) fastqscreen_cols = [ col for col in data.columns.values if col.startswith('fastqscreen_') ] reference = "fastqscreen_{}".format(reference) if reference not in fastqscreen_cols: raise Exception("Could not find the fastq screen counts") alts = [col for col in fastqscreen_cols if not col == reference] data['is_contaminated'] = False perc_ref = data[reference] / data['total_reads'] data.loc[perc_ref <= ref_threshold, 'is_contaminated'] = True for altcol in alts: perc_alt = data[altcol] / data['total_reads'] data.loc[perc_alt > alt_threshold, 'is_contaminated'] = True col_type = dtypes()['metrics']['is_contaminated'] data['is_contaminated'] = data['is_contaminated'].astype(col_type) csvutils.write_dataframe_to_csv_and_yaml(data, outfile, write_header=True, dtypes=dtypes()['metrics']) # get cells that are contaminated and have enopugh human reads check_df = data.loc[data['is_contaminated'] == True] check_df['perc_ref'] = data[reference] / data['total_reads'] check_df = check_df[check_df['perc_ref'] > ref_threshold] if strict_validation and (len(check_df) / len(data) > 0.2): logging.error("over 20% of cells are contaminated")
def read_metrics(self): """ read metrics and get cell to mad mapping """ metrics = csvutils.read_csv_and_yaml(self.metrics) metrics = metrics.set_index("cell_id") cell_order = metrics.order.sort_values().index # assume all cells are good, dont filter if 'quality' not in metrics.columns.values: logging.getLogger("single_cell.hmmcopy.igv_seg").warn( "quality column missing in data") metrics['quality'] = 1 qual_cell_map = { cell: mad for cell, mad in zip(metrics.index, metrics["quality"]) } return qual_cell_map, cell_order
def classify_fastqscreen(training_data_path, metrics_path, metrics_output, dtypes): df = csvutils.read_csv_and_yaml(metrics_path) features_train, feature_transformer, model = train(training_data_path) features = ["fastqscreen_nohit_ratio", "fastqscreen_grch37_ratio", "fastqscreen_mm10_ratio", "fastqscreen_salmon_ratio"] label_to_species = {0: "grch37", 1: "mm10", 2: "salmon"} # check if all the features exists, if yes, make predictions, else create an empty species column. exist = all([feature[:-6] in df for feature in features]) if exist: # make the feature columns for feature in features: df[feature] = df[feature[:-6]].divide(df["total_reads"]) # check if there's any missing value feature_test = df[features] feature_test = feature_test.replace([np.inf, -np.inf], np.nan) feature_test.fillna(features_train.mean(), inplace=True) # scale the features scaled_features = feature_transformer.transform(feature_test) df["species"] = model.predict(scaled_features) df["species"].replace(label_to_species, inplace=True) csvutils.write_dataframe_to_csv_and_yaml(df, metrics_output, dtypes)
def add_contamination_status(infile, outfile, genome_labels, reference='grch37', threshold=0.05): data = csvutils.read_csv_and_yaml(infile) data = data.set_index('cell_id', drop=False) if reference not in genome_labels: raise Exception("Could not find the fastq screen counts") alts = [col for col in genome_labels if not col == reference] data['is_contaminated'] = False for altcol in alts: perc_alt = _get_col_data(data, altcol) / data['total_reads'] data.loc[perc_alt > threshold, 'is_contaminated'] = True data['is_contaminated'] = data['is_contaminated'].astype('bool') csvutils.write_dataframe_to_csv_and_yaml(data, outfile, dtypes(genome_labels))
def load_metrics_data(filename): reads = csvutils.read_csv_and_yaml(filename) reads = reads.set_index(['cell_id']) return reads
def read_csv(self, infile): return csvutils.read_csv_and_yaml(infile)
def get_max_cn(reads): df = csvutils.read_csv_and_yaml(reads) max_cn = np.nanpercentile(df['copy'], 99) return max_cn