def run_hmmcopy( bam_file, corrected_reads_filename, segments_filename, parameters_filename, metrics_filename, hmmcopy_tar, cell_id, hmmparams, tempdir, docker_image ): # generate wig file for hmmcopy helpers.makedirs(tempdir) readcount_wig = os.path.join(tempdir, 'readcounter.wig') corrected_reads = os.path.join(tempdir, 'corrected_reads.csv') run_correction_hmmcopy( bam_file, corrected_reads, readcount_wig, hmmparams, docker_image ) hmmcopy_tempdir = os.path.join(tempdir, '{}_hmmcopy'.format(cell_id)) helpers.makedirs(hmmcopy_tempdir) run_hmmcopy_script( corrected_reads, hmmcopy_tempdir, cell_id, hmmparams, docker_image ) hmmcopy_outdir = os.path.join(hmmcopy_tempdir, str(0)) csvutils.rewrite_csv_file( os.path.join(hmmcopy_outdir, "reads.csv"), corrected_reads_filename, dtypes=dtypes()['reads'] ) csvutils.rewrite_csv_file( os.path.join(hmmcopy_outdir, "params.csv"), parameters_filename, dtypes=dtypes()['params'] ) csvutils.rewrite_csv_file( os.path.join(hmmcopy_outdir, "segs.csv"), segments_filename, dtypes=dtypes()['segs'] ) csvutils.rewrite_csv_file( os.path.join(hmmcopy_outdir, "metrics.csv"), metrics_filename, dtypes=dtypes()['metrics'] ) helpers.make_tarfile(hmmcopy_tar, hmmcopy_tempdir)
def add_quality(hmmcopy_metrics, alignment_metrics, multipliers, output, training_data, tempdir): helpers.makedirs(tempdir) hmmcopy_tables = ['/hmmcopy/metrics/{}'.format(mult) for mult in multipliers] model = classify.train_classifier(training_data) feature_names = model.feature_names_ data = classify.load_data(hmmcopy_metrics, alignment_metrics, hmmcopy_tables, '/alignment/metrics', feature_names) for i, (hmmcopy_table, tabledata) in enumerate(data): intermediate_output = os.path.join( tempdir, '{}_metrics_with_quality.csv.gz'.format(i) ) predictions = classify.classify(model, tabledata) classify.write_to_output( hmmcopy_metrics, hmmcopy_table, intermediate_output, predictions) csvutils.prep_csv_files(intermediate_output, output, dtypes=dtypes()['metrics'])
def concatenate_csv(inputs, output, data_type, low_memory=False): ref_dtypes = None if data_type: ref_dtypes = dtypes()[data_type] if low_memory: csvutils.concatenate_csv_files_quick_lowmem(inputs, output, dtypes=ref_dtypes) else: csvutils.concatenate_csv(inputs, output, dtypes=ref_dtypes)
def get_mappability_col(reads, annotated_reads): reads = csvutils.read_csv_and_yaml(reads, chunksize=100) alldata = [] for read_data in reads: read_data['is_low_mappability'] = (read_data['map'] <= 0.9) alldata.append(read_data) alldata = pd.concat(alldata) csvutils.write_dataframe_to_csv_and_yaml( alldata, annotated_reads, dtypes()['reads'], write_header=True )
def add_clustering_order( reads, metrics, output, chromosomes=None, sample_info=None): """ adds sample information to metrics in place """ order = get_hierarchical_clustering_order( reads, chromosomes=chromosomes ) if not sample_info: sample_info = {} for cell_id, order in order.items(): if cell_id not in sample_info: sample_info[cell_id] = {} sample_info[cell_id]['order'] = order csvutils.annotate_csv(metrics, sample_info, output, dtypes()['metrics'])