def concatenate_csv(inputs, output, low_memory=False): if low_memory: csvutils.concatenate_csv_files_quick_lowmem( inputs, output ) else: csvutils.concatenate_csv( inputs, output )
def concatenate_csv(inputs, output, data_type, low_memory=False): ref_dtypes = None if data_type: ref_dtypes = dtypes()[data_type] if low_memory: csvutils.concatenate_csv_files_quick_lowmem(inputs, output, dtypes=ref_dtypes) else: csvutils.concatenate_csv(inputs, output, dtypes=ref_dtypes)
def collect_gc(infiles, outfile, tempdir): helpers.makedirs(tempdir) tempouts = [] for cell_id, infile in infiles.items(): tempout = os.path.join(tempdir, "{}.parsed.csv".format(cell_id)) tempouts.append(tempout) gen_gc = GenerateCNMatrix(infile, tempout, ',', 'NORMALIZED_COVERAGE', cell_id, 'gcbias') gen_gc.main() csvutils.concatenate_csv(tempouts, outfile, dtypes=dtypes()['metrics'])
def test_concat_csv_multiple_files_to_concat(self, tmpdir, n_rows, n_frames): """ provide just 1 file to concat """ dtypes = [{v: "int" for v in 'ABCD'} for _ in range(n_frames)] concatenated = os.path.join(tmpdir, 'concat.csv.gz') dfs, csvs, ref = self.base_test_concat(n_rows, dtypes, write=True, get_ref=True, dir=tmpdir) csvutils.concatenate_csv(csvs, concatenated) assert self.dfs_exact_match(ref, concatenated)
def test_concat_csv_input_as_dict(self, tmpdir, n_rows): """ test concating a dictionary of csvs :param tmpdir: temp dir to test in :param n_rows: length of test dfs :return: """ dtypes = {v: "int" for v in 'ABCD'} concatenated = os.path.join(tmpdir, 'concat.csv.gz') dfs, csvs, ref = self.base_test_concat(n_rows, [dtypes, dtypes], write=True, get_ref=True, dir=tmpdir) csvutils.concatenate_csv({"a": csvs[0], "b": csvs[1]}, concatenated)
def test_concat_csv_empty_inputs(self, tmpdir): """ test concatenate csv with data-less input csvs :param tmpdir: tempdir to test in """ dtypes = {v: "int" for v in 'ABCD'} concatenated = os.path.join(tmpdir, 'concat.csv.gz') dfs, csvs, ref = self.base_test_concat(0, [dtypes, dtypes], write=True, get_ref=True, dir=tmpdir) csvutils.concatenate_csv(csvs, concatenated) assert self.dfs_exact_match(concatenated, ref)
def test_concat_csv_different_cols(self, tmpdir, n_rows): """ concat two dataframes with different columns """ dtypes1 = {v: "float" for v in 'ABCD'} dtypes2 = {v: "float" for v in 'ABGF'} concatenated = os.path.join(tmpdir, 'concat.csv.gz') dfs, csvs, ref = self.base_test_concat(n_rows, [dtypes1, dtypes2], write=True, get_ref=True, dir=tmpdir) csvutils.concatenate_csv(csvs, concatenated) assert self.dfs_exact_match(ref, concatenated)
def test_concat_csv(self, tmpdir, n_rows): """ basic sanity check - concat two csvs with same cols :param tmpdir: temporary directory to write in :param n_rows: number of rows in test csvs """ dtypes = {v: "int" for v in 'ABCD'} concatenated = os.path.join(tmpdir, 'concat.csv.gz') dfs, csvs, ref = self.base_test_concat(n_rows, [dtypes, dtypes], write=True, get_ref=True, dir=tmpdir) csvutils.concatenate_csv(csvs, concatenated) assert self.dfs_exact_match(ref, concatenated)
def test_concat_csv_one_file_to_concat(self, tmpdir, n_rows): """ provide just 1 file to concat :param tmpdir: temp dir to test in :param n_rows: length of test dfs :return: """ dtypes = {v: "int" for v in 'ABCD'} concatenated = os.path.join(tmpdir, 'concat.csv.gz') df, csv, ref = self.base_test_concat(n_rows, [dtypes], write=True, get_ref=True, dir=tmpdir) csvutils.concatenate_csv(csv, concatenated) assert self.dfs_exact_match(ref, concatenated)
def collect_metrics(flagstat_metrics, markdups_metrics, insert_metrics, wgs_metrics, tempdir, merged_metrics): helpers.makedirs(tempdir) sample_outputs = [] for sample in flagstat_metrics.keys(): flgstat = flagstat_metrics[sample] mkdup = markdups_metrics[sample] insrt = insert_metrics[sample] wgs = wgs_metrics[sample] outfile = os.path.join(tempdir, sample + "_metrics.csv") sample_outputs.append(outfile) collmet = CollectMetrics(wgs, insrt, flgstat, mkdup, outfile, sample) collmet.main() csvutils.concatenate_csv(sample_outputs, merged_metrics)
def collect_gc(infiles, outfile, tempdir): helpers.makedirs(tempdir) tempouts = [] for cell_id, infile in infiles.iteritems(): tempout = os.path.join(tempdir, os.path.basename(infile) + ".parsed.csv") tempouts.append(tempout) gen_gc = GenerateCNMatrix(infile, tempout, ',', 'NORMALIZED_COVERAGE', cell_id, 'gcbias') gen_gc.main() merged_csv = os.path.join(tempdir, "merged_gc_metrics.csv") csvutils.concatenate_csv(tempouts, merged_csv) hdfutils.convert_csv_to_hdf(merged_csv, outfile, '/alignment/gc_metrics')
def test_concat_csv_no_header(self, tmpdir, n_rows): """ test concating csvs with no headers :param tmpdir: temporary directory to write in :param n_rows: number of rows in test csvs """ dtypes = {v: "int" for v in 'ABCD'} concatenated = os.path.join(tmpdir, 'concat.csv.gz') dfs, csvs, ref = self.base_test_concat(n_rows, [dtypes, dtypes], write=True, get_ref=True, dir=tmpdir, write_head=False) csvutils.concatenate_csv(csvs, concatenated, write_header=False) assert self.dfs_exact_match(ref, concatenated) concatenated = pd.read_csv(concatenated) # ignore separate yaml assert all([col not in concatenated.columns.tolist() for col in dtypes.keys()])
def test_concat_csv_with_nans(self, tmpdir, n_rows): """ concat two csvs with NaNs """ dtypes = {v: "float" for v in 'ABCD'} concatenated = os.path.join(tmpdir, 'concat.csv.gz') dfs = self.make_test_dfs([dtypes, dtypes], n_rows) csvs = [os.path.join(tmpdir, "0.csv.gz"), os.path.join(tmpdir, "1.csv.gz")] dfs[0].iloc[2, dfs[0].columns.get_loc("A")] = np.NaN dfs[1].iloc[2, dfs[1].columns.get_loc("A")] = np.NaN csvutils.write_dataframe_to_csv_and_yaml(dfs[0], csvs[0], dtypes) csvutils.write_dataframe_to_csv_and_yaml(dfs[1], csvs[1], dtypes) ref = pd.concat(dfs, ignore_index=True) csvutils.concatenate_csv(csvs, concatenated) assert self.dfs_exact_match(ref, concatenated)
def merge_csvs(input_csvs, merged_csv): """ merges input csv files into one csv """ csvutils.concatenate_csv(input_csvs, merged_csv, write_header=True)
def concatenate_csv(inputs, output): csvutils.concatenate_csv( inputs, output, write_header=True )