def merge_csv(in_filenames: Union[List[str], Dict[str, str]], out_filename: str, how: str, on: List[str], write_header: bool = True) -> None: """ Create one gzipped CSV out of multiple gzipped CSVs. @param in_filenames: Dictionary containing file paths as keys @param out_filename: Path to newly merged CSV @param how: How to join DataFrames (inner, outer, left, right). @param on: Column(s) to join on, comma separated if multiple. @param write_header: boolean, True = write header, False = don't write header @return: """ if isinstance(in_filenames, dict): in_filenames = list(in_filenames.values()) data: List[CsverveInput] = [ CsverveInput(infile) for infile in in_filenames ] dfs: List[str] = [csvinput.read_csv() for csvinput in data] dtypes: List[Dict[str, str]] = [csvinput.dtypes for csvinput in data] merged_data: pd.DataFrame = utils.merge_frames(dfs, how, on) dtypes_: Dict[str, str] = utils.merge_dtypes(dtypes) csvoutput: CsverveOutputDataFrame = CsverveOutputDataFrame( merged_data, out_filename, dtypes_, write_header=write_header) csvoutput.write_df()
def test_merge_frames_multiple_cols(self, n_rows): """ test merging of 2 dfs on multiple columns with right merge """ how = "inner" on = ["A", "B"] suffs = ["", ""] dtypes1 = {v: "int" for v in "ABC"} dtypes2 = {v: "int" for v in "ABDF"} dtypes = [dtypes1, dtypes2] dfs, ref = self.base_merge_test(n_rows, how, on, suffs, dtypes) merged = utils.merge_frames(dfs, how=how, on=on) assert self.dfs_exact_match(ref, merged)
def merge_frames_directional_test(self, length, direction): """ merge frames in a given direction; corresponds to "how" :param length: length of test dfs :param direction: direction to merge in (outter, inner etc.) """ how = direction on = ["A"] suffs = ["", ""] dtypes1 = {v: "int" for v in "ABC"} dtypes2 = {v: "int" for v in "ADF"} dtypes = [dtypes1, dtypes2] dfs, ref = self.base_merge_test(length, how, on, suffs, dtypes) merged = utils.merge_frames(dfs, how=how, on=on) assert self.dfs_exact_match(ref, merged)
def test_merge_frames_with_nans(self, n_rows): """ test merging of 2 dfs on 1 col which contains NaNs in each """ dtypes1 = {v: "float" for v in 'ACD'} dtypes2 = {v: "float" for v in 'AEG'} how = "outer" on = ['A'] dfs = self.make_mergeable_test_dfs([dtypes1, dtypes2], on, n_rows) dfs[0].iloc[2, dfs[0].columns.get_loc(on[0])] = np.NaN dfs[1].iloc[2, dfs[1].columns.get_loc(on[0])] = np.NaN ref = dfs[0].merge(dfs[1], how=how, on=on) merged = utils.merge_frames(dfs, how=how, on=on) assert self.dfs_exact_match(ref, merged)
def test_merge_frames_one_frame(self, n_rows): """ provide just one df :param n_rows: number of rows in simulated df :return: assertion """ how = "inner" on = ["A"] suffs = ["", ""] dtypes = {v: "int" for v in "ABC"} df = self.base_merge_test(n_rows, how, on, suffs, [dtypes], get_ref=False) merged = utils.merge_frames(df, how=how, on=on) assert self.dfs_exact_match(df[0], merged)