def join(left_df: pd.DataFrame, right_df: pd.DataFrame, left_columns: typing.List[typing.List[int]], right_columns: typing.List[typing.List[int]], **kwargs) -> JoinResult: left_columns = [x[0] for x in left_columns] right_columns = [x[0] for x in right_columns] if len(left_columns) != len(right_columns): raise ValueError("Default join only perform on 1-1 mapping") right_df = right_df.rename( columns={ right_df.columns[right_columns[idx]]: left_df.columns[ left_columns[idx]] for idx in range(len(left_columns)) }) df = pd.merge( left=left_df, right=right_df, left_on=[left_df.columns[idx] for idx in left_columns], right_on=[right_df.columns[idx] for idx in right_columns], how='left') return JoinResult(df=df)
def join(self, left_df: pd.DataFrame, right_df: pd.DataFrame, left_columns: typing.List[typing.List[int]], right_columns: typing.List[typing.List[int]], left_metadata: dict, right_metadata: dict) -> JoinResult: # print(left_metadata) # step 1 : transform columns """ 1. merge columns if multi-multi relation, mark as "merged" so we use set-based functions only 2. pull out the mapped columns and form to new datasets with same order to support """ fp = FeaturePairs(left_df, right_df, left_columns, right_columns, left_metadata, right_metadata) record_pairs = rltk.get_record_pairs(fp.left_rltk_dataset, fp.right_rltk_dataset, block=fp.get_rltk_block()) sim = [[0 for __ in range(len(right_df))] for _ in range(len(left_df))] for r1, r2 in record_pairs: similarities = [] for f1, f2 in fp.pairs: v1 = f1.value_merge_func(r1) v2 = f2.value_merge_func(r2) if self.exact_match: similarities.append(1 if v1 == v2 else 0) continue # print(v1, v2, type(f1), type(f2)) for similarity_func in f1.similarity_functions(): similarity = similarity_func(v1, v2) similarities.append(similarity) # print(f1.name, f2.name, v1, v2, similarity, similarity_func, type(f1)) # TODO: now only consider the first similarity function for now break # print(v1, v2, similarities) sim[int(r1.id)][int( r2.id )] = sum(similarities) / len(similarities) if similarities else 0 matched_rows = self.simple_best_match(sim) res = self.one_to_one_concat(matched_rows, left_df, right_df, right_columns) # step 2 : analyze target columns - get ranked similarity functions for each columns """ see https://paper.dropbox.com/doc/ER-for-Datamart--ASlKtpR4ceGaj~6cN4Q7EWoSAQ-tRug6oRX6g5Ko5jzaeynT """ # step 3 : check if 1-1, 1-n, n-1, or m-n relations, # based on the analyze in step 2 we can basically know if it is one-to-one relation return JoinResult(res, matched_rows)
def join(left_data: pd.DataFrame or str or d3m_ds.Dataset, right_data: Dataset or int or pd.DataFrame or str or d3m_ds.Dataset, left_columns: typing.List[typing.List[int or str]], right_columns: typing.List[typing.List[int or str]], left_meta: dict=None, joiner=JoinerType.RLTK ) -> JoinResult: """ :param left_data: a tabular data :param right_data: a tabular data or the datamart.Dataset(metadata with materialize info) or an int for the datamart_id - Recommend to use datamart.Dataset or ID :param left_columns: list of index(indices)/header(headers) for each "key" for joining :param right_columns: list of index(indices)/header(headers) for each "key" for joining(same length as left_columns) :return: a pandas.DataFrame(joined table) """ if isinstance(right_data, Dataset): return augment(left_data, right_data, (left_columns, right_columns), joiner) print(" - start loading data") left_df = DataLoader.load_data(left_data) right_metadata = None if isinstance(right_data, int): right_metadata, right_df = DataLoader.load_meta_and_data_by_id(right_data) else: right_df = DataLoader.load_data(right_data) if not (isinstance(left_df, pd.DataFrame) and isinstance(right_df, pd.DataFrame) and left_columns and right_columns): return JoinResult(left_df, []) augmenter = Augment(es_index=PRODUCTION_ES_INDEX) print(" - satrt augmenting") augmented_data = augmenter.join( left_df=left_df, right_df=right_df, left_columns=left_columns, right_columns=right_columns, left_metadata=left_meta, right_metadata=right_metadata, joiner=joiner ) return augmented_data
def join(self, left_df: pd.DataFrame, right_df: pd.DataFrame, left_columns: typing.List[typing.List[int]], right_columns: typing.List[typing.List[int]], left_metadata: dict, right_metadata: dict) -> JoinResult: class left(rltk.AutoGeneratedRecord): pass class right(rltk.AutoGeneratedRecord): pass left_df['id'] = left_df.index.astype(str) right_df['id'] = right_df.index.astype(str) if 'Unnamed: 0' in right_df.columns: right_df = right_df.drop(columns=['Unnamed: 0']) ds1 = rltk.Dataset(rltk.DataFrameReader(left_df), record_class=left) ds2 = rltk.Dataset(rltk.DataFrameReader(right_df), record_class=right) bg = rltk.HashBlockGenerator() block = bg.generate( bg.block(ds1, property_=self.join_target_column_names[0]), bg.block(ds2, property_=self.join_target_column_names[1])) left_df = left_df.set_index('id') right_df = right_df.set_index('id') pairs = rltk.get_record_pairs(ds1, ds2, block=block) df_joined = pd.DataFrame() column_names_to_join = None for r1, r2 in pairs: left_res = left_df.loc[r1.id] right_res = right_df.loc[r2.id] if column_names_to_join is None: column_names_to_join = right_res.index.difference( left_res.index) matched_rows = right_res.index.intersection(left_res.index) columns_new = left_res.index.tolist() columns_new.extend(column_names_to_join.tolist()) new = pd.concat([left_res, right_res[column_names_to_join]]) df_joined = df_joined.append(new, ignore_index=True) # ensure that the original dataframe columns are at the first left part df_joined = df_joined[columns_new] return JoinResult(df_joined, matched_rows)
def augment(original_data: pd.DataFrame or str or d3m_ds.Dataset, augment_data: Dataset, joining_columns: typing.Tuple[typing.List[typing.List[int or str]], typing.List[typing.List[int or str]]]=None, joiner=JoinerType.RLTK ) -> JoinResult: """ Perform the augmentation (either join or union). Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API Args: original_data: augment_data: joining_columns: user defined which columns to be joined Returns: """ loaded_data = DataLoader.load_data(original_data) if joining_columns: try: augment_data.set_join_columns(*joining_columns) except Exception as e: print("FAILED SET JOINING COLUMNS:", e) if not augment_data.join_columns: return JoinResult(loaded_data, []) left_cols, right_cols = augment_data.join_columns augmenter = Augment(es_index=PRODUCTION_ES_INDEX) augmented_data = augmenter.join( left_df=loaded_data, right_df=augment_data.materialize(), left_columns=left_cols, right_columns=right_cols, left_metadata=None, right_metadata=augment_data.metadata, joiner=joiner ) return augmented_data
def join(self, left_df: pd.DataFrame, right_df: pd.DataFrame, left_columns: typing.List[typing.List[int]], right_columns: typing.List[typing.List[int]], left_metadata: dict = None, right_metadata: dict = None, joiner: JoinerType = JoinerType.DEFAULT) -> JoinResult: """Join two dataframes based on different joiner. Args: left_df: pandas Dataframe right_df: pandas Dataframe left_metadata: metadata of left dataframe right_metadata: metadata of right dataframe left_columns: list of integers from left df for join right_columns: list of integers from right df for join joiner: string of joiner, default to be "default" Returns: JoinResult """ if joiner not in self.joiners: self.joiners[joiner] = JoinerPrepare.prepare_joiner(joiner=joiner) if not self.joiners[joiner]: warnings.warn("No suitable joiner, return original dataframe") return JoinResult(left_df, []) print(" - start profiling") if not (left_metadata and left_metadata.get("variables")): # Left df is the user provided one. # We will generate metadata just based on the data itself, profiling and so on left_metadata = Utils.generate_metadata_from_dataframe( data=left_df, original_meta=left_metadata) if not right_metadata: right_metadata = Utils.generate_metadata_from_dataframe( data=right_df) # Only profile the joining columns, otherwise it will be too slow: left_metadata = Utils.calculate_dsbox_features( data=left_df, metadata=left_metadata, selected_columns=set(chain.from_iterable(left_columns))) right_metadata = Utils.calculate_dsbox_features( data=right_df, metadata=right_metadata, selected_columns=set(chain.from_iterable(right_columns))) # update with implicit_variable on the user supplied dataset if left_metadata.get('implicit_variables'): Utils.append_columns_for_implicit_variables_and_add_meta( left_metadata, left_df) print(" - start joining tables") res = self.joiners[joiner].join( left_df=left_df, right_df=right_df, left_columns=left_columns, right_columns=right_columns, left_metadata=left_metadata, right_metadata=right_metadata, ) return res