Exemplo n.º 1
0
    def join(left_df: pd.DataFrame, right_df: pd.DataFrame,
             left_columns: typing.List[typing.List[int]],
             right_columns: typing.List[typing.List[int]],
             **kwargs) -> JoinResult:

        left_columns = [x[0] for x in left_columns]
        right_columns = [x[0] for x in right_columns]

        if len(left_columns) != len(right_columns):
            raise ValueError("Default join only perform on 1-1 mapping")

        right_df = right_df.rename(
            columns={
                right_df.columns[right_columns[idx]]: left_df.columns[
                    left_columns[idx]]
                for idx in range(len(left_columns))
            })

        df = pd.merge(
            left=left_df,
            right=right_df,
            left_on=[left_df.columns[idx] for idx in left_columns],
            right_on=[right_df.columns[idx] for idx in right_columns],
            how='left')

        return JoinResult(df=df)
Exemplo n.º 2
0
    def join(self, left_df: pd.DataFrame, right_df: pd.DataFrame,
             left_columns: typing.List[typing.List[int]],
             right_columns: typing.List[typing.List[int]], left_metadata: dict,
             right_metadata: dict) -> JoinResult:
        # print(left_metadata)

        # step 1 : transform columns
        """
        1. merge columns if multi-multi relation, mark as "merged" so we use set-based functions only
        2. pull out the mapped columns and form to new datasets with same order to support
        """

        fp = FeaturePairs(left_df, right_df, left_columns, right_columns,
                          left_metadata, right_metadata)

        record_pairs = rltk.get_record_pairs(fp.left_rltk_dataset,
                                             fp.right_rltk_dataset,
                                             block=fp.get_rltk_block())
        sim = [[0 for __ in range(len(right_df))] for _ in range(len(left_df))]

        for r1, r2 in record_pairs:
            similarities = []
            for f1, f2 in fp.pairs:
                v1 = f1.value_merge_func(r1)
                v2 = f2.value_merge_func(r2)
                if self.exact_match:
                    similarities.append(1 if v1 == v2 else 0)
                    continue
                # print(v1, v2, type(f1), type(f2))
                for similarity_func in f1.similarity_functions():
                    similarity = similarity_func(v1, v2)
                    similarities.append(similarity)
                    # print(f1.name, f2.name, v1, v2, similarity, similarity_func, type(f1))
                    # TODO: now only consider the first similarity function for now
                    break
                # print(v1, v2, similarities)
            sim[int(r1.id)][int(
                r2.id
            )] = sum(similarities) / len(similarities) if similarities else 0

        matched_rows = self.simple_best_match(sim)
        res = self.one_to_one_concat(matched_rows, left_df, right_df,
                                     right_columns)

        # step 2 : analyze target columns - get ranked similarity functions for each columns
        """
        see https://paper.dropbox.com/doc/ER-for-Datamart--ASlKtpR4ceGaj~6cN4Q7EWoSAQ-tRug6oRX6g5Ko5jzaeynT
        """

        # step 3 : check if 1-1, 1-n, n-1, or m-n relations,
        # based on the analyze in step 2 we can basically know if it is one-to-one relation

        return JoinResult(res, matched_rows)
Exemplo n.º 3
0
def join(left_data: pd.DataFrame or str or d3m_ds.Dataset,
         right_data: Dataset or int or pd.DataFrame or str or d3m_ds.Dataset,
         left_columns: typing.List[typing.List[int or str]],
         right_columns: typing.List[typing.List[int or str]],
         left_meta: dict=None,
         joiner=JoinerType.RLTK
         ) -> JoinResult:
    """

    :param left_data: a tabular data
    :param right_data: a tabular data or the datamart.Dataset(metadata with materialize info)
                        or an int for the datamart_id - Recommend to use datamart.Dataset or ID
    :param left_columns: list of index(indices)/header(headers) for each "key" for joining
    :param right_columns: list of index(indices)/header(headers) for each "key" for joining(same length as left_columns)
    :return: a pandas.DataFrame(joined table)
    """

    if isinstance(right_data, Dataset):
        return augment(left_data, right_data, (left_columns, right_columns), joiner)

    print(" - start loading data")
    left_df = DataLoader.load_data(left_data)
    right_metadata = None
    if isinstance(right_data, int):
        right_metadata, right_df = DataLoader.load_meta_and_data_by_id(right_data)
    else:
        right_df = DataLoader.load_data(right_data)

    if not (isinstance(left_df, pd.DataFrame) and isinstance(right_df, pd.DataFrame) and left_columns and right_columns):
        return JoinResult(left_df, [])

    augmenter = Augment(es_index=PRODUCTION_ES_INDEX)

    print(" - satrt augmenting")
    augmented_data = augmenter.join(
            left_df=left_df,
            right_df=right_df,
            left_columns=left_columns,
            right_columns=right_columns,
            left_metadata=left_meta,
            right_metadata=right_metadata,
            joiner=joiner
    )
    return augmented_data
Exemplo n.º 4
0
    def join(self, left_df: pd.DataFrame, right_df: pd.DataFrame,
             left_columns: typing.List[typing.List[int]],
             right_columns: typing.List[typing.List[int]], left_metadata: dict,
             right_metadata: dict) -> JoinResult:
        class left(rltk.AutoGeneratedRecord):
            pass

        class right(rltk.AutoGeneratedRecord):
            pass

        left_df['id'] = left_df.index.astype(str)
        right_df['id'] = right_df.index.astype(str)
        if 'Unnamed: 0' in right_df.columns:
            right_df = right_df.drop(columns=['Unnamed: 0'])
        ds1 = rltk.Dataset(rltk.DataFrameReader(left_df), record_class=left)
        ds2 = rltk.Dataset(rltk.DataFrameReader(right_df), record_class=right)

        bg = rltk.HashBlockGenerator()
        block = bg.generate(
            bg.block(ds1, property_=self.join_target_column_names[0]),
            bg.block(ds2, property_=self.join_target_column_names[1]))
        left_df = left_df.set_index('id')
        right_df = right_df.set_index('id')

        pairs = rltk.get_record_pairs(ds1, ds2, block=block)

        df_joined = pd.DataFrame()

        column_names_to_join = None
        for r1, r2 in pairs:
            left_res = left_df.loc[r1.id]
            right_res = right_df.loc[r2.id]
            if column_names_to_join is None:
                column_names_to_join = right_res.index.difference(
                    left_res.index)
                matched_rows = right_res.index.intersection(left_res.index)
                columns_new = left_res.index.tolist()
                columns_new.extend(column_names_to_join.tolist())
            new = pd.concat([left_res, right_res[column_names_to_join]])
            df_joined = df_joined.append(new, ignore_index=True)
        # ensure that the original dataframe columns are at the first left part
        df_joined = df_joined[columns_new]

        return JoinResult(df_joined, matched_rows)
Exemplo n.º 5
0
def augment(original_data: pd.DataFrame or str or d3m_ds.Dataset,
            augment_data: Dataset,
            joining_columns: typing.Tuple[typing.List[typing.List[int or str]], typing.List[typing.List[int or str]]]=None,
            joiner=JoinerType.RLTK
            ) -> JoinResult:
    """
    Perform the augmentation (either join or union).
    Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API

    Args:
        original_data:
        augment_data:
        joining_columns: user defined which columns to be joined

    Returns:

    """

    loaded_data = DataLoader.load_data(original_data)
    if joining_columns:
        try:
            augment_data.set_join_columns(*joining_columns)
        except Exception as e:
            print("FAILED SET JOINING COLUMNS:", e)

    if not augment_data.join_columns:
        return JoinResult(loaded_data, [])

    left_cols, right_cols = augment_data.join_columns
    augmenter = Augment(es_index=PRODUCTION_ES_INDEX)

    augmented_data = augmenter.join(
            left_df=loaded_data,
            right_df=augment_data.materialize(),
            left_columns=left_cols,
            right_columns=right_cols,
            left_metadata=None,
            right_metadata=augment_data.metadata,
            joiner=joiner
    )
    return augmented_data
Exemplo n.º 6
0
    def join(self,
             left_df: pd.DataFrame,
             right_df: pd.DataFrame,
             left_columns: typing.List[typing.List[int]],
             right_columns: typing.List[typing.List[int]],
             left_metadata: dict = None,
             right_metadata: dict = None,
             joiner: JoinerType = JoinerType.DEFAULT) -> JoinResult:
        """Join two dataframes based on different joiner.

          Args:
              left_df: pandas Dataframe
              right_df: pandas Dataframe
              left_metadata: metadata of left dataframe
              right_metadata: metadata of right dataframe
              left_columns: list of integers from left df for join
              right_columns: list of integers from right df for join
              joiner: string of joiner, default to be "default"

          Returns:
               JoinResult
          """

        if joiner not in self.joiners:
            self.joiners[joiner] = JoinerPrepare.prepare_joiner(joiner=joiner)

        if not self.joiners[joiner]:
            warnings.warn("No suitable joiner, return original dataframe")
            return JoinResult(left_df, [])

        print(" - start profiling")
        if not (left_metadata and left_metadata.get("variables")):
            # Left df is the user provided one.
            # We will generate metadata just based on the data itself, profiling and so on
            left_metadata = Utils.generate_metadata_from_dataframe(
                data=left_df, original_meta=left_metadata)

        if not right_metadata:
            right_metadata = Utils.generate_metadata_from_dataframe(
                data=right_df)

        # Only profile the joining columns, otherwise it will be too slow:
        left_metadata = Utils.calculate_dsbox_features(
            data=left_df,
            metadata=left_metadata,
            selected_columns=set(chain.from_iterable(left_columns)))

        right_metadata = Utils.calculate_dsbox_features(
            data=right_df,
            metadata=right_metadata,
            selected_columns=set(chain.from_iterable(right_columns)))

        # update with implicit_variable on the user supplied dataset
        if left_metadata.get('implicit_variables'):
            Utils.append_columns_for_implicit_variables_and_add_meta(
                left_metadata, left_df)

        print(" - start joining tables")
        res = self.joiners[joiner].join(
            left_df=left_df,
            right_df=right_df,
            left_columns=left_columns,
            right_columns=right_columns,
            left_metadata=left_metadata,
            right_metadata=right_metadata,
        )

        return res