示例#1
0
def augment(original_data: pd.DataFrame or str or d3m_ds.Dataset,
            augment_data: Dataset) -> pd.DataFrame:
    """
    Perform the augmentation (either join or union).
    Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API

    Args:
        original_data:
        augment_data:

    Returns:

    """

    loaded_data = DataLoader.load_data(original_data)

    if not augment_data.matched_cols:
        return loaded_data

    left_cols, right_cols = augment_data.matched_cols
    default_joiner = 'rltk'
    augmenter = Augment(es_index=DEFAULT_ES)

    augmented_data = augmenter.join(left_df=loaded_data,
                                    right_df=augment_data.materialize(),
                                    left_columns=left_cols,
                                    right_columns=right_cols,
                                    left_metadata=None,
                                    right_metadata=augment_data.metadata,
                                    joiner=default_joiner)
    return augmented_data
示例#2
0
def search(url: str,
           query: dict,
           data: pd.DataFrame or str or d3m_ds.Dataset=None,
           send_data=True,
           max_return_docs: int=20,
           return_named_entity: bool=False) -> typing.List[Dataset]:
    """
    Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API

    Args:
        url: str - the datamart server(for ISI's datamart it is meaningless, just a flag)
        query: JSON object describing the query(https://datadrivendiscovery.org/wiki/display/work/Query+results+schema)
        data: the data you are trying to augment. It can be provided as one of:
            - a pandas.DataFrame object
            - a D3M Dataset object
            - the path to a D3M datasetDoc.json file
            - the path to a CSV file
        send_data: (for ISI's datamart it is meaningless)

    Returns: a list of datamart.Dataset objects

    """
    if not url.startswith(SEARCH_URL):
        return []

    loaded_data = DataLoader.load_data(data)
    augmenter = Augment(es_index=PRODUCTION_ES_INDEX)

    es_results = []
    if (query and ('required_variables' in query)) or (loaded_data is None):
        # if ("required_variables" exists or no data):
        es_results = augmenter.query_by_json(query, loaded_data,
                                             size=max_return_docs,
                                             return_named_entity=return_named_entity) or []
    else:
        # if there is no "required_variables" in the query JSON, but the dataset exists,
        # try each named entity column as "required_variables" and concat the results:
        query = query or {}
        exist = set()
        for col in loaded_data:
            if Utils.is_column_able_to_query(loaded_data[col]):
                query['required_variables'] = [{
                    "type": "dataframe_columns",
                    "names": [col]
                }]
                cur_results = augmenter.query_by_json(query, loaded_data,
                                                   size=max_return_docs,
                                                   return_named_entity=return_named_entity)
                if not cur_results:
                    continue
                for res in cur_results:
                    if res['_id'] not in exist:
                        # TODO: how about the score ??
                        exist.add(res['_id'])
                        es_results.append(res)
    return [Dataset(es_result, original_data=loaded_data, query_json=query) for es_result in es_results]
示例#3
0
    def setUp(self):
        self.augment = Augment(es_index="fake")
        self.assertDataframeEqual = assert_frame_equal

        data = {
            'Name': ['Tom', 'Jack', 'Steve', 'Ricky'],
            'Age': [28, 34, 29, 42],
            'Date': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"]
        }
        self.df = pd.DataFrame(data).infer_objects()
示例#4
0
class SearchMetadata(object):
    MAX_MATCH = 10

    def __init__(self, es_index="datamart_all"):
        self.augment = Augment(es_index=es_index)

    def default_search_by_csv(self, request, old_df):

        query_string = request.args.get("query_string", None)
        minimum_should_match = request.args.get(
            "minimum_should_match_for_column"
        ) if "minimum_should_match_for_column" in request.args else None

        ret = {
            "message": "Created Dataframe and finding datasets for augmenting",
            "result": []
        }

        for idx in range(old_df.shape[1]):
            if Utils.is_column_able_to_query(old_df.iloc[:, idx]):
                this_column_result = self.augment.query(
                    col=old_df.iloc[:, idx],
                    minimum_should_match_ratio_for_col=minimum_should_match,
                    query_string=query_string)
                if this_column_result:
                    ret["result"].append({
                        "column_idx":
                        idx,
                        "datasets_metadata":
                        this_column_result[:10]
                    })
        return ret
示例#5
0
class TestAugment(unittest.TestCase):
    def setUp(self):
        self.augment = Augment(es_index="fake")
        self.assertDataframeEqual = assert_frame_equal

        data = {
            'Name': ['Tom', 'Jack', 'Steve', 'Ricky'],
            'Age': [28, 34, 29, 42],
            'Date': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"]
        }
        self.df = pd.DataFrame(data).infer_objects()

    @Utils.test_print
    def test_joiner(self):
        data = {
            'Age': [28, 34, 29, 42],
            'Date_x': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"],
            'Name_x': ['Tom', 'Jack', 'Steve', 'Ricky'],
            'Date_y': ["2018-10-05", "2014-02-23", np.nan, np.nan],
            'Name_y': ['Tom', 'Jack', np.nan, np.nan]
        }
        expected = pd.DataFrame(data, columns=data.keys())

        self.assertDataframeEqual(
            self.augment.join(left_df=self.df,
                              right_df=self.df.iloc[:2, :],
                              left_columns=[[0]],
                              right_columns=[[0]],
                              joiner="default").df, expected)
示例#6
0
def join(left_data: pd.DataFrame or str or d3m_ds.Dataset,
         right_data: Dataset or int or pd.DataFrame or str or d3m_ds.Dataset,
         left_columns: typing.List[typing.List[int or str]],
         right_columns: typing.List[typing.List[int or str]],
         left_meta: dict=None,
         joiner=JoinerType.RLTK
         ) -> JoinResult:
    """

    :param left_data: a tabular data
    :param right_data: a tabular data or the datamart.Dataset(metadata with materialize info)
                        or an int for the datamart_id - Recommend to use datamart.Dataset or ID
    :param left_columns: list of index(indices)/header(headers) for each "key" for joining
    :param right_columns: list of index(indices)/header(headers) for each "key" for joining(same length as left_columns)
    :return: a pandas.DataFrame(joined table)
    """

    if isinstance(right_data, Dataset):
        return augment(left_data, right_data, (left_columns, right_columns), joiner)

    print(" - start loading data")
    left_df = DataLoader.load_data(left_data)
    right_metadata = None
    if isinstance(right_data, int):
        right_metadata, right_df = DataLoader.load_meta_and_data_by_id(right_data)
    else:
        right_df = DataLoader.load_data(right_data)

    if not (isinstance(left_df, pd.DataFrame) and isinstance(right_df, pd.DataFrame) and left_columns and right_columns):
        return JoinResult(left_df, [])

    augmenter = Augment(es_index=PRODUCTION_ES_INDEX)

    print(" - satrt augmenting")
    augmented_data = augmenter.join(
            left_df=left_df,
            right_df=right_df,
            left_columns=left_columns,
            right_columns=right_columns,
            left_metadata=left_meta,
            right_metadata=right_metadata,
            joiner=joiner
    )
    return augmented_data
示例#7
0
def augment(original_data: pd.DataFrame or str or d3m_ds.Dataset,
            augment_data: Dataset,
            joining_columns: typing.Tuple[typing.List[typing.List[int or str]], typing.List[typing.List[int or str]]]=None,
            joiner=JoinerType.RLTK
            ) -> JoinResult:
    """
    Perform the augmentation (either join or union).
    Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API

    Args:
        original_data:
        augment_data:
        joining_columns: user defined which columns to be joined

    Returns:

    """

    loaded_data = DataLoader.load_data(original_data)
    if joining_columns:
        try:
            augment_data.set_join_columns(*joining_columns)
        except Exception as e:
            print("FAILED SET JOINING COLUMNS:", e)

    if not augment_data.join_columns:
        return JoinResult(loaded_data, [])

    left_cols, right_cols = augment_data.join_columns
    augmenter = Augment(es_index=PRODUCTION_ES_INDEX)

    augmented_data = augmenter.join(
            left_df=loaded_data,
            right_df=augment_data.materialize(),
            left_columns=left_cols,
            right_columns=right_cols,
            left_metadata=None,
            right_metadata=augment_data.metadata,
            joiner=joiner
    )
    return augmented_data
示例#8
0
class JoinDatasets(object):
    def __init__(self, es_index="datamart"):
        self.augument = Augment(es_index=es_index)

    def default_join(self, request):

        # print(request.form, request.files)
        query_data = json.loads(request.form['data'])
        selected_metadata = query_data["selected_metadata"]

        old_df = pd.read_csv(request.files['file']).infer_objects()

        offset_and_matched_queries = Utils.get_offset_and_matched_queries_from_variable_metadata(
            metadata=selected_metadata)

        if not offset_and_matched_queries:
            return old_df.to_csv()

        if "constrains" in query_data:
            try:
                constrains = query_data["constrains"]
            except:
                constrains = None
        else:
            constrains = {}

        constrains["named_entity"] = {}
        for offset, matched_queries in offset_and_matched_queries:
            constrains["named_entity"][offset] = matched_queries

        new_df = self.augument.get_dataset(
            metadata=selected_metadata["_source"], constrains=constrains)

        df = self.augument.join(
            left_df=old_df,
            right_df=new_df,
            left_columns=[int(x) for x in query_data["old_df_column_ids"]],
            right_columns=[offset for offset, _ in offset_and_matched_queries])

        return df.to_csv()
示例#9
0
def search(
    query: dict,
    data: pd.DataFrame or str
    or d3m_ds.Dataset = None) -> typing.List[Dataset]:
    """
    Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API

    Args:
        query: JSON object describing the query(https://datadrivendiscovery.org/wiki/display/work/Query+results+schema)
        data: the data you are trying to augment. It can be provided as one of:
            - a pandas.DataFrame object
            - a D3M Dataset object
            - the path to a D3M datasetDoc.json file
            - the path to a CSV file

    Returns: a list of datamart.Dataset objects.

    """
    loaded_data = DataLoader.load_data(data)
    augmenter = Augment(es_index=DEFAULT_ES)
    if not (query and
            ('required_variables' in query)) and (loaded_data is not None):
        query = query or {}
        query['required_variables'] = []
        for col in loaded_data:
            if Utils.is_column_able_to_query(loaded_data[col]):
                query['required_variables'].append({
                    "type": "dataframe_columns",
                    "names": [col]
                })
    es_results = augmenter.query_by_json(query, loaded_data)
    if es_results:
        return [
            Dataset(es_result, original_data=loaded_data, query_json=query)
            for es_result in es_results
        ]
    return []
示例#10
0
 def __init__(self, es_index="datamart_all"):
     self.augment = Augment(es_index=es_index)
示例#11
0
class JoinDatasets(object):
    def __init__(self, es_index="datamart"):
        self.augment = Augment(es_index=es_index)

    def default_join(self, request, old_df):

        left_metadata = Utils.generate_metadata_from_dataframe(data=old_df)

        query_data = json.loads(request.form['data'])
        selected_metadata = query_data["selected_metadata"]
        columns_mapping = query_data["columns_mapping"]

        if "constrains" in query_data:
            try:
                constrains = query_data["constrains"]
            except:
                constrains = {}
        else:
            constrains = {}

        matches = Utils.get_inner_hits_info(hitted_es_result=selected_metadata)

        if not matches:
            return json.dumps({
                "message":
                "Default join should perform after default search using default search result"
            })

        constrains[
            "named_entity"] = Utils.get_named_entity_constrain_from_inner_hits(
                matches)

        # get temporal coverage from provided dataframe
        if left_metadata.get("variables", []):
            for variable in left_metadata["variables"]:
                if variable.get(
                        "temporal_coverage"
                ) and variable["temporal_coverage"].get(
                        "start") and variable["temporal_coverage"].get("end"):
                    constrains["date_range"] = {
                        "start": variable["temporal_coverage"]["start"],
                        "end": variable["temporal_coverage"]["end"]
                    }
                    break

        try:
            new_df = Utils.get_dataset(metadata=selected_metadata["_source"],
                                       constrains=constrains)
        except:
            return json.dumps({
                "message":
                "Failed to join, not getting complementary dataset"
            })

        try:
            df = self.augment.join(
                left_df=old_df,
                right_df=new_df,
                left_columns=[x["old_cols"] for x in columns_mapping],
                right_columns=[x["new_cols"] for x in columns_mapping],
                left_metadata=left_metadata,
                right_metadata=selected_metadata["_source"],
                joiner="default")
        except:
            return json.dumps(
                {"message": "Failed to join, con not join two dataframes"})

        return df.to_csv()
示例#12
0
class SearchMetadata(object):

    MAX_MATCH = 10

    MAX_DISPLAY_NAMED_ENTITY = 10

    def __init__(self, es_index="datamart_all"):
        self.augument = Augment(es_index=es_index)

    def default_search_by_csv(self, request):

        query_string = request.args.get("query_string", None)
        minimum_should_match_for_column = int(
            request.args.get("minimum_should_match_for_column")
        ) if "minimum_should_match_for_column" in request.args else None

        df = pd.read_csv(request.files['file']).infer_objects()
        if df is None or df.empty:
            return json.dumps({
                "message":
                "Failed to create Dataframe from csv, nothing found"
            })

        ret = {
            "message": "Created Dataframe and finding datasets for augmenting",
            "result": []
        }

        query_string_result = self.augument.query_any_field_with_string(
            query_string=query_string) if query_string else None

        query_string_result_ids = None
        if query_string_result:
            query_string_result_ids = {
                x["_source"]["datamart_id"]
                for x in query_string_result
            }

        for idx in range(df.shape[1]):
            if is_object_dtype(df.iloc[:, idx]):
                this_column_result = self.augument.query_by_column(
                    col=df.iloc[:, idx],
                    minimum_should_match=minimum_should_match_for_column)
                if this_column_result:
                    if not query_string_result:
                        ret["result"].append({
                            "column_idx":
                            idx,
                            "datasets_metadata":
                            this_column_result[:10]
                        })
                    else:
                        ret["result"].append({
                            "column_idx":
                            idx,
                            "datasets_metadata": [
                                x for x in this_column_result if x["_source"]
                                ["datamart_id"] in query_string_result_ids
                            ][:10]
                        })
        return ret