def augment(original_data: pd.DataFrame or str or d3m_ds.Dataset, augment_data: Dataset) -> pd.DataFrame: """ Perform the augmentation (either join or union). Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API Args: original_data: augment_data: Returns: """ loaded_data = DataLoader.load_data(original_data) if not augment_data.matched_cols: return loaded_data left_cols, right_cols = augment_data.matched_cols default_joiner = 'rltk' augmenter = Augment(es_index=DEFAULT_ES) augmented_data = augmenter.join(left_df=loaded_data, right_df=augment_data.materialize(), left_columns=left_cols, right_columns=right_cols, left_metadata=None, right_metadata=augment_data.metadata, joiner=default_joiner) return augmented_data
def search(url: str, query: dict, data: pd.DataFrame or str or d3m_ds.Dataset=None, send_data=True, max_return_docs: int=20, return_named_entity: bool=False) -> typing.List[Dataset]: """ Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API Args: url: str - the datamart server(for ISI's datamart it is meaningless, just a flag) query: JSON object describing the query(https://datadrivendiscovery.org/wiki/display/work/Query+results+schema) data: the data you are trying to augment. It can be provided as one of: - a pandas.DataFrame object - a D3M Dataset object - the path to a D3M datasetDoc.json file - the path to a CSV file send_data: (for ISI's datamart it is meaningless) Returns: a list of datamart.Dataset objects """ if not url.startswith(SEARCH_URL): return [] loaded_data = DataLoader.load_data(data) augmenter = Augment(es_index=PRODUCTION_ES_INDEX) es_results = [] if (query and ('required_variables' in query)) or (loaded_data is None): # if ("required_variables" exists or no data): es_results = augmenter.query_by_json(query, loaded_data, size=max_return_docs, return_named_entity=return_named_entity) or [] else: # if there is no "required_variables" in the query JSON, but the dataset exists, # try each named entity column as "required_variables" and concat the results: query = query or {} exist = set() for col in loaded_data: if Utils.is_column_able_to_query(loaded_data[col]): query['required_variables'] = [{ "type": "dataframe_columns", "names": [col] }] cur_results = augmenter.query_by_json(query, loaded_data, size=max_return_docs, return_named_entity=return_named_entity) if not cur_results: continue for res in cur_results: if res['_id'] not in exist: # TODO: how about the score ?? exist.add(res['_id']) es_results.append(res) return [Dataset(es_result, original_data=loaded_data, query_json=query) for es_result in es_results]
def setUp(self): self.augment = Augment(es_index="fake") self.assertDataframeEqual = assert_frame_equal data = { 'Name': ['Tom', 'Jack', 'Steve', 'Ricky'], 'Age': [28, 34, 29, 42], 'Date': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"] } self.df = pd.DataFrame(data).infer_objects()
class SearchMetadata(object): MAX_MATCH = 10 def __init__(self, es_index="datamart_all"): self.augment = Augment(es_index=es_index) def default_search_by_csv(self, request, old_df): query_string = request.args.get("query_string", None) minimum_should_match = request.args.get( "minimum_should_match_for_column" ) if "minimum_should_match_for_column" in request.args else None ret = { "message": "Created Dataframe and finding datasets for augmenting", "result": [] } for idx in range(old_df.shape[1]): if Utils.is_column_able_to_query(old_df.iloc[:, idx]): this_column_result = self.augment.query( col=old_df.iloc[:, idx], minimum_should_match_ratio_for_col=minimum_should_match, query_string=query_string) if this_column_result: ret["result"].append({ "column_idx": idx, "datasets_metadata": this_column_result[:10] }) return ret
class TestAugment(unittest.TestCase): def setUp(self): self.augment = Augment(es_index="fake") self.assertDataframeEqual = assert_frame_equal data = { 'Name': ['Tom', 'Jack', 'Steve', 'Ricky'], 'Age': [28, 34, 29, 42], 'Date': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"] } self.df = pd.DataFrame(data).infer_objects() @Utils.test_print def test_joiner(self): data = { 'Age': [28, 34, 29, 42], 'Date_x': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"], 'Name_x': ['Tom', 'Jack', 'Steve', 'Ricky'], 'Date_y': ["2018-10-05", "2014-02-23", np.nan, np.nan], 'Name_y': ['Tom', 'Jack', np.nan, np.nan] } expected = pd.DataFrame(data, columns=data.keys()) self.assertDataframeEqual( self.augment.join(left_df=self.df, right_df=self.df.iloc[:2, :], left_columns=[[0]], right_columns=[[0]], joiner="default").df, expected)
def join(left_data: pd.DataFrame or str or d3m_ds.Dataset, right_data: Dataset or int or pd.DataFrame or str or d3m_ds.Dataset, left_columns: typing.List[typing.List[int or str]], right_columns: typing.List[typing.List[int or str]], left_meta: dict=None, joiner=JoinerType.RLTK ) -> JoinResult: """ :param left_data: a tabular data :param right_data: a tabular data or the datamart.Dataset(metadata with materialize info) or an int for the datamart_id - Recommend to use datamart.Dataset or ID :param left_columns: list of index(indices)/header(headers) for each "key" for joining :param right_columns: list of index(indices)/header(headers) for each "key" for joining(same length as left_columns) :return: a pandas.DataFrame(joined table) """ if isinstance(right_data, Dataset): return augment(left_data, right_data, (left_columns, right_columns), joiner) print(" - start loading data") left_df = DataLoader.load_data(left_data) right_metadata = None if isinstance(right_data, int): right_metadata, right_df = DataLoader.load_meta_and_data_by_id(right_data) else: right_df = DataLoader.load_data(right_data) if not (isinstance(left_df, pd.DataFrame) and isinstance(right_df, pd.DataFrame) and left_columns and right_columns): return JoinResult(left_df, []) augmenter = Augment(es_index=PRODUCTION_ES_INDEX) print(" - satrt augmenting") augmented_data = augmenter.join( left_df=left_df, right_df=right_df, left_columns=left_columns, right_columns=right_columns, left_metadata=left_meta, right_metadata=right_metadata, joiner=joiner ) return augmented_data
def augment(original_data: pd.DataFrame or str or d3m_ds.Dataset, augment_data: Dataset, joining_columns: typing.Tuple[typing.List[typing.List[int or str]], typing.List[typing.List[int or str]]]=None, joiner=JoinerType.RLTK ) -> JoinResult: """ Perform the augmentation (either join or union). Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API Args: original_data: augment_data: joining_columns: user defined which columns to be joined Returns: """ loaded_data = DataLoader.load_data(original_data) if joining_columns: try: augment_data.set_join_columns(*joining_columns) except Exception as e: print("FAILED SET JOINING COLUMNS:", e) if not augment_data.join_columns: return JoinResult(loaded_data, []) left_cols, right_cols = augment_data.join_columns augmenter = Augment(es_index=PRODUCTION_ES_INDEX) augmented_data = augmenter.join( left_df=loaded_data, right_df=augment_data.materialize(), left_columns=left_cols, right_columns=right_cols, left_metadata=None, right_metadata=augment_data.metadata, joiner=joiner ) return augmented_data
class JoinDatasets(object): def __init__(self, es_index="datamart"): self.augument = Augment(es_index=es_index) def default_join(self, request): # print(request.form, request.files) query_data = json.loads(request.form['data']) selected_metadata = query_data["selected_metadata"] old_df = pd.read_csv(request.files['file']).infer_objects() offset_and_matched_queries = Utils.get_offset_and_matched_queries_from_variable_metadata( metadata=selected_metadata) if not offset_and_matched_queries: return old_df.to_csv() if "constrains" in query_data: try: constrains = query_data["constrains"] except: constrains = None else: constrains = {} constrains["named_entity"] = {} for offset, matched_queries in offset_and_matched_queries: constrains["named_entity"][offset] = matched_queries new_df = self.augument.get_dataset( metadata=selected_metadata["_source"], constrains=constrains) df = self.augument.join( left_df=old_df, right_df=new_df, left_columns=[int(x) for x in query_data["old_df_column_ids"]], right_columns=[offset for offset, _ in offset_and_matched_queries]) return df.to_csv()
def search( query: dict, data: pd.DataFrame or str or d3m_ds.Dataset = None) -> typing.List[Dataset]: """ Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API Args: query: JSON object describing the query(https://datadrivendiscovery.org/wiki/display/work/Query+results+schema) data: the data you are trying to augment. It can be provided as one of: - a pandas.DataFrame object - a D3M Dataset object - the path to a D3M datasetDoc.json file - the path to a CSV file Returns: a list of datamart.Dataset objects. """ loaded_data = DataLoader.load_data(data) augmenter = Augment(es_index=DEFAULT_ES) if not (query and ('required_variables' in query)) and (loaded_data is not None): query = query or {} query['required_variables'] = [] for col in loaded_data: if Utils.is_column_able_to_query(loaded_data[col]): query['required_variables'].append({ "type": "dataframe_columns", "names": [col] }) es_results = augmenter.query_by_json(query, loaded_data) if es_results: return [ Dataset(es_result, original_data=loaded_data, query_json=query) for es_result in es_results ] return []
def __init__(self, es_index="datamart_all"): self.augment = Augment(es_index=es_index)
class JoinDatasets(object): def __init__(self, es_index="datamart"): self.augment = Augment(es_index=es_index) def default_join(self, request, old_df): left_metadata = Utils.generate_metadata_from_dataframe(data=old_df) query_data = json.loads(request.form['data']) selected_metadata = query_data["selected_metadata"] columns_mapping = query_data["columns_mapping"] if "constrains" in query_data: try: constrains = query_data["constrains"] except: constrains = {} else: constrains = {} matches = Utils.get_inner_hits_info(hitted_es_result=selected_metadata) if not matches: return json.dumps({ "message": "Default join should perform after default search using default search result" }) constrains[ "named_entity"] = Utils.get_named_entity_constrain_from_inner_hits( matches) # get temporal coverage from provided dataframe if left_metadata.get("variables", []): for variable in left_metadata["variables"]: if variable.get( "temporal_coverage" ) and variable["temporal_coverage"].get( "start") and variable["temporal_coverage"].get("end"): constrains["date_range"] = { "start": variable["temporal_coverage"]["start"], "end": variable["temporal_coverage"]["end"] } break try: new_df = Utils.get_dataset(metadata=selected_metadata["_source"], constrains=constrains) except: return json.dumps({ "message": "Failed to join, not getting complementary dataset" }) try: df = self.augment.join( left_df=old_df, right_df=new_df, left_columns=[x["old_cols"] for x in columns_mapping], right_columns=[x["new_cols"] for x in columns_mapping], left_metadata=left_metadata, right_metadata=selected_metadata["_source"], joiner="default") except: return json.dumps( {"message": "Failed to join, con not join two dataframes"}) return df.to_csv()
class SearchMetadata(object): MAX_MATCH = 10 MAX_DISPLAY_NAMED_ENTITY = 10 def __init__(self, es_index="datamart_all"): self.augument = Augment(es_index=es_index) def default_search_by_csv(self, request): query_string = request.args.get("query_string", None) minimum_should_match_for_column = int( request.args.get("minimum_should_match_for_column") ) if "minimum_should_match_for_column" in request.args else None df = pd.read_csv(request.files['file']).infer_objects() if df is None or df.empty: return json.dumps({ "message": "Failed to create Dataframe from csv, nothing found" }) ret = { "message": "Created Dataframe and finding datasets for augmenting", "result": [] } query_string_result = self.augument.query_any_field_with_string( query_string=query_string) if query_string else None query_string_result_ids = None if query_string_result: query_string_result_ids = { x["_source"]["datamart_id"] for x in query_string_result } for idx in range(df.shape[1]): if is_object_dtype(df.iloc[:, idx]): this_column_result = self.augument.query_by_column( col=df.iloc[:, idx], minimum_should_match=minimum_should_match_for_column) if this_column_result: if not query_string_result: ret["result"].append({ "column_idx": idx, "datasets_metadata": this_column_result[:10] }) else: ret["result"].append({ "column_idx": idx, "datasets_metadata": [ x for x in this_column_result if x["_source"] ["datamart_id"] in query_string_result_ids ][:10] }) return ret