Exemplo n.º 1
0
 def test_temporal_coverage_validate(self):
     coverage = {}
     self.assertEqual(Utils.temporal_coverage_validate(coverage), {
         "start": None,
         "end": None
     })
     coverage = {"start": None}
     self.assertEqual(Utils.temporal_coverage_validate(coverage), {
         "start": None,
         "end": None
     })
     coverage = {"end": None}
     self.assertEqual(Utils.temporal_coverage_validate(coverage), {
         "start": None,
         "end": None
     })
     coverage = {"start": "2018-09-23T00:00:00", "end": "2018-10-10"}
     self.assertEqual(Utils.temporal_coverage_validate(coverage), {
         'end': '2018-10-10T00:00:00',
         'start': '2018-09-23T00:00:00'
     })
     coverage = {"start": "2018-00", "end": "2018-10-10"}
     self.assertEqual(Utils.temporal_coverage_validate(coverage), {
         'end': '2018-10-10T00:00:00',
         'start': None
     })
def upload(meta_list: typing.List[dict],
           es_index: str = PRODUCTION_ES_INDEX,
           deduplicate: bool = True,
           index_builder: IndexBuilder = None) -> typing.List[dict]:
    ib = index_builder or IndexBuilder()
    succeeded = []
    for meta in meta_list:
        try:
            Utils.validate_schema(meta)
            meta['datamart_status'] = 'not_profiled'
            if deduplicate:
                exist_id = check_existence(meta['materialization'],
                                           es_index=es_index)
                if exist_id:
                    success = ib.updating_send_trusted_metadata(
                        metadata=meta, es_index=es_index, datamart_id=exist_id)
                else:
                    success = ib.indexing_send_to_es(metadata=meta,
                                                     es_index=es_index)
            else:
                success = ib.indexing_send_to_es(metadata=meta,
                                                 es_index=es_index)
            if success:
                succeeded.append(success)
        except Exception as e:
            print('UPLOAD FAILED: ', str(e))
            continue
    return succeeded
Exemplo n.º 3
0
    def match_temporal_coverage(cls,
                                start: str = None,
                                end: str = None) -> typing.Optional[dict]:
        """Generate query body for query by temporal_coverage.

        Args:
            start: dataset should cover date time earlier than the start date.
            end: dataset should cover date time later than the end date.

        Returns:
            dict of query body
        """

        start = Utils.date_validate(date_text=start) if start else None
        end = Utils.date_validate(date_text=end) if end else None
        if not start and not end:
            warnings.warn("Start and end are valid")
            return None

        body = {
            "nested": {
                "path": "variables",
                "inner_hits": {
                    "_source": ["temporal_coverage"]
                },
                "query": {
                    "bool": {
                        "must": []
                    }
                }
            }
        }

        if start:
            body["nested"]["query"]["bool"]["must"].append({
                "range": {
                    "variables.temporal_coverage.start": {
                        "lte": start,
                        "format": "yyyy-MM-dd'T'HH:mm:ss"
                    }
                }
            })

        if end:
            body["nested"]["query"]["bool"]["must"].append({
                "range": {
                    "variables.temporal_coverage.end": {
                        "gte": end,
                        "format": "yyyy-MM-dd'T'HH:mm:ss"
                    }
                }
            })

        return body
Exemplo n.º 4
0
    def updating(self,
                 description_path: str,
                 es_index: str,
                 document_id: int,
                 data_path: str = None,
                 query_data_for_updating: bool = False) -> dict:
        """Update document in elastic search.

        By providing description file, index builder should be able to process it and create metadata json for the
        dataset, update document in elastic search

        Args:
            description_path: Path to description json file.
            es_index: str, es index for this dataset
            document_id: int, document id of document which need to be updated
            data_path: Path to data csv file.
            query_data_for_updating: Bool. If no data is presented, and query_data_for_updating is False, will only
                create metadata according to the description json. If query_data_for_updating is True and no data is
                presented, will use Materialize to query data for profiling and indexing

        Returns:
            metadata dictionary

        """
        """
        Not keep up to date for a while, may not work well. But updating is not very useful as well.
        """

        self._check_es_index(es_index=es_index)

        description, data = self._read_data(description_path, data_path)
        if not data and query_data_for_updating:
            try:
                data = Utils.materialize(metadata=description).infer_objects()
            except:
                traceback.print_exc()
                warnings.warn(
                    "Materialization Failed, index based on schema json only. (%s)"
                    % description_path)

        metadata = self.construct_global_metadata(
            description=description,
            data=data,
            overwrite_datamart_id=document_id)
        Utils.validate_schema(metadata)

        self.im.update_doc(index=es_index,
                           doc_type='document',
                           body={"doc": metadata},
                           id=metadata['datamart_id'])

        return metadata
Exemplo n.º 5
0
 def updating_send_trusted_metadata(self, metadata: dict, datamart_id: int,
                                    es_index: str):
     self.update_datamart_id(metadata=metadata, datamart_id=datamart_id)
     Utils.validate_schema(metadata)
     try:
         self.im.update_doc(index=es_index,
                            doc_type='_doc',
                            body={"doc": metadata},
                            id=metadata['datamart_id'])
         return metadata
     except Exception as e:
         if isinstance(e, TransportError):
             print(e.info)
         pass
Exemplo n.º 6
0
    def join(self,
             left_df: pd.DataFrame,
             right_df: pd.DataFrame,
             left_columns: typing.List[typing.List[int]],
             right_columns: typing.List[typing.List[int]],
             left_metadata: dict = None,
             right_metadata: dict = None,
             joiner: str = "default") -> typing.Optional[pd.DataFrame]:
        """Join two dataframes based on different joiner.

          Args:
              left_df: pandas Dataframe
              right_df: pandas Dataframe
              left_metadata: metadata of left dataframe
              right_metadata: metadata of right dataframe
              left_columns: list of integers from left df for join
              right_columns: list of integers from right df for join
              joiner: string of joiner, default to be "default"

          Returns:
               Dataframe
          """

        if joiner not in self.joiners:
            self.joiners[joiner] = JoinerPrepare.prepare_joiner(joiner=joiner)

        if not self.joiners[joiner]:
            warnings.warn("No suitable joiner, return original dataframe")
            return left_df

        if not left_metadata:
            # Left df is the user provided one.
            # We will generate metadata just based on the data itself, profiling and so on
            left_metadata = Utils.generate_metadata_from_dataframe(
                data=left_df)

        left_metadata = Utils.calculate_dsbox_features(data=left_df,
                                                       metadata=left_metadata)
        right_metadata = Utils.calculate_dsbox_features(
            data=right_df, metadata=right_metadata)

        return self.joiners[joiner].join(
            left_df=left_df,
            right_df=right_df,
            left_columns=left_columns,
            right_columns=right_columns,
            left_metadata=left_metadata,
            right_metadata=right_metadata,
        )
Exemplo n.º 7
0
    def default_search_by_csv(self, request, old_df):

        query_string = request.args.get("query_string", None)
        minimum_should_match = request.args.get(
            "minimum_should_match_for_column"
        ) if "minimum_should_match_for_column" in request.args else None

        ret = {
            "message": "Created Dataframe and finding datasets for augmenting",
            "result": []
        }

        for idx in range(old_df.shape[1]):
            if Utils.is_column_able_to_query(old_df.iloc[:, idx]):
                this_column_result = self.augment.query(
                    col=old_df.iloc[:, idx],
                    minimum_should_match_ratio_for_col=minimum_should_match,
                    query_string=query_string)
                if this_column_result:
                    ret["result"].append({
                        "column_idx":
                        idx,
                        "datasets_metadata":
                        this_column_result[:10]
                    })
        return ret
Exemplo n.º 8
0
 def test_validate_schema(self):
     with open(
             os.path.join(os.path.dirname(__file__),
                          "resources/sample_schema.json"), "r") as f:
         description = json.load(f)
     self.assertEqual(Utils.validate_schema(description["description"]),
                      True)
Exemplo n.º 9
0
    def test_get_dataset(self):
        fake_matadata = {
            "materialization": {
                "python_path": "noaa_materializer",
                "arguments": {
                    "type": "TAVG"
                }
            }
        }

        fake_constrains = {
            "named_entity": {
                2: ["new york", "sdasds"]
            },
            "date_range": {
                "start": "2018-09-23T00:00:00",
                "end": "2018-09-30T00:00:00"
            }
        }

        df = Utils.get_dataset(metadata=fake_matadata,
                               variables=[0, 2, 3],
                               constrains=fake_constrains)

        ground_truth = pd.read_csv(
            os.path.join(os.path.dirname(__file__), "./resources",
                         "test_augment.csv"))
        self.dataframe_equal(ground_truth, df)
Exemplo n.º 10
0
 def test_materialize(self):
     fake_metadata = {
         "materialization": {
             "python_path": "noaa_materializer",
             "arguments": {
                 "type": 'PRCP'
             }
         }
     }
     fake_constrains = {
         "date_range": {
             "start": "2016-09-23",
             "end": "2016-09-23"
         },
         "named_entity": {
             2: ["los angeles"]
         }
     }
     result = Utils.materialize(metadata=fake_metadata,
                                constrains=fake_constrains).infer_objects()
     print(result)
     expepcted = pd.read_csv(
         os.path.join(os.path.dirname(__file__),
                      "resources/noaa_result.csv"))
     self.dataframe_equal(result, expepcted)
Exemplo n.º 11
0
def search(url: str,
           query: dict,
           data: pd.DataFrame or str or d3m_ds.Dataset=None,
           send_data=True,
           max_return_docs: int=20,
           return_named_entity: bool=False) -> typing.List[Dataset]:
    """
    Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API

    Args:
        url: str - the datamart server(for ISI's datamart it is meaningless, just a flag)
        query: JSON object describing the query(https://datadrivendiscovery.org/wiki/display/work/Query+results+schema)
        data: the data you are trying to augment. It can be provided as one of:
            - a pandas.DataFrame object
            - a D3M Dataset object
            - the path to a D3M datasetDoc.json file
            - the path to a CSV file
        send_data: (for ISI's datamart it is meaningless)

    Returns: a list of datamart.Dataset objects

    """
    if not url.startswith(SEARCH_URL):
        return []

    loaded_data = DataLoader.load_data(data)
    augmenter = Augment(es_index=PRODUCTION_ES_INDEX)

    es_results = []
    if (query and ('required_variables' in query)) or (loaded_data is None):
        # if ("required_variables" exists or no data):
        es_results = augmenter.query_by_json(query, loaded_data,
                                             size=max_return_docs,
                                             return_named_entity=return_named_entity) or []
    else:
        # if there is no "required_variables" in the query JSON, but the dataset exists,
        # try each named entity column as "required_variables" and concat the results:
        query = query or {}
        exist = set()
        for col in loaded_data:
            if Utils.is_column_able_to_query(loaded_data[col]):
                query['required_variables'] = [{
                    "type": "dataframe_columns",
                    "names": [col]
                }]
                cur_results = augmenter.query_by_json(query, loaded_data,
                                                   size=max_return_docs,
                                                   return_named_entity=return_named_entity)
                if not cur_results:
                    continue
                for res in cur_results:
                    if res['_id'] not in exist:
                        # TODO: how about the score ??
                        exist.add(res['_id'])
                        es_results.append(res)
    return [Dataset(es_result, original_data=loaded_data, query_json=query) for es_result in es_results]
Exemplo n.º 12
0
    def _read_data(description_path: str,
                   data_path: str = None) -> typing.Tuple[dict, pd.DataFrame]:
        """Read dataset description json and dataset if present.

        Args:
            description_path: Path to description json file.
            data_path: Path to data csv file.

        Returns:
            Tuple of (description json, dataframe of data)
        """

        description = json.load(open(description_path, 'r'))
        Utils.validate_schema(description)
        if data_path:
            data = pd.read_csv(open(data_path), 'r')
        else:
            data = None
        return description, data
Exemplo n.º 13
0
    def _read_data(description_path: str or dict,
                   data_path: str = None) -> typing.Tuple[dict, pd.DataFrame]:
        """Read dataset description json and dataset if present.

        Args:
            description_path: Path to description json file, or the description JSON in Python dict.
            data_path: Path to data csv file.

        Returns:
            Tuple of (description json, dataframe of data)
        """
        if isinstance(description_path, str):
            description = json.load(open(description_path, 'r'))
        else:
            description = description_path
        Utils.validate_schema(description)
        if data_path:
            data = pd.read_csv(open(data_path), 'r')
        else:
            data = None
        return description, data
Exemplo n.º 14
0
    def indexing_generate_metadata(
            self,
            description_path: str or dict,
            data_path: str or pd.DataFrame = None,
            query_data_for_indexing: bool = False,
            save_to_file: str = None,
            save_to_file_mode: str = "a+",
            cache_dataset_path: str = None,
            enable_two_ravens_profiler: bool = False) -> dict:

        description, data = self._read_data(description_path, data_path)
        if data is None and query_data_for_indexing:
            try:
                data = Utils.materialize(metadata=description).infer_objects()
                if cache_dataset_path:
                    data.to_csv(cache_dataset_path, index=False)
            except Exception:
                traceback.print_exc()
                warnings.warn(
                    "Materialization Failed, index based on schema json only. (%s)"
                    % description_path)

        # construct global metadata without generating valid datamart_id
        metadata = self.construct_global_metadata(description=description,
                                                  data=data,
                                                  overwrite_datamart_id=0)

        if data is not None:
            metadata = self.profile(
                data=data,
                metadata=metadata,
                enable_two_ravens_profiler=enable_two_ravens_profiler)
        Utils.validate_schema(metadata)

        if save_to_file:
            self._save_data(save_to_file=save_to_file,
                            save_mode=save_to_file_mode,
                            metadata=metadata)

        return metadata
Exemplo n.º 15
0
 def test_calculate_dsbox_features(self):
     expected = {
         'variables': [{
             'dsbox_profiled': {
                 'ratio_of_numeric_values': 1.0,
                 'number_of_outlier_numeric_values': 0
             }
         }, {
             'dsbox_profiled': {
                 'most_common_tokens': [{
                     'name': '2014-02-23',
                     'count': 1
                 }, {
                     'name': '2018-10-05',
                     'count': 1
                 }, {
                     'name': '2020-09-23',
                     'count': 1
                 }, {
                     'name': '2023-02-13',
                     'count': 1
                 }],
                 'number_of_tokens_containing_numeric_char':
                 4,
                 'ratio_of_tokens_containing_numeric_char':
                 1.0,
                 'number_of_values_containing_numeric_char':
                 4,
                 'ratio_of_values_containing_numeric_char':
                 1.0
             }
         }, {
             'dsbox_profiled': {
                 'most_common_tokens': [{
                     'name': 'Jack',
                     'count': 1
                 }, {
                     'name': 'Ricky',
                     'count': 1
                 }, {
                     'name': 'Steve',
                     'count': 1
                 }, {
                     'name': 'Tom',
                     'count': 1
                 }]
             }
         }]
     }
     self.assertDictEqual(
         Utils.calculate_dsbox_features(
             data=self.df, metadata={"variables": [{}, {}, {}]}), expected)
Exemplo n.º 16
0
 def load_meta_and_data_by_id(datamart_id: int,
                              first_n_rows: int = None,
                              constrains=None):
     qm = QueryManager(es_host=ES_HOST,
                       es_port=ES_PORT,
                       es_index=PRODUCTION_ES_INDEX)
     res = qm.get_by_id(datamart_id)
     if res and res.get('_source'):
         df = Utils.get_dataset(res['_source'], constrains=constrains)
         if first_n_rows:
             df = df.head(first_n_rows)
         return res['_source'], df
     return None, None
Exemplo n.º 17
0
 def test_two_ravens_profiler(self):
     data = pd.DataFrame({
         'Name': ['Tom', 'Jack', 'Steve'],
         'Age': [28, 34, 29],
         'Date': ["2018-10-05", "2014-02-23", "2020-09-23"]
     })
     meta = Utils.generate_metadata_from_dataframe(data)
     res = TwoRavensProfiler().profile(data, meta)
     if meta == res:
         # TwoRavensProfiler is probably down
         print('TwoRavensProfiler is probably down. Skipping test.')
     else:
         expected_file = os.path.join(resources_path, "two_ravens.json")
         with open(expected_file) as f:
             expected = json.load(f)
         self.assertEqual(res, expected)
Exemplo n.º 18
0
 def test_get_named_entity_constrain_from_inner_hits(self):
     expected = {2: ['new york'], 1: ['united states']}
     self.assertDictEqual(
         Utils.get_named_entity_constrain_from_inner_hits(
             matches=[{
                 'offset': 2,
                 'matched_queries': ['new york'],
                 'highlight': {
                     'variables.named_entity': ['new york']
                 }
             }, {
                 'offset': 1,
                 'matched_queries': ['nunited states of american'],
                 'highlight': {
                     'variables.named_entity': ['united states']
                 }
             }]), expected)
Exemplo n.º 19
0
    def test_get_inner_hits_info(self):
        fake_es_result = {
            "inner_hits": {
                "variables": {
                    "hits": {
                        "hits": [{
                            "_nested": {
                                "field": "variables",
                                "offset": 2
                            },
                            "highlight": {
                                "variables.named_entity": ["new york"]
                            },
                            "matched_queries": ["new york"]
                        }, {
                            "_nested": {
                                "field": "variables",
                                "offset": 1
                            },
                            "highlight": {
                                "variables.named_entity": ["united states"]
                            },
                            "matched_queries": ["united states of american"]
                        }]
                    }
                }
            }
        }

        expected = [{
            'offset': 2,
            'matched_queries': ['new york'],
            'highlight': {
                'variables.named_entity': ['new york']
            }
        }, {
            'offset': 1,
            'matched_queries': ['united states of american'],
            'highlight': {
                'variables.named_entity': ['united states']
            }
        }]
        self.assertListEqual(
            Utils.get_inner_hits_info(hitted_es_result=fake_es_result),
            expected)
Exemplo n.º 20
0
    def test_get_metadata_intersection(self):
        metadata_lst1 = [{
            "_source": {
                "datamart_id": 0
            }
        }, {
            "_source": {
                "datamart_id": 1
            }
        }, {
            "_source": {
                "datamart_id": 2
            }
        }]

        metadata_lst2 = [{
            "_source": {
                "datamart_id": 0
            }
        }, {
            "_source": {
                "datamart_id": 2
            }
        }, {
            "_source": {
                "datamart_id": 3
            }
        }]

        metadata_lst3 = [{
            "_source": {
                "datamart_id": 0
            }
        }, {
            "_source": {
                "datamart_id": 3
            }
        }]

        expect = [{'_source': {'datamart_id': 0}}]
        self.assertListEqual(
            Utils.get_metadata_intersection(metadata_lst1, metadata_lst2,
                                            metadata_lst3), expect)
Exemplo n.º 21
0
    def test_append_columns_for_implicit_variables(self):
        implicit_variables = [{
            "name": "indicator",
            "value": "born"
        }, {
            "name": "city",
            "value": "shanghai"
        }]

        data = {
            'Age': [28, 34, 29, 42],
            'Date': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"],
            'Name': ['Tom', 'Jack', 'Steve', 'Ricky'],
            'indicator': ["born", "born", "born", "born"],
            'city': ['shanghai', 'shanghai', 'shanghai', 'shanghai']
        }
        expected = pd.DataFrame(data, columns=data.keys())

        self.dataframe_equal(
            Utils.append_columns_for_implicit_variables(
                implicit_variables=implicit_variables, df=self.df), expected)
def bulk_generate_metadata(
        html_page: str,
        description: dict = None,
        enable_two_ravens_profiler=False) -> typing.List[typing.List[dict]]:
    """

    :param html_page:
    :param description:
    :param es_index:
    :return:
    """
    successed = []
    hp = HTMLProcesser(html_page)
    html_meta = hp.extract_description_from_meta()
    for text, href in hp.generate_a_tags_from_html():
        try:
            cur_description = copy.deepcopy(description) or {}
            if not Utils.validate_url(href):
                continue
            if not cur_description.get('title'):
                black_list = set(
                    text.lower().split()).intersection(TITLE_BLACK_LIST)
                if not black_list:
                    cur_description['title'] = text.strip()
            if not cur_description.get('description'):
                cur_description['description'] = html_meta
            cur_description['materialization_arguments'] = {'url': href}
            # Not to extract html tables, otherwise there will be too many FPs:
            cur_metadata = generate_metadata(
                cur_description,
                ignore_html=True,
                enable_two_ravens_profiler=enable_two_ravens_profiler)
            if cur_metadata:
                successed.append(cur_metadata)
        except Exception as e:
            print(
                ' - FAILED GENERATE METADATA ON \n\ttext = %s, \n\thref = %s \n%s'
                % (text, href, str(e)))
    return successed
Exemplo n.º 23
0
    def __init__(self,
                 description: dict,
                 datamart_id: typing.Union[int, None] = None) -> None:
        """Init method of VariableMetadata.

        Args:
            description: description dict.
            datamart_id: unique datamart_id.

        Returns:

        """

        super().__init__()

        self._metadata["datamart_id"] = datamart_id

        if "name" in description:
            self._metadata["name"] = description["name"]

        if "description" in description:
            self._metadata["description"] = description["description"]

        self._metadata["semantic_type"] = description.get("semantic_type", [])

        if "named_entity" in description:
            self._metadata["named_entity"] = description["named_entity"]

        if "temporal_coverage" in description:
            self._metadata["temporal_coverage"] = description[
                "temporal_coverage"]

        if self.temporal_coverage is not False:
            self.temporal_coverage = Utils.temporal_coverage_validate(
                self.temporal_coverage)

        if "spatial_coverage" in description:
            self._metadata["spatial_coverage"] = description[
                "spatial_coverage"]
Exemplo n.º 24
0
def search(
    query: dict,
    data: pd.DataFrame or str
    or d3m_ds.Dataset = None) -> typing.List[Dataset]:
    """
    Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API

    Args:
        query: JSON object describing the query(https://datadrivendiscovery.org/wiki/display/work/Query+results+schema)
        data: the data you are trying to augment. It can be provided as one of:
            - a pandas.DataFrame object
            - a D3M Dataset object
            - the path to a D3M datasetDoc.json file
            - the path to a CSV file

    Returns: a list of datamart.Dataset objects.

    """
    loaded_data = DataLoader.load_data(data)
    augmenter = Augment(es_index=DEFAULT_ES)
    if not (query and
            ('required_variables' in query)) and (loaded_data is not None):
        query = query or {}
        query['required_variables'] = []
        for col in loaded_data:
            if Utils.is_column_able_to_query(loaded_data[col]):
                query['required_variables'].append({
                    "type": "dataframe_columns",
                    "names": [col]
                })
    es_results = augmenter.query_by_json(query, loaded_data)
    if es_results:
        return [
            Dataset(es_result, original_data=loaded_data, query_json=query)
            for es_result in es_results
        ]
    return []
Exemplo n.º 25
0
 def test_generate_metadata_from_dataframe(self):
     expected = {
         'datamart_id':
         None,
         'materialization': {
             'python_path': 'default_materializer',
             'arguments': None
         },
         'variables': [{
             'datamart_id': None,
             'semantic_type': [],
             'name': 'Age',
             'description': 'column name: Age, dtype: int64'
         }, {
             'datamart_id': None,
             'semantic_type': [],
             'name': 'Date',
             'description': 'column name: Date, dtype: object',
             'temporal_coverage': {
                 'start': '2014-02-23T00:00:00',
                 'end': '2023-02-13T00:00:00'
             }
         }, {
             'datamart_id': None,
             'semantic_type': [],
             'name': 'Name',
             'description': 'column name: Name, dtype: object',
             'named_entity': ['Tom', 'Jack', 'Steve', 'Ricky']
         }],
         'title':
         'Age Date Name',
         'description':
         'Age : int64, Date : object, Name : object',
         'keywords': ['Age', 'Date', 'Name']
     }
     self.assertEqual(Utils.generate_metadata_from_dataframe(data=self.df),
                      expected)
Exemplo n.º 26
0
from datamart.utilities.utils import Utils
from datamart import search, augment


schema = {
    "materialization": {
        "python_path": "general_materializer",
        "arguments": {
            "url": "https://en.wikipedia.org/wiki/List_of_Rock_and_Roll_Hall_of_Fame_inductees",
            "file_type": "html"
        }
    }
}
hof_df = Utils.get_dataset(schema)

print(hof_df)

query = {
    "dataset": {
        "about": "rock and roll, music, rock music, rock artist, rock band, music award, artist award, hall of fame, singer"
    },
    "required_variables": [
        {
            "type": "dataframe_columns",
            "index": [2]
        }
    ]
}
candidates = search(query, hof_df)

res = []
Exemplo n.º 27
0
    def join(self,
             left_df: pd.DataFrame,
             right_df: pd.DataFrame,
             left_columns: typing.List[typing.List[int]],
             right_columns: typing.List[typing.List[int]],
             left_metadata: dict = None,
             right_metadata: dict = None,
             joiner: JoinerType = JoinerType.DEFAULT) -> JoinResult:
        """Join two dataframes based on different joiner.

          Args:
              left_df: pandas Dataframe
              right_df: pandas Dataframe
              left_metadata: metadata of left dataframe
              right_metadata: metadata of right dataframe
              left_columns: list of integers from left df for join
              right_columns: list of integers from right df for join
              joiner: string of joiner, default to be "default"

          Returns:
               JoinResult
          """

        if joiner not in self.joiners:
            self.joiners[joiner] = JoinerPrepare.prepare_joiner(joiner=joiner)

        if not self.joiners[joiner]:
            warnings.warn("No suitable joiner, return original dataframe")
            return JoinResult(left_df, [])

        print(" - start profiling")
        if not (left_metadata and left_metadata.get("variables")):
            # Left df is the user provided one.
            # We will generate metadata just based on the data itself, profiling and so on
            left_metadata = Utils.generate_metadata_from_dataframe(
                data=left_df, original_meta=left_metadata)

        if not right_metadata:
            right_metadata = Utils.generate_metadata_from_dataframe(
                data=right_df)

        # Only profile the joining columns, otherwise it will be too slow:
        left_metadata = Utils.calculate_dsbox_features(
            data=left_df,
            metadata=left_metadata,
            selected_columns=set(chain.from_iterable(left_columns)))

        right_metadata = Utils.calculate_dsbox_features(
            data=right_df,
            metadata=right_metadata,
            selected_columns=set(chain.from_iterable(right_columns)))

        # update with implicit_variable on the user supplied dataset
        if left_metadata.get('implicit_variables'):
            Utils.append_columns_for_implicit_variables_and_add_meta(
                left_metadata, left_df)

        print(" - start joining tables")
        res = self.joiners[joiner].join(
            left_df=left_df,
            right_df=right_df,
            left_columns=left_columns,
            right_columns=right_columns,
            left_metadata=left_metadata,
            right_metadata=right_metadata,
        )

        return res
Exemplo n.º 28
0
 def test_is_column_able_to_query(self):
     self.assertTrue(Utils.is_column_able_to_query(col=self.df['Name']))
     self.assertFalse(Utils.is_column_able_to_query(col=self.df['Date']))
     self.assertFalse(Utils.is_column_able_to_query(col=self.df['Age']))
Exemplo n.º 29
0
 def test_load_materializer(self):
     materializer = Utils.load_materializer("noaa_materializer")
     self.assertEqual(issubclass(type(materializer), MaterializerBase),
                      True)
     self.assertIn(type(materializer).__name__, NoaaMaterializer.__name__)
Exemplo n.º 30
0
 def test_date_validate(self):
     self.assertEqual(Utils.date_validate("2018-10-10"),
                      "2018-10-10T00:00:00")