Пример #1
0
    def indexing(self,
                 description_path: str,
                 es_index: str,
                 data_path: str = None,
                 query_data_for_indexing: bool = False,
                 save_to_file: str = None,
                 save_to_file_mode: str = "a+",
                 delete_old_es_index: bool = False) -> dict:
        """API for the index builder.

        By providing description file, index builder should be able to process it and create metadata json for the
        dataset, create index in our index store

        Args:
            description_path: Path to description json file.
            es_index: str, es index for this dataset
            data_path: Path to data csv file.
            query_data_for_indexing: Bool. If no data is presented, and query_data_for_indexing is False, will only
                create metadata according to the description json. If query_data_for_indexing is True and no data is
                presented, will use Materialize to query data for profiling and indexing
            save_to_file: str, a path to the json line file
            save_to_file_mode: str, mode for saving, default "a+"
            delete_old_es_index: bool, boolean if delete original es index if it exist

        Returns:
            metadata dictionary

        """

        self._check_es_index(es_index=es_index,
                             delete_old_es_index=delete_old_es_index)

        if not self.current_global_index or delete_old_es_index:
            self.current_global_index = self.im.current_global_datamart_id(
                index=es_index)

        description, data = self._read_data(description_path, data_path)
        if not data and query_data_for_indexing:
            try:
                data = Utils.materialize(metadata=description)
            except:
                traceback.print_exc()
                warnings.warn(
                    "Materialization Failed, index based on schema json only")

        metadata = self.construct_global_metadata(description=description,
                                                  data=data)
        Utils.validate_schema(metadata.value)

        if save_to_file:
            self._save_data(save_to_file=save_to_file,
                            save_mode=save_to_file_mode,
                            metadata=metadata)

        self.im.create_doc(index=es_index,
                           doc_type='_doc',
                           body=metadata.value,
                           id=metadata.value['datamart_id'])

        return metadata.value
Пример #2
0
    def get_dataset(metadata: dict, variables: list = None, constrains: dict = None) -> typing.Optional[pd.DataFrame]:
        """Get the dataset with materializer.

       Args:
           metadata: metadata dict.
           variables:
           constrains:

       Returns:
            pandas dataframe
       """

        return Utils.materialize(metadata=metadata, variables=variables, constrains=constrains)
Пример #3
0
 def test_materialize(self):
     fake_metadata = {
         "materialization": {
             "python_path": "noaa_materializer",
             "arguments": {
                 "type": 'PRCP'
             }
         }
     }
     fake_constrains = {
         "date_range": {
             "start": "2016-09-23",
             "end": "2016-09-23"
         },
         "named_entity": {2: ["los angeles"]}
     }
     result = Utils.materialize(metadata=fake_metadata, constrains=fake_constrains)
     expepcted = pd.read_csv(os.path.join(os.path.dirname(__file__), "resources/noaa_result.csv"))
     assert_frame_equal(result, expepcted)
Пример #4
0
    def get_dataset(metadata: dict,
                    variables: list = None,
                    constrains: dict = None) -> typing.Optional[pd.DataFrame]:
        """Get the dataset with materializer.

       Args:
           metadata: metadata dict.
           variables:
           constrains:

       Returns:
            pandas dataframe
       """
        if "date_range" in constrains:
            if not constrains["date_range"].get("start", None):
                constrains["date_range"]["start"] = Augment.DEFAULT_START_DATE
            if not constrains["date_range"].get("end", None):
                constrains["date_range"]["end"] = datetime.now().strftime(
                    '%Y-%m-%dT%H:%M:%S')
        df = Utils.materialize(metadata=metadata, constrains=constrains)
        if variables:
            return df.iloc[:, variables]
        return df