def test_index_csv(self): with open( os.path.join(os.path.dirname(__file__), "resources/user_index_schema.json"), "r") as f: description = json.load(f) ib = IndexBuilder() res = ib.indexing(description_path=description, es_index='datamart_tmp', query_data_for_indexing=True) _id = res.get('datamart_id') self.assertNotEqual(_id, None) expected = { 'datamart_id': _id, 'title': 'A short description of the dataset', 'description': 'https://cerc.blackboard.com/Page/1189', 'implicit_variables': [{ 'name': 'year', 'value': '2007', 'semantic_type': [ 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Time' ] }], 'materialization': { 'python_path': 'general_materializer', 'arguments': { 'url': 'http://insight.dev.schoolwires.com/HelpAssets/C2Assets/C2Files/C2ImportFamRelSample.csv', 'file_type': 'csv' } }, 'variables': [{ 'datamart_id': _id + 1, 'semantic_type': [], 'name': 'Parent Identifier', 'description': 'column name: Parent Identifier, dtype: int64' }, { 'datamart_id': _id + 2, 'semantic_type': [], 'name': 'Student Identifier', 'description': 'column name: Student Identifier, dtype: int64' }], 'keywords': ['Parent Identifier', 'Student Identifier'] } print(res) self.assertEqual(res, expected)
def setUp(self): self.ib = IndexBuilder() self.global_datamart_id = 10000 self.df_for_global = pd.DataFrame({ "city": ["abu dhabi", "ajman", "dubai", "sharjah"], 'date': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"] }) self.df_for_variable = pd.DataFrame( {'date': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"]})
def upload(meta_list: typing.List[dict], es_index: str = PRODUCTION_ES_INDEX, deduplicate: bool = True, index_builder: IndexBuilder = None) -> typing.List[dict]: ib = index_builder or IndexBuilder() succeeded = [] for meta in meta_list: try: Utils.validate_schema(meta) meta['datamart_status'] = 'not_profiled' if deduplicate: exist_id = check_existence(meta['materialization'], es_index=es_index) if exist_id: success = ib.updating_send_trusted_metadata( metadata=meta, es_index=es_index, datamart_id=exist_id) else: success = ib.indexing_send_to_es(metadata=meta, es_index=es_index) else: success = ib.indexing_send_to_es(metadata=meta, es_index=es_index) if success: succeeded.append(success) except Exception as e: print('UPLOAD FAILED: ', str(e)) continue return succeeded
def bulk_upload(list_of_meta_list: typing.List[typing.List[dict]], es_index: str = PRODUCTION_ES_INDEX, deduplicate: bool = True) -> typing.List[typing.List[dict]]: succeeded = [] ib = IndexBuilder() for meta_list in list_of_meta_list: success_list = upload(meta_list, es_index, deduplicate, ib) if success_list: succeeded.append(success_list) return succeeded
def upload(description: dict, es_index: str = None) -> dict: """ Args: description: Returns: """ description['materialization'] = { 'python_path': 'general_materializer', 'arguments': description['materialization_arguments'] } del description['materialization_arguments'] ib = IndexBuilder() metadata = ib.indexing(description_path=description, es_index=es_index or DEFAULT_ES, query_data_for_indexing=True) return metadata
class TestIndexBuilder(unittest.TestCase): def setUp(self): self.ib = IndexBuilder() self.global_datamart_id = 10000 self.df_for_global = pd.DataFrame({ "city": ["abu dhabi", "ajman", "dubai", "sharjah"], 'date': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"] }) self.df_for_variable = pd.DataFrame( {'date': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"]}) @Utils.test_print def test_construct_variable_metadata_with_empty_variable(self): variable_metadata = self.ib.construct_variable_metadata( description={}, global_datamart_id=self.global_datamart_id, col_offset=0, data=self.df_for_variable) expected = { 'datamart_id': 10001, 'semantic_type': [], 'name': 'date', 'description': 'column name: date, dtype: object', 'temporal_coverage': { 'start': '2014-02-23T00:00:00', 'end': '2023-02-13T00:00:00' } } self.assertEqual(variable_metadata.value, expected) @Utils.test_print def test_construct_variable_metadata_1(self): variable_description = { "name": "date", "description": "the date of data", "semantic_type": ["https://metadata.datadrivendiscovery.org/types/Time"], "temporal_coverage": { "start": "1874-10-13", "end": "2018-10-01" } } variable_metadata = self.ib.construct_variable_metadata( description=variable_description, global_datamart_id=self.global_datamart_id, col_offset=0) expected = { 'datamart_id': 10001, 'name': 'date', 'description': 'the date of data', 'semantic_type': ['https://metadata.datadrivendiscovery.org/types/Time'], 'temporal_coverage': { 'start': '1874-10-13T00:00:00', 'end': '2018-10-01T00:00:00' } } self.assertEqual(variable_metadata.value, expected) @Utils.test_print def test_construct_variable_metadata_1_with_data(self): variable_description = { "description": "the date of data", "semantic_type": ["https://metadata.datadrivendiscovery.org/types/Time"], "temporal_coverage": { "start": None, "end": None } } variable_metadata = self.ib.construct_variable_metadata( description=variable_description, global_datamart_id=self.global_datamart_id, col_offset=0, data=self.df_for_variable) expected = { 'datamart_id': 10001, 'name': 'date', 'description': 'the date of data', 'semantic_type': ['https://metadata.datadrivendiscovery.org/types/Time'], 'temporal_coverage': { 'start': '2014-02-23T00:00:00', 'end': '2023-02-13T00:00:00' } } self.assertEqual(variable_metadata.value, expected) @Utils.test_print def test_construct_variable_metadata_2(self): variable_description = { "name": "city", "description": "the city data belongs to", "semantic_type": ["https://metadata.datadrivendiscovery.org/types/Location"], "named_entity": [ "abu dhabi", "ajman", "dubai", "sharjah", "kabul", "kandahar", "algiers", "annaba", "batna" ] } variable_metadata = self.ib.construct_variable_metadata( description=variable_description, global_datamart_id=self.global_datamart_id, col_offset=0) expected = { 'datamart_id': 10001, 'name': 'city', 'description': 'the city data belongs to', 'semantic_type': ['https://metadata.datadrivendiscovery.org/types/Location'], 'named_entity': [ 'abu dhabi', 'ajman', 'dubai', 'sharjah', 'kabul', 'kandahar', 'algiers', 'annaba', 'batna' ] } self.assertEqual(variable_metadata.value, expected) @Utils.test_print def test_construct_variable_metadata_2_with_data(self): data = { "city": [ "abu dhabi", "ajman", "dubai", "sharjah", "kabul", "kandahar", "algiers", "annaba", "batna" ] } df = pd.DataFrame(data) variable_description = { "name": "city", "semantic_type": ["https://metadata.datadrivendiscovery.org/types/Location"], "named_entity": None } variable_metadata = self.ib.construct_variable_metadata( description=variable_description, global_datamart_id=self.global_datamart_id, col_offset=0, data=df) expected = { 'datamart_id': 10001, 'name': 'city', 'description': 'column name: city, dtype: object', 'semantic_type': ['https://metadata.datadrivendiscovery.org/types/Location'], 'named_entity': [ 'abu dhabi', 'ajman', 'dubai', 'sharjah', 'kabul', 'kandahar', 'algiers', 'annaba', 'batna' ] } self.assertEqual(variable_metadata.value, expected) @Utils.test_print def test_construct_global_metadata(self): self.ib.current_global_index = 10000 description = { "title": "TAVG", "description": "Average temperature (tenths of degrees C)[Note that TAVG from source 'S' corresponds to an average for the period ending at 2400 UTC rather than local midnight]", "url": "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt", "keywords": ["Average Temperature."], "provenance": { "resource": "noaa.org" }, "materialization": { "python_path": "noaa_materializer", "arguments": { "type": "TAVG" } }, "variables": [{ "name": "date", "description": "the date of data", "semantic_type": ["https://metadata.datadrivendiscovery.org/types/Time"], "temporal_coverage": { "start": "1874-10-13", "end": "2018-10-01" } }, { "name": "city", "description": "the city data belongs to", "semantic_type": ["https://metadata.datadrivendiscovery.org/types/Location"], "named_entity": ["abu dhabi", "ajman", "dubai", "sharjah"] }], "date_updated": "2018-09-28" } global_metadata = self.ib.construct_global_metadata( description=description) expected = { 'datamart_id': 20000, 'title': 'TAVG', 'description': "Average temperature (tenths of degrees C)[Note that TAVG from source 'S' corresponds to an average for the period ending at 2400 UTC rather than local midnight]", 'url': 'https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt', 'keywords': ['Average Temperature.'], 'date_updated': '2018-09-28T00:00:00', 'provenance': { "resource": "noaa.org" }, 'materialization': { 'python_path': 'noaa_materializer', 'arguments': { 'type': 'TAVG' } }, 'variables': [{ 'datamart_id': 20001, 'name': 'date', 'description': 'the date of data', 'semantic_type': ['https://metadata.datadrivendiscovery.org/types/Time'], 'temporal_coverage': { 'start': '1874-10-13T00:00:00', 'end': '2018-10-01T00:00:00' } }, { 'datamart_id': 20002, 'name': 'city', 'description': 'the city data belongs to', 'semantic_type': ['https://metadata.datadrivendiscovery.org/types/Location'], 'named_entity': ['abu dhabi', 'ajman', 'dubai', 'sharjah'] }] } self.assertEqual(global_metadata, expected) @Utils.test_print def test_construct_global_metadata_with_data(self): self.ib.current_global_index = 10000 description = { "url": "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt", "keywords": ["Average Temperature."], "provenance": { "resource": "noaa.org" }, "materialization": { "python_path": "noaa_materializer", "arguments": { "type": "TAVG" } }, "variables": [{ "name": "city", "description": "the city data belongs to", "semantic_type": ["https://metadata.datadrivendiscovery.org/types/Location"], "named_entity": None }, { "name": "date", "description": "the date of data", "semantic_type": ["https://metadata.datadrivendiscovery.org/types/Time"], "temporal_coverage": None }], "date_updated": "2018-09-28" } global_metadata = self.ib.construct_global_metadata( description=description, data=self.df_for_global) expected = { 'datamart_id': 20000, 'title': 'city date', 'description': 'city : object, date : object', 'url': 'https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt', 'keywords': ['Average Temperature.'], 'date_updated': '2018-09-28T00:00:00', 'provenance': { "resource": "noaa.org" }, 'materialization': { 'python_path': 'noaa_materializer', 'arguments': { 'type': 'TAVG' } }, 'variables': [{ 'datamart_id': 20001, 'name': 'city', 'description': 'the city data belongs to', 'semantic_type': ['https://metadata.datadrivendiscovery.org/types/Location'], 'named_entity': ['abu dhabi', 'ajman', 'dubai', 'sharjah'] }, { 'datamart_id': 20002, 'name': 'date', 'description': 'the date of data', 'semantic_type': ['https://metadata.datadrivendiscovery.org/types/Time'], 'temporal_coverage': { 'start': '2014-02-23T00:00:00', 'end': '2023-02-13T00:00:00' } }] } self.assertEqual(global_metadata, expected) @Utils.test_print def test_construct_global_metadata_with_basic_fields(self): self.ib.current_global_index = 10000 description = {"materialization": {"python_path": "noaa_materializer"}} global_metadata = self.ib.construct_global_metadata( description=description, data=self.df_for_global) expected = { 'datamart_id': 20000, 'materialization': { 'python_path': 'noaa_materializer', 'arguments': None }, 'variables': [{ 'datamart_id': 20001, 'semantic_type': [], 'name': 'city', 'description': 'column name: city, dtype: object', "named_entity": ["abu dhabi", "ajman", "dubai", "sharjah"] }, { 'datamart_id': 20002, 'semantic_type': [], 'name': 'date', 'description': 'column name: date, dtype: object', 'temporal_coverage': { 'start': '2014-02-23T00:00:00', 'end': '2023-02-13T00:00:00' } }], 'title': 'city date', 'description': 'city : object, date : object', 'keywords': ['city', 'date'] } self.assertEqual(global_metadata, expected)
def generate_metadata(description: dict, ignore_html=False, enable_two_ravens_profiler=False) -> typing.List[dict]: """ Step 1 for indexing, user provide a description with url for materializing, datamart will try to generate metadata, by materializing, profiling the data, and will return to the users to take a look or edit for final indexing. :param description: a dict, must have the key "materializer_arguments", description["materializer_arguments"] must have the key "url" which is a valid url pointing to the real data :return: List of dict(mostly only one dict in the list, unless cases like excel file with multiple sheets) Each dict is a metadata that can be indexed to elasticsearch """ url = description['materialization_arguments']['url'].rstrip('/') if not (url and isinstance(url, str) and Utils.validate_url(url)): return [] file_name = url.rsplit('/', 1)[-1].rsplit('.', 1) if not file_name: return [] if len(file_name) == 2: file_suffix = file_name[1].split('#', 1)[0] if file_suffix.lower() in FILE_BLACK_LIST: return [] if ignore_html: parser = GeneralMaterializer().type2parser.get(file_suffix.lower()) if isinstance(parser, HTMLParser) or parser is None: return [] file_name = file_name[0].replace('-', ' ').replace('_', ' ') if not description.get('title'): description['title'] = file_name if not description.get('url'): description['url'] = url description['materialization'] = { 'python_path': 'general_materializer', 'arguments': description['materialization_arguments'] } del description['materialization_arguments'] ib = IndexBuilder() meta_list = [] parse_results = GeneralMaterializer().parse(description) for res in parse_results: try: df = res.dataframe idx = res.index if len(parse_results) > 1: sub_name = '(%s)' % res.name if res.name else '' if sub_name or description.get('title'): description['title'] = description.get('title', '') + sub_name description['materialization']['arguments']['index'] = idx or 0 # TODO: make use of res.metadata? indexed = ib.indexing_generate_metadata( description_path=description, data_path=df, enable_two_ravens_profiler=enable_two_ravens_profiler) meta_list.append(indexed) except Exception as e: print( 'IndexBuilder.indexing_generate_metadata, FAIL ON %d' % res.index, e) continue return meta_list
def setUp(self): self.ib = IndexBuilder() self.global_datamart_id = 10000