示例#1
0
 def test_index_csv(self):
     with open(
             os.path.join(os.path.dirname(__file__),
                          "resources/user_index_schema.json"), "r") as f:
         description = json.load(f)
     ib = IndexBuilder()
     res = ib.indexing(description_path=description,
                       es_index='datamart_tmp',
                       query_data_for_indexing=True)
     _id = res.get('datamart_id')
     self.assertNotEqual(_id, None)
     expected = {
         'datamart_id':
         _id,
         'title':
         'A short description of the dataset',
         'description':
         'https://cerc.blackboard.com/Page/1189',
         'implicit_variables': [{
             'name':
             'year',
             'value':
             '2007',
             'semantic_type': [
                 'http://schema.org/Integer',
                 'https://metadata.datadrivendiscovery.org/types/Time'
             ]
         }],
         'materialization': {
             'python_path': 'general_materializer',
             'arguments': {
                 'url':
                 'http://insight.dev.schoolwires.com/HelpAssets/C2Assets/C2Files/C2ImportFamRelSample.csv',
                 'file_type': 'csv'
             }
         },
         'variables': [{
             'datamart_id':
             _id + 1,
             'semantic_type': [],
             'name':
             'Parent Identifier',
             'description':
             'column name: Parent Identifier, dtype: int64'
         }, {
             'datamart_id':
             _id + 2,
             'semantic_type': [],
             'name':
             'Student Identifier',
             'description':
             'column name: Student Identifier, dtype: int64'
         }],
         'keywords': ['Parent Identifier', 'Student Identifier']
     }
     print(res)
     self.assertEqual(res, expected)
示例#2
0
    def setUp(self):
        self.ib = IndexBuilder()
        self.global_datamart_id = 10000
        self.df_for_global = pd.DataFrame({
            "city": ["abu dhabi", "ajman", "dubai", "sharjah"],
            'date': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"]
        })

        self.df_for_variable = pd.DataFrame(
            {'date': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"]})
def upload(meta_list: typing.List[dict],
           es_index: str = PRODUCTION_ES_INDEX,
           deduplicate: bool = True,
           index_builder: IndexBuilder = None) -> typing.List[dict]:
    ib = index_builder or IndexBuilder()
    succeeded = []
    for meta in meta_list:
        try:
            Utils.validate_schema(meta)
            meta['datamart_status'] = 'not_profiled'
            if deduplicate:
                exist_id = check_existence(meta['materialization'],
                                           es_index=es_index)
                if exist_id:
                    success = ib.updating_send_trusted_metadata(
                        metadata=meta, es_index=es_index, datamart_id=exist_id)
                else:
                    success = ib.indexing_send_to_es(metadata=meta,
                                                     es_index=es_index)
            else:
                success = ib.indexing_send_to_es(metadata=meta,
                                                 es_index=es_index)
            if success:
                succeeded.append(success)
        except Exception as e:
            print('UPLOAD FAILED: ', str(e))
            continue
    return succeeded
def bulk_upload(list_of_meta_list: typing.List[typing.List[dict]],
                es_index: str = PRODUCTION_ES_INDEX,
                deduplicate: bool = True) -> typing.List[typing.List[dict]]:
    succeeded = []
    ib = IndexBuilder()
    for meta_list in list_of_meta_list:
        success_list = upload(meta_list, es_index, deduplicate, ib)
        if success_list:
            succeeded.append(success_list)
    return succeeded
示例#5
0
def upload(description: dict, es_index: str = None) -> dict:
    """

    Args:
        description:

    Returns:

    """

    description['materialization'] = {
        'python_path': 'general_materializer',
        'arguments': description['materialization_arguments']
    }
    del description['materialization_arguments']
    ib = IndexBuilder()
    metadata = ib.indexing(description_path=description,
                           es_index=es_index or DEFAULT_ES,
                           query_data_for_indexing=True)

    return metadata
示例#6
0
class TestIndexBuilder(unittest.TestCase):
    def setUp(self):
        self.ib = IndexBuilder()
        self.global_datamart_id = 10000
        self.df_for_global = pd.DataFrame({
            "city": ["abu dhabi", "ajman", "dubai", "sharjah"],
            'date': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"]
        })

        self.df_for_variable = pd.DataFrame(
            {'date': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"]})

    @Utils.test_print
    def test_construct_variable_metadata_with_empty_variable(self):
        variable_metadata = self.ib.construct_variable_metadata(
            description={},
            global_datamart_id=self.global_datamart_id,
            col_offset=0,
            data=self.df_for_variable)
        expected = {
            'datamart_id': 10001,
            'semantic_type': [],
            'name': 'date',
            'description': 'column name: date, dtype: object',
            'temporal_coverage': {
                'start': '2014-02-23T00:00:00',
                'end': '2023-02-13T00:00:00'
            }
        }

        self.assertEqual(variable_metadata.value, expected)

    @Utils.test_print
    def test_construct_variable_metadata_1(self):
        variable_description = {
            "name":
            "date",
            "description":
            "the date of data",
            "semantic_type":
            ["https://metadata.datadrivendiscovery.org/types/Time"],
            "temporal_coverage": {
                "start": "1874-10-13",
                "end": "2018-10-01"
            }
        }
        variable_metadata = self.ib.construct_variable_metadata(
            description=variable_description,
            global_datamart_id=self.global_datamart_id,
            col_offset=0)
        expected = {
            'datamart_id':
            10001,
            'name':
            'date',
            'description':
            'the date of data',
            'semantic_type':
            ['https://metadata.datadrivendiscovery.org/types/Time'],
            'temporal_coverage': {
                'start': '1874-10-13T00:00:00',
                'end': '2018-10-01T00:00:00'
            }
        }

        self.assertEqual(variable_metadata.value, expected)

    @Utils.test_print
    def test_construct_variable_metadata_1_with_data(self):
        variable_description = {
            "description":
            "the date of data",
            "semantic_type":
            ["https://metadata.datadrivendiscovery.org/types/Time"],
            "temporal_coverage": {
                "start": None,
                "end": None
            }
        }
        variable_metadata = self.ib.construct_variable_metadata(
            description=variable_description,
            global_datamart_id=self.global_datamart_id,
            col_offset=0,
            data=self.df_for_variable)
        expected = {
            'datamart_id':
            10001,
            'name':
            'date',
            'description':
            'the date of data',
            'semantic_type':
            ['https://metadata.datadrivendiscovery.org/types/Time'],
            'temporal_coverage': {
                'start': '2014-02-23T00:00:00',
                'end': '2023-02-13T00:00:00'
            }
        }

        self.assertEqual(variable_metadata.value, expected)

    @Utils.test_print
    def test_construct_variable_metadata_2(self):
        variable_description = {
            "name":
            "city",
            "description":
            "the city data belongs to",
            "semantic_type":
            ["https://metadata.datadrivendiscovery.org/types/Location"],
            "named_entity": [
                "abu dhabi", "ajman", "dubai", "sharjah", "kabul", "kandahar",
                "algiers", "annaba", "batna"
            ]
        }
        variable_metadata = self.ib.construct_variable_metadata(
            description=variable_description,
            global_datamart_id=self.global_datamart_id,
            col_offset=0)
        expected = {
            'datamart_id':
            10001,
            'name':
            'city',
            'description':
            'the city data belongs to',
            'semantic_type':
            ['https://metadata.datadrivendiscovery.org/types/Location'],
            'named_entity': [
                'abu dhabi', 'ajman', 'dubai', 'sharjah', 'kabul', 'kandahar',
                'algiers', 'annaba', 'batna'
            ]
        }

        self.assertEqual(variable_metadata.value, expected)

    @Utils.test_print
    def test_construct_variable_metadata_2_with_data(self):
        data = {
            "city": [
                "abu dhabi", "ajman", "dubai", "sharjah", "kabul", "kandahar",
                "algiers", "annaba", "batna"
            ]
        }
        df = pd.DataFrame(data)
        variable_description = {
            "name":
            "city",
            "semantic_type":
            ["https://metadata.datadrivendiscovery.org/types/Location"],
            "named_entity":
            None
        }
        variable_metadata = self.ib.construct_variable_metadata(
            description=variable_description,
            global_datamart_id=self.global_datamart_id,
            col_offset=0,
            data=df)
        expected = {
            'datamart_id':
            10001,
            'name':
            'city',
            'description':
            'column name: city, dtype: object',
            'semantic_type':
            ['https://metadata.datadrivendiscovery.org/types/Location'],
            'named_entity': [
                'abu dhabi', 'ajman', 'dubai', 'sharjah', 'kabul', 'kandahar',
                'algiers', 'annaba', 'batna'
            ]
        }

        self.assertEqual(variable_metadata.value, expected)

    @Utils.test_print
    def test_construct_global_metadata(self):
        self.ib.current_global_index = 10000
        description = {
            "title":
            "TAVG",
            "description":
            "Average temperature (tenths of degrees C)[Note that TAVG from source 'S' corresponds to an average for the period ending at 2400 UTC rather than local midnight]",
            "url":
            "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt",
            "keywords": ["Average Temperature."],
            "provenance": {
                "resource": "noaa.org"
            },
            "materialization": {
                "python_path": "noaa_materializer",
                "arguments": {
                    "type": "TAVG"
                }
            },
            "variables": [{
                "name":
                "date",
                "description":
                "the date of data",
                "semantic_type":
                ["https://metadata.datadrivendiscovery.org/types/Time"],
                "temporal_coverage": {
                    "start": "1874-10-13",
                    "end": "2018-10-01"
                }
            }, {
                "name":
                "city",
                "description":
                "the city data belongs to",
                "semantic_type":
                ["https://metadata.datadrivendiscovery.org/types/Location"],
                "named_entity": ["abu dhabi", "ajman", "dubai", "sharjah"]
            }],
            "date_updated":
            "2018-09-28"
        }
        global_metadata = self.ib.construct_global_metadata(
            description=description)
        expected = {
            'datamart_id':
            20000,
            'title':
            'TAVG',
            'description':
            "Average temperature (tenths of degrees C)[Note that TAVG from source 'S' corresponds to an average for the period ending at 2400 UTC rather than local midnight]",
            'url':
            'https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt',
            'keywords': ['Average Temperature.'],
            'date_updated':
            '2018-09-28T00:00:00',
            'provenance': {
                "resource": "noaa.org"
            },
            'materialization': {
                'python_path': 'noaa_materializer',
                'arguments': {
                    'type': 'TAVG'
                }
            },
            'variables': [{
                'datamart_id':
                20001,
                'name':
                'date',
                'description':
                'the date of data',
                'semantic_type':
                ['https://metadata.datadrivendiscovery.org/types/Time'],
                'temporal_coverage': {
                    'start': '1874-10-13T00:00:00',
                    'end': '2018-10-01T00:00:00'
                }
            }, {
                'datamart_id':
                20002,
                'name':
                'city',
                'description':
                'the city data belongs to',
                'semantic_type':
                ['https://metadata.datadrivendiscovery.org/types/Location'],
                'named_entity': ['abu dhabi', 'ajman', 'dubai', 'sharjah']
            }]
        }

        self.assertEqual(global_metadata, expected)

    @Utils.test_print
    def test_construct_global_metadata_with_data(self):
        self.ib.current_global_index = 10000
        description = {
            "url":
            "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt",
            "keywords": ["Average Temperature."],
            "provenance": {
                "resource": "noaa.org"
            },
            "materialization": {
                "python_path": "noaa_materializer",
                "arguments": {
                    "type": "TAVG"
                }
            },
            "variables": [{
                "name":
                "city",
                "description":
                "the city data belongs to",
                "semantic_type":
                ["https://metadata.datadrivendiscovery.org/types/Location"],
                "named_entity":
                None
            }, {
                "name":
                "date",
                "description":
                "the date of data",
                "semantic_type":
                ["https://metadata.datadrivendiscovery.org/types/Time"],
                "temporal_coverage":
                None
            }],
            "date_updated":
            "2018-09-28"
        }
        global_metadata = self.ib.construct_global_metadata(
            description=description, data=self.df_for_global)

        expected = {
            'datamart_id':
            20000,
            'title':
            'city date',
            'description':
            'city : object, date : object',
            'url':
            'https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt',
            'keywords': ['Average Temperature.'],
            'date_updated':
            '2018-09-28T00:00:00',
            'provenance': {
                "resource": "noaa.org"
            },
            'materialization': {
                'python_path': 'noaa_materializer',
                'arguments': {
                    'type': 'TAVG'
                }
            },
            'variables': [{
                'datamart_id':
                20001,
                'name':
                'city',
                'description':
                'the city data belongs to',
                'semantic_type':
                ['https://metadata.datadrivendiscovery.org/types/Location'],
                'named_entity': ['abu dhabi', 'ajman', 'dubai', 'sharjah']
            }, {
                'datamart_id':
                20002,
                'name':
                'date',
                'description':
                'the date of data',
                'semantic_type':
                ['https://metadata.datadrivendiscovery.org/types/Time'],
                'temporal_coverage': {
                    'start': '2014-02-23T00:00:00',
                    'end': '2023-02-13T00:00:00'
                }
            }]
        }

        self.assertEqual(global_metadata, expected)

    @Utils.test_print
    def test_construct_global_metadata_with_basic_fields(self):
        self.ib.current_global_index = 10000
        description = {"materialization": {"python_path": "noaa_materializer"}}
        global_metadata = self.ib.construct_global_metadata(
            description=description, data=self.df_for_global)

        expected = {
            'datamart_id':
            20000,
            'materialization': {
                'python_path': 'noaa_materializer',
                'arguments': None
            },
            'variables': [{
                'datamart_id':
                20001,
                'semantic_type': [],
                'name':
                'city',
                'description':
                'column name: city, dtype: object',
                "named_entity": ["abu dhabi", "ajman", "dubai", "sharjah"]
            }, {
                'datamart_id': 20002,
                'semantic_type': [],
                'name': 'date',
                'description': 'column name: date, dtype: object',
                'temporal_coverage': {
                    'start': '2014-02-23T00:00:00',
                    'end': '2023-02-13T00:00:00'
                }
            }],
            'title':
            'city date',
            'description':
            'city : object, date : object',
            'keywords': ['city', 'date']
        }

        self.assertEqual(global_metadata, expected)
def generate_metadata(description: dict,
                      ignore_html=False,
                      enable_two_ravens_profiler=False) -> typing.List[dict]:
    """
    Step 1 for indexing, user provide a description with url for materializing,
    datamart will try to generate metadata, by materializing, profiling the data,
    and will return to the users to take a look or edit for final indexing.

    :param description: a dict, must have the key "materializer_arguments",
           description["materializer_arguments"] must have the key "url" which is a valid url pointing to the real data
    :return: List of dict(mostly only one dict in the list, unless cases like excel file with multiple sheets)
             Each dict is a metadata that can be indexed to elasticsearch
    """

    url = description['materialization_arguments']['url'].rstrip('/')
    if not (url and isinstance(url, str) and Utils.validate_url(url)):
        return []

    file_name = url.rsplit('/', 1)[-1].rsplit('.', 1)
    if not file_name:
        return []
    if len(file_name) == 2:
        file_suffix = file_name[1].split('#', 1)[0]
        if file_suffix.lower() in FILE_BLACK_LIST:
            return []
        if ignore_html:
            parser = GeneralMaterializer().type2parser.get(file_suffix.lower())
            if isinstance(parser, HTMLParser) or parser is None:
                return []

    file_name = file_name[0].replace('-', ' ').replace('_', ' ')
    if not description.get('title'):
        description['title'] = file_name

    if not description.get('url'):
        description['url'] = url

    description['materialization'] = {
        'python_path': 'general_materializer',
        'arguments': description['materialization_arguments']
    }
    del description['materialization_arguments']

    ib = IndexBuilder()

    meta_list = []
    parse_results = GeneralMaterializer().parse(description)
    for res in parse_results:
        try:
            df = res.dataframe
            idx = res.index
            if len(parse_results) > 1:
                sub_name = '(%s)' % res.name if res.name else ''
                if sub_name or description.get('title'):
                    description['title'] = description.get('title',
                                                           '') + sub_name
            description['materialization']['arguments']['index'] = idx or 0
            # TODO: make use of res.metadata?
            indexed = ib.indexing_generate_metadata(
                description_path=description,
                data_path=df,
                enable_two_ravens_profiler=enable_two_ravens_profiler)
            meta_list.append(indexed)
        except Exception as e:
            print(
                'IndexBuilder.indexing_generate_metadata, FAIL ON %d' %
                res.index, e)
            continue
    return meta_list
示例#8
0
 def setUp(self):
     self.ib = IndexBuilder()
     self.global_datamart_id = 10000