Пример #1
0
 def test_numbers(self):
     DATA_IN = [
         ['Organisation', 'Cluster', 'District', 'Count'],
         ['#org', '#sector+list', '#adm1', '#meta+count'],
         ['NGO A', 'WASH', 'Coast', '  200'],
         ['NGO B', 'Education', 'Plains', '1,100 '],
         ['NGO B', 'Child Protection', 'Coast', 300],
         ['NGO A', 'Logistics', 'Coast', '1.7E5']
     ]
     DATA_OUT = [
         ['NGO A', 'WASH', 'Coast', '200'],
         ['NGO B', 'Education', 'Plains', '1100'],
         ['NGO B', 'Child Protection', 'Coast', '300'],
         ['NGO A', 'Logistics', 'Coast', '170000']
     ]
     DATA_OUT_FORMATTED = [
         ['NGO A', 'WASH', 'Coast', '200.00'],
         ['NGO B', 'Education', 'Plains', '1100.00'],
         ['NGO B', 'Child Protection', 'Coast', '300.00'],
         ['NGO A', 'Logistics', 'Coast', '170000.00']
     ]
     self.assertEqual(DATA_OUT, hxl.data(DATA_IN).clean_data(number='meta+count').values)
     self.assertEqual(
         DATA_OUT_FORMATTED,
         hxl.data(DATA_IN).clean_data(number='meta+count', number_format='0.2f').values
     )
Пример #2
0
 def test_purge_malformed_data(self):
     DATA_IN = [
         ['#date', '#affected', '#geo+lat', '#geo+lon'],
         ['1/Mar/2017', 'bad', '45N30', 'bad'],
         ['bad', '2,000', 'bad', '75W30'],
     ]
     DATA_OUT_UNPURGED = [
         ['2017-03-01', 'bad', '45.5000', 'bad'],
         ['bad', '2000', 'bad', '-75.5000'],
     ]
     DATA_OUT_PURGED = [
         ['2017-03-01', '', '45.5000', ''],
         ['', '2000', '', '-75.5000'],
     ]
     self.assertEqual(
         DATA_OUT_UNPURGED,
         hxl.data(DATA_IN).clean_data(date='date',
                                      number='affected',
                                      latlon='geo',
                                      purge=False).values)
     self.assertEqual(
         DATA_OUT_PURGED,
         hxl.data(DATA_IN).clean_data(date='date',
                                      number='affected',
                                      latlon='geo',
                                      purge=True).values)
Пример #3
0
 def test_dates(self):
     DATA_IN = [
         ['Organisation', 'Cluster', 'District', 'Date'],
         ['#org', '#sector+list', '#adm1', '#date'],
         ['NGO A', 'WASH', 'Coast', 'January 1 2015'],
         ['NGO B', 'Education', 'Plains', '2/2/15'],
         ['NGO B', 'Child Protection', 'Coast', '1 Mar/15']
     ]
     DATA_OUT = [
         ['NGO A', 'WASH', 'Coast', '2015-01-01'],
         ['NGO B', 'Education', 'Plains', '2015-02-02'],
         ['NGO B', 'Child Protection', 'Coast', '2015-03-01']
     ]
     DATA_OUT_WEEK = [
         ['NGO A', 'WASH', 'Coast', '2015-W01'],
         ['NGO B', 'Education', 'Plains', '2015-W06'],
         ['NGO B', 'Child Protection', 'Coast', '2015-W09']
     ]
     DATA_OUT_MONTH = [
         ['NGO A', 'WASH', 'Coast', '2015-01'],
         ['NGO B', 'Education', 'Plains', '2015-02'],
         ['NGO B', 'Child Protection', 'Coast', '2015-03']
     ]
     DATA_OUT_YEAR = [
         ['NGO A', 'WASH', 'Coast', '2015'],
         ['NGO B', 'Education', 'Plains', '2015'],
         ['NGO B', 'Child Protection', 'Coast', '2015']
     ]
     self.assertEqual(DATA_OUT, hxl.data(DATA_IN).clean_data(date='date').values)
     self.assertEqual(DATA_OUT_WEEK, hxl.data(DATA_IN).clean_data(date='date', date_format="%Y-W%V").values)
     self.assertEqual(DATA_OUT_MONTH, hxl.data(DATA_IN).clean_data(date='date', date_format="%Y-%m").values)
     self.assertEqual(DATA_OUT_YEAR, hxl.data(DATA_IN).clean_data(date='date', date_format="%Y").values)
Пример #4
0
    def assertRowErrors(self,
                        row_values,
                        errors_expected,
                        schema_values=None,
                        columns=None):
        """Set up a HXL row and count the errors in it"""
        errors = []

        def callback(error):
            errors.append(error)

        if schema_values is None:
            schema = hxl.schema(hxl.data(self.DEFAULT_SCHEMA),
                                callback=callback)
        else:
            schema = hxl.schema(hxl.data(schema_values), callback=callback)

        if columns is None:
            columns = self.DEFAULT_COLUMNS

        row = Row(values=row_values,
                  columns=[Column.parse(tag) for tag in columns])

        schema.start()

        if errors_expected == 0:
            self.assertTrue(schema.validate_row(row))
        else:
            self.assertFalse(schema.validate_row(row))
        self.assertEqual(len(errors), errors_expected)
Пример #5
0
    def test_merge_patterns(self):
        SOURCE_DATA = [
            ['P-code', 'District'],
            ['#adm1+code', '#adm1+name'],
            ['001', 'Coast'],
            ['002', 'Plains'],
        ]
        MERGE_DATA = [
            ['P-code', 'Population (female)', 'Population (male)', 'Population (total)'],
            ['#adm1+code', '#population+f', '#population+m', '#population+total'],
            ['002', '51000', '49000', '100000'],
            ['001', '76000', '74000', '150000'],
        ]
        EXPECTED = [
            ['P-code', 'District', 'Population (female)', 'Population (male)', 'Population (total)'],
            ['#adm1+code', '#adm1+name', '#population+f', '#population+m', '#population+total'],
            ['001', 'Coast', '76000', '74000', '150000'],
            ['002', 'Plains', '51000', '49000', '100000'],
        ]

        result = hxl.data(SOURCE_DATA).merge_data(hxl.data(MERGE_DATA), keys='#adm1+code', tags='#population')
        self.assertEqual(EXPECTED[0], result.headers)
        self.assertEqual(EXPECTED[1], result.display_tags)
        self.assertEqual(EXPECTED[2:], result.values)
        
        result = hxl.data(SOURCE_DATA).merge_data(hxl.data(MERGE_DATA), keys='#adm1+code', tags='#population+f,#population+m,#population+total')
        self.assertEqual(EXPECTED[0], result.headers)
        self.assertEqual(EXPECTED[1], result.display_tags)
        self.assertEqual(EXPECTED[2:], result.values)
Пример #6
0
    def test_merge_patterns(self):
        SOURCE_DATA = [
            ['P-code', 'District'],
            ['#adm1+code', '#adm1+name'],
            ['001', 'Coast'],
            ['002', 'Plains'],
        ]
        MERGE_DATA = [
            ['P-code', 'Population (female)', 'Population (male)', 'Population (total)'],
            ['#adm1+code', '#population+f', '#population+m', '#population+total'],
            ['002', '51000', '49000', '100000'],
            ['001', '76000', '74000', '150000'],
        ]
        EXPECTED = [
            ['P-code', 'District', 'Population (female)', 'Population (male)', 'Population (total)'],
            ['#adm1+code', '#adm1+name', '#population+f', '#population+m', '#population+total'],
            ['001', 'Coast', '76000', '74000', '150000'],
            ['002', 'Plains', '51000', '49000', '100000'],
        ]

        result = hxl.data(SOURCE_DATA).merge_data(hxl.data(MERGE_DATA), keys='#adm1+code', tags='#population')
        self.assertEqual(EXPECTED[0], result.headers)
        self.assertEqual(EXPECTED[1], result.display_tags)
        self.assertEqual(EXPECTED[2:], result.values)
        
        result = hxl.data(SOURCE_DATA).merge_data(hxl.data(MERGE_DATA), keys='#adm1+code', tags='#population+f,#population+m,#population+total')
        self.assertEqual(EXPECTED[0], result.headers)
        self.assertEqual(EXPECTED[1], result.display_tags)
        self.assertEqual(EXPECTED[2:], result.values)
Пример #7
0
def is_hxl(url):
    """Try to parse as a HXL dataset."""
    try:
        hxl.data(url).columns
        return True
    except:
        return False
Пример #8
0
    def assertRowErrors(self, row_values, errors_expected, schema_values=None, columns=None):
        """Set up a HXL row and count the errors in it"""
        errors = []

        def callback(error):
            errors.append(error)

        if schema_values is None:
            schema = hxl.schema(hxl.data(self.DEFAULT_SCHEMA), callback=callback)
        else:
            schema = hxl.schema(hxl.data(schema_values), callback=callback)

        if columns is None:
            columns = self.DEFAULT_COLUMNS

        row = Row(
            values=row_values,
            columns=[Column.parse(tag) for tag in columns]
        )

        schema.start()

        if errors_expected == 0:
            self.assertTrue(schema.validate_row(row))
        else:
            self.assertFalse(schema.validate_row(row))
        self.assertEqual(len(errors), errors_expected)
Пример #9
0
 def test_dates(self):
     DATA_IN = [
         ['Organisation', 'Cluster', 'District', 'Date'],
         ['#org', '#sector+list', '#adm1', '#date'],
         ['NGO A', 'WASH', 'Coast', 'January 1 2015'],
         ['NGO B', 'Education', 'Plains', '2/2/15'],
         ['NGO B', 'Child Protection', 'Coast', '1 Mar/15']
     ]
     DATA_OUT = [
         ['NGO A', 'WASH', 'Coast', '2015-01-01'],
         ['NGO B', 'Education', 'Plains', '2015-02-02'],
         ['NGO B', 'Child Protection', 'Coast', '2015-03-01']
     ]
     DATA_OUT_WEEK = [
         ['NGO A', 'WASH', 'Coast', '2015-W01'],
         ['NGO B', 'Education', 'Plains', '2015-W06'],
         ['NGO B', 'Child Protection', 'Coast', '2015-W09']
     ]
     DATA_OUT_MONTH = [
         ['NGO A', 'WASH', 'Coast', '2015-01'],
         ['NGO B', 'Education', 'Plains', '2015-02'],
         ['NGO B', 'Child Protection', 'Coast', '2015-03']
     ]
     DATA_OUT_YEAR = [
         ['NGO A', 'WASH', 'Coast', '2015'],
         ['NGO B', 'Education', 'Plains', '2015'],
         ['NGO B', 'Child Protection', 'Coast', '2015']
     ]
     self.assertEqual(DATA_OUT, hxl.data(DATA_IN).clean_data(date='date').values)
     self.assertEqual(DATA_OUT_WEEK, hxl.data(DATA_IN).clean_data(date='date', date_format="%Y-W%V").values)
     self.assertEqual(DATA_OUT_MONTH, hxl.data(DATA_IN).clean_data(date='date', date_format="%Y-%m").values)
     self.assertEqual(DATA_OUT_YEAR, hxl.data(DATA_IN).clean_data(date='date', date_format="%Y").values)
Пример #10
0
 def test_numbers(self):
     DATA_IN = [
         ['Organisation', 'Cluster', 'District', 'Count'],
         ['#org', '#sector+list', '#adm1', '#meta+count'],
         ['NGO A', 'WASH', 'Coast', '  200'],
         ['NGO B', 'Education', 'Plains', '1,100 '],
         ['NGO B', 'Child Protection', 'Coast', 300],
         ['NGO A', 'Logistics', 'Coast', '1.7E5']
     ]
     DATA_OUT = [
         ['NGO A', 'WASH', 'Coast', '200'],
         ['NGO B', 'Education', 'Plains', '1100'],
         ['NGO B', 'Child Protection', 'Coast', '300'],
         ['NGO A', 'Logistics', 'Coast', '170000']
     ]
     DATA_OUT_FORMATTED = [
         ['NGO A', 'WASH', 'Coast', '200.00'],
         ['NGO B', 'Education', 'Plains', '1100.00'],
         ['NGO B', 'Child Protection', 'Coast', '300.00'],
         ['NGO A', 'Logistics', 'Coast', '170000.00']
     ]
     self.assertEqual(DATA_OUT, hxl.data(DATA_IN).clean_data(number='meta+count').values)
     self.assertEqual(
         DATA_OUT_FORMATTED,
         hxl.data(DATA_IN).clean_data(number='meta+count', number_format='0.2f').values
     )
Пример #11
0
    def countriesdata(cls, use_live=True):
        # type: (bool) -> List[Dict[Dict]]
        """
        Read countries data from OCHA countries feed (falling back to file)

        Args:
            use_live (bool): Try to get use latest data from web rather than file in package. Defaults to True.

        Returns:
            List[Dict[Dict]]: Countries dictionaries
        """
        if cls._countriesdata is None:
            countries = None
            if use_live:
                try:
                    countries = hxl.data(cls._ochaurl)
                except IOError:
                    logger.exception(
                        'Download from OCHA feed failed! Falling back to stored file.'
                    )
            if countries is None:
                countries = hxl.data(script_dir_plus_file(
                    'Countries & Territories Taxonomy MVP - C&T Taxonomy with HXL Tags.csv',
                    Country),
                                     allow_local=True)
            cls.set_countriesdata(countries)
        return cls._countriesdata
Пример #12
0
 def test_truthy(self):
     schema = hxl.schema(
         hxl.data(resolve_path('files/test_validation/truthy-schema.json'),
                  allow_local=True))
     BAD_DATA = [['#sector'], ['Health']]
     self.assertFalse(schema.validate(hxl.data(BAD_DATA)))
     GOOD_DATA = [['#adm2+code'], ['xxx']]
     self.assertTrue(schema.validate(hxl.data(GOOD_DATA)))
Пример #13
0
 def test_minmax_numbers(self):
     DATA = [['#date+year', '#affected', '#adm1'], ['2016', '200', 'Coast'],
             ['2016', '100', 'Plains'], ['2015', '300', 'Coast'],
             ['2015', '200', 'Plains'], ['2014', '400', 'Coast'],
             ['2014', '300', 'Plains']]
     self.assertEqual(
         hxl.data(DATA).with_rows('#affected is max').values, [DATA[5]])
     self.assertEqual(
         hxl.data(DATA).with_rows('#affected is min').values, [DATA[2]])
Пример #14
0
 def test_blank_merge(self):
     data1 = hxl.data([['#sector+list', '#org+name', '#org+name'],
                       ['Health', '', 'Red Cross']])
     data2 = hxl.data([['#org+name', '#org+code'], ['XX', 'YY'],
                       ['Red Cross', 'IFRC']])
     expected = [['#sector+list', '#org+name', '#org+name', '#org+code'],
                 ['Health', '', 'Red Cross', 'IFRC']]
     merged = data1.merge_data(data2, '#org+name', '#org+code')
     self.assertEqual(expected[1:], merged.values)
Пример #15
0
 def test_empty_header_row(self):
     """Test for exception parsing an empty header row"""
     DATA = [
         [],
         ['X', 'Y'],
         ['#adm1', '#affected'],
         ['Coast', '100']
     ]
     hxl.data(DATA).columns
Пример #16
0
    def test_remote_google(self):
        """Test reading from a Google Sheet (will fail without connectivity)."""

        # default tab
        with hxl.data(URL_GOOGLE_NOHASH, timeout=5) as source:
            self.compare_input(source)

        # specific tab
        with hxl.data(URL_GOOGLE_HASH, timeout=5) as source:
            self.compare_input(source)
Пример #17
0
 def test_values_displaced_key(self):
     """Test that the filter scans all candidate keys."""
     data1 = hxl.data([['#sector+list', '#org+name', '#org+name'],
                       ['Health', 'xxx', 'Red Cross']])
     data2 = hxl.data([['#org+name', '#org+code'], ['XX', 'YY'],
                       ['Red Cross', 'IFRC']])
     expected = [['#sector+list', '#org+name', '#org+name', '#org+code'],
                 ['Health', 'xxx', 'Red Cross', 'IFRC']]
     merged = data1.merge_data(data2, '#org+name', '#org+code')
     self.assertEqual(expected[1:], merged.values)
Пример #18
0
 def test_aggregates_mixed_types(self):
     """Test selecting on rows with mixed datatypes"""
     DATA = [
         ["#affected"],
         ["1"],
         [2],
         ["N/A"],
     ]
     self.assertEqual([["1"]], hxl.data(DATA).with_rows('#affected is min').values)
     self.assertEqual([["N/A"]], hxl.data(DATA).with_rows('#affected is max').values)
Пример #19
0
 def test_aggregates_mixed_types(self):
     """Test selecting on rows with mixed datatypes"""
     DATA = [
         ["#affected"],
         ["1"],
         [2],
         ["N/A"],
     ]
     self.assertEqual([["1"]], hxl.data(DATA).with_rows('#affected is min').values)
     self.assertEqual([["N/A"]], hxl.data(DATA).with_rows('#affected is max').values)
Пример #20
0
    def test_remote_google(self):
        """Test reading from a Google Sheet (will fail without connectivity)."""

        # default tab
        with hxl.data(URL_GOOGLE_NOHASH) as source:
            self.compare_input(source)

        # specific tab
        with hxl.data(URL_GOOGLE_HASH) as source:
            self.compare_input(source)
Пример #21
0
 def test_optional_params(self):
     url = 'https://data.humdata.org/dataset/hxl-master-vocabulary-list/resource/d22dd1b6-2ff0-47ab-85c6-08aeb911a832'
     hxl.input.make_input(url,
                          verify_ssl=True,
                          timeout=30,
                          http_headers={'User-Agent': 'libhxl-python'})
     hxl.data(url,
              verify_ssl=True,
              timeout=30,
              http_headers={'User-Agent': 'libhxl-python'})
Пример #22
0
    def test_aggregator_dates(self):
        DATA_IN = [['#event', '#date'], ['Flood', '2017-01-10 00:00:00'],
                   ['Flood', '1 Jan 2018'], ['Flood', '06/30/2018']]

        # minimum date
        self.assertEqual([['Flood', '2017-01-10 00:00:00']],
                         hxl.data(DATA_IN).count('event', 'min(#date)').values)

        # maximum date
        self.assertEqual([['Flood', '06/30/2018']],
                         hxl.data(DATA_IN).count('event', 'max(#date)').values)
Пример #23
0
 def test_minmax_numbers(self):
     DATA = [
         ['#date+year', '#affected', '#adm1'],
         ['2016', '200', 'Coast'],
         ['2016', '100', 'Plains'],
         ['2015', '300', 'Coast'],
         ['2015', '200', 'Plains'],
         ['2014', '400', 'Coast'],
         ['2014', '300', 'Plains']
     ]
     self.assertEqual(hxl.data(DATA).with_rows('#affected is max').values, [DATA[5]])
     self.assertEqual(hxl.data(DATA).with_rows('#affected is min').values, [DATA[2]])
Пример #24
0
 def test_truthy(self):
     schema = hxl.schema(hxl.data(resolve_path('files/test_validation/truthy-schema.json'), allow_local=True))
     BAD_DATA = [
         ['#sector'],
         ['Health']
     ]
     self.assertFalse(schema.validate(hxl.data(BAD_DATA)))
     GOOD_DATA = [
         ['#adm2+code'],
         ['xxx']
     ]
     self.assertTrue(schema.validate(hxl.data(GOOD_DATA)))
Пример #25
0
    def test_aggregator_strings(self):
        DATA_IN = [['#event', '#sector'], ['Flood', 'Food'],
                   ['Flood', 'Health'], ['Flood', 'Education']]

        # minimum date
        self.assertEqual([['Flood', 'Education']],
                         hxl.data(DATA_IN).count('event',
                                                 'min(#sector)').values)

        # maximum date
        self.assertEqual([['Flood', 'Health']],
                         hxl.data(DATA_IN).count('event',
                                                 'max(#sector)').values)
Пример #26
0
    def test_json_selector(self):
        SEL1_DATA = [["Coast", "100"]]
        SEL2_DATA = [["Plains", "200"]]

        # make sure legacy selectors still work
        with make_input(FILE_JSON_SELECTOR, True, selector="sel1") as input:
            self.assertEqual(SEL1_DATA, hxl.data(input).values)
        with make_input(FILE_JSON_SELECTOR, True, selector="sel2") as input:
            self.assertEqual(SEL2_DATA, hxl.data(input).values)

        # test JSONPath support
        with make_input(FILE_JSON_SELECTOR, True, selector="$.sel1") as input:
            self.assertEqual(SEL1_DATA, hxl.data(input).values)
Пример #27
0
 def test_min_year(self):
     DATA = [
         ['#date'],
         ['2018'],
         ['2017']
     ]
     self.assertEqual('2017', hxl.data(DATA).min('#date'))
Пример #28
0
 def test_min_date(self):
     DATA = [
         ['#date'],
         ['2018-01-01'],
         ['1/1/2019']
     ]
     self.assertEqual('2018-01-01', hxl.data(DATA).min('#date'))
Пример #29
0
    def assertDatasetErrors(self, dataset, errors_expected, schema=None):
        errors = []

        def callback(error):
            errors.append(error)

        if schema is None:
            schema = self.SCHEMA
        schema = hxl.schema(schema, callback)

        if errors_expected == 0:
            self.assertTrue(schema.validate(hxl.data(dataset)))
        else:
            self.assertFalse(schema.validate(hxl.data(dataset)))

        self.assertEqual(len(errors), errors_expected)
Пример #30
0
 def test_blank_merge(self):
     data1 = hxl.data([
         ['#sector', '#org+name', '#org+name'],
         ['Health', '', 'Red Cross']
         ])
     data2 = hxl.data([
         ['#org+name', '#org+code'],
         ['XX', 'YY'],
         ['Red Cross', 'IFRC']
         ])
     expected = [
         ['#sector', '#org+name', '#org+name', '#org+code'],
         ['Health', '', 'Red Cross', 'IFRC']
         ]
     merged = data1.merge_data(data2, '#org+name', '#org+code')
     self.assertEqual(expected[1:], merged.values)
Пример #31
0
 def test_min_date(self):
     DATA = [
         ['#date'],
         ['2018-01-01'],
         ['1/1/2019']
     ]
     self.assertEqual('2018-01-01', hxl.data(DATA).min('#date'))
Пример #32
0
    def convert(self, url_or_filename, allow_local=False):
        """ Top-level method to convert a HXLated 3W to IATI

        """

        # open the data source (usually a URL)
        source = hxl.data(url_or_filename, allow_local=allow_local)

        # check that the required hashtags are present
        check_hashtags(source, self.REQUIRED_HASHTAGS)

        self.xmlout.start_document()

        self.xmlout.start_block(
            "iati-activities", {
                "generated-datetime": datetime.datetime.now().isoformat(),
                "version": "2.03",
            })

        for row in source:
            self.do_activity(row)

        self.xmlout.end_block("iati-activities")

        self.xmlout.end_document()
Пример #33
0
def process_dataset(dataset):
    """Do something with a dataset tagged hxl"""
    record = {
        'type': 'dataset',
        'name': dataset['name'],
        'title': dataset['title'],
        'source': dataset['dataset_source'],
        'resources': []
    }
    for resource in dataset['resources']:
        try:
            resource_info = {
                'type': 'resource',
                'name': resource['name'],
                'url': resource['url'],
                'columns': []
            }
            columns = hxl.data(resource['url']).columns
            resource_info['columns'].append([{
                'tag': column.tag,
                'display_tag': column.display_tag,
                'attributes': list(column.attributes)
            } for column in columns])
            record['resources'].append(resource_info)
        except:
            print("  Skipped {} (not valid HXL)".format(resource['name']), file=sys.stderr)
            return None
    if record['resources']:
        return record
    else:
        return False
Пример #34
0
 def test_repeat(self):
     # Test repeating a cache filter directly
     source = hxl.data(DATA).cache()
     rows1 = [row.values for row in source]
     rows2 = [row.values for row in source]
     self.assertEqual(3, len(rows1))
     self.assertEqual(rows1, rows2)
Пример #35
0
 def test_row_count(self):
     row_count = 0
     with hxl.data(FILE_CSV, True) as source:
         # logical row count
         for row in source:
             row_count += 1
     self.assertEqual(TestParser.EXPECTED_ROW_COUNT, row_count)
Пример #36
0
    def assertDatasetErrors(self, dataset, errors_expected, schema=None):
        errors = []

        def callback(error):
            errors.append(error)

        if schema is None:
            schema = self.SCHEMA
        schema = hxl.schema(schema, callback)

        if errors_expected == 0:
            self.assertTrue(schema.validate(hxl.data(dataset)))
        else:
            self.assertFalse(schema.validate(hxl.data(dataset)))

        self.assertEqual(len(errors), errors_expected)
Пример #37
0
 def test_min_year(self):
     DATA = [
         ['#date'],
         ['2018'],
         ['2017']
     ]
     self.assertEqual('2017', hxl.data(DATA).min('#date'))
Пример #38
0
 def test_repeat_sub(self):
     # Test repeating a cache filter backing another filter
     source = hxl.data(DATA).cache().with_rows('org=NGO A')
     rows1 = [row.values for row in source]
     rows2 = [row.values for row in source]
     self.assertEqual(1, len(rows1))
     self.assertEqual(rows1, rows2)
Пример #39
0
def readXlsx(fileLocation):
    print "Trying to download XLSX"
    try:
        response = urlopen(fileLocation)
        try:
            print "Reading XLSX"
            wb = load_workbook(BytesIO(response.read()))
        except:
            print "Error reading " + str(fileLocation)
            return False
        sheet = wb.active
        data = {}
    except URLError as e:
        print("XLS Failed to download")
    try:
        rows_iter = sheet.iter_rows(min_col=1,
                                    min_row=1,
                                    max_col=sheet.max_column,
                                    max_row=sheet.max_row)
        dataset = [[cell.value for cell in row] for row in rows_iter]
        for i, row in enumerate(dataset):
            for j, cell in enumerate(dataset[i]):
                if isinstance(cell, datetime.date):
                    dataset[i][j] = cell.strftime('%m/%d/%Y')
                elif isinstance(cell, basestring):
                    dataset[i][j] = cell.encode('ascii', 'ignore')
        dataset = hxl.data(dataset).cache()
        output = processHXLData(dataset)
        print "HXL output"
        return output
    except Exception as e:
        print e
        return False
Пример #40
0
    def read_external_filter(self, datasetinfo):
        # type: (Dict) -> Tuple[List[str],Iterator[Union[List,Dict]]]
        """Read filter list from external url poitning to a HXLated file

        Args:
            datasetinfo (Dict): Dictionary of information about dataset

        Returns:
            None
        """
        external_filter = datasetinfo.get('external_filter')
        if not external_filter:
            return
        hxltags = external_filter['hxltags']
        data = hxl.data(external_filter['url'])
        use_hxl = datasetinfo.get('use_hxl', False)
        for row in data:
            for hxltag in data.columns:
                if hxltag.display_tag in hxltags:
                    if use_hxl:
                        header = hxltag.display_tag
                    else:
                        header = hxltag.header
                    dict_of_lists_add(self.filters, header,
                                      row.get('#country+code'))
Пример #41
0
def readXls(fileLocation):
    print "Trying to download XLS"
    try:
        response = urlopen(fileLocation).read()
        try:
            print "Reading XLS"
            wb = xlrd.open_workbook(file_contents=response)
        except Exception as e:
            print e
            print "Error reading " + str(fileLocation)
            return False
        xl_sheet = wb.sheet_by_index(0)
    except URLError as e:
        print("XLS Failed to download")
    try:
        dataset = []
        for row in range(0, xl_sheet.nrows):
            r = []
            for col in range(0, xl_sheet.ncols):
                if isinstance(xl_sheet.cell_value(row, col), basestring):
                    r.append(
                        xl_sheet.cell_value(row,
                                            col).encode('ascii', 'ignore'))
                else:
                    r.append(xl_sheet.cell_value(row, col))
                #if isinstance(cell, datetime.date):
                #    dataset[i][j] = cell.strftime('%m/%d/%Y')
            dataset.append(r)
        dataset = hxl.data(dataset).cache()
        output = processHXLData(dataset)
        print "HXL output"
        return output
    except Exception as e:
        print e
        return False
Пример #42
0
 def test_write_json_attribute_normalisation(self):
     DATA_IN = [['#sector+es+cluster'], ['Hygiene']]
     DATA_OUT = [{'#sector+cluster+es': 'Hygiene'}]
     buffer = StringIO()
     source = hxl.data(DATA_IN)
     hxl.input.write_json(buffer, source, use_objects=True)
     self.assertEqual(DATA_OUT, json.loads(buffer.getvalue()))
Пример #43
0
 def test_google_row_number(self):
     source = hxl.data('https://docs.google.com/spreadsheets/d/1rOO0-xYa3kIOfI-6KR-mLgMTdgIEijNxM52Nfhs8uvg/edit#gid=0')
     for row in source:
         self.assertTrue(row.source_row_number is not None)
         self.assertEqual(row.source_row_number, row.row_number+1) # there are two header rows and the hashtags
         for i, column in enumerate(row.columns):
             self.assertEqual(i, column.column_number)
Пример #44
0
 def test_repeat_sub(self):
     # Test repeating a cache filter backing another filter
     source = hxl.data(DATA).cache().with_rows('org=NGO A')
     rows1 = [row.values for row in source]
     rows2 = [row.values for row in source]
     self.assertEqual(2, len(rows1))
     self.assertEqual(rows1, rows2)
Пример #45
0
 def test_attributes(self):
     with hxl.data(FILE_CSV, True) as source:
         for row in source:
             for column_number, column in enumerate(row.columns):
                 self.assertEqual(
                     set(TestParser.EXPECTED_ATTRIBUTES[column_number]),
                     column.attributes)
Пример #46
0
 def test_write_json_objects(self):
     with open(FILE_JSON_OBJECTS_OUT) as input:
         expected = input.read()
         buffer = StringIO()
         with hxl.data(FILE_CSV, True) as source:
             hxl.input.write_json(buffer, source, use_objects=True)
             self.assertEqual(expected, buffer.getvalue())
Пример #47
0
 def test_repeat(self):
     # Test repeating a cache filter directly
     source = hxl.data(DATA).cache()
     rows1 = [row.values for row in source]
     rows2 = [row.values for row in source]
     self.assertEqual(4, len(rows1))
     self.assertEqual(rows1, rows2)
Пример #48
0
 def test_filter(self):
     """ Confirm that the JSONPath implementation supports filters. """
     data = [
         ["#x"],
         ['[{"a": 1, "b": 2}, {"a": 3, "b": 4}]'],
     ]
     self.assertEqual("2", hxl.data(data).jsonpath('$[?(@.a=1)].b').values[0][0]);
Пример #49
0
 def test_write_json_objects(self):
     with open(FILE_JSON_OBJECTS_OUT) as input:
         expected = input.read()
         buffer = StringIO()
         with hxl.data(FILE_CSV, True) as source:
             hxl.io.write_json(buffer, source, use_objects=True)
             self.assertEqual(expected, buffer.getvalue())
Пример #50
0
 def test_row_count(self):
     row_count = 0
     with hxl.data(FILE_CSV, True) as source:
         # logical row count
         for row in source:
             row_count += 1
     self.assertEqual(TestParser.EXPECTED_ROW_COUNT, row_count)
Пример #51
0
    def test_outliers(self):
        BAD_VALUES = ['1', '1000000']

        raw_data = [
            ['#affected'],
            ['1'],
            ['1000000']
        ]

        for i in range(0, 10):
            raw_data += [
                ['100'],
                ['200'],
                ['800']
            ]
            
        seen_callback = False

        def callback(e):
            nonlocal seen_callback
            seen_callback = True
            self.assertTrue(e.value in BAD_VALUES)

        schema = hxl.schema([
            ['#valid_tag', '#valid_value+outliers'],
            ['#affected', 'true']
        ], callback=callback)

        data = hxl.data(raw_data)

        self.assertFalse(schema.validate(data))
        self.assertTrue(seen_callback)
Пример #52
0
 def test_values_displaced_key(self):
     """Test that the filter scans all candidate keys."""
     data1 = hxl.data([
         ['#sector+list', '#org+name', '#org+name'],
         ['Health', 'xxx', 'Red Cross']
         ])
     data2 = hxl.data([
         ['#org+name', '#org+code'],
         ['XX', 'YY'],
         ['Red Cross', 'IFRC']
         ])
     expected = [
         ['#sector+list', '#org+name', '#org+name', '#org+code'],
         ['Health', 'xxx', 'Red Cross', 'IFRC']
         ]
     merged = data1.merge_data(data2, '#org+name', '#org+code')
     self.assertEqual(expected[1:], merged.values)
Пример #53
0
 def test_write_csv(self):
     with open(FILE_CSV_OUT, 'rb') as input:
         expected = input.read()
         buffer = StringIO()
         with hxl.data(FILE_CSV, True) as source:
             hxl.io.write_hxl(buffer, source)
             # Need to work with bytes to handle CRLF
             self.assertEqual(expected, buffer.getvalue().encode('utf-8'))
Пример #54
0
 def test_taxonomy_bad(self):
     schema = hxl.schema(SCHEMA_TAXONOMY)
     result = hxl.validate(hxl.data(DATA_TAXONOMY_BAD), schema)
     self.assertFalse(result['is_valid'])
     self.assertEqual(1, result['stats']['error'])
     self.assertEqual(0, result['stats']['external'])
     self.assertEqual(1, len(result['issues']))
     self.assertEqual(0, len(result['external_issues']))
Пример #55
0
def add_append_filter(source, args, index):
    """Add the hxlappend filter to the end of the chain."""
    exclude_columns = args.get('append-exclude-columns%02d' % index, False)
    for subindex in range(1, 100):
        dataset_url = args.get('append-dataset%02d-%02d' % (index, subindex))
        if dataset_url:
            source = source.append(hxl.data(dataset_url), not exclude_columns)
    return source
Пример #56
0
def add_merge_filter(source, args, index):
    """Add the hxlmerge filter to the end of the pipeline."""
    tags = hxl.TagPattern.parse_list(args.get('merge-tags%02d' % index, []))
    keys = hxl.TagPattern.parse_list(args.get('merge-keys%02d' % index, []))
    replace = (args.get('merge-replace%02d' % index) == 'on')
    overwrite = (args.get('merge-overwrite%02d' % index) == 'on')
    url = args.get('merge-url%02d' % index)
    merge_source = hxl.data(url)
    return source.merge_data(merge_source, keys=keys, tags=tags, replace=replace, overwrite=overwrite)
Пример #57
0
 def test_wide_data(self):
     """Test for very wide data"""
     tagging_specs = [
         ('cod_wardsr', '#adm3+code',),
         ('food_monthly', '#value+expenditure+food_monthly',),
     ]
     filename = resolve_path("files/test_converters/wide-tagging-test.csv")
     source = hxl.data(hxl.converters.Tagger(hxl.io.make_input(filename, allow_local=True), tagging_specs)).cache()
     self.assertTrue('#value+expenditure+food_monthly' in source.display_tags)
Пример #58
0
 def test_latlon(self):
     DATA_IN = [
         ['#foo', '#geo+lat', '#geo+lon', '#geo+coord'],
         ['75W 30 00', '45N 30 00', '75W 30 00', '45N 30 00,75W 30 00'],
     ]
     DATA_OUT = [
         ['75W 30 00', '45.5000', '-75.5000', '45.5000,-75.5000'],
     ]
     self.assertEqual(DATA_OUT, hxl.data(DATA_IN).clean_data(latlon='geo').values)
Пример #59
0
 def test_taxonomy_missing(self):
     """Handle a missing external taxonomy."""
     schema = hxl.schema(SCHEMA_TAXONOMY_MISSING)
     result = hxl.validate(hxl.data(DATA_TAXONOMY_GOOD), schema)
     self.assertTrue(result['is_valid'])
     self.assertTrue('external_issues' in result)
     self.assertEqual(0, result['stats']['error'])
     self.assertEqual(1, result['stats']['external'])
     self.assertEqual(0, len(result['issues']))
     self.assertEqual(1, len(result['external_issues']))
Пример #60
0
    def test_aggregator_strings(self):
        DATA_IN = [
            ['#event', '#sector'],
            ['Flood', 'Food'],
            ['Flood', 'Health'],
            ['Flood', 'Education']
        ]

        # minimum date
        self.assertEqual(
            [['Flood', 'Education']],
            hxl.data(DATA_IN).count('event', 'min(#sector)').values
        )

        # maximum date
        self.assertEqual(
            [['Flood', 'Health']],
            hxl.data(DATA_IN).count('event', 'max(#sector)').values
        )