def update_datapackage_sources(self): """Update the 'sources' property of datapackage with the new sources""" datapackage_check = DataPackageChecker(self.config) required_resources = [self.source_file, self.publisher_file] datapackage_check.check_database_completeness(required_resources) datapackage_check.run() self.datapackage.descriptor['sources'] = [] datapkg_path = os.path.join(self.datapackage.base_path, 'datapackage.json') with compat.UnicodeDictReader(self.source_file) as sources_file: for source in sources_file: src_info = { 'name': source['title'], 'web': source[self.data_key] } self.datapackage.descriptor['sources'].append(src_info) with io.open(datapkg_path, mode='w+', encoding='utf-8') as datapkg_file: new_datapkg = json.dumps(self.datapackage.to_dict(), indent=4, sort_keys=True) datapkg_file.write(compat.str(new_datapkg))
def read_file_contents(self, file_name): """Return file contents as list of dicts""" contents = [] with compat.UnicodeDictReader(file_name) as src_file: for line in src_file: contents.append(line) return contents
def get_publishers(self): """Return list of publishers ids.""" publisher_ids = [] with compat.UnicodeDictReader(self.publisher_file) as publishers_file: for row in publishers_file: publisher_ids.append(row['id']) return publisher_ids
def test_performance_calculation(self): """Test that PerformanceAssessor task calculates performance correctly""" config = self.config assess_performance_task = tasks.PerformanceAssessor(config) assess_performance_task.run() test_dict = {'files_count_to_date': '1', 'valid_to_date': '100', 'score_to_date': '100', 'score': '100', 'month_of_creation': '2015-01-01', 'publisher_id': 'xx_dept1', 'valid': '100', 'files_count': '1'} with compat.UnicodeDictReader(assess_performance_task.performance_file) as pf: self.assertGreater(self.find_in_sequence(pf, test_dict), -1)
def get_lookup(self): _keys = [ 'id', 'publisher_id', self.data_key, 'created_at', 'title', 'period_id' ] lookup = [] with compat.UnicodeDictReader(self.source_file) as sources_file: for row in sources_file: lookup.append({k: v for k, v in row.items() if k in _keys}) return lookup
def get_sources(self, publisher_id): """Return list of sources of a publisher with id, period and score. """ sources = [] with compat.UnicodeDictReader(self.source_file) as sources_file: for row in sources_file: source = {} if row['publisher_id'] == publisher_id: source['id'] = row['id'] source['created_at'] = utilities.date_from_string( row['created_at']) source['score'] = self.get_source_score(source['id']) sources.append(source) return sources
def get_source_score(self, source_id): """Return latest score of a source from results. Args: source_id: id of the source whose score is wanted """ score = 0 latest_timestamp = pytz.timezone('UTC').localize(datetime.datetime.min) with compat.UnicodeDictReader(self.result_file) as result_file: for row in result_file: if row['source_id'] == source_id: timestamp = dateutil.parser.parse(row['timestamp']) if timestamp > latest_timestamp: latest_timestamp = timestamp score = int(row['score']) return score
def extract_period_from_sources(self): """Try to extract relevance period for each source or return None""" sources = [] with compat.UnicodeDictReader(self.source_file) as source_file: timeliness_set = set(self.timeliness_strategy) found_fields = timeliness_set.intersection(set(source_file.header)) if not found_fields: raise ValueError(('At least one of the "timeliness_strategy" ' 'fields must be present in your "source_file".')) if not found_fields.issuperset(timeliness_set): missing_fields = timeliness_set.difference(found_fields) print(('Fields "{0}" from "timeliness_strategy" were not found ' 'in your `source_file`').format(missing_fields)) for source in source_file: timeliness_fields = {field: val for field, val in source.items() if field in self.timeliness_strategy} extracted_period = self.identify_period(timeliness_fields) source['period_id'] = extracted_period sources.append(source) return sources