예제 #1
0
    def update_datapackage_sources(self):
        """Update the 'sources' property of datapackage with the new sources"""

        datapackage_check = DataPackageChecker(self.config)
        required_resources = [self.source_file, self.publisher_file]
        datapackage_check.check_database_completeness(required_resources)
        datapackage_check.run()
        self.datapackage.descriptor['sources'] = []
        datapkg_path = os.path.join(self.datapackage.base_path,
                                    'datapackage.json')

        with compat.UnicodeDictReader(self.source_file) as sources_file:
            for source in sources_file:
                src_info = {
                    'name': source['title'],
                    'web': source[self.data_key]
                }
                self.datapackage.descriptor['sources'].append(src_info)

        with io.open(datapkg_path, mode='w+',
                     encoding='utf-8') as datapkg_file:
            new_datapkg = json.dumps(self.datapackage.to_dict(),
                                     indent=4,
                                     sort_keys=True)
            datapkg_file.write(compat.str(new_datapkg))
예제 #2
0
    def read_file_contents(self, file_name):
        """Return file contents as list of dicts"""

        contents = []
        with compat.UnicodeDictReader(file_name) as src_file:
            for line in src_file:
                contents.append(line)
        return contents
예제 #3
0
    def get_publishers(self):
        """Return list of publishers ids."""

        publisher_ids = []

        with compat.UnicodeDictReader(self.publisher_file) as publishers_file:
            for row in publishers_file:
                publisher_ids.append(row['id'])
        return publisher_ids
    def test_performance_calculation(self):
        """Test that PerformanceAssessor task calculates performance correctly"""

        config = self.config
        assess_performance_task = tasks.PerformanceAssessor(config)
        assess_performance_task.run()
        test_dict = {'files_count_to_date': '1', 'valid_to_date': '100',
                     'score_to_date': '100', 'score': '100',
                     'month_of_creation': '2015-01-01', 'publisher_id': 'xx_dept1',
                     'valid': '100', 'files_count': '1'}
        with compat.UnicodeDictReader(assess_performance_task.performance_file) as pf:
            self.assertGreater(self.find_in_sequence(pf, test_dict), -1)
예제 #5
0
    def get_lookup(self):

        _keys = [
            'id', 'publisher_id', self.data_key, 'created_at', 'title',
            'period_id'
        ]
        lookup = []

        with compat.UnicodeDictReader(self.source_file) as sources_file:
            for row in sources_file:
                lookup.append({k: v for k, v in row.items() if k in _keys})

        return lookup
예제 #6
0
    def get_sources(self, publisher_id):
        """Return list of sources of a publisher with id, period and score. """

        sources = []

        with compat.UnicodeDictReader(self.source_file) as sources_file:
            for row in sources_file:
                source = {}
                if row['publisher_id'] == publisher_id:
                    source['id'] = row['id']
                    source['created_at'] = utilities.date_from_string(
                        row['created_at'])
                    source['score'] = self.get_source_score(source['id'])
                    sources.append(source)
        return sources
예제 #7
0
    def get_source_score(self, source_id):
        """Return latest score of a source from results.

        Args:
            source_id: id of the source whose score is wanted
        """

        score = 0
        latest_timestamp = pytz.timezone('UTC').localize(datetime.datetime.min)

        with compat.UnicodeDictReader(self.result_file) as result_file:
            for row in result_file:
                if row['source_id'] == source_id:
                    timestamp = dateutil.parser.parse(row['timestamp'])
                    if timestamp > latest_timestamp:
                        latest_timestamp = timestamp
                        score = int(row['score'])
        return score
    def extract_period_from_sources(self):
        """Try to extract relevance period for each source or return None"""

        sources = []
        with compat.UnicodeDictReader(self.source_file) as source_file:
            timeliness_set = set(self.timeliness_strategy)
            found_fields = timeliness_set.intersection(set(source_file.header))
            if not found_fields:
                raise ValueError(('At least one of the "timeliness_strategy" '
                                  'fields must be present in your "source_file".'))
            if not found_fields.issuperset(timeliness_set):
                missing_fields = timeliness_set.difference(found_fields)
                print(('Fields "{0}" from "timeliness_strategy" were not found '
                       'in your `source_file`').format(missing_fields))

            for source in source_file:
                timeliness_fields = {field: val for field, val in source.items()
                                     if field in self.timeliness_strategy}
                extracted_period = self.identify_period(timeliness_fields)
                source['period_id'] = extracted_period
                sources.append(source)
        return sources