Exemplo n.º 1
0
def score_by_url_extension(resource, score_reasons, log):
    '''
    Looks at the URL for a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format is None
      * If it cannot score it, then score is None
    '''
    extension_variants_ = extension_variants(resource.url.strip())
    if not extension_variants_:
        score_reasons.append(
            _('Could not determine a file extension in the URL.'))
        return (None, None)
    for extension in extension_variants_:
        format_ = format_get(extension)
        if format_:
            score = lib.resource_format_scores().get(format_)
            if score:
                score_reasons.append(
                    _('URL extension "%s" relates to format "%s" and receives score: %s.'
                      ) % (extension, format_, score))
                return score, format_
            else:
                score = 1
                score_reasons.append(
                    _('URL extension "%s" relates to format "%s" but a score for that format is not configured, so giving it default score %s.'
                      ) % (extension, format_, score))
                return score, format_
        score_reasons.append(
            _('URL extension "%s" is an unknown format.') % extension)
    return (None, None)
Exemplo n.º 2
0
def score_by_format_field(resource, score_reasons, log):
    '''
    Looks at the format field of a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format_string is None
      * If it cannot score it, then score is None
    '''
    format_field = resource.format or ''
    if not format_field:
        score_reasons.append(_('Format field is blank.'))
        return (None, None)
    format_tuple = ckan_helpers.resource_formats().get(format_field.lower()) or \
        ckan_helpers.resource_formats().get(lib.munge_format_to_be_canonical(format_field))
    if not format_tuple:
        score_reasons.append(
            _('Format field "%s" does not correspond to a known format.') %
            format_field)
        return (None, None)
    score = lib.resource_format_scores().get(format_tuple[1])
    score_reasons.append(
        _('Format field "%s" receives score: %s.') % (format_field, score))
    return (score, format_tuple[1])
Exemplo n.º 3
0
def score_by_url_extension(resource, score_reasons, log):
    '''
    Looks at the URL for a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format is None
      * If it cannot score it, then score is None
    '''
    extension_variants_ = extension_variants(resource.url.strip())
    if not extension_variants_:
        score_reasons.append('Could not determine a file extension in the URL.')
        return (None, None)
    for extension in extension_variants_:
        format_ = format_get(extension)
        if format_:
            score = lib.resource_format_scores().get(format_)
            if score:
                score_reasons.append('URL extension "%s" relates to format "%s" and receives score: %s.' % (extension, format_, score))
                return score, format_
            else:
                score = 1
                score_reasons.append('URL extension "%s" relates to format "%s" but a score for that format is not configured, so giving it default score %s.' % (extension, format_, score))
                return score, format_
        score_reasons.append('URL extension "%s" is an unknown format.' % extension)
    return (None, None)
Exemplo n.º 4
0
def score_by_sniffing_data(archival, resource, score_reasons, log):
    '''
    Looks inside a data file\'s contents to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format_string is None
      * If it cannot score it, then score is None
    '''
    if not archival or not archival.cache_filepath:
        score_reasons.append(
            _('This file had not been downloaded at the time of scoring it.'))
        return (None, None)
    # Analyse the cached file
    filepath = archival.cache_filepath
    if not os.path.exists(filepath):
        score_reasons.append(
            _('Cache filepath does not exist: "%s".') % filepath)
        return (None, None)
    else:
        if filepath:
            sniffed_format = sniff_file_format(filepath, log)
            score = lib.resource_format_scores().get(sniffed_format['format']) \
                if sniffed_format else None
            if sniffed_format:
                score_reasons.append(
                    _('Content of file appeared to be format "%s" which receives openness score: %s.'
                      ) % (sniffed_format['format'], score))
                return score, sniffed_format['format']
            else:
                score_reasons.append(
                    _('The format of the file was not recognized from its contents.'
                      ))
                return (None, None)
        else:
            # No cache_url
            if archival.status_id == Status.by_text('Chose not to download'):
                score_reasons.append(
                    _('File was not downloaded deliberately') + '. ' +
                    _('Reason') + ': %s. ' % archival.reason +
                    _('Using other methods to determine file openness.'))
                return (None, None)
            elif archival.is_broken is None and archival.status_id:
                # i.e. 'Download failure' or 'System error during archival'
                score_reasons.append(
                    _('A system error occurred during downloading this file') +
                    '. ' + _('Reason') + ': %s. ' % archival.reason +
                    _('Using other methods to determine file openness.'))
                return (None, None)
            else:
                score_reasons.append(
                    _('This file had not been downloaded at the time of scoring it.'
                      ))
                return (None, None)
Exemplo n.º 5
0
def score_by_sniffing_data(archival, resource, score_reasons, log):
    """
    Looks inside a data file\'s contents to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format_string is None
      * If it cannot score it, then score is None
    """
    if not archival or not archival.cache_filepath:
        score_reasons.append("This file had not been downloaded at the time of scoring it.")
        return (None, None)
    # Analyse the cached file
    filepath = archival.cache_filepath
    if not os.path.exists(filepath):
        score_reasons.append('Cache filepath does not exist: "%s".' % filepath)
        return (None, None)
    else:
        if filepath:
            sniffed_format = sniff_file_format(filepath, log)
            score = lib.resource_format_scores().get(sniffed_format["format"]) if sniffed_format else None
            if sniffed_format:
                score_reasons.append(
                    'Content of file appeared to be format "%s" which receives openness score: %s.'
                    % (sniffed_format["format"], score)
                )
                return score, sniffed_format["format"]
            else:
                score_reasons.append("The format of the file was not recognized from its contents.")
                return (None, None)
        else:
            # No cache_url
            if archival.status_id == Status.by_text("Chose not to download"):
                score_reasons.append(
                    "File was not downloaded deliberately. Reason: %s. Using other methods to determine file openness."
                    % archival.reason
                )
                return (None, None)
            elif archival.is_broken is None and archival.status_id:
                # i.e. 'Download failure' or 'System error during archival'
                score_reasons.append(
                    "A system error occurred during downloading this file. Reason: %s. Using other methods to determine file openness."
                    % archival.reason
                )
                return (None, None)
            else:
                score_reasons.append("This file had not been downloaded at the time of scoring it.")
                return (None, None)
Exemplo n.º 6
0
def score_by_format_field(resource, score_reasons, log):
    '''
    Looks at the format field of a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format_string is None
      * If it cannot score it, then score is None
    '''
    format_field = resource.format or ''
    if not format_field:
        score_reasons.append('Format field is blank.')
        return (None, None)
    format_tuple = ckan_helpers.resource_formats().get(format_field.lower()) or \
        ckan_helpers.resource_formats().get(lib.munge_format_to_be_canonical(format_field))
    if not format_tuple:
        score_reasons.append('Format field "%s" does not correspond to a known format.' % format_field)
        return (None, None)
    score = lib.resource_format_scores().get(format_tuple[1])
    score_reasons.append('Format field "%s" receives score: %s.' %
                         (format_field, score))
    return (score, format_tuple[1])
    # GTFS check - a GTFS is a zip which containing specific filenames
    filenames = set((os.path.basename(f) for f in filepaths))
    if not (set(('agency.txt', 'stops.txt', 'routes.txt', 'trips.txt',
                 'stop_times.txt', 'calendar.txt')) - set(filenames)):
        log.info('GTFS detected')
        return {'format': 'GTFS'}

    top_score = 0
    top_scoring_extension_counts = defaultdict(
        int)  # extension: number_of_files
    for filepath in filepaths:
        extension = os.path.splitext(filepath)[-1][1:].lower()
        format_tuple = ckan_helpers.resource_formats().get(extension)
        if format_tuple:
            score = lib.resource_format_scores().get(format_tuple[1])
            if score is not None and score > top_score:
                top_score = score
                top_scoring_extension_counts = defaultdict(int)
            if score == top_score:
                top_scoring_extension_counts[extension] += 1
        else:
            log.info('Zipped file of unknown extension: "%s" (%s)', extension,
                     filepath)
    if not top_scoring_extension_counts:
        log.info('Zip has no known extensions: %s', filepath)
        return {'format': 'ZIP'}

    top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(),
                                          key=lambda x: x[1])
    top_extension = top_scoring_extension_counts[-1][0]
Exemplo n.º 8
0
            zip.close()
    except zipfile.BadZipfile, e:
        log.info('Zip file open raised error %s: %s',
                    e, e.args)
        return
    except Exception, e:
        log.warning('Zip file open raised exception %s: %s',
                    e, e.args)
        return
    top_score = 0
    top_scoring_extension_counts = defaultdict(int) # extension: number_of_files
    for filename in filenames:
        extension = os.path.splitext(filename)[-1][1:].lower()
        format_tuple = ckan_helpers.resource_formats().get(extension)
        if format_tuple:
            score = lib.resource_format_scores().get(format_tuple[1])
            if score is not None and score > top_score:
                top_score = score
                top_scoring_extension_counts = defaultdict(int)
            if score == top_score:
                top_scoring_extension_counts[extension] += 1
        else:
            log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath)
    if not top_scoring_extension_counts:
        log.info('Zip has no known extensions: %s', filepath)
        return {'format': 'ZIP'}

    top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(),
                                          key=lambda x: x[1])
    top_extension = top_scoring_extension_counts[-1][0]
    log.info('Zip file\'s most popular extension is "%s" (All extensions: %r)',
Exemplo n.º 9
0
    def validate_resource(self, id):
        if toolkit.request.method == 'POST':
            data = dict_fns.unflatten(
                tuplize_dict(parse_params(toolkit.request.POST)))
            check_schema = toolkit.request.params.get('check_schema')
            upload_data = toolkit.request.params.get('upload_data')
            file_path = data.get('file_path')

            # Logic for validating a resource against a specified schema
            if check_schema:
                schema = {'fields': []}

                fields = data.get('field_name')
                field_type = data.get('field_type')

                # Schema is populated from data entered by the user
                for i, field in enumerate(fields):
                    schema['fields'].append({
                        'name': field,
                        'type': field_type[i]
                    })

                # File is validated with Goodtables
                report = validate(file_path, schema=schema)

                log = logging.getLogger('ckanext.tayside')

                # Score is calculated based on Sir Tim Berners-Lee's five star
                # of openness
                sniffed_format = sniff_file_format(file_path, log)
                score = resource_format_scores().get(sniffed_format['format'])

                vars = {
                    'report': report,
                    'pkg_name': id,
                    'stars': score,
                    'file_path': file_path
                }

                return toolkit.render('tayside/package/validate_resource.html',
                                      extra_vars=vars)
            elif upload_data:
                # Handles creating a resource in CKAN

                # Metadata for the resource is stored in Redis
                r = redis.StrictRedis()
                data = json.loads(r.get(file_path))
                data['package_id'] = id

                # Dataset's state is changed from 'draft' to 'active'
                toolkit.get_action('package_patch')({}, {
                    'id': id,
                    'state': 'active'
                })

                # FieldStorage instance is created which is needed to upload
                # the file to Filestore and Datastore
                fs = cgi.FieldStorage()
                fs.file = fs.make_file()
                fs.filename = data.get('url')

                f = open(file_path, 'r')
                fs.file.write(f.read())
                fs.file.seek(0)
                f.close()

                data['upload'] = fs

                try:
                    toolkit.get_action('resource_create')({}, data)
                except Exception as e:
                    vars = {
                        'upload_error': 'An error occured while creating the '
                        'resource.',
                        'pkg_name': id
                    }

                    return toolkit.render(
                        'tayside/package/validate_resource.html',
                        extra_vars=vars)

                # File is uploaded on Filestore, and now it is safe to be
                # removed from the temporary location
                os.remove(file_path)

                toolkit.redirect_to(controller='package', action='read', id=id)
            else:
                is_upload = isinstance(data.get('upload'), cgi.FieldStorage)
                supported_formats = ['csv', 'tsv', 'xls', 'xlsx', 'ods']
                current_format = data.get('url').split('.')[-1]

                if is_upload:
                    if current_format in supported_formats:
                        # Logic for storing the file locally and extracting
                        # it's headers (fields)
                        storage_path = config.get('ckan.storage_path')
                        file_path = storage_path + '/' + data.get('url')

                        # Read the file
                        buffer = data.get('upload').file
                        buffer.seek(0)

                        # Write the file locally
                        f = open(file_path, 'w')
                        f.write(buffer.read())
                        f.close()

                        # Inspect the headers (fields) of the file
                        with Stream(file_path, headers=1) as stream:
                            fields = stream.headers

                        vars = {
                            'fields': fields,
                            'pkg_name': id,
                            'file_path': file_path
                        }

                        if is_redis_available():
                            # Store the metadata of the resource in Redis for
                            # later usage
                            r = redis.StrictRedis()
                            resource_data = {
                                'name': data.get('name'),
                                'description': data.get('description'),
                                'format': data.get('format'),
                                'url': data.get('url'),
                            }

                            r.set(file_path, json.dumps(resource_data))

                            # Store it for 1 day
                            r.expire(file_path, 86400)
                        else:
                            return toolkit.render(
                                'tayside/package/validate_resource.html',
                                {'redis_error': 'Redis not available'})

                        return toolkit.render(
                            'tayside/package/validate_resource.html',
                            extra_vars=vars)
                    else:
                        vars = {
                            'format_error': 'Format not supported.',
                            'pkg_name': id
                        }

                        return toolkit.render(
                            'tayside/package/validate_resource.html',
                            extra_vars=vars)

                vars = {
                    'format_error': 'No file provided for validation.',
                    'pkg_name': id
                }

                return toolkit.render('tayside/package/validate_resource.html',
                                      extra_vars=vars)
        else:
            return toolkit.render('tayside/package/validate_resource.html',
                                  {'pkg_name': id})
Exemplo n.º 10
0
    def custom_resource_score(self, resource, resource_score):
        resource_score_format = resource_score.get('format').upper(
        ) if resource_score.get('format') is not None else ''
        resource_format = resource.format.upper(
        ) if resource.format is not None else ''
        # If resource openness_score is 3 and format is CSV
        if resource_score.get('openness_score',
                              0) == 3 and resource_score_format == 'CSV':
            # If resource has a JSON schema which validated successfully, set score to 4
            if hasattr(resource, 'extras') and resource.extras.get(
                    'schema', None) and resource.extras.get(
                        'validation_status', '').lower() == 'success':
                resource_score['openness_score'] = 4
                resource_score['openness_score_reason'] = toolkit._(
                    'Content of file appeared to be format "{0}" which receives openness score: {1}.'
                    .format(resource_score_format,
                            resource_score.get('openness_score', '')))

        if resource_score.get('openness_score', 0) > 0:
            # QA cannot determine file formats that are not part of its own
            # 'resource_format_openness_scores.json' file and CKAN resource_formats.json file
            # The below are dataqld specific file formats that are not part of the default
            # CKAN resource_formats.json file and need custom scoring

            # If QA believes the resource is a TIFF file, check the resource format selected,
            # if it's GEOTIFF apply custom score
            if resource_score_format == 'TIFF' and resource_format == 'GEOTIFF':
                resource_score['openness_score'] = resource_score[
                    'openness_score'] = qa_lib.resource_format_scores().get(
                        resource_format)
                resource_score['openness_score_reason'] = toolkit._(
                    'Content of file appeared to be format "{0}" which receives openness score: {1}.'
                    .format(resource_format,
                            resource_score.get('openness_score', '')))

            # If QA believes the resource is a ZIP file, check the resource format selected,
            # if it's GDB apply custom score
            if resource_score_format == 'ZIP' and 'GDB' in resource_format:
                resource_score['format'] = 'GDB'
                resource_score[
                    'openness_score'] = qa_lib.resource_format_scores().get(
                        resource_score['format'])
                resource_score['openness_score_reason'] = toolkit._(
                    'Content of file appeared to be format "{0}" which receives openness score: {1}.'
                    .format(resource_format,
                            resource_score.get('openness_score', '')))

            # QA by default does not know how to handle GPKG formats, check the
            # resource format selected and extension, if it's GPKG apply custom score
            if 'GPKG' in resource_format:
                if resource.url_type == 'upload' and 'GPKG' in os.path.splitext(resource.url)[1].upper() \
                        or resource.url_type == 'url' and 'GPKG' in (ext.upper() for ext in
                                                                     qa_tasks.extension_variants(resource.url)):
                    resource_score['format'] = 'GPKG'
                    resource_score[
                        'openness_score'] = qa_lib.resource_format_scores(
                        ).get(resource_score['format'])
                    resource_score['openness_score_reason'] = toolkit._(
                        'Content of file appeared to be format "{0}" which receives openness score: {1}.'
                        .format(resource_format,
                                resource_score.get('openness_score', '')))

        return resource_score