Exemplo n.º 1
0
Arquivo: tasks.py Projeto: tbalaz/test
def score_by_format_field(resource, score_reasons, log):
    '''
    Looks at the format field of a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_display_name)
      * If it cannot work out the format then format_display_name is None
      * If it cannot score it, then score is None
    '''
    format_field = resource.format or ''
    if not format_field:
        score_reasons.append('Format field is blank.')
        return (None, None)
    format_ = Formats.by_display_name().get(format_field) or \
              Formats.by_extension().get(format_field.lower()) or \
              Formats.by_reduced_name().get(Formats.reduce(format_field))
    if not format_:
        score_reasons.append('Polje formata "%s" ne odgovara ni jednom poznatom formatu.' % format_field)
        return (None, None)
    score = format_['openness']
    score_reasons.append('Polje formata "%s" ima ocijenu otvorenosti: %s.' % \
                         (format_field, score))
    return (score, format_['display_name'])
Exemplo n.º 2
0
def score_by_format_field(resource, score_reasons, log):
    '''
    Looks at the format field of a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_display_name)
      * If it cannot work out the format then format_display_name is None
      * If it cannot score it, then score is None
    '''
    format_field = resource.format or ''
    if not format_field:
        score_reasons.append('Format field is blank.')
        return (None, None)
    format_ = Formats.by_display_name().get(format_field) or \
              Formats.by_extension().get(format_field.lower()) or \
              Formats.by_reduced_name().get(Formats.reduce(format_field))
    if not format_:
        score_reasons.append(
            'Polje formata "%s" ne odgovara ni jednom poznatom formatu.' %
            format_field)
        return (None, None)
    score = format_['openness']
    score_reasons.append('Polje formata "%s" ima ocijenu otvorenosti: %s.' % \
                         (format_field, score))
    return (score, format_['display_name'])
Exemplo n.º 3
0
 def test_fugue_icons_exist(self):
     # List all icon files in the fugue folder
     path = os.path.dirname(__file__)  # /ckanext/dgu/tests/lib
     path = os.path.dirname(path)  # /ckanext/dgu/tests
     path = os.path.dirname(path)  # /ckanext/dgu
     # /ckanext/dgu/theme/public/images/fugue
     path = os.path.join(path, 'theme', 'public', 'images', 'fugue')
     assert os.path.isdir(path)
     files = os.listdir(path)
     # Each format should have an icon in that folder
     assert 'document.png' in files, 'document.png not found in %s' % path
     for fmt in Formats.by_display_name().values():
         if fmt['icon'] == '': continue
         icon_name = fmt['icon'] + '.png'
         assert icon_name in files, '%s not found in %s' % (icon_name, path)
Exemplo n.º 4
0
 def test_fugue_icons_exist(self):
     # List all icon files in the fugue folder
     path = os.path.dirname(__file__)  # /ckanext/dgu/tests/lib
     path = os.path.dirname(path)  # /ckanext/dgu/tests
     path = os.path.dirname(path)  # /ckanext/dgu
     # /ckanext/dgu/theme/public/images/fugue
     path = os.path.join(path, "theme", "public", "images", "fugue")
     assert os.path.isdir(path)
     files = os.listdir(path)
     # Each format should have an icon in that folder
     assert "document.png" in files, "document.png not found in %s" % path
     for fmt in Formats.by_display_name().values():
         if fmt["icon"] == "":
             continue
         icon_name = fmt["icon"] + ".png"
         assert icon_name in files, "%s not found in %s" % (icon_name, path)
Exemplo n.º 5
0
 def test_by_display_name(self):
     assert_equal(Formats.by_display_name()['JSON']['extension'], 'json')
Exemplo n.º 6
0
def sniff_file_format(filepath, log):
    '''For a given filepath, work out what file format it is.
    Returns Format dict with a key to say if it is contained
    in a zip or something.
    e.g. {'display_name': 'CSV',
          'container': 'zip',
           ...}
    or None if it can\'t tell what it is.

    Note, log is a logger, either a Celery one or a standard
    Python logging one.
    '''
    format_ = None
    log.info('Sniffing file format of: %s', filepath)
    filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \
                    else filepath
    mime_type = magic.from_file(filepath_utf8, mime=True)
    log.info('Magic detects file as: %s', mime_type)
    if mime_type:
        if mime_type == 'application/xml':
            with open(filepath) as f:
                buf = f.read(5000)
            format_ = get_xml_variant_including_xml_declaration(buf, log)
        elif mime_type == 'application/zip':
            format_ = get_zipped_format(filepath, log)
        elif mime_type == 'application/msword':
            # Magic gives this mime-type for other MS Office files too
            format_ = run_bsd_file(filepath, log)
            if not format_ and is_excel(filepath, log):
                format_ = Formats.by_display_name()['XLS']
        elif mime_type == 'application/octet-stream':
            # Excel files sometimes come up as this
            if is_excel(filepath, log):
                format_ = Formats.by_display_name()['XLS']
            else:
                # e.g. Shapefile
                format_ = run_bsd_file(filepath, log)
            if not format_:
                with open(filepath) as f:
                    buf = f.read(500)
                format_ = is_html(buf, log)
        elif mime_type == 'text/html':
            # Magic can mistake IATI for HTML
            with open(filepath) as f:
                buf = f.read(100)
            if is_iati(buf, log):
                format_ = Formats.by_display_name()['IATI']

        if format_:
            return format_

        format_ = Formats.by_mime_type().get(mime_type)

        if not format_:
            if mime_type.startswith('text/'):
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = Formats.by_extension()['json']
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = Formats.by_extension()['csv']
                elif is_psv(buf, log):
                    format_ = Formats.by_extension()['psv']

        if not format_:
            log.warning('Mimetype not recognised by CKAN as a data format: %s',
                        mime_type)

        if format_:
            log.info('Mimetype translates to filetype: %s',
                     format_['display_name'])

            if format_['display_name'] == 'TXT':
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = Formats.by_extension()['json']
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = Formats.by_extension()['csv']
                elif is_psv(buf, log):
                    format_ = Formats.by_extension()['psv']
                # XML files without the "<?xml ... ?>" tag end up here
                elif is_xml_but_without_declaration(buf, log):
                    format_ = get_xml_variant_without_xml_declaration(buf, log)
                elif is_ttl(buf, log):
                    format_ = Formats.by_extension()['ttl']

            elif format_['display_name'] == 'HTML':
                # maybe it has RDFa in it
                with open(filepath) as f:
                    buf = f.read(100000)
                if has_rdfa(buf, log):
                    format_ = Formats.by_display_name()['RDFa']

    else:
        # Excel files sometimes not picked up by magic, so try alternative
        if is_excel(filepath, log):
            format_ = Formats.by_display_name()['XLS']
        # BSD file picks up some files that Magic misses
        # e.g. some MS Word files
        if not format_:
            format_ = run_bsd_file(filepath, log)

    if not format_:
        log.warning('Could not detect format of file: %s', filepath)
    return format_
Exemplo n.º 7
0
        int)  # extension: number_of_files
    for filename in filenames:
        extension = os.path.splitext(filename)[-1][1:].lower()
        if extension in Formats.by_extension():
            format_ = Formats.by_extension()[extension]
            if format_['openness'] > top_score:
                top_score = format_['openness']
                top_scoring_extension_counts = defaultdict(int)
            if format_['openness'] == top_score:
                top_scoring_extension_counts[extension] += 1
        else:
            log.info('Zipped file of unknown extension: "%s" (%s)', extension,
                     filepath)
    if not top_scoring_extension_counts:
        log.info('Zip has no known extensions: %s', filepath)
        return Formats.by_display_name()['Zip']

    top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(),
                                          key=lambda x: x[1])
    top_extension = top_scoring_extension_counts[-1][0]
    log.info('Zip file\'s most popular extension is "%s" (All extensions: %r)',
             top_extension, top_scoring_extension_counts)
    format_ = Formats.by_extension()[top_extension]
    # take a copy of the format_ dict to avoid altering the copy held in Formats
    format_ = copy.deepcopy(format_)
    format_['container'] = Formats.by_display_name()['Zip']['display_name']
    log.info('Zipped file format detected: %s', format_['display_name'])
    return format_


def is_excel(filepath, log):
Exemplo n.º 8
0
def sniff_file_format(filepath, log):
    '''For a given filepath, work out what file format it is.
    Returns Format dict with a key to say if it is contained
    in a zip or something.
    e.g. {'display_name': 'CSV',
          'container': 'zip',
           ...}
    or None if it can\'t tell what it is.

    Note, log is a logger, either a Celery one or a standard
    Python logging one.
    '''
    format_ = None
    log.info('Sniffing file format of: %s', filepath)
    filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \
                    else filepath
    mime_type = magic.from_file(filepath_utf8, mime=True)
    log.info('Magic detects file as: %s', mime_type)
    if mime_type:
        if mime_type == 'application/xml':
            with open(filepath) as f:
                buf = f.read(5000)
            format_ = get_xml_variant_including_xml_declaration(buf, log)
        elif mime_type == 'application/zip':
            format_ = get_zipped_format(filepath, log)
        elif mime_type == 'application/msword':
            # Magic gives this mime-type for other MS Office files too
            format_ = run_bsd_file(filepath, log)
            if not format_ and is_excel(filepath, log):
                format_ = Formats.by_display_name()['XLS']
        elif mime_type == 'application/octet-stream':
            # Excel files sometimes come up as this
            if is_excel(filepath, log):
                format_ = Formats.by_display_name()['XLS']
            else:
                # e.g. Shapefile
                format_ = run_bsd_file(filepath, log)
            if not format_:
                with open(filepath) as f:
                    buf = f.read(500)
                format_ = is_html(buf, log)
        elif mime_type == 'text/html':
            # Magic can mistake IATI for HTML
            with open(filepath) as f:
                buf = f.read(100)
            if is_iati(buf, log):
                format_ = Formats.by_display_name()['IATI']
                
        if format_:
            return format_
                
        format_ = Formats.by_mime_type().get(mime_type)

        if not format_:
            if mime_type.startswith('text/'):
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = Formats.by_extension()['json']
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = Formats.by_extension()['csv']
                elif is_psv(buf, log):
                    format_ = Formats.by_extension()['psv']

        if not format_:
            log.warning('Mimetype not recognised by CKAN as a data format: %s', mime_type)
            
        if format_:
            log.info('Mimetype translates to filetype: %s', format_['display_name'])

            if format_['display_name'] == 'TXT':
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = Formats.by_extension()['json']
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = Formats.by_extension()['csv']
                elif is_psv(buf, log):
                    format_ = Formats.by_extension()['psv']
                # XML files without the "<?xml ... ?>" tag end up here
                elif is_xml_but_without_declaration(buf, log):
                    format_ = get_xml_variant_without_xml_declaration(buf, log)
                elif is_ttl(buf, log):
                    format_ = Formats.by_extension()['ttl']

            elif format_['display_name'] == 'HTML':
                # maybe it has RDFa in it
                with open(filepath) as f:
                    buf = f.read(100000)
                if has_rdfa(buf, log):
                    format_ = Formats.by_display_name()['RDFa']

    else:
        # Excel files sometimes not picked up by magic, so try alternative
        if is_excel(filepath, log):
            format_ = Formats.by_display_name()['XLS']
        # BSD file picks up some files that Magic misses
        # e.g. some MS Word files
        if not format_:
            format_ = run_bsd_file(filepath, log)
                
    if not format_:
        log.warning('Could not detect format of file: %s', filepath)
    return format_
Exemplo n.º 9
0
    top_score = 0
    top_scoring_extension_counts = defaultdict(int) # extension: number_of_files
    for filename in filenames:
        extension = os.path.splitext(filename)[-1][1:].lower()
        if extension in Formats.by_extension():
            format_ = Formats.by_extension()[extension]
            if format_['openness'] > top_score:
                top_score = format_['openness']
                top_scoring_extension_counts = defaultdict(int)
            if format_['openness'] == top_score:
                top_scoring_extension_counts[extension] += 1
        else:
            log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath)
    if not top_scoring_extension_counts:
        log.info('Zip has no known extensions: %s', filepath)
        return Formats.by_display_name()['Zip']

    top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(),
                                          key=lambda x: x[1])
    top_extension = top_scoring_extension_counts[-1][0]
    log.info('Zip file\'s most popular extension is "%s" (All extensions: %r)',
             top_extension, top_scoring_extension_counts)
    format_ = Formats.by_extension()[top_extension]
    # take a copy of the format_ dict to avoid altering the copy held in Formats
    format_ = copy.deepcopy(format_)
    format_['container'] = Formats.by_display_name()['Zip']['display_name']
    log.info('Zipped file format detected: %s', format_['display_name'])
    return format_


def is_excel(filepath, log):
Exemplo n.º 10
0
def set_sniffed_format(format_display_name):
    global sniffed_format
    if format_display_name:
        sniffed_format = Formats.by_display_name()[format_display_name]
    else:
        sniffed_format = None
Exemplo n.º 11
0
 def test_by_display_name(self):
     assert_equal(Formats.by_display_name()["JSON"]["extension"], "json")
Exemplo n.º 12
0
def set_sniffed_format(format_display_name):
    global sniffed_format
    if format_display_name:
        sniffed_format = Formats.by_display_name()[format_display_name]
    else:
        sniffed_format = None