Exemplo n.º 1
0
def run_bsd_file(filepath, log):
    '''Run the BSD command-line tool "file" to determine file type. Returns
    a Format or None if it fails.'''
    result = check_output(['file', filepath])
    match = re.search('Name of Creating Application: ([^,]*),', result)
    if match:
        app_name = match.groups()[0]
        format_map = {'Microsoft Office PowerPoint': 'ppt',
                      'Microsoft PowerPoint': 'ppt',
                      'Microsoft Excel': 'xls',
                      'Microsoft Office Word': 'doc',
                      'Microsoft Word 10.0': 'doc',
                      'Microsoft Macintosh Word': 'doc',
                      }
        if app_name in format_map:
            extension = format_map[app_name]
            format_ = Formats.by_extension()[extension]
            log.info('"file" detected file format: %s',
                     format_['display_name'])
            return format_
    match = re.search(': ESRI Shapefile', result)
    if match:
        format_ = Formats.by_extension()['shp']
        log.info('"file" detected file format: %s',
                 format_['display_name'])
        return format_
    log.info('"file" could not determine file format of "%s": %s',
             filepath, result)
Exemplo n.º 2
0
Arquivo: tasks.py Projeto: tbalaz/test
def score_by_format_field(resource, score_reasons, log):
    '''
    Looks at the format field of a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_display_name)
      * If it cannot work out the format then format_display_name is None
      * If it cannot score it, then score is None
    '''
    format_field = resource.format or ''
    if not format_field:
        score_reasons.append('Format field is blank.')
        return (None, None)
    format_ = Formats.by_display_name().get(format_field) or \
              Formats.by_extension().get(format_field.lower()) or \
              Formats.by_reduced_name().get(Formats.reduce(format_field))
    if not format_:
        score_reasons.append('Polje formata "%s" ne odgovara ni jednom poznatom formatu.' % format_field)
        return (None, None)
    score = format_['openness']
    score_reasons.append('Polje formata "%s" ima ocijenu otvorenosti: %s.' % \
                         (format_field, score))
    return (score, format_['display_name'])
Exemplo n.º 3
0
def get_xml_variant_without_xml_declaration(buf, log):
    '''If this buffer is in a format based on XML, without any XML declaration
    or other boilerplate, return the format type.'''
    xml_re = '.{0,3}\s*<([^>\s]*)'
    match = re.match(xml_re, buf)
    if match:
        top_level_tag_name = match.groups()[-1].lower()
        top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf')
        top_level_tag_name = top_level_tag_name.replace('wms_capabilities', 'wms')
        if top_level_tag_name in Formats.by_extension():
            format_ = Formats.by_extension()[top_level_tag_name]
            log.info('XML variant detected: %s', format_['display_name'])
            return format_
        log.warning('Did not recognise XML format: %s', top_level_tag_name)
        return Formats.by_extension()['xml']
    log.debug('XML tags not found: %s', buf)
Exemplo n.º 4
0
def score_by_format_field(resource, score_reasons, log):
    '''
    Looks at the format field of a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_display_name)
      * If it cannot work out the format then format_display_name is None
      * If it cannot score it, then score is None
    '''
    format_field = resource.format or ''
    if not format_field:
        score_reasons.append('Format field is blank.')
        return (None, None)
    format_ = Formats.by_display_name().get(format_field) or \
              Formats.by_extension().get(format_field.lower()) or \
              Formats.by_reduced_name().get(Formats.reduce(format_field))
    if not format_:
        score_reasons.append(
            'Polje formata "%s" ne odgovara ni jednom poznatom formatu.' %
            format_field)
        return (None, None)
    score = format_['openness']
    score_reasons.append('Polje formata "%s" ima ocijenu otvorenosti: %s.' % \
                         (format_field, score))
    return (score, format_['display_name'])
Exemplo n.º 5
0
def get_xml_variant_without_xml_declaration(buf, log):
    '''If this buffer is in a format based on XML, without any XML declaration
    or other boilerplate, return the format type.'''
    xml_re = '.{0,3}\s*<([^>\s]*)'
    match = re.match(xml_re, buf)
    if match:
        top_level_tag_name = match.groups()[-1].lower()
        top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf')
        top_level_tag_name = top_level_tag_name.replace(
            'wms_capabilities', 'wms')
        if top_level_tag_name in Formats.by_extension():
            format_ = Formats.by_extension()[top_level_tag_name]
            log.info('XML variant detected: %s', format_['display_name'])
            return format_
        log.warning('Did not recognise XML format: %s', top_level_tag_name)
        return Formats.by_extension()['xml']
    log.debug('XML tags not found: %s', buf)
Exemplo n.º 6
0
def is_iati(buf, log):
    '''If this buffer is IATI format, return that format type, else None.'''
    xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<iati-(activities|organisations)[^>]*>'
    match = re.match(xml_re, buf, re.IGNORECASE)
    if match:
        log.info('IATI tag detected')
        return Formats.by_extension()['iati']
    log.debug('Not IATI', buf)
Exemplo n.º 7
0
def is_html(buf, log):
    '''If this buffer is HTML, return that format type, else None.'''
    xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<html[^>]*>'
    match = re.match(xml_re, buf, re.IGNORECASE)
    if match:
        log.info('HTML tag detected')
        return Formats.by_extension()['html']
    log.debug('Not HTML')
Exemplo n.º 8
0
def is_iati(buf, log):
    '''If this buffer is IATI format, return that format type, else None.'''
    xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<iati-(activities|organisations)[^>]*>'
    match = re.match(xml_re, buf, re.IGNORECASE)
    if match:
        log.info('IATI tag detected')
        return Formats.by_extension()['iati']
    log.debug('Not IATI', buf)
Exemplo n.º 9
0
def is_html(buf, log):
    '''If this buffer is HTML, return that format type, else None.'''
    xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<html[^>]*>'
    match = re.match(xml_re, buf, re.IGNORECASE)
    if match:
        log.info('HTML tag detected')
        return Formats.by_extension()['html']
    log.debug('Not HTML')
Exemplo n.º 10
0
Arquivo: tasks.py Projeto: tbalaz/test
def score_by_url_extension(resource, score_reasons, log):
    '''
    Looks at the URL for a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_display_name)
      * If it cannot work out the format then format_display_name is None
      * If it cannot score it, then score is None
    '''
    formats_by_extension = Formats.by_extension()
    extension_variants_ = extension_variants(resource.url.strip())
    if not extension_variants_:
        score_reasons.append('Nepoznata ekstenzija datoteke.')
        return (None, None)
    for extension in extension_variants_:
        if extension.lower() in formats_by_extension:
            format_ = Formats.by_extension().get(extension.lower())
            score = format_['openness']
            score_reasons.append('URL ekstenzija "%s" je povezana s formatom "%s" i ima ocjenu: %s.' % (extension, format_['display_name'], score))
            return score, format_['display_name']
        score_reasons.append('URL ekstenzija "%s" je nepoznat format.' % extension)
    return (None, None)
Exemplo n.º 11
0
 def test_by_extension(self):
     assert_equal(Formats.by_extension()['json']['display_name'], 'JSON')
Exemplo n.º 12
0
def sniff_file_format(filepath, log):
    '''For a given filepath, work out what file format it is.
    Returns Format dict with a key to say if it is contained
    in a zip or something.
    e.g. {'display_name': 'CSV',
          'container': 'zip',
           ...}
    or None if it can\'t tell what it is.

    Note, log is a logger, either a Celery one or a standard
    Python logging one.
    '''
    format_ = None
    log.info('Sniffing file format of: %s', filepath)
    filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \
                    else filepath
    mime_type = magic.from_file(filepath_utf8, mime=True)
    log.info('Magic detects file as: %s', mime_type)
    if mime_type:
        if mime_type == 'application/xml':
            with open(filepath) as f:
                buf = f.read(5000)
            format_ = get_xml_variant_including_xml_declaration(buf, log)
        elif mime_type == 'application/zip':
            format_ = get_zipped_format(filepath, log)
        elif mime_type == 'application/msword':
            # Magic gives this mime-type for other MS Office files too
            format_ = run_bsd_file(filepath, log)
            if not format_ and is_excel(filepath, log):
                format_ = Formats.by_display_name()['XLS']
        elif mime_type == 'application/octet-stream':
            # Excel files sometimes come up as this
            if is_excel(filepath, log):
                format_ = Formats.by_display_name()['XLS']
            else:
                # e.g. Shapefile
                format_ = run_bsd_file(filepath, log)
            if not format_:
                with open(filepath) as f:
                    buf = f.read(500)
                format_ = is_html(buf, log)
        elif mime_type == 'text/html':
            # Magic can mistake IATI for HTML
            with open(filepath) as f:
                buf = f.read(100)
            if is_iati(buf, log):
                format_ = Formats.by_display_name()['IATI']

        if format_:
            return format_

        format_ = Formats.by_mime_type().get(mime_type)

        if not format_:
            if mime_type.startswith('text/'):
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = Formats.by_extension()['json']
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = Formats.by_extension()['csv']
                elif is_psv(buf, log):
                    format_ = Formats.by_extension()['psv']

        if not format_:
            log.warning('Mimetype not recognised by CKAN as a data format: %s',
                        mime_type)

        if format_:
            log.info('Mimetype translates to filetype: %s',
                     format_['display_name'])

            if format_['display_name'] == 'TXT':
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = Formats.by_extension()['json']
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = Formats.by_extension()['csv']
                elif is_psv(buf, log):
                    format_ = Formats.by_extension()['psv']
                # XML files without the "<?xml ... ?>" tag end up here
                elif is_xml_but_without_declaration(buf, log):
                    format_ = get_xml_variant_without_xml_declaration(buf, log)
                elif is_ttl(buf, log):
                    format_ = Formats.by_extension()['ttl']

            elif format_['display_name'] == 'HTML':
                # maybe it has RDFa in it
                with open(filepath) as f:
                    buf = f.read(100000)
                if has_rdfa(buf, log):
                    format_ = Formats.by_display_name()['RDFa']

    else:
        # Excel files sometimes not picked up by magic, so try alternative
        if is_excel(filepath, log):
            format_ = Formats.by_display_name()['XLS']
        # BSD file picks up some files that Magic misses
        # e.g. some MS Word files
        if not format_:
            format_ = run_bsd_file(filepath, log)

    if not format_:
        log.warning('Could not detect format of file: %s', filepath)
    return format_
Exemplo n.º 13
0
        try:
            filenames = zip.namelist()
        finally:
            zip.close()
    except zipfile.BadZipfile, e:
        log.info('Zip file open raised error %s: %s', e, e.args)
        return
    except Exception, e:
        log.warning('Zip file open raised exception %s: %s', e, e.args)
        return
    top_score = 0
    top_scoring_extension_counts = defaultdict(
        int)  # extension: number_of_files
    for filename in filenames:
        extension = os.path.splitext(filename)[-1][1:].lower()
        if extension in Formats.by_extension():
            format_ = Formats.by_extension()[extension]
            if format_['openness'] > top_score:
                top_score = format_['openness']
                top_scoring_extension_counts = defaultdict(int)
            if format_['openness'] == top_score:
                top_scoring_extension_counts[extension] += 1
        else:
            log.info('Zipped file of unknown extension: "%s" (%s)', extension,
                     filepath)
    if not top_scoring_extension_counts:
        log.info('Zip has no known extensions: %s', filepath)
        return Formats.by_display_name()['Zip']

    top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(),
                                          key=lambda x: x[1])
Exemplo n.º 14
0
def sniff_file_format(filepath, log):
    '''For a given filepath, work out what file format it is.
    Returns Format dict with a key to say if it is contained
    in a zip or something.
    e.g. {'display_name': 'CSV',
          'container': 'zip',
           ...}
    or None if it can\'t tell what it is.

    Note, log is a logger, either a Celery one or a standard
    Python logging one.
    '''
    format_ = None
    log.info('Sniffing file format of: %s', filepath)
    filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \
                    else filepath
    mime_type = magic.from_file(filepath_utf8, mime=True)
    log.info('Magic detects file as: %s', mime_type)
    if mime_type:
        if mime_type == 'application/xml':
            with open(filepath) as f:
                buf = f.read(5000)
            format_ = get_xml_variant_including_xml_declaration(buf, log)
        elif mime_type == 'application/zip':
            format_ = get_zipped_format(filepath, log)
        elif mime_type == 'application/msword':
            # Magic gives this mime-type for other MS Office files too
            format_ = run_bsd_file(filepath, log)
            if not format_ and is_excel(filepath, log):
                format_ = Formats.by_display_name()['XLS']
        elif mime_type == 'application/octet-stream':
            # Excel files sometimes come up as this
            if is_excel(filepath, log):
                format_ = Formats.by_display_name()['XLS']
            else:
                # e.g. Shapefile
                format_ = run_bsd_file(filepath, log)
            if not format_:
                with open(filepath) as f:
                    buf = f.read(500)
                format_ = is_html(buf, log)
        elif mime_type == 'text/html':
            # Magic can mistake IATI for HTML
            with open(filepath) as f:
                buf = f.read(100)
            if is_iati(buf, log):
                format_ = Formats.by_display_name()['IATI']
                
        if format_:
            return format_
                
        format_ = Formats.by_mime_type().get(mime_type)

        if not format_:
            if mime_type.startswith('text/'):
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = Formats.by_extension()['json']
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = Formats.by_extension()['csv']
                elif is_psv(buf, log):
                    format_ = Formats.by_extension()['psv']

        if not format_:
            log.warning('Mimetype not recognised by CKAN as a data format: %s', mime_type)
            
        if format_:
            log.info('Mimetype translates to filetype: %s', format_['display_name'])

            if format_['display_name'] == 'TXT':
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = Formats.by_extension()['json']
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = Formats.by_extension()['csv']
                elif is_psv(buf, log):
                    format_ = Formats.by_extension()['psv']
                # XML files without the "<?xml ... ?>" tag end up here
                elif is_xml_but_without_declaration(buf, log):
                    format_ = get_xml_variant_without_xml_declaration(buf, log)
                elif is_ttl(buf, log):
                    format_ = Formats.by_extension()['ttl']

            elif format_['display_name'] == 'HTML':
                # maybe it has RDFa in it
                with open(filepath) as f:
                    buf = f.read(100000)
                if has_rdfa(buf, log):
                    format_ = Formats.by_display_name()['RDFa']

    else:
        # Excel files sometimes not picked up by magic, so try alternative
        if is_excel(filepath, log):
            format_ = Formats.by_display_name()['XLS']
        # BSD file picks up some files that Magic misses
        # e.g. some MS Word files
        if not format_:
            format_ = run_bsd_file(filepath, log)
                
    if not format_:
        log.warning('Could not detect format of file: %s', filepath)
    return format_
Exemplo n.º 15
0
            filenames = zip.namelist()
        finally:
            zip.close()
    except zipfile.BadZipfile, e:
        log.info('Zip file open raised error %s: %s',
                    e, e.args)
        return
    except Exception, e:
        log.warning('Zip file open raised exception %s: %s',
                    e, e.args)
        return
    top_score = 0
    top_scoring_extension_counts = defaultdict(int) # extension: number_of_files
    for filename in filenames:
        extension = os.path.splitext(filename)[-1][1:].lower()
        if extension in Formats.by_extension():
            format_ = Formats.by_extension()[extension]
            if format_['openness'] > top_score:
                top_score = format_['openness']
                top_scoring_extension_counts = defaultdict(int)
            if format_['openness'] == top_score:
                top_scoring_extension_counts[extension] += 1
        else:
            log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath)
    if not top_scoring_extension_counts:
        log.info('Zip has no known extensions: %s', filepath)
        return Formats.by_display_name()['Zip']

    top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(),
                                          key=lambda x: x[1])
    top_extension = top_scoring_extension_counts[-1][0]
Exemplo n.º 16
0
 def test_by_extension(self):
     assert_equal(Formats.by_extension()["json"]["display_name"], "JSON")