def score_by_format_field(resource, score_reasons, log): ''' Looks at the format field of a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_display_name) * If it cannot work out the format then format_display_name is None * If it cannot score it, then score is None ''' format_field = resource.format or '' if not format_field: score_reasons.append('Format field is blank.') return (None, None) format_ = Formats.by_display_name().get(format_field) or \ Formats.by_extension().get(format_field.lower()) or \ Formats.by_reduced_name().get(Formats.reduce(format_field)) if not format_: score_reasons.append('Polje formata "%s" ne odgovara ni jednom poznatom formatu.' % format_field) return (None, None) score = format_['openness'] score_reasons.append('Polje formata "%s" ima ocijenu otvorenosti: %s.' % \ (format_field, score)) return (score, format_['display_name'])
def score_by_format_field(resource, score_reasons, log): ''' Looks at the format field of a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_display_name) * If it cannot work out the format then format_display_name is None * If it cannot score it, then score is None ''' format_field = resource.format or '' if not format_field: score_reasons.append('Format field is blank.') return (None, None) format_ = Formats.by_display_name().get(format_field) or \ Formats.by_extension().get(format_field.lower()) or \ Formats.by_reduced_name().get(Formats.reduce(format_field)) if not format_: score_reasons.append( 'Polje formata "%s" ne odgovara ni jednom poznatom formatu.' % format_field) return (None, None) score = format_['openness'] score_reasons.append('Polje formata "%s" ima ocijenu otvorenosti: %s.' % \ (format_field, score)) return (score, format_['display_name'])
def test_fugue_icons_exist(self): # List all icon files in the fugue folder path = os.path.dirname(__file__) # /ckanext/dgu/tests/lib path = os.path.dirname(path) # /ckanext/dgu/tests path = os.path.dirname(path) # /ckanext/dgu # /ckanext/dgu/theme/public/images/fugue path = os.path.join(path, 'theme', 'public', 'images', 'fugue') assert os.path.isdir(path) files = os.listdir(path) # Each format should have an icon in that folder assert 'document.png' in files, 'document.png not found in %s' % path for fmt in Formats.by_display_name().values(): if fmt['icon'] == '': continue icon_name = fmt['icon'] + '.png' assert icon_name in files, '%s not found in %s' % (icon_name, path)
def test_fugue_icons_exist(self): # List all icon files in the fugue folder path = os.path.dirname(__file__) # /ckanext/dgu/tests/lib path = os.path.dirname(path) # /ckanext/dgu/tests path = os.path.dirname(path) # /ckanext/dgu # /ckanext/dgu/theme/public/images/fugue path = os.path.join(path, "theme", "public", "images", "fugue") assert os.path.isdir(path) files = os.listdir(path) # Each format should have an icon in that folder assert "document.png" in files, "document.png not found in %s" % path for fmt in Formats.by_display_name().values(): if fmt["icon"] == "": continue icon_name = fmt["icon"] + ".png" assert icon_name in files, "%s not found in %s" % (icon_name, path)
def test_by_display_name(self): assert_equal(Formats.by_display_name()['JSON']['extension'], 'json')
def sniff_file_format(filepath, log): '''For a given filepath, work out what file format it is. Returns Format dict with a key to say if it is contained in a zip or something. e.g. {'display_name': 'CSV', 'container': 'zip', ...} or None if it can\'t tell what it is. Note, log is a logger, either a Celery one or a standard Python logging one. ''' format_ = None log.info('Sniffing file format of: %s', filepath) filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \ else filepath mime_type = magic.from_file(filepath_utf8, mime=True) log.info('Magic detects file as: %s', mime_type) if mime_type: if mime_type == 'application/xml': with open(filepath) as f: buf = f.read(5000) format_ = get_xml_variant_including_xml_declaration(buf, log) elif mime_type == 'application/zip': format_ = get_zipped_format(filepath, log) elif mime_type == 'application/msword': # Magic gives this mime-type for other MS Office files too format_ = run_bsd_file(filepath, log) if not format_ and is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] elif mime_type == 'application/octet-stream': # Excel files sometimes come up as this if is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] else: # e.g. Shapefile format_ = run_bsd_file(filepath, log) if not format_: with open(filepath) as f: buf = f.read(500) format_ = is_html(buf, log) elif mime_type == 'text/html': # Magic can mistake IATI for HTML with open(filepath) as f: buf = f.read(100) if is_iati(buf, log): format_ = Formats.by_display_name()['IATI'] if format_: return format_ format_ = Formats.by_mime_type().get(mime_type) if not format_: if mime_type.startswith('text/'): # is it JSON? with open(filepath, 'rU') as f: buf = f.read(10000) if is_json(buf, log): format_ = Formats.by_extension()['json'] # is it CSV? elif is_csv(buf, log): format_ = Formats.by_extension()['csv'] elif is_psv(buf, log): format_ = Formats.by_extension()['psv'] if not format_: log.warning('Mimetype not recognised by CKAN as a data format: %s', mime_type) if format_: log.info('Mimetype translates to filetype: %s', format_['display_name']) if format_['display_name'] == 'TXT': # is it JSON? with open(filepath, 'rU') as f: buf = f.read(10000) if is_json(buf, log): format_ = Formats.by_extension()['json'] # is it CSV? elif is_csv(buf, log): format_ = Formats.by_extension()['csv'] elif is_psv(buf, log): format_ = Formats.by_extension()['psv'] # XML files without the "<?xml ... ?>" tag end up here elif is_xml_but_without_declaration(buf, log): format_ = get_xml_variant_without_xml_declaration(buf, log) elif is_ttl(buf, log): format_ = Formats.by_extension()['ttl'] elif format_['display_name'] == 'HTML': # maybe it has RDFa in it with open(filepath) as f: buf = f.read(100000) if has_rdfa(buf, log): format_ = Formats.by_display_name()['RDFa'] else: # Excel files sometimes not picked up by magic, so try alternative if is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] # BSD file picks up some files that Magic misses # e.g. some MS Word files if not format_: format_ = run_bsd_file(filepath, log) if not format_: log.warning('Could not detect format of file: %s', filepath) return format_
int) # extension: number_of_files for filename in filenames: extension = os.path.splitext(filename)[-1][1:].lower() if extension in Formats.by_extension(): format_ = Formats.by_extension()[extension] if format_['openness'] > top_score: top_score = format_['openness'] top_scoring_extension_counts = defaultdict(int) if format_['openness'] == top_score: top_scoring_extension_counts[extension] += 1 else: log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath) if not top_scoring_extension_counts: log.info('Zip has no known extensions: %s', filepath) return Formats.by_display_name()['Zip'] top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(), key=lambda x: x[1]) top_extension = top_scoring_extension_counts[-1][0] log.info('Zip file\'s most popular extension is "%s" (All extensions: %r)', top_extension, top_scoring_extension_counts) format_ = Formats.by_extension()[top_extension] # take a copy of the format_ dict to avoid altering the copy held in Formats format_ = copy.deepcopy(format_) format_['container'] = Formats.by_display_name()['Zip']['display_name'] log.info('Zipped file format detected: %s', format_['display_name']) return format_ def is_excel(filepath, log):
top_score = 0 top_scoring_extension_counts = defaultdict(int) # extension: number_of_files for filename in filenames: extension = os.path.splitext(filename)[-1][1:].lower() if extension in Formats.by_extension(): format_ = Formats.by_extension()[extension] if format_['openness'] > top_score: top_score = format_['openness'] top_scoring_extension_counts = defaultdict(int) if format_['openness'] == top_score: top_scoring_extension_counts[extension] += 1 else: log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath) if not top_scoring_extension_counts: log.info('Zip has no known extensions: %s', filepath) return Formats.by_display_name()['Zip'] top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(), key=lambda x: x[1]) top_extension = top_scoring_extension_counts[-1][0] log.info('Zip file\'s most popular extension is "%s" (All extensions: %r)', top_extension, top_scoring_extension_counts) format_ = Formats.by_extension()[top_extension] # take a copy of the format_ dict to avoid altering the copy held in Formats format_ = copy.deepcopy(format_) format_['container'] = Formats.by_display_name()['Zip']['display_name'] log.info('Zipped file format detected: %s', format_['display_name']) return format_ def is_excel(filepath, log):
def set_sniffed_format(format_display_name): global sniffed_format if format_display_name: sniffed_format = Formats.by_display_name()[format_display_name] else: sniffed_format = None
def test_by_display_name(self): assert_equal(Formats.by_display_name()["JSON"]["extension"], "json")