def run_bsd_file(filepath, log): '''Run the BSD command-line tool "file" to determine file type. Returns a Format or None if it fails.''' result = check_output(['file', filepath]) match = re.search('Name of Creating Application: ([^,]*),', result) if match: app_name = match.groups()[0] format_map = {'Microsoft Office PowerPoint': 'ppt', 'Microsoft PowerPoint': 'ppt', 'Microsoft Excel': 'xls', 'Microsoft Office Word': 'doc', 'Microsoft Word 10.0': 'doc', 'Microsoft Macintosh Word': 'doc', } if app_name in format_map: extension = format_map[app_name] format_ = Formats.by_extension()[extension] log.info('"file" detected file format: %s', format_['display_name']) return format_ match = re.search(': ESRI Shapefile', result) if match: format_ = Formats.by_extension()['shp'] log.info('"file" detected file format: %s', format_['display_name']) return format_ log.info('"file" could not determine file format of "%s": %s', filepath, result)
def score_by_format_field(resource, score_reasons, log): ''' Looks at the format field of a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_display_name) * If it cannot work out the format then format_display_name is None * If it cannot score it, then score is None ''' format_field = resource.format or '' if not format_field: score_reasons.append('Format field is blank.') return (None, None) format_ = Formats.by_display_name().get(format_field) or \ Formats.by_extension().get(format_field.lower()) or \ Formats.by_reduced_name().get(Formats.reduce(format_field)) if not format_: score_reasons.append('Polje formata "%s" ne odgovara ni jednom poznatom formatu.' % format_field) return (None, None) score = format_['openness'] score_reasons.append('Polje formata "%s" ima ocijenu otvorenosti: %s.' % \ (format_field, score)) return (score, format_['display_name'])
def get_xml_variant_without_xml_declaration(buf, log): '''If this buffer is in a format based on XML, without any XML declaration or other boilerplate, return the format type.''' xml_re = '.{0,3}\s*<([^>\s]*)' match = re.match(xml_re, buf) if match: top_level_tag_name = match.groups()[-1].lower() top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf') top_level_tag_name = top_level_tag_name.replace('wms_capabilities', 'wms') if top_level_tag_name in Formats.by_extension(): format_ = Formats.by_extension()[top_level_tag_name] log.info('XML variant detected: %s', format_['display_name']) return format_ log.warning('Did not recognise XML format: %s', top_level_tag_name) return Formats.by_extension()['xml'] log.debug('XML tags not found: %s', buf)
def score_by_format_field(resource, score_reasons, log): ''' Looks at the format field of a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_display_name) * If it cannot work out the format then format_display_name is None * If it cannot score it, then score is None ''' format_field = resource.format or '' if not format_field: score_reasons.append('Format field is blank.') return (None, None) format_ = Formats.by_display_name().get(format_field) or \ Formats.by_extension().get(format_field.lower()) or \ Formats.by_reduced_name().get(Formats.reduce(format_field)) if not format_: score_reasons.append( 'Polje formata "%s" ne odgovara ni jednom poznatom formatu.' % format_field) return (None, None) score = format_['openness'] score_reasons.append('Polje formata "%s" ima ocijenu otvorenosti: %s.' % \ (format_field, score)) return (score, format_['display_name'])
def get_xml_variant_without_xml_declaration(buf, log): '''If this buffer is in a format based on XML, without any XML declaration or other boilerplate, return the format type.''' xml_re = '.{0,3}\s*<([^>\s]*)' match = re.match(xml_re, buf) if match: top_level_tag_name = match.groups()[-1].lower() top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf') top_level_tag_name = top_level_tag_name.replace( 'wms_capabilities', 'wms') if top_level_tag_name in Formats.by_extension(): format_ = Formats.by_extension()[top_level_tag_name] log.info('XML variant detected: %s', format_['display_name']) return format_ log.warning('Did not recognise XML format: %s', top_level_tag_name) return Formats.by_extension()['xml'] log.debug('XML tags not found: %s', buf)
def is_iati(buf, log): '''If this buffer is IATI format, return that format type, else None.''' xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<iati-(activities|organisations)[^>]*>' match = re.match(xml_re, buf, re.IGNORECASE) if match: log.info('IATI tag detected') return Formats.by_extension()['iati'] log.debug('Not IATI', buf)
def is_html(buf, log): '''If this buffer is HTML, return that format type, else None.''' xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<html[^>]*>' match = re.match(xml_re, buf, re.IGNORECASE) if match: log.info('HTML tag detected') return Formats.by_extension()['html'] log.debug('Not HTML')
def is_iati(buf, log): '''If this buffer is IATI format, return that format type, else None.''' xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<iati-(activities|organisations)[^>]*>' match = re.match(xml_re, buf, re.IGNORECASE) if match: log.info('IATI tag detected') return Formats.by_extension()['iati'] log.debug('Not IATI', buf)
def is_html(buf, log): '''If this buffer is HTML, return that format type, else None.''' xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<html[^>]*>' match = re.match(xml_re, buf, re.IGNORECASE) if match: log.info('HTML tag detected') return Formats.by_extension()['html'] log.debug('Not HTML')
def score_by_url_extension(resource, score_reasons, log): ''' Looks at the URL for a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_display_name) * If it cannot work out the format then format_display_name is None * If it cannot score it, then score is None ''' formats_by_extension = Formats.by_extension() extension_variants_ = extension_variants(resource.url.strip()) if not extension_variants_: score_reasons.append('Nepoznata ekstenzija datoteke.') return (None, None) for extension in extension_variants_: if extension.lower() in formats_by_extension: format_ = Formats.by_extension().get(extension.lower()) score = format_['openness'] score_reasons.append('URL ekstenzija "%s" je povezana s formatom "%s" i ima ocjenu: %s.' % (extension, format_['display_name'], score)) return score, format_['display_name'] score_reasons.append('URL ekstenzija "%s" je nepoznat format.' % extension) return (None, None)
def test_by_extension(self): assert_equal(Formats.by_extension()['json']['display_name'], 'JSON')
def sniff_file_format(filepath, log): '''For a given filepath, work out what file format it is. Returns Format dict with a key to say if it is contained in a zip or something. e.g. {'display_name': 'CSV', 'container': 'zip', ...} or None if it can\'t tell what it is. Note, log is a logger, either a Celery one or a standard Python logging one. ''' format_ = None log.info('Sniffing file format of: %s', filepath) filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \ else filepath mime_type = magic.from_file(filepath_utf8, mime=True) log.info('Magic detects file as: %s', mime_type) if mime_type: if mime_type == 'application/xml': with open(filepath) as f: buf = f.read(5000) format_ = get_xml_variant_including_xml_declaration(buf, log) elif mime_type == 'application/zip': format_ = get_zipped_format(filepath, log) elif mime_type == 'application/msword': # Magic gives this mime-type for other MS Office files too format_ = run_bsd_file(filepath, log) if not format_ and is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] elif mime_type == 'application/octet-stream': # Excel files sometimes come up as this if is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] else: # e.g. Shapefile format_ = run_bsd_file(filepath, log) if not format_: with open(filepath) as f: buf = f.read(500) format_ = is_html(buf, log) elif mime_type == 'text/html': # Magic can mistake IATI for HTML with open(filepath) as f: buf = f.read(100) if is_iati(buf, log): format_ = Formats.by_display_name()['IATI'] if format_: return format_ format_ = Formats.by_mime_type().get(mime_type) if not format_: if mime_type.startswith('text/'): # is it JSON? with open(filepath, 'rU') as f: buf = f.read(10000) if is_json(buf, log): format_ = Formats.by_extension()['json'] # is it CSV? elif is_csv(buf, log): format_ = Formats.by_extension()['csv'] elif is_psv(buf, log): format_ = Formats.by_extension()['psv'] if not format_: log.warning('Mimetype not recognised by CKAN as a data format: %s', mime_type) if format_: log.info('Mimetype translates to filetype: %s', format_['display_name']) if format_['display_name'] == 'TXT': # is it JSON? with open(filepath, 'rU') as f: buf = f.read(10000) if is_json(buf, log): format_ = Formats.by_extension()['json'] # is it CSV? elif is_csv(buf, log): format_ = Formats.by_extension()['csv'] elif is_psv(buf, log): format_ = Formats.by_extension()['psv'] # XML files without the "<?xml ... ?>" tag end up here elif is_xml_but_without_declaration(buf, log): format_ = get_xml_variant_without_xml_declaration(buf, log) elif is_ttl(buf, log): format_ = Formats.by_extension()['ttl'] elif format_['display_name'] == 'HTML': # maybe it has RDFa in it with open(filepath) as f: buf = f.read(100000) if has_rdfa(buf, log): format_ = Formats.by_display_name()['RDFa'] else: # Excel files sometimes not picked up by magic, so try alternative if is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] # BSD file picks up some files that Magic misses # e.g. some MS Word files if not format_: format_ = run_bsd_file(filepath, log) if not format_: log.warning('Could not detect format of file: %s', filepath) return format_
try: filenames = zip.namelist() finally: zip.close() except zipfile.BadZipfile, e: log.info('Zip file open raised error %s: %s', e, e.args) return except Exception, e: log.warning('Zip file open raised exception %s: %s', e, e.args) return top_score = 0 top_scoring_extension_counts = defaultdict( int) # extension: number_of_files for filename in filenames: extension = os.path.splitext(filename)[-1][1:].lower() if extension in Formats.by_extension(): format_ = Formats.by_extension()[extension] if format_['openness'] > top_score: top_score = format_['openness'] top_scoring_extension_counts = defaultdict(int) if format_['openness'] == top_score: top_scoring_extension_counts[extension] += 1 else: log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath) if not top_scoring_extension_counts: log.info('Zip has no known extensions: %s', filepath) return Formats.by_display_name()['Zip'] top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(), key=lambda x: x[1])
def sniff_file_format(filepath, log): '''For a given filepath, work out what file format it is. Returns Format dict with a key to say if it is contained in a zip or something. e.g. {'display_name': 'CSV', 'container': 'zip', ...} or None if it can\'t tell what it is. Note, log is a logger, either a Celery one or a standard Python logging one. ''' format_ = None log.info('Sniffing file format of: %s', filepath) filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \ else filepath mime_type = magic.from_file(filepath_utf8, mime=True) log.info('Magic detects file as: %s', mime_type) if mime_type: if mime_type == 'application/xml': with open(filepath) as f: buf = f.read(5000) format_ = get_xml_variant_including_xml_declaration(buf, log) elif mime_type == 'application/zip': format_ = get_zipped_format(filepath, log) elif mime_type == 'application/msword': # Magic gives this mime-type for other MS Office files too format_ = run_bsd_file(filepath, log) if not format_ and is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] elif mime_type == 'application/octet-stream': # Excel files sometimes come up as this if is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] else: # e.g. Shapefile format_ = run_bsd_file(filepath, log) if not format_: with open(filepath) as f: buf = f.read(500) format_ = is_html(buf, log) elif mime_type == 'text/html': # Magic can mistake IATI for HTML with open(filepath) as f: buf = f.read(100) if is_iati(buf, log): format_ = Formats.by_display_name()['IATI'] if format_: return format_ format_ = Formats.by_mime_type().get(mime_type) if not format_: if mime_type.startswith('text/'): # is it JSON? with open(filepath, 'rU') as f: buf = f.read(10000) if is_json(buf, log): format_ = Formats.by_extension()['json'] # is it CSV? elif is_csv(buf, log): format_ = Formats.by_extension()['csv'] elif is_psv(buf, log): format_ = Formats.by_extension()['psv'] if not format_: log.warning('Mimetype not recognised by CKAN as a data format: %s', mime_type) if format_: log.info('Mimetype translates to filetype: %s', format_['display_name']) if format_['display_name'] == 'TXT': # is it JSON? with open(filepath, 'rU') as f: buf = f.read(10000) if is_json(buf, log): format_ = Formats.by_extension()['json'] # is it CSV? elif is_csv(buf, log): format_ = Formats.by_extension()['csv'] elif is_psv(buf, log): format_ = Formats.by_extension()['psv'] # XML files without the "<?xml ... ?>" tag end up here elif is_xml_but_without_declaration(buf, log): format_ = get_xml_variant_without_xml_declaration(buf, log) elif is_ttl(buf, log): format_ = Formats.by_extension()['ttl'] elif format_['display_name'] == 'HTML': # maybe it has RDFa in it with open(filepath) as f: buf = f.read(100000) if has_rdfa(buf, log): format_ = Formats.by_display_name()['RDFa'] else: # Excel files sometimes not picked up by magic, so try alternative if is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] # BSD file picks up some files that Magic misses # e.g. some MS Word files if not format_: format_ = run_bsd_file(filepath, log) if not format_: log.warning('Could not detect format of file: %s', filepath) return format_
filenames = zip.namelist() finally: zip.close() except zipfile.BadZipfile, e: log.info('Zip file open raised error %s: %s', e, e.args) return except Exception, e: log.warning('Zip file open raised exception %s: %s', e, e.args) return top_score = 0 top_scoring_extension_counts = defaultdict(int) # extension: number_of_files for filename in filenames: extension = os.path.splitext(filename)[-1][1:].lower() if extension in Formats.by_extension(): format_ = Formats.by_extension()[extension] if format_['openness'] > top_score: top_score = format_['openness'] top_scoring_extension_counts = defaultdict(int) if format_['openness'] == top_score: top_scoring_extension_counts[extension] += 1 else: log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath) if not top_scoring_extension_counts: log.info('Zip has no known extensions: %s', filepath) return Formats.by_display_name()['Zip'] top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(), key=lambda x: x[1]) top_extension = top_scoring_extension_counts[-1][0]
def test_by_extension(self): assert_equal(Formats.by_extension()["json"]["display_name"], "JSON")