def test_query_run_for_text_with_long_lines(self): location1 = self.get_test_loc('query/long_lines.txt') location2 = self.get_test_loc('query/not_long_lines.txt') from typecode.contenttype import get_type ft1 = get_type(location1) assert ft1.is_text_with_long_lines ft2 = get_type(location2) assert not ft2.is_text_with_long_lines idx = cache.get_index() assert len(Query(location1, idx=idx).query_runs) == 3 assert len(Query(location2, idx=idx).query_runs) == 14
def get_file_info(location, **kwargs): """ Return a mapping of file information collected for the file at `location`. """ result = OrderedDict() # TODO: move date and size these to the inventory collection step??? result['date'] = get_last_modified_date(location) or None result['size'] = getsize(location) or 0 sha1, md5, sha256 = multi_checksums(location, ('sha1', 'md5', 'sha256')).values() result['sha1'] = sha1 result['md5'] = md5 result['sha256'] = sha256 collector = get_type(location) result['mime_type'] = collector.mimetype_file or None result['file_type'] = collector.filetype_file or None result['programming_language'] = collector.programming_language or None result['is_binary'] = bool(collector.is_binary) result['is_text'] = bool(collector.is_text) result['is_archive'] = bool(collector.is_archive) result['is_media'] = bool(collector.is_media) result['is_source'] = bool(collector.is_source) result['is_script'] = bool(collector.is_script) return result
def is_pom(location): """ Return True if the file at location is highly likely to be a POM. """ if (not filetype.is_file(location) or not location.endswith(('.pom', 'pom.xml', 'project.xml',))): if TRACE: logger.debug('is_pom: not a POM on name: {}'.format(location)) return T = contenttype.get_type(location) if T.is_text: # check the POM version in the first 150 lines with codecs.open(location, encoding='utf-8') as pom: for n, line in enumerate(pom): if n > 150: break if any(x in line for x in ('http://maven.apache.org/POM/4.0.0', 'http://maven.apache.org/xsd/maven-4.0.0.xsd', '<modelVersion>', # somehow we can still parse version 3 poms too '<pomVersion>',) ): return True if TRACE: logger.debug('is_pom: not a POM based on type: {}: {}'.format(T, location))
def is_datafile(cls, location, filetypes=tuple()): """ Return True if the file at location is highly likely to be a POM. """ if super().is_datafile(location, filetypes=filetypes): return True T = contenttype.get_type(location) if not T.is_text: return maven_declarations = ( b'http://maven.apache.org/POM/4.0.0', b'http://maven.apache.org/xsd/maven-4.0.0.xsd', b'<modelVersion>', # somehow we can still parse version 3 poms too b'<pomVersion>', ) # check the POM version in the first 150 lines with open(location, 'rb') as pom: for n, line in enumerate(pom): if n > 150: break if any(x in line for x in maven_declarations): return True
def is_datafile(cls, location, filetypes=tuple(), _bare_filename=False): """ Return True if the file at ``location`` is likely a package data file that this parser can handle. This implementation is based on: - matching the ``location`` as a whole with any one of the ``path_patterns`` sequence of patterns defined as a class attributes. The path patterns are for POSIX paths. - if defined, ensuring that the filetype of the file at ``location`` contains any of the type listed in the ``filetypes`` class attribute. - ``_bare_filename`` is for testing using a bare path that does not point to real files. Subclasses can override to implement more complex data file recognition. """ if filetype.is_file(location) or _bare_filename: loc = as_posixpath(location) if any(fnmatchcase(loc, pat) for pat in cls.path_patterns): filetypes = filetypes or cls.filetypes if not filetypes: return True else: T = contenttype.get_type(location) actual_type = T.filetype_file.lower() return any(ft in actual_type for ft in filetypes)
def is_pom(location): """ Return True if the file at location is highly likely to be a POM. """ if (not filetype.is_file(location) or not location.endswith(('.pom', 'pom.xml', 'project.xml',))): if TRACE: logger.debug('is_pom: not a POM on name: {}'.format(location)) return T = contenttype.get_type(location) if T.is_text: # check the POM version in the first 150 lines with io.open(location, 'rb') as pom: for n, line in enumerate(pom): if n > 150: break if any(x in line for x in (b'http://maven.apache.org/POM/4.0.0', b'http://maven.apache.org/xsd/maven-4.0.0.xsd', b'<modelVersion>', # somehow we can still parse version 3 poms too b'<pomVersion>',) ): return True if TRACE: logger.debug('is_pom: not a POM based on type: {}: {}'.format(T, location))
def dwarf_source_path(location): """ Collect unique paths to compiled source code found in Elf binaries DWARF sections for D2D. """ location = location T = contenttype.get_type(location) if not (T.is_elf or T.is_stripped_elf): return seen_paths = set() path_file_names = set() bare_file_names = set() for dpath in chain(get_dwarf1(location), get_dwarf2(location)): if dpath in seen_paths: continue fn = fileutils.file_name(dpath) if fn == dpath: bare_file_names.add(fn) continue else: path_file_names.add(fn) seen_paths.add(dpath) yield dpath # only yield filename that do not exist as full paths for bfn in sorted(bare_file_names): if bfn not in path_file_names and bfn not in seen_paths: yield bfn seen_paths.add(bfn)
def get_file_infos(location): """ Return a list of dictionaries of informations collected from the file or directory at location. """ from commoncode import fileutils from commoncode import filetype from commoncode.hash import sha1, md5 from typecode import contenttype T = contenttype.get_type(location) is_file = T.is_file is_dir = T.is_dir infos = OrderedDict() infos['type'] = filetype.get_type(location, short=False) infos['name'] = fileutils.file_name(location) infos['extension'] = is_file and fileutils.file_extension(location) or '' infos['date'] = is_file and filetype.get_last_modified_date(location) or None infos['size'] = T.size infos['sha1'] = is_file and sha1(location) or None infos['md5'] = is_file and md5(location) or None infos['files_count'] = is_dir and filetype.get_file_count(location) or None infos['mime_type'] = is_file and T.mimetype_file or None infos['file_type'] = is_file and T.filetype_file or None infos['programming_language'] = is_file and T.programming_language or None infos['is_binary'] = is_file and T.is_binary or None infos['is_text'] = is_file and T.is_text or None infos['is_archive'] = is_file and T.is_archive or None infos['is_media'] = is_file and T.is_media or None infos['is_source'] = is_file and T.is_source or None infos['is_script'] = is_file and T.is_script or None return [infos]
def get_source_file_path_references(location): """ Yield unique references to source file paths extracted from DWARF debug symbols from the Elf file at `location`. If there are errors when processing Elfs, these are returned as well as paths prefixed with 'ERROR: '. """ if not os.path.exists(location): return T = contenttype.get_type(location) if not T.is_elf: return unique_files = set() unique_paths = set() errors = [] try: with_libdwarf = dwarf.Dwarf(location) for path in with_libdwarf.included_source_files: if '/' not in path: # bare file name unique_files.add(path) else: unique_paths.add(path) for path in with_libdwarf.original_source_files: if '/' not in path: # bare file name unique_files.add(path) else: unique_paths.add(path) except Exception as lde: msg = str(lde) _, m1, m2 = msg.partition('dwarfdump') errors.append(''.join([m1, m2])) try: with_binutils_nm = dwarf2.get_dwarfs(location) for entry in with_binutils_nm: path = entry.path if '/' not in path: # bare file name unique_files.add(path) else: unique_paths.add(path) except Exception as lde: msg = str(lde) errors.append(str) seen_file_names = set(file_name(p) for p in unique_paths) for fn in unique_files: if fn not in seen_file_names and fn not in ignores: unique_paths.add(fn) for error in errors: yield 'ERROR: ' + error for path in sorted(unique_paths): yield path
def is_pom(location): """ Return True if the file at location is highly likely to be a POM. """ if (not filetype.is_file(location) or not location.endswith(( '.pom', 'pom.xml', 'project.xml', ))): return T = contenttype.get_type(location) # logger.debug('location: %(location)r, T: %(T)r)' % locals()) if T.is_text and ('xml' in T.filetype_file.lower() or 'sgml' in T.filetype_file.lower() or 'xml' in T.filetype_pygment.lower() or 'genshi' in T.filetype_pygment.lower()): # check the POM version in the first 100 lines with codecs.open(location, encoding='utf-8') as pom: for n, line in enumerate(pom): if n > 100: break if any(x in line for x in ( 'http://maven.apache.org/POM/4.0.0', '<modelVersion>', )): return True
def get_file_infos(location): """ Return a list of dictionaries of informations collected from the file or directory at location. """ from commoncode import fileutils from commoncode import filetype from commoncode.hash import sha1, md5 from typecode import contenttype T = contenttype.get_type(location) is_file = T.is_file is_dir = T.is_dir infos = OrderedDict() infos['type'] = filetype.get_type(location, short=False) infos['name'] = fileutils.file_name(location) infos['extension'] = is_file and fileutils.file_extension(location) or '' infos['date'] = is_file and filetype.get_last_modified_date( location) or None infos['size'] = T.size infos['sha1'] = is_file and sha1(location) or None infos['md5'] = is_file and md5(location) or None infos['files_count'] = is_dir and filetype.get_file_count(location) or None infos['mime_type'] = is_file and T.mimetype_file or None infos['file_type'] = is_file and T.filetype_file or None infos['programming_language'] = is_file and T.programming_language or None infos['is_binary'] = is_file and T.is_binary or None infos['is_text'] = is_file and T.is_text or None infos['is_archive'] = is_file and T.is_archive or None infos['is_media'] = is_file and T.is_media or None infos['is_source'] = is_file and T.is_source or None infos['is_script'] = is_file and T.is_script or None return [infos]
def recognize_package(location): """ Return a Package object if one was recognized or None for this `location`. """ if not filetype.is_file(location): return T = contenttype.get_type(location) ftype = T.filetype_file.lower() mtype = T.mimetype_file for package in PACKAGE_TYPES: # Note: default to True if there is nothing to match against if location.endswith(tuple(package.metafiles)): return package.recognize(location) if package.filetypes: type_matched = any(t in ftype for t in package.filetypes) else: type_matched = False if package.mimetypes: mime_matched = any(m in mtype for m in package.mimetypes) else: mime_matched = False if package.extensions: extension_matched = location.lower().endswith(package.extensions) else: extension_matched = False if type_matched and mime_matched and extension_matched: # we return the first match in the order of PACKAGE_TYPES return package(location=location)
def pom_version(location): """ Return 1 or 2 corresponding to the maven major version of POM style, not the POM version) if the file at location is highly likely to be a POM, otherwise None. """ if (not filetype.is_file(location) or not location.endswith(pom_extensions)): return T = contenttype.get_type(location) # logger.debug('location: %(location)r, T: %(T)r)' % locals()) if T.is_text and ('xml' in T.filetype_file.lower() or 'sgml' in T.filetype_file.lower() or 'xml' in T.filetype_pygment.lower() or 'genshi' in T.filetype_pygment.lower()): # check the POM version in the first 100 lines with open(location, 'rb') as pom: for n, l in enumerate(pom): if n > 100: break if ('http://maven.apache.org/POM/4.0.0' in l or '<modelVersion>' in l): return 2 elif '<pomVersion>' in l: return 1
def get_file_infos(location): """ Return a mapping of file information collected from the file or directory at `location`. """ from commoncode import fileutils from commoncode import filetype from commoncode.hash import multi_checksums from typecode import contenttype if on_linux: location = path_to_bytes(location) else: location = path_to_unicode(location) infos = OrderedDict() is_file = filetype.is_file(location) is_dir = filetype.is_dir(location) T = contenttype.get_type(location) infos['type'] = filetype.get_type(location, short=False) name = fileutils.file_name(location) if is_file: base_name, extension = fileutils.splitext(location) else: base_name = name extension = '' if on_linux: infos['name'] = path_to_unicode(name) infos['base_name'] = path_to_unicode(base_name) infos['extension'] = path_to_unicode(extension) else: infos['name'] = name infos['base_name'] = base_name infos['extension'] = extension infos['date'] = is_file and filetype.get_last_modified_date( location) or None infos['size'] = T.size infos.update(multi_checksums(location, ( 'sha1', 'md5', ))) infos['files_count'] = is_dir and filetype.get_file_count(location) or None infos['mime_type'] = is_file and T.mimetype_file or None infos['file_type'] = is_file and T.filetype_file or None infos['programming_language'] = is_file and T.programming_language or None infos['is_binary'] = bool(is_file and T.is_binary) infos['is_text'] = bool(is_file and T.is_text) infos['is_archive'] = bool(is_file and T.is_archive) infos['is_media'] = bool(is_file and T.is_media) infos['is_source'] = bool(is_file and T.is_source) infos['is_script'] = bool(is_file and T.is_script) return infos
def is_rar(location): """ Return True if the file at location is a RAR archive. """ if not os.path.exists(location): return from typecode import contenttype T = contenttype.get_type(location) return T.filetype_file.lower().startswith('rar archive')
def is_manifest(cls, location): """ Return True if the file at ``location`` is likely a manifest of this type. Sub-classes should override to implement their own manifest recognition. """ if not filetype.is_file(location): return filename = file_name(location) file_patterns = cls.file_patterns if any( fnmatch.fnmatchcase(filename, metaf) for metaf in file_patterns): return True T = contenttype.get_type(location) ftype = T.filetype_file.lower() mtype = T.mimetype_file _base_name, extension = splitext_name(location, is_file=True) extension = extension.lower() if TRACE: logger_debug( 'is_manifest: ftype:', ftype, 'mtype:', mtype, 'pygtype:', T.filetype_pygment, 'fname:', filename, 'ext:', extension, ) type_matched = False if cls.filetypes: type_matched = any(t in ftype for t in cls.filetypes) mime_matched = False if cls.mimetypes: mime_matched = any(m in mtype for m in cls.mimetypes) extension_matched = False extensions = cls.extensions if extensions: extensions = (e.lower() for e in extensions) extension_matched = any( fnmatch.fnmatchcase(extension, ext_pat) for ext_pat in extensions) if type_matched and mime_matched and extension_matched: return True
def parse(location): """ Return a WindowsExecutable package from the file at `location` or None. """ if not filetype.is_file(location): return T = contenttype.get_type(location) if not T.is_winexe: return infos = pe_info(location) version = get_first(infos, 'Full Version', 'ProductVersion', 'FileVersion', 'Assembly Version') release_date = get_first(infos, 'BuildDate') if release_date: if len(release_date) >= 10: release_date = release_date[:10] release_date = release_date.replace('/', '-') name = get_first(infos, 'ProductName', 'OriginalFilename', 'InternalName') copyr = get_first(infos, 'LegalCopyright') LegalCopyright = copyr, LegalTrademarks = concat(infos, 'LegalTrademarks', 'LegalTrademarks1', 'LegalTrademarks2', 'LegalTrademarks3') License = get_first(infos, 'License') declared_license = {} if LegalCopyright or LegalTrademarks or License: declared_license = dict(LegalCopyright=copyr, LegalTrademarks=LegalTrademarks, License=License) description = concat(infos, 'FileDescription', 'Comments') parties = [] cname = get_first(infos, 'CompanyName', 'Company') if cname: parties = [Party(type=party_org, role='author', name=cname)] homepage_url = get_first(infos, 'URL', 'WWW') return WindowsExecutable( name=name, version=version, release_date=release_date, copyright=copyr, declared_license=declared_license, description=description, parties=parties, homepage_url=homepage_url, )
def is_datafile(cls, location, filetypes=tuple()): """ Return True if the file at location is highly likely to be a POM. """ if super().is_datafile(location, filetypes=filetypes): return True T = contenttype.get_type(location) if T.is_winexe: return True
def cpp_includes(location, **kwargs): """Collect the #includes statements in a C/C++ file.""" T = contenttype.get_type(location) if not T.is_c_source: return results = [] for line in analysis.unicode_text_lines(location): for inc in cpp_includes_re().findall(line): results.append(inc) return dict(cpp_includes=results)
def get_handlers(location): """ Return an iterable of (handler, type_matched, mime_matched, extension_matched,) for this `location`. """ if on_linux and py2: location = fileutils.fsencode(location) if filetype.is_file(location): T = contenttype.get_type(location) ftype = T.filetype_file.lower() mtype = T.mimetype_file if TRACE_DEEP: logger.debug( 'get_handlers: processing %(location)s: ftype: %(ftype)s, mtype: %(mtype)s ' % locals()) for handler in archive_handlers: if not handler.extractors: continue extractor_count = len(handler.extractors) if extractor_count > 2: raise Exception('Maximum level of archive nesting is two.') # default to False type_matched = handler.filetypes and any( t in ftype for t in handler.filetypes) mime_matched = handler.mimetypes and any( m in mtype for m in handler.mimetypes) exts = handler.extensions if exts: if on_linux and py2: exts = tuple(fileutils.fsencode(e) for e in exts) extension_matched = exts and location.lower().endswith(exts) if TRACE_DEEP: logger.debug( ' get_handlers: matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s' % locals()) if handler.strict and not all( [type_matched, mime_matched, extension_matched]): logger.debug(' get_handlers: skip strict' % locals()) continue if type_matched or mime_matched or extension_matched: if TRACE_DEEP: handler_name = handler.name logger.debug( ' get_handlers: yielding handler: %(handler_name)r' % locals()) yield handler, type_matched, mime_matched, extension_matched
def get_file_infos(location): """ Return a mapping of file information collected from the file or directory at `location`. """ from commoncode import fileutils from commoncode import filetype from commoncode.hash import multi_checksums from typecode import contenttype if on_linux: location = path_to_bytes(location) else: location = path_to_unicode(location) infos = OrderedDict() is_file = filetype.is_file(location) is_dir = filetype.is_dir(location) T = contenttype.get_type(location) infos['type'] = filetype.get_type(location, short=False) name = fileutils.file_name(location) if is_file: base_name, extension = fileutils.splitext(location) else: base_name = name extension = '' if on_linux: infos['name'] = path_to_unicode(name) infos['base_name'] = path_to_unicode(base_name) infos['extension'] = path_to_unicode(extension) else: infos['name'] = name infos['base_name'] = base_name infos['extension'] = extension infos['date'] = is_file and filetype.get_last_modified_date(location) or None infos['size'] = T.size infos.update(multi_checksums(location, ('sha1', 'md5',))) infos['files_count'] = is_dir and filetype.get_file_count(location) or None infos['mime_type'] = is_file and T.mimetype_file or None infos['file_type'] = is_file and T.filetype_file or None infos['programming_language'] = is_file and T.programming_language or None infos['is_binary'] = bool(is_file and T.is_binary) infos['is_text'] = bool(is_file and T.is_text) infos['is_archive'] = bool(is_file and T.is_archive) infos['is_media'] = bool(is_file and T.is_media) infos['is_source'] = bool(is_file and T.is_source) infos['is_script'] = bool(is_file and T.is_script) return infos
def get_elf_needed_library(location, **kwargs): """ Return a list of needed_libraries """ T = contenttype.get_type(location) if not T.is_elf: return elfie = Elf(location) results = [] for needed_library in elfie.needed_libraries: results.append(needed_library) return dict(elf_needed_library=results)
def closure_test_function(*args, **kwargs): results = get_type(test_file).to_dict(include_date=False) if regen: for key, value in results.items(): setattr(test, key, value) test.dump() expected = test.to_dict(filter_empty=False, filter_extra=True) passing = check_types_equal(expected, results) # this is done to display slightly eaier to handle error traces if not passing: expected['data file'] = 'file://' + data_file expected['test_file'] = 'file://' + test_file assert dict(results) == dict(expected)
def recognize_package(location): """ Return a Package object if one was recognized or None for this `location`. """ if not filetype.is_file(location): return T = contenttype.get_type(location) ftype = T.filetype_file.lower() mtype = T.mimetype_file for package_type in PACKAGE_TYPES: # Note: default to True if there is nothing to match against metafiles = package_type.metafiles if on_linux: metafiles = (path_to_bytes(m) for m in metafiles) if location.endswith(tuple(metafiles)): logger_debug('metafile matching: package_type is of type:', package_type) return package_type.recognize(location) if package_type.filetypes: type_matched = any(t in ftype for t in package_type.filetypes) else: type_matched = False if package_type.mimetypes: mime_matched = any(m in mtype for m in package_type.mimetypes) else: mime_matched = False extensions = package_type.extensions if extensions: if on_linux: extensions = tuple(path_to_bytes(e) for e in extensions) extension_matched = location.lower().endswith(extensions) else: extension_matched = False if type_matched and mime_matched and extension_matched: # we return the first match in the order of PACKAGE_TYPES logger_debug('all matching: package is of type:', package_type) recognized = package_type.recognize(location) logger_debug('all matching: recognized as:', repr(recognized)) return recognized logger_debug('no match: package is not of known type:', package_type)
def is_binary(location): """ Return True if the resource at location is a binary file. """ t = get_type(location) return ( t.is_binary or t.is_archive or t.is_media or t.is_office_doc or t.is_compressed or t.is_filesystem or t.is_winexe or t.is_elf or t.is_java_class or t.is_data )
def get_dwarfs(location): """ Yield tuples with debug information extracted from the DWARF debug symbols. Return also the symbol type, the symbol value itself and the line number in the source code at where the symbol is used or defined. Yields this tuple: (symbol_type, symbol, path_to_source, symbol_source_line) """ T = contenttype.get_type(location) if T.is_elf: rc, out, err = call_nm(location) if rc != 0: raise Exception(repr(open(err).read())) for res in parse(out): yield res
def get_elf_needed_library(location): """ Return a list of needed_libraries """ if not os.path.exists(location): return T = contenttype.get_type(location) if not T.is_elf: return with open(location, 'rb') as f: elffile = ELFFile(f) for section in elffile.iter_sections(): if not isinstance(section, DynamicSection): continue for tag in section.iter_tags(): if tag.entry.d_tag == 'DT_NEEDED': yield tag.needed
def get_file_infos(location, as_list=True): """ Return a list of dictionaries of informations collected from the file or directory at location. """ from commoncode import fileutils from commoncode import filetype from commoncode.hash import multi_checksums from scancode import utils from typecode import contenttype infos = OrderedDict() is_file = filetype.is_file(location) is_dir = filetype.is_dir(location) T = contenttype.get_type(location) infos['type'] = filetype.get_type(location, short=False) infos['name'] = fileutils.file_name(location) infos['extension'] = is_file and fileutils.file_extension(location) or '' infos['date'] = is_file and filetype.get_last_modified_date( location) or None infos['size'] = T.size infos.update(multi_checksums(location, ( 'sha1', 'md5', ))) infos['files_count'] = is_dir and filetype.get_file_count(location) or None infos['mime_type'] = is_file and T.mimetype_file or None infos['file_type'] = is_file and T.filetype_file or None infos['programming_language'] = is_file and T.programming_language or None infos['is_binary'] = bool(is_file and T.is_binary) infos['is_text'] = bool(is_file and T.is_text) infos['is_archive'] = bool(is_file and T.is_archive) infos['is_media'] = bool(is_file and T.is_media) infos['is_source'] = bool(is_file and T.is_source) infos['is_script'] = bool(is_file and T.is_script) if as_list: return [infos] else: return infos
def convert_to_utf8(location): """ Convert the file at location to UTF-8 text. Return the location of the converted file or None. """ if not contenttype.get_type(location).is_text: return location start = open(location, 'rb').read(4096) encoding = chardet.detect(start) if encoding: encoding = encoding.get('encoding', None) if encoding: target = os.path.join(fileutils.get_temp_dir('markup'), fileutils.file_name(location)) with codecs.open(location, 'rb', encoding=encoding, errors='replace', buffering=16384) as inf: with codecs.open(target, 'wb', encoding='utf-8') as outf: outf.write(inf.read()) return target else: # chardet failed somehow to detect an encoding return location
def is_manifest(cls, location): """ Return True if the file at ``location`` is likely a manifest of this type. """ T = contenttype.get_type(location) return filetype.is_file(location) and T.is_winexe
def recognize_packages(location): """ Return a list of Package object if any packages were recognized for this `location`, or None if there were no Packages found. Raises Exceptions on errors. """ if not filetype.is_file(location): return T = contenttype.get_type(location) ftype = T.filetype_file.lower() mtype = T.mimetype_file _base_name, extension = splitext_name(location, is_file=True) filename = file_name(location) extension = extension.lower() if TRACE: logger_debug('recognize_packages: ftype:', ftype, 'mtype:', mtype, 'pygtype:', T.filetype_pygment, 'fname:', filename, 'ext:', extension) recognized_packages = [] for package_type in PACKAGE_TYPES: # Note: default to True if there is nothing to match against metafiles = package_type.metafiles if on_linux and py2: metafiles = (fsencode(m) for m in metafiles) if any(fnmatch.fnmatchcase(filename, metaf) for metaf in metafiles): for recognized in package_type.recognize(location): if TRACE: logger_debug( 'recognize_packages: metafile matching: recognized:', recognized) if recognized and not recognized.license_expression: # compute and set a normalized license expression recognized.license_expression = recognized.compute_normalized_license( ) if TRACE: logger_debug( 'recognize_packages: recognized.license_expression:', recognized.license_expression) recognized_packages.append(recognized) return recognized_packages type_matched = False if package_type.filetypes: type_matched = any(t in ftype for t in package_type.filetypes) mime_matched = False if package_type.mimetypes: mime_matched = any(m in mtype for m in package_type.mimetypes) extension_matched = False extensions = package_type.extensions if extensions: if on_linux and py2: extensions = (fsencode(e) for e in extensions) extensions = (e.lower() for e in extensions) extension_matched = any( fnmatch.fnmatchcase(extension, ext_pat) for ext_pat in extensions) if type_matched and mime_matched and extension_matched: if TRACE: logger_debug('recognize_packages: all matching') try: for recognized in package_type.recognize(location): # compute and set a normalized license expression if recognized and not recognized.license_expression: recognized.license_expression = recognized.compute_normalized_license( ) if TRACE: logger_debug('recognize_packages: recognized', recognized) recognized_packages.append(recognized) except NotImplementedError: # build a plain package if recognize is not yet implemented recognized = package_type() if TRACE: logger_debug('recognize_packages: recognized', recognized) recognized_packages.append(recognized) return recognized_packages if TRACE: logger_debug('recognize_packages: no match for type:', package_type)
def pe_info(location): """ Return a mapping of common data available for a Windows dll or exe PE (portable executable). Return None for non-Windows PE files. Return an empty mapping for PE from which we could not collect data. Also collect extra data found if any, returned as a dictionary under the 'extra_data' key in the returned mapping. """ if not location: return {} T = contenttype.get_type(location) if not T.is_winexe: return {} result = dict([( k, None, ) for k in PE_INFO_KEYS]) extra_data = result['extra_data'] = {} with closing(pefile.PE(location)) as pe: if not hasattr(pe, 'FileInfo'): # No fileinfo section: we return just empties return result # >>> pe.FileInfo: this is a list of list of Structure objects: # [[<Structure: [VarFileInfo] >, <Structure: [StringFileInfo]>]] file_info = pe.FileInfo if not file_info or not isinstance(file_info, list): if TRACE: logger.debug('pe_info: not file_info') return result # here we have a non-empty list file_info = file_info[0] if TRACE: logger.debug('pe_info: file_info:', file_info) string_file_info = [ x for x in file_info if type(x) == pefile.Structure and hasattr(x, 'name') and x.name == 'StringFileInfo' ] if not string_file_info: # No stringfileinfo section: we return just empties if TRACE: logger.debug('pe_info: not string_file_info') return result string_file_info = string_file_info[0] if not hasattr(string_file_info, 'StringTable'): # No fileinfo.StringTable section: we return just empties if TRACE: logger.debug('pe_info: not StringTable') return result string_table = string_file_info.StringTable if not string_table or not isinstance(string_table, list): return result string_table = string_table[0] if TRACE: logger.debug('pe_info: Entries keys: ' + str(set(k for k in string_table.entries))) logger.debug('pe_info: Entry values:') for k, v in string_table.entries.items(): logger.debug(' ' + str(k) + ': ' + repr(type(v)) + repr(v)) for k, v in string_table.entries.items(): # convert unicode to a safe ASCII representation key = text.as_unicode(k).strip() value = text.as_unicode(v).strip() value = fix_text(value) if key in PE_INFO_KEYSET: result[key] = value else: extra_data[key] = value return result
import os from unittest.case import skipIf from unittest.case import expectedFailure from commoncode.testcase import FileBasedTesting from commoncode.system import on_windows from typecode.contenttype import get_filetype from typecode.contenttype import get_type from typecode.contenttype import get_pygments_lexer from typecode.contenttype import is_standard_include # aliases for testing get_mimetype_python = lambda l: get_type(l).mimetype_python get_filetype_pygment = lambda l: get_type(l).filetype_pygment get_filetype_file = lambda l: get_type(l).filetype_file get_mimetype_file = lambda l: get_type(l).mimetype_file is_text = lambda l: get_type(l).is_text is_archive = lambda l: get_type(l).is_archive is_media = lambda l: get_type(l).is_media is_winexe = lambda l: get_type(l).is_winexe is_source = lambda l: get_type(l).is_source is_special = lambda l: get_type(l).is_special is_pdf = lambda l: get_type(l).is_pdf is_pdf_with_text = lambda l: get_type(l).is_pdf_with_text is_binary = lambda l: get_type(l).is_binary is_c_source = lambda l: get_type(l).is_c_source is_stripped_elf = lambda l: get_type(l).is_stripped_elf is_elf = lambda l: get_type(l).is_elf
import os from unittest.case import skipIf from unittest.case import expectedFailure from commoncode.testcase import FileBasedTesting from commoncode.system import on_windows from typecode.contenttype import get_filetype from typecode.contenttype import get_type from typecode.contenttype import get_pygments_lexer from typecode.contenttype import is_standard_include # aliases for testing get_mimetype_python = lambda l: get_type(l).mimetype_python get_filetype_pygment = lambda l: get_type(l).filetype_pygment get_filetype_file = lambda l: get_type(l).filetype_file get_mimetype_file = lambda l: get_type(l).mimetype_file is_text = lambda l: get_type(l).is_text is_archive = lambda l: get_type(l).is_archive is_compressed = lambda l: get_type(l).is_compressed is_media = lambda l: get_type(l).is_media is_winexe = lambda l: get_type(l).is_winexe is_source = lambda l: get_type(l).is_source is_special = lambda l: get_type(l).is_special is_pdf = lambda l: get_type(l).is_pdf is_pdf_with_text = lambda l: get_type(l).is_pdf_with_text is_binary = lambda l: get_type(l).is_binary is_c_source = lambda l: get_type(l).is_c_source is_stripped_elf = lambda l: get_type(l).is_stripped_elf