Пример #1
0
    def test_query_run_for_text_with_long_lines(self):
        location1 = self.get_test_loc('query/long_lines.txt')
        location2 = self.get_test_loc('query/not_long_lines.txt')
        from typecode.contenttype import get_type
        ft1 = get_type(location1)
        assert ft1.is_text_with_long_lines
        ft2 = get_type(location2)
        assert not ft2.is_text_with_long_lines

        idx = cache.get_index()
        assert len(Query(location1, idx=idx).query_runs) == 3
        assert len(Query(location2, idx=idx).query_runs) == 14
Пример #2
0
def get_file_info(location, **kwargs):
    """
    Return a mapping of file information collected for the file at `location`.
    """
    result = OrderedDict()

    # TODO: move date and size these to the inventory collection step???
    result['date'] = get_last_modified_date(location) or None
    result['size'] = getsize(location) or 0

    sha1, md5, sha256 = multi_checksums(location, ('sha1', 'md5', 'sha256')).values()
    result['sha1'] = sha1
    result['md5'] = md5
    result['sha256'] = sha256

    collector = get_type(location)
    result['mime_type'] = collector.mimetype_file or None
    result['file_type'] = collector.filetype_file or None
    result['programming_language'] = collector.programming_language or None
    result['is_binary'] = bool(collector.is_binary)
    result['is_text'] = bool(collector.is_text)
    result['is_archive'] = bool(collector.is_archive)
    result['is_media'] = bool(collector.is_media)
    result['is_source'] = bool(collector.is_source)
    result['is_script'] = bool(collector.is_script)
    return result
Пример #3
0
def is_pom(location):
    """
    Return True if the file at location is highly likely to be a POM.
    """
    if (not filetype.is_file(location)
    or not location.endswith(('.pom', 'pom.xml', 'project.xml',))):
        if TRACE: logger.debug('is_pom: not a POM on name: {}'.format(location))
        return

    T = contenttype.get_type(location)
    if T.is_text:

        # check the POM version in the first 150 lines
        with codecs.open(location, encoding='utf-8') as pom:
            for n, line in enumerate(pom):
                if n > 150:
                    break
                if any(x in line for x in
                       ('http://maven.apache.org/POM/4.0.0',
                        'http://maven.apache.org/xsd/maven-4.0.0.xsd',
                        '<modelVersion>',
                        # somehow we can still parse version 3 poms too
                        '<pomVersion>',)
                       ):
                    return True

    if TRACE: logger.debug('is_pom: not a POM based on type: {}: {}'.format(T, location))
Пример #4
0
    def is_datafile(cls, location, filetypes=tuple()):
        """
        Return True if the file at location is highly likely to be a POM.
        """
        if super().is_datafile(location, filetypes=filetypes):
            return True

        T = contenttype.get_type(location)
        if not T.is_text:
            return

        maven_declarations = (
            b'http://maven.apache.org/POM/4.0.0',
            b'http://maven.apache.org/xsd/maven-4.0.0.xsd',
            b'<modelVersion>',
            # somehow we can still parse version 3 poms too
            b'<pomVersion>',
        )

        # check the POM version in the first 150 lines
        with open(location, 'rb') as pom:
            for n, line in enumerate(pom):
                if n > 150:
                    break
                if any(x in line for x in maven_declarations):
                    return True
Пример #5
0
    def is_datafile(cls, location, filetypes=tuple(), _bare_filename=False):
        """
        Return True if the file at ``location`` is likely a package data file
        that this parser can handle. This implementation is based on:

        - matching the ``location`` as a whole with any one of the
          ``path_patterns`` sequence of patterns defined as a class attributes.
          The path patterns are for POSIX paths.

        - if defined, ensuring that the filetype of the file at ``location``
          contains any of the type listed in the ``filetypes`` class attribute.

        - ``_bare_filename`` is for testing using a bare path that does not
        point to real files.
        Subclasses can override to implement more complex data file recognition.
        """
        if filetype.is_file(location) or _bare_filename:
            loc = as_posixpath(location)
            if any(fnmatchcase(loc, pat) for pat in cls.path_patterns):
                filetypes = filetypes or cls.filetypes
                if not filetypes:
                    return True
                else:
                    T = contenttype.get_type(location)
                    actual_type = T.filetype_file.lower()
                    return any(ft in actual_type for ft in filetypes)
Пример #6
0
def is_pom(location):
    """
    Return True if the file at location is highly likely to be a POM.
    """
    if (not filetype.is_file(location)
     or not location.endswith(('.pom', 'pom.xml', 'project.xml',))):

        if TRACE: logger.debug('is_pom: not a POM on name: {}'.format(location))
        return

    T = contenttype.get_type(location)
    if T.is_text:

        # check the POM version in the first 150 lines
        with io.open(location, 'rb') as pom:
            for n, line in enumerate(pom):
                if n > 150:
                    break
                if any(x in line for x in
                       (b'http://maven.apache.org/POM/4.0.0',
                        b'http://maven.apache.org/xsd/maven-4.0.0.xsd',
                        b'<modelVersion>',
                        # somehow we can still parse version 3 poms too
                        b'<pomVersion>',)
                       ):
                    return True

    if TRACE: logger.debug('is_pom: not a POM based on type: {}: {}'.format(T, location))
Пример #7
0
def dwarf_source_path(location):
    """
    Collect unique paths to compiled source code found in Elf binaries DWARF
    sections for D2D.
    """
    location = location
    T = contenttype.get_type(location)
    if not (T.is_elf or T.is_stripped_elf):
        return
    seen_paths = set()
    path_file_names = set()
    bare_file_names = set()
    for dpath in chain(get_dwarf1(location), get_dwarf2(location)):
        if dpath in seen_paths:
            continue
        fn = fileutils.file_name(dpath)
        if fn == dpath:
            bare_file_names.add(fn)
            continue
        else:
            path_file_names.add(fn)
        seen_paths.add(dpath)
        yield dpath
    # only yield filename that do not exist as full paths
    for bfn in sorted(bare_file_names):
        if bfn not in path_file_names and bfn not in seen_paths:
            yield bfn
            seen_paths.add(bfn)
Пример #8
0
def get_file_infos(location):
    """
    Return a list of dictionaries of informations collected from the file or
    directory at location.
    """
    from commoncode import fileutils
    from commoncode import filetype
    from commoncode.hash import sha1, md5
    from typecode import contenttype

    T = contenttype.get_type(location)
    is_file = T.is_file
    is_dir = T.is_dir
    infos = OrderedDict()
    infos['type'] = filetype.get_type(location, short=False)
    infos['name'] = fileutils.file_name(location)
    infos['extension'] = is_file and fileutils.file_extension(location) or ''
    infos['date'] = is_file and filetype.get_last_modified_date(location) or None
    infos['size'] = T.size
    infos['sha1'] = is_file and sha1(location) or None
    infos['md5'] = is_file and md5(location) or None
    infos['files_count'] = is_dir and filetype.get_file_count(location) or None
    infos['mime_type'] = is_file and T.mimetype_file or None
    infos['file_type'] = is_file and T.filetype_file or None
    infos['programming_language'] = is_file and T.programming_language or None
    infos['is_binary'] = is_file and T.is_binary or None
    infos['is_text'] = is_file and T.is_text or None
    infos['is_archive'] = is_file and T.is_archive or None
    infos['is_media'] = is_file and T.is_media or None
    infos['is_source'] = is_file and T.is_source or None
    infos['is_script'] = is_file and T.is_script or None
    return [infos]
Пример #9
0
def get_source_file_path_references(location):
    """
    Yield unique references to source file paths extracted from DWARF debug symbols
    from the Elf file at `location`.

    If there are errors when processing Elfs, these are returned as well as paths
    prefixed with 'ERROR: '.
    """
    if not os.path.exists(location):
        return
    T = contenttype.get_type(location)
    if not T.is_elf:
        return
    unique_files = set()
    unique_paths = set()
    errors = []
    try:
        with_libdwarf = dwarf.Dwarf(location)
        for path in with_libdwarf.included_source_files:
            if '/' not in path:
                # bare file name
                unique_files.add(path)
            else:
                unique_paths.add(path)

        for path in with_libdwarf.original_source_files:
            if '/' not in path:
                # bare file name
                unique_files.add(path)
            else:
                unique_paths.add(path)

    except Exception as lde:
        msg = str(lde)
        _, m1, m2 = msg.partition('dwarfdump')
        errors.append(''.join([m1, m2]))

    try:
        with_binutils_nm = dwarf2.get_dwarfs(location)
        for entry in with_binutils_nm:
            path = entry.path
            if '/' not in path:
                # bare file name
                unique_files.add(path)
            else:
                unique_paths.add(path)
    except Exception as lde:
        msg = str(lde)
        errors.append(str)

    seen_file_names = set(file_name(p) for p in unique_paths)
    for fn in unique_files:
        if fn not in seen_file_names and fn not in ignores:
            unique_paths.add(fn)

    for error in errors:
        yield 'ERROR: ' + error

    for path in sorted(unique_paths):
        yield path
Пример #10
0
def is_pom(location):
    """
    Return True if the file at location is highly likely to be a POM.
    """
    if (not filetype.is_file(location) or not location.endswith((
            '.pom',
            'pom.xml',
            'project.xml',
    ))):
        return

    T = contenttype.get_type(location)
    # logger.debug('location: %(location)r, T: %(T)r)' % locals())
    if T.is_text and ('xml' in T.filetype_file.lower()
                      or 'sgml' in T.filetype_file.lower()
                      or 'xml' in T.filetype_pygment.lower()
                      or 'genshi' in T.filetype_pygment.lower()):

        # check the POM version in the first 100 lines
        with codecs.open(location, encoding='utf-8') as pom:
            for n, line in enumerate(pom):
                if n > 100:
                    break
                if any(x in line for x in (
                        'http://maven.apache.org/POM/4.0.0',
                        '<modelVersion>',
                )):
                    return True
Пример #11
0
def get_file_infos(location):
    """
    Return a list of dictionaries of informations collected from the file or
    directory at location.
    """
    from commoncode import fileutils
    from commoncode import filetype
    from commoncode.hash import sha1, md5
    from typecode import contenttype

    T = contenttype.get_type(location)
    is_file = T.is_file
    is_dir = T.is_dir
    infos = OrderedDict()
    infos['type'] = filetype.get_type(location, short=False)
    infos['name'] = fileutils.file_name(location)
    infos['extension'] = is_file and fileutils.file_extension(location) or ''
    infos['date'] = is_file and filetype.get_last_modified_date(
        location) or None
    infos['size'] = T.size
    infos['sha1'] = is_file and sha1(location) or None
    infos['md5'] = is_file and md5(location) or None
    infos['files_count'] = is_dir and filetype.get_file_count(location) or None
    infos['mime_type'] = is_file and T.mimetype_file or None
    infos['file_type'] = is_file and T.filetype_file or None
    infos['programming_language'] = is_file and T.programming_language or None
    infos['is_binary'] = is_file and T.is_binary or None
    infos['is_text'] = is_file and T.is_text or None
    infos['is_archive'] = is_file and T.is_archive or None
    infos['is_media'] = is_file and T.is_media or None
    infos['is_source'] = is_file and T.is_source or None
    infos['is_script'] = is_file and T.is_script or None
    return [infos]
Пример #12
0
def recognize_package(location):
    """
    Return a Package object if one was recognized or None for this `location`.
    """
    if not filetype.is_file(location):
        return

    T = contenttype.get_type(location)
    ftype = T.filetype_file.lower()
    mtype = T.mimetype_file

    for package in PACKAGE_TYPES:
        # Note: default to True if there is nothing to match against
        if location.endswith(tuple(package.metafiles)):
            return package.recognize(location)

        if package.filetypes:
            type_matched = any(t in ftype for t in package.filetypes)
        else:
            type_matched = False
        if package.mimetypes:
            mime_matched = any(m in mtype for m in package.mimetypes)
        else:
            mime_matched = False

        if package.extensions:
            extension_matched = location.lower().endswith(package.extensions)
        else:
            extension_matched = False

        if type_matched and mime_matched and extension_matched:
            # we return the first match in the order of PACKAGE_TYPES
            return package(location=location)
Пример #13
0
def pom_version(location):
    """
    Return 1 or 2 corresponding to the maven major
    version of POM style, not the POM version) if the file at location is
    highly likely to be a POM, otherwise None.
    """
    if (not filetype.is_file(location)
        or not location.endswith(pom_extensions)):
        return

    T = contenttype.get_type(location)
    # logger.debug('location: %(location)r, T: %(T)r)' % locals())
    if T.is_text and ('xml' in T.filetype_file.lower()
                      or 'sgml' in T.filetype_file.lower()
                      or 'xml' in T.filetype_pygment.lower()
                      or 'genshi' in T.filetype_pygment.lower()):

        # check the POM version in the first 100 lines
        with open(location, 'rb') as pom:
            for n, l in enumerate(pom):
                if n > 100:
                    break
                if ('http://maven.apache.org/POM/4.0.0' in l
                    or '<modelVersion>' in l):
                    return 2
                elif '<pomVersion>' in l:
                    return 1
Пример #14
0
def get_file_infos(location):
    """
    Return a mapping of file information collected from the file or
    directory at `location`.
    """
    from commoncode import fileutils
    from commoncode import filetype
    from commoncode.hash import multi_checksums
    from typecode import contenttype

    if on_linux:
        location = path_to_bytes(location)
    else:
        location = path_to_unicode(location)

    infos = OrderedDict()
    is_file = filetype.is_file(location)
    is_dir = filetype.is_dir(location)

    T = contenttype.get_type(location)

    infos['type'] = filetype.get_type(location, short=False)
    name = fileutils.file_name(location)
    if is_file:
        base_name, extension = fileutils.splitext(location)
    else:
        base_name = name
        extension = ''

    if on_linux:
        infos['name'] = path_to_unicode(name)
        infos['base_name'] = path_to_unicode(base_name)
        infos['extension'] = path_to_unicode(extension)
    else:
        infos['name'] = name
        infos['base_name'] = base_name
        infos['extension'] = extension

    infos['date'] = is_file and filetype.get_last_modified_date(
        location) or None
    infos['size'] = T.size
    infos.update(multi_checksums(location, (
        'sha1',
        'md5',
    )))
    infos['files_count'] = is_dir and filetype.get_file_count(location) or None
    infos['mime_type'] = is_file and T.mimetype_file or None
    infos['file_type'] = is_file and T.filetype_file or None
    infos['programming_language'] = is_file and T.programming_language or None
    infos['is_binary'] = bool(is_file and T.is_binary)
    infos['is_text'] = bool(is_file and T.is_text)
    infos['is_archive'] = bool(is_file and T.is_archive)
    infos['is_media'] = bool(is_file and T.is_media)
    infos['is_source'] = bool(is_file and T.is_source)
    infos['is_script'] = bool(is_file and T.is_script)

    return infos
Пример #15
0
def is_rar(location):
    """
    Return True if the file at location is a RAR archive.
    """
    if not os.path.exists(location):
        return
    from typecode import contenttype
    T = contenttype.get_type(location)
    return T.filetype_file.lower().startswith('rar archive')
Пример #16
0
    def is_manifest(cls, location):
        """
        Return True if the file at ``location`` is likely a manifest of this type.

        Sub-classes should override to implement their own manifest recognition.
        """
        if not filetype.is_file(location):
            return

        filename = file_name(location)

        file_patterns = cls.file_patterns
        if any(
                fnmatch.fnmatchcase(filename, metaf)
                for metaf in file_patterns):
            return True

        T = contenttype.get_type(location)
        ftype = T.filetype_file.lower()
        mtype = T.mimetype_file

        _base_name, extension = splitext_name(location, is_file=True)
        extension = extension.lower()

        if TRACE:
            logger_debug(
                'is_manifest: ftype:',
                ftype,
                'mtype:',
                mtype,
                'pygtype:',
                T.filetype_pygment,
                'fname:',
                filename,
                'ext:',
                extension,
            )

        type_matched = False
        if cls.filetypes:
            type_matched = any(t in ftype for t in cls.filetypes)

        mime_matched = False
        if cls.mimetypes:
            mime_matched = any(m in mtype for m in cls.mimetypes)

        extension_matched = False
        extensions = cls.extensions
        if extensions:
            extensions = (e.lower() for e in extensions)
            extension_matched = any(
                fnmatch.fnmatchcase(extension, ext_pat)
                for ext_pat in extensions)

        if type_matched and mime_matched and extension_matched:
            return True
Пример #17
0
def parse(location):
    """
    Return a WindowsExecutable package from the file at `location` or None.
    """
    if not filetype.is_file(location):
        return

    T = contenttype.get_type(location)
    if not T.is_winexe:
        return

    infos = pe_info(location)

    version = get_first(infos, 'Full Version', 'ProductVersion', 'FileVersion',
                        'Assembly Version')
    release_date = get_first(infos, 'BuildDate')
    if release_date:
        if len(release_date) >= 10:
            release_date = release_date[:10]
        release_date = release_date.replace('/', '-')

    name = get_first(infos, 'ProductName', 'OriginalFilename', 'InternalName')
    copyr = get_first(infos, 'LegalCopyright')

    LegalCopyright = copyr,

    LegalTrademarks = concat(infos, 'LegalTrademarks', 'LegalTrademarks1',
                             'LegalTrademarks2', 'LegalTrademarks3')

    License = get_first(infos, 'License')

    declared_license = {}
    if LegalCopyright or LegalTrademarks or License:
        declared_license = dict(LegalCopyright=copyr,
                                LegalTrademarks=LegalTrademarks,
                                License=License)

    description = concat(infos, 'FileDescription', 'Comments')

    parties = []
    cname = get_first(infos, 'CompanyName', 'Company')

    if cname:
        parties = [Party(type=party_org, role='author', name=cname)]
    homepage_url = get_first(infos, 'URL', 'WWW')

    return WindowsExecutable(
        name=name,
        version=version,
        release_date=release_date,
        copyright=copyr,
        declared_license=declared_license,
        description=description,
        parties=parties,
        homepage_url=homepage_url,
    )
Пример #18
0
    def is_datafile(cls, location, filetypes=tuple()):
        """
        Return True if the file at location is highly likely to be a POM.
        """
        if super().is_datafile(location, filetypes=filetypes):
            return True

        T = contenttype.get_type(location)
        if T.is_winexe:
            return True
Пример #19
0
def cpp_includes(location, **kwargs):
    """Collect the #includes statements in a C/C++ file."""
    T = contenttype.get_type(location)
    if not T.is_c_source:
        return
    results = []
    for line in analysis.unicode_text_lines(location):
        for inc in cpp_includes_re().findall(line):
            results.append(inc)
    return dict(cpp_includes=results)
Пример #20
0
def get_handlers(location):
    """
    Return an iterable of (handler, type_matched, mime_matched,
    extension_matched,) for this `location`.
    """
    if on_linux and py2:
        location = fileutils.fsencode(location)

    if filetype.is_file(location):

        T = contenttype.get_type(location)
        ftype = T.filetype_file.lower()
        mtype = T.mimetype_file

        if TRACE_DEEP:
            logger.debug(
                'get_handlers: processing %(location)s: ftype: %(ftype)s, mtype: %(mtype)s '
                % locals())
        for handler in archive_handlers:
            if not handler.extractors:
                continue

            extractor_count = len(handler.extractors)
            if extractor_count > 2:
                raise Exception('Maximum level of archive nesting is two.')

            # default to False
            type_matched = handler.filetypes and any(
                t in ftype for t in handler.filetypes)
            mime_matched = handler.mimetypes and any(
                m in mtype for m in handler.mimetypes)
            exts = handler.extensions
            if exts:
                if on_linux and py2:
                    exts = tuple(fileutils.fsencode(e) for e in exts)
                extension_matched = exts and location.lower().endswith(exts)

            if TRACE_DEEP:
                logger.debug(
                    '  get_handlers: matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s'
                    % locals())

            if handler.strict and not all(
                [type_matched, mime_matched, extension_matched]):
                logger.debug('  get_handlers: skip strict' % locals())
                continue

            if type_matched or mime_matched or extension_matched:
                if TRACE_DEEP:
                    handler_name = handler.name
                    logger.debug(
                        '     get_handlers: yielding handler: %(handler_name)r'
                        % locals())
                yield handler, type_matched, mime_matched, extension_matched
Пример #21
0
def get_file_infos(location):
    """
    Return a mapping of file information collected from the file or
    directory at `location`.
    """
    from commoncode import fileutils
    from commoncode import filetype
    from commoncode.hash import multi_checksums
    from typecode import contenttype

    if on_linux:
        location = path_to_bytes(location)
    else:
        location = path_to_unicode(location)

    infos = OrderedDict()
    is_file = filetype.is_file(location)
    is_dir = filetype.is_dir(location)

    T = contenttype.get_type(location)

    infos['type'] = filetype.get_type(location, short=False)
    name = fileutils.file_name(location)
    if is_file:
        base_name, extension = fileutils.splitext(location)
    else:
        base_name = name
        extension = ''

    if on_linux:
        infos['name'] = path_to_unicode(name)
        infos['base_name'] = path_to_unicode(base_name)
        infos['extension'] = path_to_unicode(extension)
    else:
        infos['name'] = name
        infos['base_name'] = base_name
        infos['extension'] = extension

    infos['date'] = is_file and filetype.get_last_modified_date(location) or None
    infos['size'] = T.size
    infos.update(multi_checksums(location, ('sha1', 'md5',)))
    infos['files_count'] = is_dir and filetype.get_file_count(location) or None
    infos['mime_type'] = is_file and T.mimetype_file or None
    infos['file_type'] = is_file and T.filetype_file or None
    infos['programming_language'] = is_file and T.programming_language or None
    infos['is_binary'] = bool(is_file and T.is_binary)
    infos['is_text'] = bool(is_file and T.is_text)
    infos['is_archive'] = bool(is_file and T.is_archive)
    infos['is_media'] = bool(is_file and T.is_media)
    infos['is_source'] = bool(is_file and T.is_source)
    infos['is_script'] = bool(is_file and T.is_script)

    return infos
Пример #22
0
def get_elf_needed_library(location, **kwargs):
    """
    Return a list of needed_libraries
    """

    T = contenttype.get_type(location)
    if not T.is_elf:
        return
    elfie = Elf(location)
    results = []
    for needed_library in elfie.needed_libraries:
        results.append(needed_library)
    return dict(elf_needed_library=results)
Пример #23
0
    def closure_test_function(*args, **kwargs):
        results = get_type(test_file).to_dict(include_date=False)

        if regen:
            for key, value in results.items():
                setattr(test, key, value)
                test.dump()

        expected = test.to_dict(filter_empty=False, filter_extra=True)
        passing = check_types_equal(expected, results)

        # this is done to display slightly eaier to handle error traces
        if not passing:
            expected['data file'] = 'file://' + data_file
            expected['test_file'] = 'file://' + test_file
            assert dict(results) == dict(expected)
Пример #24
0
def recognize_package(location):
    """
    Return a Package object if one was recognized or None for this `location`.
    """

    if not filetype.is_file(location):
        return

    T = contenttype.get_type(location)
    ftype = T.filetype_file.lower()
    mtype = T.mimetype_file


    for package_type in PACKAGE_TYPES:
        # Note: default to True if there is nothing to match against
        metafiles = package_type.metafiles
        if on_linux:
            metafiles = (path_to_bytes(m) for m in metafiles)
        if location.endswith(tuple(metafiles)):
            logger_debug('metafile matching: package_type is of type:', package_type)
            return package_type.recognize(location)

        if package_type.filetypes:
            type_matched = any(t in ftype for t in package_type.filetypes)
        else:
            type_matched = False
        if package_type.mimetypes:
            mime_matched = any(m in mtype for m in package_type.mimetypes)
        else:
            mime_matched = False

        extensions = package_type.extensions
        if extensions:
            if on_linux:
                extensions = tuple(path_to_bytes(e) for e in extensions)
            extension_matched = location.lower().endswith(extensions)
        else:
            extension_matched = False

        if type_matched and mime_matched and extension_matched:
            # we return the first match in the order of PACKAGE_TYPES
            logger_debug('all matching: package is of type:', package_type)
            recognized = package_type.recognize(location)
            logger_debug('all matching: recognized as:', repr(recognized))
            return recognized

        logger_debug('no match: package is not of known type:', package_type)
Пример #25
0
def is_binary(location):
    """
    Return True if the resource at location is a binary file.
    """
    t = get_type(location)
    return (
        t.is_binary 
        or t.is_archive 
        or t.is_media 
        or t.is_office_doc
        or t.is_compressed
        or t.is_filesystem
        or t.is_winexe
        or t.is_elf
        or t.is_java_class
        or t.is_data
    )
Пример #26
0
def get_dwarfs(location):
    """
    Yield tuples with debug information extracted from the DWARF
    debug symbols. Return also the symbol type, the symbol value itself and
    the line number in the source code at where the symbol is used or defined.

    Yields this tuple:
        (symbol_type, symbol, path_to_source, symbol_source_line)
    """

    T = contenttype.get_type(location)
    if T.is_elf:
        rc, out, err = call_nm(location)
        if rc != 0:
            raise Exception(repr(open(err).read()))
        for res in parse(out):
            yield res
Пример #27
0
def get_elf_needed_library(location):
    """
    Return a list of needed_libraries
    """
    if not os.path.exists(location):
        return

    T = contenttype.get_type(location)
    if not T.is_elf:
        return
    with open(location, 'rb') as f:
        elffile = ELFFile(f)
        for section in elffile.iter_sections():
            if not isinstance(section, DynamicSection):
                continue
            for tag in section.iter_tags():
                if tag.entry.d_tag == 'DT_NEEDED':
                    yield tag.needed
Пример #28
0
def get_file_infos(location, as_list=True):
    """
    Return a list of dictionaries of informations collected from the file or
    directory at location.
    """
    from commoncode import fileutils
    from commoncode import filetype
    from commoncode.hash import multi_checksums
    from scancode import utils
    from typecode import contenttype

    infos = OrderedDict()
    is_file = filetype.is_file(location)
    is_dir = filetype.is_dir(location)

    T = contenttype.get_type(location)

    infos['type'] = filetype.get_type(location, short=False)
    infos['name'] = fileutils.file_name(location)
    infos['extension'] = is_file and fileutils.file_extension(location) or ''
    infos['date'] = is_file and filetype.get_last_modified_date(
        location) or None
    infos['size'] = T.size
    infos.update(multi_checksums(location, (
        'sha1',
        'md5',
    )))
    infos['files_count'] = is_dir and filetype.get_file_count(location) or None
    infos['mime_type'] = is_file and T.mimetype_file or None
    infos['file_type'] = is_file and T.filetype_file or None
    infos['programming_language'] = is_file and T.programming_language or None
    infos['is_binary'] = bool(is_file and T.is_binary)
    infos['is_text'] = bool(is_file and T.is_text)
    infos['is_archive'] = bool(is_file and T.is_archive)
    infos['is_media'] = bool(is_file and T.is_media)
    infos['is_source'] = bool(is_file and T.is_source)
    infos['is_script'] = bool(is_file and T.is_script)

    if as_list:
        return [infos]
    else:
        return infos
Пример #29
0
def convert_to_utf8(location):
    """
    Convert the file at location to UTF-8 text.
    Return the location of the converted file or None.
    """
    if not contenttype.get_type(location).is_text:
        return location
    start = open(location, 'rb').read(4096)
    encoding = chardet.detect(start)
    if encoding:
        encoding = encoding.get('encoding', None)
        if encoding:
            target = os.path.join(fileutils.get_temp_dir('markup'),
                                  fileutils.file_name(location))
            with codecs.open(location, 'rb', encoding=encoding,
                             errors='replace', buffering=16384) as inf:
                with codecs.open(target, 'wb', encoding='utf-8') as outf:
                    outf.write(inf.read())
            return target
        else:
            # chardet failed somehow to detect an encoding
            return location
Пример #30
0
 def is_manifest(cls, location):
     """
     Return True if the file at ``location`` is likely a manifest of this type.
     """
     T = contenttype.get_type(location)
     return filetype.is_file(location) and T.is_winexe
Пример #31
0
def recognize_packages(location):
    """
    Return a list of Package object if any packages were recognized for this
    `location`, or None if there were no Packages found. Raises Exceptions on errors.
    """

    if not filetype.is_file(location):
        return

    T = contenttype.get_type(location)
    ftype = T.filetype_file.lower()
    mtype = T.mimetype_file

    _base_name, extension = splitext_name(location, is_file=True)
    filename = file_name(location)
    extension = extension.lower()

    if TRACE:
        logger_debug('recognize_packages: ftype:', ftype, 'mtype:', mtype,
                     'pygtype:', T.filetype_pygment, 'fname:', filename,
                     'ext:', extension)

    recognized_packages = []
    for package_type in PACKAGE_TYPES:
        # Note: default to True if there is nothing to match against
        metafiles = package_type.metafiles
        if on_linux and py2:
            metafiles = (fsencode(m) for m in metafiles)

        if any(fnmatch.fnmatchcase(filename, metaf) for metaf in metafiles):
            for recognized in package_type.recognize(location):
                if TRACE:
                    logger_debug(
                        'recognize_packages: metafile matching: recognized:',
                        recognized)
                if recognized and not recognized.license_expression:
                    # compute and set a normalized license expression
                    recognized.license_expression = recognized.compute_normalized_license(
                    )
                    if TRACE:
                        logger_debug(
                            'recognize_packages: recognized.license_expression:',
                            recognized.license_expression)
                recognized_packages.append(recognized)
            return recognized_packages

        type_matched = False
        if package_type.filetypes:
            type_matched = any(t in ftype for t in package_type.filetypes)

        mime_matched = False
        if package_type.mimetypes:
            mime_matched = any(m in mtype for m in package_type.mimetypes)

        extension_matched = False
        extensions = package_type.extensions
        if extensions:
            if on_linux and py2:
                extensions = (fsencode(e) for e in extensions)

            extensions = (e.lower() for e in extensions)
            extension_matched = any(
                fnmatch.fnmatchcase(extension, ext_pat)
                for ext_pat in extensions)

        if type_matched and mime_matched and extension_matched:
            if TRACE: logger_debug('recognize_packages: all matching')
            try:
                for recognized in package_type.recognize(location):
                    # compute and set a normalized license expression
                    if recognized and not recognized.license_expression:
                        recognized.license_expression = recognized.compute_normalized_license(
                        )
                    if TRACE:
                        logger_debug('recognize_packages: recognized',
                                     recognized)
                    recognized_packages.append(recognized)
            except NotImplementedError:
                # build a plain package if recognize is not yet implemented
                recognized = package_type()
                if TRACE:
                    logger_debug('recognize_packages: recognized', recognized)
                recognized_packages.append(recognized)
            return recognized_packages

        if TRACE:
            logger_debug('recognize_packages: no match for type:',
                         package_type)
Пример #32
0
def pe_info(location):
    """
    Return a mapping of common data available for a Windows dll or exe PE
    (portable executable).
    Return None for non-Windows PE files.
    Return an empty mapping for PE from which we could not collect data.

    Also collect extra data found if any, returned as a dictionary under the
    'extra_data' key in the returned mapping.
    """
    if not location:
        return {}

    T = contenttype.get_type(location)

    if not T.is_winexe:
        return {}

    result = dict([(
        k,
        None,
    ) for k in PE_INFO_KEYS])
    extra_data = result['extra_data'] = {}

    with closing(pefile.PE(location)) as pe:
        if not hasattr(pe, 'FileInfo'):
            # No fileinfo section: we return just empties
            return result

        # >>> pe.FileInfo: this is a list of list of Structure objects:
        # [[<Structure: [VarFileInfo] >,  <Structure: [StringFileInfo]>]]
        file_info = pe.FileInfo
        if not file_info or not isinstance(file_info, list):
            if TRACE:
                logger.debug('pe_info: not file_info')
            return result

        # here we have a non-empty list
        file_info = file_info[0]
        if TRACE:
            logger.debug('pe_info: file_info:', file_info)

        string_file_info = [
            x for x in file_info if type(x) == pefile.Structure
            and hasattr(x, 'name') and x.name == 'StringFileInfo'
        ]

        if not string_file_info:
            # No stringfileinfo section: we return just empties
            if TRACE:
                logger.debug('pe_info: not string_file_info')
            return result

        string_file_info = string_file_info[0]

        if not hasattr(string_file_info, 'StringTable'):
            # No fileinfo.StringTable section: we return just empties
            if TRACE:
                logger.debug('pe_info: not StringTable')
            return result

        string_table = string_file_info.StringTable
        if not string_table or not isinstance(string_table, list):
            return result

        string_table = string_table[0]

        if TRACE:
            logger.debug('pe_info: Entries keys: ' +
                         str(set(k for k in string_table.entries)))

            logger.debug('pe_info: Entry values:')
            for k, v in string_table.entries.items():
                logger.debug('  ' + str(k) + ': ' + repr(type(v)) + repr(v))

        for k, v in string_table.entries.items():
            # convert unicode to a safe ASCII representation
            key = text.as_unicode(k).strip()
            value = text.as_unicode(v).strip()
            value = fix_text(value)
            if key in PE_INFO_KEYSET:
                result[key] = value
            else:
                extra_data[key] = value

    return result
import os

from unittest.case import skipIf
from unittest.case import expectedFailure

from commoncode.testcase import FileBasedTesting
from commoncode.system import on_windows

from typecode.contenttype import get_filetype
from typecode.contenttype import get_type
from typecode.contenttype import get_pygments_lexer
from typecode.contenttype import is_standard_include


# aliases for testing
get_mimetype_python = lambda l: get_type(l).mimetype_python
get_filetype_pygment = lambda l: get_type(l).filetype_pygment
get_filetype_file = lambda l: get_type(l).filetype_file
get_mimetype_file = lambda l: get_type(l).mimetype_file
is_text = lambda l: get_type(l).is_text
is_archive = lambda l: get_type(l).is_archive
is_media = lambda l: get_type(l).is_media
is_winexe = lambda l: get_type(l).is_winexe
is_source = lambda l: get_type(l).is_source
is_special = lambda l: get_type(l).is_special
is_pdf = lambda l: get_type(l).is_pdf
is_pdf_with_text = lambda l: get_type(l).is_pdf_with_text
is_binary = lambda l: get_type(l).is_binary
is_c_source = lambda l: get_type(l).is_c_source
is_stripped_elf = lambda l: get_type(l).is_stripped_elf
is_elf = lambda l: get_type(l).is_elf
Пример #34
0
import os

from unittest.case import skipIf
from unittest.case import expectedFailure

from commoncode.testcase import FileBasedTesting
from commoncode.system import on_windows

from typecode.contenttype import get_filetype
from typecode.contenttype import get_type
from typecode.contenttype import get_pygments_lexer
from typecode.contenttype import is_standard_include

# aliases for testing
get_mimetype_python = lambda l: get_type(l).mimetype_python
get_filetype_pygment = lambda l: get_type(l).filetype_pygment
get_filetype_file = lambda l: get_type(l).filetype_file
get_mimetype_file = lambda l: get_type(l).mimetype_file
is_text = lambda l: get_type(l).is_text
is_archive = lambda l: get_type(l).is_archive
is_compressed = lambda l: get_type(l).is_compressed
is_media = lambda l: get_type(l).is_media
is_winexe = lambda l: get_type(l).is_winexe
is_source = lambda l: get_type(l).is_source
is_special = lambda l: get_type(l).is_special
is_pdf = lambda l: get_type(l).is_pdf
is_pdf_with_text = lambda l: get_type(l).is_pdf_with_text
is_binary = lambda l: get_type(l).is_binary
is_c_source = lambda l: get_type(l).is_c_source
is_stripped_elf = lambda l: get_type(l).is_stripped_elf