示例#1
0
    def test(self, input_dir):
        """Tests the model accuracy using source code files.

        ``input_dir`` -- source code files directory.

        """
        report = {
            'overall-accuracy': 0,
            'per-language': {
                lang: {
                    'nb-files': 0,
                    'accuracy': 0,
                    'predicted': {
                        predicted_lang: 0 for predicted_lang in self.languages
                    },
                    'predicted-files': {
                        predicted_lang: [] for predicted_lang in self.languages
                    }
                } for lang in self.languages
            }
        }

        # Test files found in input_dir
        extensions = {
            ext: lang for lang, exts in self.languages.items() for ext in exts}
        for pos, path in enumerate(Path(input_dir).glob('**/*'), 1):
            if not path.is_file():
                continue

            lang = extensions.get(path.suffix.lstrip('.'))
            if lang is None:
                continue

            content = safe_read_file(path)
            predicted_lang = self.language_name(content)
            language_info = report['per-language'][lang]
            language_info['nb-files'] += 1
            language_info['predicted'][predicted_lang] += 1
            language_info['predicted-files'][predicted_lang].append(str(path))
            LOGGER.debug("[%d] files processed", pos)

        # Fill the report accuracy data
        total_success = 0
        total_files = 0
        for lang in self.languages:

            nb_files = report['per-language'][lang]['nb-files']
            if not nb_files:
                continue

            nb_success = report['per-language'][lang]['predicted'][lang]
            report['per-language'][lang]['accuracy'] = nb_success / nb_files
            total_success += nb_success
            total_files += nb_files

        report['overall-accuracy'] = (
            total_success / total_files if total_files else 0.0)

        return report
示例#2
0
def _extract(path):
    text = safe_read_file(path).lower()
    tokens = [
        token for token in split(text)
        if not (len(token) > 1 and token.isdigit())
    ]  # Drop numbers > 10

    return Counter(tokens)
示例#3
0
def test_safe_read_file():
    for _ in range(10):
        with tempfile.TemporaryDirectory() as dirname:
            file_path = Path(dirname).joinpath('example_file')

            # Write random bytes into the file
            rand_value = random.getrandbits(8*NB_BYTES)
            file_path.write_bytes(rand_value.to_bytes(NB_BYTES, 'little'))

            # Retrieve text without raising errors
            text = utils.safe_read_file(file_path)
            assert text
def test_safe_read_file():
    for _ in range(10):
        with tempfile.NamedTemporaryFile('w+b') as tmp_file:
            rand_value = random.getrandbits(8 * NB_BYTES)

            # Write random bytes into the file
            tmp_file.write(rand_value.to_bytes(NB_BYTES, 'little'))
            tmp_file.seek(0)

            text = utils.safe_read_file(Path(tmp_file.name))

            assert text  # Text retrieved without raising errors