def test(self, input_dir): """Tests the model accuracy using source code files. ``input_dir`` -- source code files directory. """ report = { 'overall-accuracy': 0, 'per-language': { lang: { 'nb-files': 0, 'accuracy': 0, 'predicted': { predicted_lang: 0 for predicted_lang in self.languages }, 'predicted-files': { predicted_lang: [] for predicted_lang in self.languages } } for lang in self.languages } } # Test files found in input_dir extensions = { ext: lang for lang, exts in self.languages.items() for ext in exts} for pos, path in enumerate(Path(input_dir).glob('**/*'), 1): if not path.is_file(): continue lang = extensions.get(path.suffix.lstrip('.')) if lang is None: continue content = safe_read_file(path) predicted_lang = self.language_name(content) language_info = report['per-language'][lang] language_info['nb-files'] += 1 language_info['predicted'][predicted_lang] += 1 language_info['predicted-files'][predicted_lang].append(str(path)) LOGGER.debug("[%d] files processed", pos) # Fill the report accuracy data total_success = 0 total_files = 0 for lang in self.languages: nb_files = report['per-language'][lang]['nb-files'] if not nb_files: continue nb_success = report['per-language'][lang]['predicted'][lang] report['per-language'][lang]['accuracy'] = nb_success / nb_files total_success += nb_success total_files += nb_files report['overall-accuracy'] = ( total_success / total_files if total_files else 0.0) return report
def _extract(path): text = safe_read_file(path).lower() tokens = [ token for token in split(text) if not (len(token) > 1 and token.isdigit()) ] # Drop numbers > 10 return Counter(tokens)
def test_safe_read_file(): for _ in range(10): with tempfile.TemporaryDirectory() as dirname: file_path = Path(dirname).joinpath('example_file') # Write random bytes into the file rand_value = random.getrandbits(8*NB_BYTES) file_path.write_bytes(rand_value.to_bytes(NB_BYTES, 'little')) # Retrieve text without raising errors text = utils.safe_read_file(file_path) assert text
def test_safe_read_file(): for _ in range(10): with tempfile.NamedTemporaryFile('w+b') as tmp_file: rand_value = random.getrandbits(8 * NB_BYTES) # Write random bytes into the file tmp_file.write(rand_value.to_bytes(NB_BYTES, 'little')) tmp_file.seek(0) text = utils.safe_read_file(Path(tmp_file.name)) assert text # Text retrieved without raising errors