def to_repr(prep_config: PrepConfig, token_list: List, n_gramm_splitting_config: Optional[NgramSplitConfig] = None): types_to_be_repr = get_types_to_be_repr(prep_config) splitting_config = n_gramm_splitting_config or get_global_n_gramm_splitting_config() dict_based_non_eng = (prep_config.get_param_value(PrepParam.EN_ONLY) != 3) lowercase = (prep_config.get_param_value(PrepParam.CAPS) == 1) repr_list = to_repr_list(token_list, ReprConfig(types_to_be_repr, splitting_config, dict_based_non_eng, lowercase)) return repr_list
def run(dataset: str, repr: str, classifier: str): from logrec.classifier.context_datasets import ContextsDataset PrepConfig.assert_classification_config(repr) path_to_dataset = os.path.join(DEFAULT_PARSED_DATASETS_DIR, dataset) full_src_dir = os.path.join(path_to_dataset, REPR_DIR, repr) dest_dir = os.path.join(path_to_dataset, CLASSIFICATION_DIR, classifier, args.repr) logger.info(f"Writing to {dest_dir}") os.makedirs(os.path.join(dest_dir, TRAIN_DIR), exist_ok=True) os.makedirs(os.path.join(dest_dir, TEST_DIR), exist_ok=True) os.makedirs(os.path.join(dest_dir, VALID_DIR), exist_ok=True) total_files = sum( file_mapper(full_src_dir, lambda f: 1, lambda fi: fi.endswith("parsed.repr"))) count = 0 cases_creator = get_cases_creator(classifier) for lines, rel_path in file_mapper(full_src_dir, cases_creator, lambda fi: fi.endswith("parsed.repr")): count += 1 logger.info(f"Processing {count} out of {total_files}") forward_path = os.path.join( dest_dir, re.sub("parsed\\.repr", ContextsDataset.FW_CONTEXTS_FILE_EXT, rel_path)) backward_path = os.path.join( dest_dir, re.sub("parsed\\.repr", ContextsDataset.BW_CONTEXTS_FILE_EXT, rel_path)) label_path = os.path.join( dest_dir, re.sub("parsed\\.repr", ContextsDataset.LABEL_FILE_EXT, rel_path)) with open(forward_path, 'w') as f, open(backward_path, 'w') as b, open(label_path, 'w') as l: for line in lines: if line: l.write(f'{line[2]}\n') f.write(f'{" ".join(line[0])}\n') b.write(f'{" ".join(line[1])}\n') else: l.write('\n') f.write('\n') b.write('\n')
def test_to_repr_2_nosep(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 1, PrepParam.COM_STR: 0, PrepParam.SPLIT: 2, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.ONLY_NUMBERS) actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl["word_start"], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], '"', pl['word_start'], pl['capitals'], 'a', pl["capital"], pl['non_eng'], pl['word_end'], '"', '/*', pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english', pl['word_end'], '*/', '//', pl["word_start"], pl['capitals'], pl['non_eng'], "8", pl['word_end'], pl['olc_end'] ] self.assertEqual(expected, actual)
def preprocess(s, r): parsed = apply_preprocessors(from_string(s), pp_params["preprocessors"], {'interesting_context_words': []}) params = PrepConfig.from_encoded_string(r) init_splitting_config(DEFAULT_DATASET, params, DEFAULT_BPE_BASE_REPR, DEFAULT_BPE_N_MERGES, None) return to_repr(params, parsed)
def test_log_no_mark_logs(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 1, PrepParam.COM_STR: 0, PrepParam.SPLIT: 1, PrepParam.TABS_NEWLINES: 0, PrepParam.MARK_LOGS: 0, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig() tokens = [ LogStatement( SplitContainer.from_single_token('LOGGER'), SplitContainer.from_single_token('Info'), INFO, [StringLiteral([SplitContainer.from_single_token("Hi")])]) ] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['capitals'], 'logger', '.', pl['capital'], 'info', '(', '"', pl['capital'], 'hi', '"', ')', ';' ] self.assertEqual(expected, actual)
def test_to_repr_no_no_sep_with_bpe_no_merges(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 1, PrepParam.COM_STR: 0, PrepParam.SPLIT: 4, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.BPE, merges=[], merges_cache={}) actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], '"', pl['word_start'], pl['capitals'], 'a', pl["capital"], pl['non_eng'], pl['word_end'], '"', '/*', pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'e', 'n', 'g', 'l', 'i', 's', 'h', pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'], pl['non_eng'], "8", pl['word_end'], pl['olc_end'] ] self.assertEqual(expected, actual)
def test_to_repr_with_non_eng(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 0, PrepParam.COM_STR: 0, PrepParam.SPLIT: 3, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM, sc_splittings={ 'english': ['engl', 'ish'], 'dieselbe': ['die', 'selbe'] }) actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", 'dinero', '"', pl['word_start'], pl['capitals'], 'a', pl['capital'], 'wirklich', pl['word_end'], '"', '/*', 'ц', pl['word_start'], 'blanco', '_', 'engl', 'ish', pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'], 'die', 'selbe', "8", pl['word_end'], pl['olc_end'] ] self.assertEqual(expected, actual)
def test_to_repr_with_enonlycontents(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 2, PrepParam.COM_STR: 0, PrepParam.SPLIT: 3, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM, sc_splittings={}) tokens = [ Number([1, DecimalPoint(), 1]), "*", SplitContainer([NonEng(Word.from_("dinero"))]), StringLiteral([ NonEng(Word.from_("ich")), NonEng(Word.from_("weiss")), NonEng(Word.from_("nicht")), NonEng(Word.from_("was")), NonEng(Word.from_("soll")), NonEng(Word.from_("es")), NonEng(Word.from_("bedeuten")), NonEng(Word.from_("dass")), NonEng(Word.from_("ich")), NonEng(Word.from_("so")), NonEng(Word.from_("traurig")), NonEng(Word.from_("bin")), ]), NewLine(), MultilineComment([ SplitContainer([NonEng(Word.from_('ц'))]), SplitContainer([ NonEng(Word.from_("blanco")), Underscore(), Word.from_("english") ]) ]), NewLine(), Tab(), OneLineComment([ SplitContainer( [NonEng(Word.from_("DIESELBE")), Word.from_("8")]) ]) ] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], '"', pl["non_eng_content"], '"', '/*', pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english', pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'], pl['non_eng'], "8", pl['word_end'], pl['olc_end'] ] self.assertEqual(expected, actual)
def test_merges_no_cache(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 0, PrepParam.COM_STR: 0, PrepParam.SPLIT: 4, PrepParam.TABS_NEWLINES: 0, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.BPE, merges={('w', 'h'): 0}, merges_cache={}) tokens = [SplitContainer.from_single_token("While")] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['word_start'], pl['capital'], "wh", "i", "l", "e", pl["word_end"] ] self.assertEqual(expected, actual)
def calc_stats_for_prepconfig(prepconfig, lang_checker, token_list, include_sample=False): repr = to_token_list( to_repr(PrepConfig.from_encoded_string(prepconfig), token_list, NgramSplitConfig())).split(' ') return lang_checker.calc_lang_stats(repr, include_sample=include_sample)
def test(self): for input, output_tuple in test_cases.items(): parsed = apply_preprocessors(from_string(input), pp_params["preprocessors"], {}) self.assertEqual(output_tuple[0], parsed) repred = to_repr(PrepConfig.from_encoded_string('104111'), parsed, ngram_split_config) self.assertEqual(output_tuple[1], repred)
def run(dataset: str, preprocessing_params: str, bpe_base_repr: Optional[str], bpe_n_merges: Optional[int], splitting_file: Optional[str]): path_to_dataset = os.path.join(DEFAULT_PARSED_DATASETS_DIR, args.dataset) full_src_dir = os.path.join(path_to_dataset, PARSED_DIR) if not os.path.exists(full_src_dir): logger.error(f"Dir does not exist: {full_src_dir}") exit(3) logger.info(f"Reading parsed files from: {os.path.abspath(full_src_dir)}") preprocessing_params = PrepConfig.from_encoded_string(preprocessing_params) init_splitting_config(dataset, preprocessing_params, bpe_base_repr, bpe_n_merges, splitting_file) repr = str(preprocessing_params) full_dest_dir = os.path.join(path_to_dataset, REPR_EXTENSION, repr) full_metadata_dir = os.path.join(path_to_dataset, METADATA_DIR, repr) logger.info( f"Writing preprocessed files to {os.path.abspath(full_dest_dir)}") if not os.path.exists(full_dest_dir): os.makedirs(full_dest_dir) if not os.path.exists(full_metadata_dir): os.makedirs(full_metadata_dir) with open(os.path.join(full_dest_dir, 'preprocessing_types.json'), "w") as f: json_str = jsons.dumps(preprocessing_params) f.write(json_str) params = [] for root, dirs, files in os.walk(full_src_dir): for file in files: if file.endswith(f".{PARSED_FILE_EXTENSION}"): full_dest_dir_with_sub_dir = os.path.join( full_dest_dir, os.path.relpath(root, full_src_dir)) if not os.path.exists(full_dest_dir_with_sub_dir): os.makedirs(full_dest_dir_with_sub_dir) params.append((os.path.join(root, file), os.path.join(full_dest_dir_with_sub_dir, file), preprocessing_params)) files_total = len(params) current_file = 0 start_time = time.time() with Pool() as pool: it = pool.imap_unordered(preprocess_and_write, params) for _ in it: current_file += 1 logger.info(f"Processed {current_file} out of {files_total}") time_elapsed = time.time() - start_time logger.info( f"Time elapsed: {time_elapsed:.2f} s, estimated time until completion: " f"{time_elapsed / current_file * files_total - time_elapsed:.2f} s" )
def test_both_enonly_and_nosplit(self): with self.assertRaises(ValueError): prep_config = PrepConfig({ PrepParam.EN_ONLY: 1, PrepParam.COM_STR: 0, PrepParam.SPLIT: 0, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) to_repr(prep_config, [], NgramSplitConfig())
def run(dataset, repr, threshold): PrepConfig.assert_classification_config(repr) path_to_classification = os.path.join(DEFAULT_PARSED_DATASETS_DIR, dataset, CLASSIFICATION_DIR) dest_dir = os.path.join(path_to_classification, CLASSIFICATION_TYPE, repr) logger.info(f"Getting stats for {dest_dir}") logger.info( f"Ignoring projects where the percentage of file that contain logging is less than {threshold} %" ) projects_to_ignore, logged_stats = calc_stats(dest_dir, threshold) for i, p in enumerate(projects_to_ignore): logger.info(f"{i}: {p}") logger.info("") logger.info(logged_stats) output_file_path = os.path.join( path_to_classification, f"{IGNORED_PROJECTS_FILE_NAME}.{threshold}") dump_list(projects_to_ignore, output_file_path) logger.info( f"Ignored files with threshold {threshold} % were written to {output_file_path}" ) logger.info(f"Total ignored projects: {len(projects_to_ignore)}")
def test_to_repr_0(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 0, PrepParam.COM_STR: 0, PrepParam.SPLIT: 0, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 0 }) actual = to_repr(prep_config, tokens, NgramSplitConfig()) expected = [ '1.1', "*", 'dinero', '"', 'AWirklich', '"', '/*', 'ц', 'blanco_english', '*/', '//', "DIESELBE8", pl['olc_end'] ] self.assertEqual(expected, actual)
def init_splitting_config(dataset: str, prep_config: PrepConfig, bpe_base_repr: Optional[str], bpe_n_merges: Optional[int], splitting_file: Optional[str]): global global_n_gramm_splitting_config global_n_gramm_splitting_config = NgramSplitConfig() if prep_config.get_param_value(PrepParam.SPLIT) in [4, 5, 6, 7, 8, 9]: if not bpe_base_repr: bpe_base_repr = prep_config.get_base_bpe_prep_config() if prep_config.get_param_value(PrepParam.SPLIT) == 9: if not bpe_n_merges: raise ValueError( "--bpe-n-merges must be specified for repr **9**") else: bpe_n_merges_dict = {4: 5000, 5: 1000, 6: 10000, 7: 20000, 8: 0} bpe_n_merges = bpe_n_merges_dict[prep_config.get_param_value( PrepParam.SPLIT)] if bpe_base_repr.find("/") == -1: bpe_base_dataset = dataset else: bpe_base_dataset, bpe_base_repr = bpe_base_repr.split("/") logger.info(f'Using bpe base dataset: {bpe_base_dataset}') logger.info(f'Using bpe base repr: {bpe_base_repr}') logger.info(f'Using bpe_n_merges: {bpe_n_merges}') path_to_merges_dir = os.path.join(DEFAULT_PARSED_DATASETS_DIR, bpe_base_dataset, METADATA_DIR, bpe_base_repr, BPE_DIR, str(bpe_n_merges)) bpe_merges_file = os.path.join(path_to_merges_dir, 'merges.txt') bpe_merges_cache = os.path.join(path_to_merges_dir, 'merges_cache.txt') global_n_gramm_splitting_config.merges_cache = read_dict_from_2_columns( bpe_merges_cache, val_type=list) global_n_gramm_splitting_config.merges = read_merges(bpe_merges_file) global_n_gramm_splitting_config.set_splitting_type( NgramSplittingType.BPE) elif prep_config.get_param_value(PrepParam.SPLIT) == 3: if not splitting_file: raise ValueError("--splitting-file must be specified") splittings = read_dict_from_2_columns(splitting_file, val_type=list, delim='|') global_n_gramm_splitting_config.sc_splittings = splittings global_n_gramm_splitting_config.set_splitting_type( NgramSplittingType.NUMBERS_AND_CUSTOM) elif prep_config.get_param_value(PrepParam.SPLIT) == 2: global_n_gramm_splitting_config.set_splitting_type( NgramSplittingType.ONLY_NUMBERS)
def run(dataset: str, preprocessing_params: str, bpe_base_repr: Optional[str], bpe_n_merges: Optional[int], splitting_file: Optional[str], merges_file): path_to_dataset = os.path.join(DEFAULT_PARSED_DATASETS_DIR, args.dataset) full_src_dir = os.path.join(path_to_dataset, PARSED_DIR) if not os.path.exists(full_src_dir): logger.error(f"Dir does not exist: {full_src_dir}") exit(3) logger.info(f"Reading parsed files from: {os.path.abspath(full_src_dir)}") preprocessing_params = PrepConfig.from_encoded_string(preprocessing_params) init_splitting_config(dataset, preprocessing_params, bpe_base_repr, bpe_n_merges, splitting_file, merges_file) repr = str(preprocessing_params) full_dest_dir = os.path.join(path_to_dataset, REPR_EXTENSION, f'{repr}_{bpe_n_merges if bpe_n_merges else ""}_{os.path.basename(merges_file)}') full_metadata_dir = os.path.join(path_to_dataset, METADATA_DIR, repr) logger.info(f"Writing preprocessed files to {os.path.abspath(full_dest_dir)}") if not os.path.exists(full_dest_dir): os.makedirs(full_dest_dir) if not os.path.exists(full_metadata_dir): os.makedirs(full_metadata_dir) with open(os.path.join(full_dest_dir, 'preprocessing_types.json'), "w") as f: json_str = jsons.dumps(preprocessing_params) f.write(json_str) params = [] for root, dirs, files in os.walk(full_src_dir): for file in files: if file.endswith(f".{PARSED_FILE_EXTENSION}"): full_dest_dir_with_sub_dir = os.path.join(full_dest_dir, os.path.relpath(root, full_src_dir)) if not os.path.exists(full_dest_dir_with_sub_dir): os.makedirs(full_dest_dir_with_sub_dir) params.append((os.path.join(root, file), os.path.join(full_dest_dir_with_sub_dir, file), preprocessing_params)) files_total = len(params) with Pool() as pool: it = pool.imap_unordered(preprocess_and_write, params) for _ in tqdm(it, total=files_total): pass
def gen(): with open(path_to_file, 'r') as f: identifiers = [line.rstrip('\n') for line in f] csv_lines = [ DELIMITER.join(["config"] + [p for p in PrepParam] + identifiers) ] for prep in prep_configs: csv_line = [prep] for p in PrepParam: csv_line.append(PrepConfig.human_readable_values[p][ PrepConfig.from_encoded_string(prep).get_param_value(p)]) for identifier in identifiers: tokens = preprocess(identifier, prep) csv_line.append(' '.join(tokens)) csv_lines.append(DELIMITER.join(csv_line)) with open(path_to_file_out, 'w') as f: for line in csv_lines: f.write(f'{line}\n')
def test_to_repr_1_nosep(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 1, PrepParam.COM_STR: 0, PrepParam.SPLIT: 1, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) actual = to_repr(prep_config, tokens, NgramSplitConfig()) expected = [ '1.1', "*", pl['non_eng'], '"', pl['word_start'], pl['capitals'], 'a', pl["capital"], pl['non_eng'], pl['word_end'], '"', '/*', pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english', pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'], pl['non_eng'], '8', pl['word_end'], pl['olc_end'] ] self.assertEqual(expected, actual)
def test_to_repr_no_str_no_com(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 1, PrepParam.COM_STR: 2, PrepParam.SPLIT: 3, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM, sc_splittings={'english': ['engl', 'ish']}) actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], pl["string_literal"], pl["comment"], pl["comment"] ] self.assertEqual(expected, actual)
def test_1(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 0, PrepParam.COM_STR: 0, PrepParam.SPLIT: 4, PrepParam.TABS_NEWLINES: 0, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.BPE, merges_cache={'while': ['while']}) tokens = [SplitContainer.from_single_token("While")] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['capital'], "while", ] self.assertEqual(expected, actual)
def run_on_device(config: ClassifierConfig, force_rerun: bool) -> None: base_model = config.base_model pretraining = config.pretraining_type PrepConfig.assert_classification_config(config.data.repr) if bool(base_model) != bool(pretraining): raise ValueError( 'Base model and pretraining_type params must be both set or both unset!' ) fs = FS.for_classifier(config.data.dataset, config.data.repr, base_model=base_model, pretraining=pretraining, classification_type=config.classification_type) fs.create_path_to_model(config.data, config.training_config) attach_dataset_aware_handlers_to_loggers(fs.path_to_model, 'main.log') print_gpu_info() text_field = fs.load_text_field() rnn_learner = create_nn_architecture(fs, text_field, LEVEL_LABEL, config.data, config.arch, config.min_log_coverage_percent) logger.info(rnn_learner) same_model_exists = fs.best_model_exists(rnn_learner) if same_model_exists and not force_rerun: logger.info( f'Model {fs.path_to_classification_model} already trained. Not rerunning training.' f'To retrain the model with this parameters, specify --force-rerun flag' ) return elif same_model_exists: logger.info( f"Model {fs.path_to_classification_model} already trained. Forcing rerun." ) if pretraining == PretrainingType.FULL: try: logger.info(f'Trying to load base classifier: {base_model}') fs.load_base_model(rnn_learner) logger.info('Base classifier model is loaded.') except Exception as e: logger.warning(e) logger.warning( 'Base classifier model not loaded. Training from scratch') elif pretraining == PretrainingType.ONLY_ENCODER: try: logger.info(f'Trying to load pretarined LM: {base_model}') # TODO its a dirty hack. fix it fs.lm_cl_pretraining = True fs.load_pretrained_langmodel(rnn_learner) logger.info("Using pretrained LM") except Exception as e: logger.warning(e) logger.warning('Pretrained LM not loaded. Training from scratch') else: logger.info("No pretraining. Training classifier from scratch.") config_manager.save_config(config.training_config, fs.path_to_model) train(fs, rnn_learner, config.training, config.metrics) model = rnn_learner.model to_test_mode(model) sample_test_runs_file = os.path.join(fs.path_to_model, 'test_runs.out') n_predicitions = 6 if config.classification_type == 'level' else 2 show_tests(fs.test_path, model, text_field, sample_test_runs_file, config.data.backwards, n_predicitions, config.testing.n_samples) logger.info("Classifier training finished successfully.")
def to_repr_l(lst): return to_repr(PrepConfig.from_encoded_string('000010'), lst, NgramSplitConfig())