def load_lm_filter(source_lang, target_lang, metadata_yaml, source_tokenizer_command, target_tokenizer_command): logging.debug("Loading LM filter") lmFilter = DualLMFluencyFilter(LMType[metadata_yaml['lm_type']], source_lang, target_lang, source_tokenizer_command, target_tokenizer_command) stats = DualLMStats(metadata_yaml['clean_mean_perp'], metadata_yaml['clean_stddev_perp'], metadata_yaml['noisy_mean_perp'], metadata_yaml['noisy_stddev_perp']) fullpath_source_lm = os.path.join(metadata_yaml["yamlpath"], metadata_yaml['source_lm']) if os.path.isfile(fullpath_source_lm): source_lm = fullpath_source_lm else: source_lm = metadata_yaml['source_lm'] fullpath_target_lm = os.path.join(metadata_yaml["yamlpath"], metadata_yaml['target_lm']) if os.path.isfile(fullpath_target_lm): target_lm = fullpath_target_lm else: target_lm = metadata_yaml['target_lm'] lmFilter.load(source_lm, target_lm, stats) return lmFilter
def train_fluency_filter(args): # Prepare corpora: # Input corpora for training the classifier split in 2 parts: # - Training data for LM # - Validation set for estimating perplexity of clean text # Input noisy corpus used as validation set for estimating perplexity of noisy text logging.info("Training LM-based fluency filter") if not (args.noisy_examples_file_sl and args.noisy_examples_file_tl and args.lm_file_sl and args.lm_file_tl): return None inputIsTmp = True if args.lm_training_file_sl and args.lm_training_file_tl and args.lm_clean_examples_file_sl and args.lm_clean_examples_file_tl: inputIsTmp = False lm_train_path_sl = args.lm_training_file_sl lm_train_path_tl = args.lm_training_file_tl lm_dev_clean_sl = args.lm_clean_examples_file_sl lm_dev_clean_tl = args.lm_clean_examples_file_tl logging.info("SL LM training corpus: {}".format(lm_train_path_sl)) logging.info("TL LM training corpus: {}".format(lm_train_path_tl)) logging.info("SL LM dev clean corpus: {}".format(lm_dev_clean_sl)) logging.info("TL LM dev clean corpus: {}".format(lm_dev_clean_tl)) logging.info("SL LM dev noisy corpus: {}".format( args.noisy_examples_file_sl)) logging.info("TL LM dev noisy corpus: {}".format( args.noisy_examples_file_tl)) else: logging.info( "SL & TL LM training corpora have been obtained from tab-separated input file (the same ones used for training the Random Forest classifier), after randomly removing {} sentences" .format(args.lm_dev_size)) logging.info( "SL & TL LM dev clean corpora have been randomly selected from input input file (the same used for training the Random Forest classifier): {} sentences" .format(args.lm_dev_size)) logging.info("SL LM dev noisy corpus: {}".format( args.noisy_examples_file_sl)) logging.info("TL LM dev noisy corpus: {}".format( args.noisy_examples_file_tl)) lm_train_path_sl, lm_train_path_tl, lm_dev_clean_sl, lm_dev_clean_tl = shuffle_lm_training_text( args.input, args.lm_dev_size) try: ff = DualLMFluencyFilter(LMType.CHARACTER, args.source_lang, args.target_lang) stats = ff.train(lm_train_path_sl, lm_train_path_tl, lm_dev_clean_sl, lm_dev_clean_tl, args.noisy_examples_file_sl, args.noisy_examples_file_tl, args.lm_file_sl, args.lm_file_tl) finally: if inputIsTmp: os.remove(lm_train_path_sl) os.remove(lm_train_path_tl) os.remove(lm_dev_clean_sl) os.remove(lm_dev_clean_tl) return stats
def initialization(): global nline global logging_level nline = 0 logging.info("Processing arguments...") # Getting arguments and options with argparse # Initialization of the argparse class parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) # Mandatory parameters ## Input file. Try to open it to check if it exists parser.add_argument('input', type=argparse.FileType('rt'), default=None, help="Tab-separated files to be classified") parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="Output of the classification") parser.add_argument('metadata', type=argparse.FileType('r'), default=None, help="Training metadata (YAML file)") # Options group groupO = parser.add_argument_group('Optional') groupO.add_argument("-S", "--source_tokeniser_path", type=str, help="Source language (SL) tokeniser executable absolute path") groupO.add_argument("-T", "--target_tokeniser_path", type=str, help="Target language (TL) tokeniser executable absolute path") groupO.add_argument("--scol", default=3, type=check_positive, help ="Source sentence column (starting in 1)") groupO.add_argument("--tcol", default=4, type=check_positive, help ="Target sentence column (starting in 1)") groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program") groupO.add_argument('-d', '--discarded_tus', type=argparse.FileType('w'), default=None, help="TSV file with discarded TUs. Discarded TUs by the classifier are written in this file in TSV file.") groupO.add_argument('--threshold', type=check_positive_between_zero_and_one, default=0.5, help="Threshold for classifier. If accuracy histogram is present in metadata, the interval for max value will be given as a default instead the current default.") groupO.add_argument('--lm_threshold',type=check_positive_between_zero_and_one, default=0.5, help="Threshold for language model fluency scoring. All TUs whose LM fluency score falls below the threshold will are removed (classifier score set to 0), unless the option --keep_lm_result set.") groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score and do not discard any TU based on that score.") groupO.add_argument('--score_only',action='store_true', help="Only output one column which is the bicleaner score", default=False) groupO.add_argument('--disable_hardrules',action = 'store_true', help = "Disables the bicleaner_hardrules filtering (only bicleaner_classify is applied)") groupO.add_argument('--disable_lang_ident', default=False, action='store_true', help="Don't apply hardrules that use language detecting") # Logging group groupL = parser.add_argument_group('Logging') groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') groupL.add_argument('--debug', action='store_true', help='Debug logging mode') groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit") # Validating & parsing # Checking if metadata is specified args = parser.parse_args() logging_setup(args) logging_level = logging.getLogger().level if logging_level <= logging.WARNING and logging_level != logging.DEBUG: #Getting rid of INFO messages when Moses processes start logging.getLogger("MosesTokenizer").setLevel(logging.WARNING) logging.getLogger("MosesSentenceSplitter").setLevel(logging.WARNING) logging.getLogger("MosesPunctuationNormalizer").setLevel(logging.WARNING) try: yamlpath = os.path.dirname(os.path.abspath(args.metadata.name)) metadata_yaml = yaml.load(args.metadata) args.source_lang=metadata_yaml["source_lang"] args.target_lang=metadata_yaml["target_lang"] if "source_tokeniser_path" in metadata_yaml: args.source_tokeniser_path=metadata_yaml["source_tokeniser_path"] if "target_tokeniser_path" in metadata_yaml: args.target_tokeniser_path=metadata_yaml["target_tokeniser_path"] try: args.clf=joblib.load( os.path.join( yamlpath , metadata_yaml["classifier"])) except: args.clf=joblib.load(metadata_yaml["classifier"]) # args.clf.n_jobs = None args.classifier_type=metadata_yaml["classifier_type"] try: args.dict_sl_tl = ProbabilisticDictionary( os.path.join(yamlpath , metadata_yaml["source_dictionary"])) except: args.dict_sl_tl = ProbabilisticDictionary(metadata_yaml["source_dictionary"]) try: args.dict_tl_sl = ProbabilisticDictionary( os.path.join(yamlpath , metadata_yaml["target_dictionary"])) except: args.dict_tl_sl = ProbabilisticDictionary(metadata_yaml["target_dictionary"]) args.normalize_by_length = metadata_yaml["normalize_by_length"] args.treat_oovs = metadata_yaml["treat_oovs"] args.qmax_limit = metadata_yaml["qmax_limit"] args.disable_features_quest = metadata_yaml["disable_features_quest"] args.good_examples = metadata_yaml["good_examples"] args.wrong_examples = metadata_yaml["wrong_examples"] args.good_test_examples = metadata_yaml["good_test_examples"] args.wrong_test_examples = metadata_yaml["wrong_test_examples"] args.length_ratio = metadata_yaml["length_ratio"] args.features_version = 1 if "features_version" not in metadata_yaml else int(metadata_yaml["features_version"]) threshold = np.argmax(metadata_yaml["accuracy_histogram"])*0.1 logging.info("Accuracy histogram: {}".format(metadata_yaml["accuracy_histogram"])) logging.info("Ideal threshold: {:1.1f}".format(threshold)) metadata_yaml["threshold"] = threshold #Load LM stuff if model was trained with it if "source_lm" in metadata_yaml and "target_lm" in metadata_yaml: lmFilter = DualLMFluencyFilter( LMType[metadata_yaml['lm_type']] ,args.source_lang, args.target_lang) stats=DualLMStats( metadata_yaml['clean_mean_perp'],metadata_yaml['clean_stddev_perp'],metadata_yaml['noisy_mean_perp'],metadata_yaml['noisy_stddev_perp'] ) fullpath_source_lm=os.path.join(yamlpath,metadata_yaml['source_lm']) if os.path.isfile(fullpath_source_lm): source_lm= fullpath_source_lm else: source_lm= metadata_yaml['source_lm'] fullpath_target_lm=os.path.join(yamlpath,metadata_yaml['target_lm']) if os.path.isfile(fullpath_target_lm): target_lm=fullpath_target_lm else: target_lm=metadata_yaml['target_lm'] lmFilter.load(source_lm, target_lm ,stats) args.lm_filter=lmFilter else: args.lm_filter=None logging.debug("YAML") logging.debug(metadata_yaml) parser.set_defaults(**metadata_yaml) except: print("Error loading metadata") traceback.print_exc() sys.exit(1) # Ensure that directory exists; if not, create it if not os.path.exists(args.tmp_dir): os.makedirs(args.tmp_dir) if args.score_only and args.keep_lm_result: raise AssertionError("Conflicting arguments: cannot output bicleaner score only AND keep language model result") logging.debug("Arguments processed: {}".format(str(args))) logging.info("Arguments processed.") return args
def classifier_process(i, jobs_queue, output_queue, args): if args.source_tokeniser_path: source_tokeniser = ToolWrapper(args.source_tokeniser_path.split(' ')) else: source_tokeniser = MosesTokenizer(args.source_lang) if args.target_tokeniser_path: target_tokeniser = ToolWrapper(args.target_tokeniser_path.split(' ')) else: target_tokeniser = MosesTokenizer(args.target_lang) #Load LM for fluency scoring lm_filter=None if args.source_lm and args.target_lm: lm_filter=DualLMFluencyFilter(args.lm_type,args.source_lang, args.target_lang) lm_filter.load(args.source_lm, args.target_lm,args.lm_filter_stats) while True: job = jobs_queue.get() if job: logging.debug("Job {0}".format(job.__repr__())) nblock, filein_name = job ojob = None with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False, dir=args.tmp_dir) as fileout: logging.debug("Classification: creating temporary filename {0}".format(fileout.name)) feats = [] lm_scores=[] #Create the following arrays: #valid_sentences: boolean, length of input. States whether each sentence passed # hard rules and lm fluency filtering #feats: vector of tuples, input features to the classifier, length equals number # of sentences in the input that passed hard rules + lm fluency filtering valid_sentences=[] for i in filein: parts = i.split("\t") sl_sentence=None tl_sentence=None if len(parts) >= max(args.scol, args.tcol): sl_sentence=parts[args.scol-1] tl_sentence=parts[args.tcol-1] else: logging.error("ERROR: scol ({}) or tcol ({}) indexes above column number ({})".format(args.scol, args.tcol, len(parts))) if sl_sentence and tl_sentence and len(sl_sentence.strip()) != 0 and len(tl_sentence.strip()) != 0 and (args.disable_hardrules or wrong_tu(sl_sentence.strip(),tl_sentence.strip(), args)== False): #if disable_hardrules == 1 --> the second part (and) is always true lm_score=None if lm_filter: lm_score=lm_filter.score(sl_sentence,tl_sentence) if lm_filter and lm_score < args.lm_threshold and not args.keep_lm_result: valid_sentences.append(False) else: features = feature_extract(sl_sentence, tl_sentence, source_tokeniser, target_tokeniser, args) feats.append([float(v) for v in features]) lm_scores.append(lm_score) valid_sentences.append(True) else: valid_sentences.append(False) predictions = args.clf.predict_proba(np.array(feats)) if len(feats) > 0 else [] filein.seek(0) piter = iter(predictions) if lm_filter: lmiter=iter(lm_scores) for i, valid_sentence in zip(filein,valid_sentences): if valid_sentence: p = next(piter) if args.score_only: fileout.write("{0:.3f}".format((p[1]))) else: fileout.write(i.strip()) fileout.write("\t") fileout.write("{0:.3f}".format((p[1]))) if lm_filter and args.keep_lm_result: lm_score=next(lmiter) fileout.write("\t") fileout.write("{0:.3f}".format(lm_score)) fileout.write("\n") else: if args.score_only: fileout.write("0") else: fileout.write(i.strip("\n")) fileout.write("\t0") if lm_filter and args.keep_lm_result: fileout.write("\t0") fileout.write("\n") ojob = (nblock, fileout.name) filein.close() fileout.close() if ojob: output_queue.put(ojob) os.unlink(filein_name) else: logging.debug("Exiting worker") break