def compile_file(self, file_path): print("Compiling", file_path, "...") file_name = os.path.splitext(os.path.basename(file_path))[0] dir_name = os.path.split(file_path)[0] output_file_name = os.path.join(dir_name, file_name + "__.xml") with open(output_file_name, "w") as output_file: tokenizer = Tokenizer(file_path) try: compiler = CompilationEngine(tokenizer, output_file) compiler.compile() print("Compilation successful!", file_path, "=>", output_file_name) except CompilationError as err: tokenizer.close() raise CompilationError("ERROR: " + err.message)
def worker_process(i, jobs_queue, output_queue, args): source_tokenizer = Tokenizer(args.source_tokenizer_command, args.source_lang) target_tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang) while True: job = jobs_queue.get() if job: logging.debug("Job {}".format(job.__repr__())) nblock, filein_name, label = job with open(filein_name, 'r') as filein, NamedTemporaryFile( mode="w", delete=False) as fileout: logging.debug("Filtering: creating temporary file {}".format( fileout.name)) for i in filein: srcsen, trgsen = i.split("\t")[:2] trgsen = trgsen.strip() features = feature_extract(srcsen, trgsen, source_tokenizer, target_tokenizer, args) for j in features: fileout.write("{}".format(j)) fileout.write("\t") fileout.write("{}".format(label)) fileout.write("\n") ojob = (nblock, fileout.name) fileout.close() filein.close() output_queue.put(ojob) os.unlink(filein_name) else: logging.debug("Exiting worker") source_tokenizer.close() target_tokenizer.close() break
def perform_training(args): time_start = default_timer() logging.debug("Starting process") logging.debug("Running {0} workers at {1} rows per block".format( args.processes, args.block_size)) process_count = max(1, args.processes) maxsize = 1000 * process_count output_queue = Queue(maxsize=maxsize) worker_count = process_count #Read input to a named temporary file #We may need to read it multiple times and that would be problematic if it is sys.stdin count_input_lines = 0 input = NamedTemporaryFile(mode="w", delete=False) for line in args.input: input.write(line) count_input_lines = count_input_lines + 1 input.close() if count_input_lines < 10000: logging.error( "Training corpus must be at least 10K sentences long (was {}).". format(count_input_lines)) sys.exit(1) # Load dictionaries if args.source_word_freqs: args.sl_word_freqs = WordZipfFreqDist(args.source_word_freqs) if args.target_word_freqs: args.tl_word_freqs = WordZipfFreqDistDoubleLinked( args.target_word_freqs) else: args.tl_word_freqs = None # Train p**n removal classifier train_porn_removal(args) stats = None with open(input.name) as input_f: args.input = input_f stats = train_fluency_filter(args) input_f.seek(0) # Shuffle and get length ratio noisy_target_tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang) total_size, length_ratio, good_sentences, wrong_sentences = build_noisy_set( args.input, count_input_lines // 2, count_input_lines // 2, args.wrong_examples_file, args.tl_word_freqs, noisy_target_tokenizer) noisy_target_tokenizer.close() os.remove(input.name) args.length_ratio = length_ratio # Load dictionaries args.dict_sl_tl = ProbabilisticDictionary(args.source_dictionary) args.dict_tl_sl = ProbabilisticDictionary(args.target_dictionary) logging.info("Start computing features.") features_file = TemporaryFile('w+') # Start reducer reduce = Process(target=reduce_process, args=(output_queue, features_file)) reduce.start() # Start workers jobs_queue = Queue(maxsize=maxsize) workers = [] for i in range(worker_count): worker = Process(target=worker_process, args=(i, jobs_queue, output_queue, args)) worker.daemon = True # dies with the parent process worker.start() workers.append(worker) # Mapper process (foreground - parent) last_block = map_process(good_sentences, args.block_size, jobs_queue, 1, 0) good_sentences.close() map_process(wrong_sentences, args.block_size, jobs_queue, 0, last_block + 1) wrong_sentences.close() # Worker termination for _ in workers: jobs_queue.put(None) logging.info("End computing features.") for w in workers: w.join() # Reducer termination output_queue.put(None) reduce.join() features_file.seek(0) if args.dump_features: logging.info("Dumping features to " + os.path.abspath(args.dump_features.name)) for i in features_file: args.dump_features.write(i) args.dump_features.close() features_file.seek(0) logging.info("Start training.") # Use 90% of the input to train and 10% for test if args.wrong_examples_file is not None: good_examples = int(count_input_lines * 0.9) good_examples_test = int(count_input_lines * 0.1) wrong_examples = 0 with args.examples_file as file: wrong_examples = sum(1 for line in file) wrong_esamples_test = min(good_examples_test, int(wrong_examples * 0.1)) else: good_examples = int(count_input_lines // 2 * 0.9) good_examples_test = int(count_input_lines // 2 * 0.1) wrong_examples = good_examples wrong_examples_test = good_examples_test hgood = [] hwrong = [] with TemporaryFile("w+") as features_train, TemporaryFile( "w+") as features_test: nline = 0 for line in features_file: if nline < good_examples: features_train.write(line) elif nline < good_examples + good_examples_test: features_test.write(line) elif nline < good_examples + good_examples_test + wrong_examples: features_train.write(line) else: features_test.write(line) nline += 1 features_train.flush() features_test.flush() features_train.seek(0) features_test.seek(0) hgood, hwrong = train_classifier( features_train, features_test, args.classifier_type, args.classifier, Features(None, args.disable_features_quest, args.disable_lang_ident).titles) features_train.close() features_test.close() logging.info("End training.") write_metadata(args, length_ratio, hgood, hwrong, stats) args.metadata.close() # Stats logging.info("Finished.") elapsed_time = default_timer() - time_start logging.info("Elapsed time {:.2f}s.".format(elapsed_time))