Пример #1
0
 def compile_file(self, file_path):
     print("Compiling", file_path, "...")
     file_name = os.path.splitext(os.path.basename(file_path))[0]
     dir_name = os.path.split(file_path)[0]
     output_file_name = os.path.join(dir_name, file_name + "__.xml")
     with open(output_file_name, "w") as output_file:
         tokenizer = Tokenizer(file_path)
         try:
             compiler = CompilationEngine(tokenizer, output_file)
             compiler.compile()
             print("Compilation successful!", file_path, "=>",
                   output_file_name)
         except CompilationError as err:
             tokenizer.close()
             raise CompilationError("ERROR: " + err.message)
Пример #2
0
def worker_process(i, jobs_queue, output_queue, args):
    source_tokenizer = Tokenizer(args.source_tokenizer_command,
                                 args.source_lang)
    target_tokenizer = Tokenizer(args.target_tokenizer_command,
                                 args.target_lang)

    while True:
        job = jobs_queue.get()
        if job:
            logging.debug("Job {}".format(job.__repr__()))
            nblock, filein_name, label = job

            with open(filein_name, 'r') as filein, NamedTemporaryFile(
                    mode="w", delete=False) as fileout:
                logging.debug("Filtering: creating temporary file {}".format(
                    fileout.name))
                for i in filein:
                    srcsen, trgsen = i.split("\t")[:2]
                    trgsen = trgsen.strip()
                    features = feature_extract(srcsen, trgsen,
                                               source_tokenizer,
                                               target_tokenizer, args)

                    for j in features:
                        fileout.write("{}".format(j))
                        fileout.write("\t")
                    fileout.write("{}".format(label))
                    fileout.write("\n")
                ojob = (nblock, fileout.name)
                fileout.close()
                filein.close()
                output_queue.put(ojob)
            os.unlink(filein_name)
        else:
            logging.debug("Exiting worker")
            source_tokenizer.close()
            target_tokenizer.close()
            break
Пример #3
0
def perform_training(args):
    time_start = default_timer()
    logging.debug("Starting process")
    logging.debug("Running {0} workers at {1} rows per block".format(
        args.processes, args.block_size))

    process_count = max(1, args.processes)
    maxsize = 1000 * process_count

    output_queue = Queue(maxsize=maxsize)
    worker_count = process_count

    #Read input to a named temporary file
    #We may need to read it multiple times and that would be problematic if it is sys.stdin

    count_input_lines = 0
    input = NamedTemporaryFile(mode="w", delete=False)
    for line in args.input:
        input.write(line)
        count_input_lines = count_input_lines + 1
    input.close()

    if count_input_lines < 10000:
        logging.error(
            "Training corpus must be at least 10K sentences long (was {}).".
            format(count_input_lines))
        sys.exit(1)

    # Load dictionaries
    if args.source_word_freqs:
        args.sl_word_freqs = WordZipfFreqDist(args.source_word_freqs)
    if args.target_word_freqs:
        args.tl_word_freqs = WordZipfFreqDistDoubleLinked(
            args.target_word_freqs)
    else:
        args.tl_word_freqs = None

    # Train p**n removal classifier
    train_porn_removal(args)

    stats = None
    with open(input.name) as input_f:
        args.input = input_f
        stats = train_fluency_filter(args)
        input_f.seek(0)

        # Shuffle and get length ratio
        noisy_target_tokenizer = Tokenizer(args.target_tokenizer_command,
                                           args.target_lang)
        total_size, length_ratio, good_sentences, wrong_sentences = build_noisy_set(
            args.input, count_input_lines // 2, count_input_lines // 2,
            args.wrong_examples_file, args.tl_word_freqs,
            noisy_target_tokenizer)
        noisy_target_tokenizer.close()
    os.remove(input.name)

    args.length_ratio = length_ratio

    # Load dictionaries
    args.dict_sl_tl = ProbabilisticDictionary(args.source_dictionary)
    args.dict_tl_sl = ProbabilisticDictionary(args.target_dictionary)

    logging.info("Start computing features.")
    features_file = TemporaryFile('w+')
    # Start reducer
    reduce = Process(target=reduce_process, args=(output_queue, features_file))
    reduce.start()

    # Start workers
    jobs_queue = Queue(maxsize=maxsize)
    workers = []
    for i in range(worker_count):
        worker = Process(target=worker_process,
                         args=(i, jobs_queue, output_queue, args))
        worker.daemon = True  # dies with the parent process
        worker.start()
        workers.append(worker)

    # Mapper process (foreground - parent)
    last_block = map_process(good_sentences, args.block_size, jobs_queue, 1, 0)
    good_sentences.close()

    map_process(wrong_sentences, args.block_size, jobs_queue, 0,
                last_block + 1)
    wrong_sentences.close()

    # Worker termination
    for _ in workers:
        jobs_queue.put(None)

    logging.info("End computing features.")

    for w in workers:
        w.join()

    # Reducer termination
    output_queue.put(None)
    reduce.join()

    features_file.seek(0)

    if args.dump_features:
        logging.info("Dumping features to " +
                     os.path.abspath(args.dump_features.name))
        for i in features_file:
            args.dump_features.write(i)
        args.dump_features.close()
        features_file.seek(0)

    logging.info("Start training.")

    # Use 90% of the input to train and 10% for test
    if args.wrong_examples_file is not None:
        good_examples = int(count_input_lines * 0.9)
        good_examples_test = int(count_input_lines * 0.1)
        wrong_examples = 0
        with args.examples_file as file:
            wrong_examples = sum(1 for line in file)
        wrong_esamples_test = min(good_examples_test,
                                  int(wrong_examples * 0.1))
    else:
        good_examples = int(count_input_lines // 2 * 0.9)
        good_examples_test = int(count_input_lines // 2 * 0.1)
        wrong_examples = good_examples
        wrong_examples_test = good_examples_test

    hgood = []
    hwrong = []
    with TemporaryFile("w+") as features_train, TemporaryFile(
            "w+") as features_test:
        nline = 0
        for line in features_file:
            if nline < good_examples:
                features_train.write(line)
            elif nline < good_examples + good_examples_test:
                features_test.write(line)
            elif nline < good_examples + good_examples_test + wrong_examples:
                features_train.write(line)
            else:
                features_test.write(line)
            nline += 1

        features_train.flush()
        features_test.flush()

        features_train.seek(0)
        features_test.seek(0)
        hgood, hwrong = train_classifier(
            features_train, features_test, args.classifier_type,
            args.classifier,
            Features(None, args.disable_features_quest,
                     args.disable_lang_ident).titles)
        features_train.close()
        features_test.close()

    logging.info("End training.")

    write_metadata(args, length_ratio, hgood, hwrong, stats)
    args.metadata.close()

    # Stats
    logging.info("Finished.")
    elapsed_time = default_timer() - time_start
    logging.info("Elapsed time {:.2f}s.".format(elapsed_time))