示例#1
0
文件: predict.py 项目: rajaswa/gector
def predict_for_file(input_file,
                     output_file,
                     transforms_file,
                     model,
                     batch_size=32):
    test_data = read_lines(input_file)
    predictions, pred_tokens = [], []
    cnt_corrections = 0
    batch = []
    for sent in test_data:
        batch.append(sent.split())
        if len(batch) == batch_size:
            preds, cnt, total_output_tokens = model.handle_batch(batch)
            predictions.extend(preds)
            pred_tokens.extend(total_output_tokens)
            cnt_corrections += cnt
            batch = []
    if batch:
        preds, cnt, edits = model.handle_batch(batch)
        predictions.extend(preds)
        pred_tokens.extend(total_output_tokens)
        cnt_corrections += cnt

    with open(output_file, 'w') as f:
        f.write("\n".join([" ".join(x) for x in predictions]) + '\n')

    pprint(pred_tokens)
    # with open(transforms_file, 'w') as f:
    #     f.write("\n".join([" ".join(x) for x in pred_edits]) + '\n')
    with open('transforms.pkl', 'wb') as f:
        pickle.dump(pred_tokens, f)

    return cnt_corrections
示例#2
0
def predict_for_file(input_file, output_file, model, batch_size=32):
    test_data = read_lines(input_file)
    predictions = []
    cnt_corrections = 0
    batch = []
    for sent in test_data:
        batch.append(sent.split())
        if len(batch) == batch_size:
            # preds, cnt = model.handle_batch(batch)
            result = model.handle_batch(batch)
            preds = result['pred_tokens_batch']
            cnt = result['correct_cnt']
            predictions.extend(preds)
            cnt_corrections += cnt
            batch = []
    if batch:
        # preds, cnt = model.handle_batch(batch)
        result = model.handle_batch(batch)
        preds = result['pred_tokens_batch']
        cnt = result['correct_cnt']
        predictions.extend(preds)
        cnt_corrections += cnt

    with open(output_file, 'w') as f:
        f.write("\n".join([" ".join(x) for x in predictions]) + '\n')
    return cnt_corrections
示例#3
0
def predict_for_file(input_file,
                     output_file,
                     model,
                     batch_size=32,
                     to_normalize=False):
    test_data = read_lines(input_file)
    predictions = []
    cnt_corrections = 0
    batch = []
    for sent in test_data:
        batch.append(sent.split())
        if len(batch) == batch_size:
            preds, cnt = model.handle_batch(batch)
            predictions.extend(preds)
            cnt_corrections += cnt
            batch = []
    if batch:
        preds, cnt = model.handle_batch(batch)
        predictions.extend(preds)
        cnt_corrections += cnt

    result_lines = [" ".join(x) for x in predictions]
    if to_normalize:
        result_lines = [normalize(line) for line in result_lines]

    with open(output_file, 'w') as f:
        f.write("\n".join(result_lines) + '\n')
    return cnt_corrections
示例#4
0
def predict_for_file(input_file, output_file, model, batch_size=32):



    test_data = read_lines(input_file)
    predictions = []
    cnt_corrections = 0
    batch = []
    rec_sent_times = []; rec_word_times = []
    for sent in test_data:
        # print('sent:', sent)
        batch.append(sent.split())
        # print('batch:', batch); exit(0)
        if len(batch) == batch_size:
            # print('batch:', batch)
            # print('batch:', len(batch))
            tot_words = sum([len(item) for item in batch])
            # print('tot_words:', tot_words)
            # print('before')
            # GPUtil.showUtilization()

            start = time.time()
            preds, cnt = model.handle_batch(batch)
            stop = time.time()
            tot_time = stop-start

            # print('after')
            # GPUtil.showUtilization()
            # print('after empty:')
            torch.cuda.empty_cache()
            # GPUtil.showUtilization()
            # exit(0)

            # print('time taken:', tot_time); print('sent/sec:', len(batch)/tot_time, 'words/sec:', tot_words/tot_time);
            rec_sent_times.append(len(batch)/tot_time); rec_word_times.append(tot_words/tot_time)
            # exit(0)
            predictions.extend(preds)
            cnt_corrections += cnt
            batch = []
    if batch:
        start = time.time()
        preds, cnt = model.handle_batch(batch)
        stop=time.time()
        tot_words = sum([len(item) for item in batch])
        tot_time = stop - start
        rec_sent_times.append(len(batch) / tot_time); rec_word_times.append(tot_words / tot_time)

        predictions.extend(preds)
        cnt_corrections += cnt

    # print('first batch:', rec_sent_times[0], rec_word_times[0])
    print('Mean sent/sec:', np.mean(rec_sent_times), 'Mean words/sec:', np.mean(rec_word_times))
    print('Median sent/sec:', np.median(rec_sent_times), 'Median words/sec:', np.median(rec_word_times))

    with open(output_file, 'w') as f:
        f.write("\n".join([" ".join(x) for x in predictions]) + '\n')
    return cnt_corrections
示例#5
0
    def correct_sentence_by_file(self, input_file='./data/predict_for_file/input.txt',
                                 output_file='./data/predict_for_file/output.txt', batch_size=32):
        test_data = read_lines(input_file)
        predictions = []
        cnt_corrections = 0
        batch = []
        for sent in test_data:
            batch.append(self.language_checker(sent).split())
            if len(batch) == batch_size:
                preds, cnt = self.model.handle_batch(batch)
                predictions.extend(preds)
                cnt_corrections += cnt
                batch = []
        if batch:
            preds, cnt = self.model.handle_batch(batch)
            predictions.extend(preds)
            cnt_corrections += cnt

        with open(output_file, 'w') as f:
            f.write("\n".join([" ".join(x) for x in predictions]) + '\n')
        return cnt_corrections
示例#6
0
def predict_for_file(input_file, output_file, model, batch_size=32):
    test_data = read_lines(input_file)
    predictions = []
    cnt_corrections = 0
    batch = []
    count = 0
    for sent in test_data:
        batch.append(sent.split())
        if len(batch) == batch_size:
            preds, cnt = model.handle_batch(batch, count, batch_size)
            predictions.extend(preds)
            cnt_corrections += cnt
            batch = []
            count += 1
    if batch:
        preds, cnt = model.handle_batch(batch, count, batch_size)
        predictions.extend(preds)
        cnt_corrections += cnt

    with open(output_file, 'w') as f:
        f.write("\n".join([" ".join(x) for x in predictions]) + '\n')
    return cnt_corrections
def predict_for_file(input_file,
                     output_file,
                     model,
                     batch_size=32,
                     save_logs=0):
    test_data = read_lines(input_file)
    #     predictions = []
    cnt_corrections = 0
    batch = []
    with open(output_file, 'w') as f:
        f.write("")

    if save_logs:
        with open(output_file + ".log", 'w') as f:
            f.write("")

        with open(output_file + ".check_correction", 'w') as f:
            f.write("")

    predicting_start_time = time.time()

    total_lines = len(test_data)
    processed_lines = 0
    corrected_lines = 0

    for sent in test_data:
        batch.append(sent.split())
        if len(batch) == batch_size:
            preds, cnt = model.handle_batch(batch)

            processed_lines += batch_size

            pred_sents = [" ".join(x) for x in preds]

            with open(output_file, 'a') as f:
                f.write("\n".join(pred_sents) + '\n')

            cnt_corrections += cnt

            if save_logs:
                checked_lines = get_corrected_lines_for_batch(batch, preds)
                corrected_lines += sum(checked_lines)
                checked_lines = [str(s) for s in checked_lines]
                with open(output_file + ".check_correction", 'a') as f:
                    f.write("\n".join(checked_lines) + '\n')

                predicting_elapsed_time = time.time() - predicting_start_time
                prediction_duration = datetime.timedelta(
                    seconds=predicting_elapsed_time)

                with open(output_file + ".log", 'w') as f:
                    f.write(
                        generate_text_for_log(processed_lines, total_lines,
                                              corrected_lines,
                                              prediction_duration,
                                              cnt_corrections))

            batch = []
    if batch:
        preds, cnt = model.handle_batch(batch)
        processed_lines += len(batch)
        pred_sents = [" ".join(x) for x in preds]

        with open(output_file, 'a') as f:
            f.write("\n".join(pred_sents) + '\n')

        cnt_corrections += cnt

        checked_lines = get_corrected_lines_for_batch(batch, preds)
        corrected_lines += sum(checked_lines)
        checked_lines = [str(s) for s in checked_lines]

        if save_logs:

            with open(output_file + ".check_correction", 'a') as f:
                f.write("\n".join(checked_lines) + '\n')

            predicting_elapsed_time = time.time() - predicting_start_time
            prediction_duration = datetime.timedelta(
                seconds=predicting_elapsed_time)

            with open(output_file + ".log", 'w') as f:
                f.write(
                    generate_text_for_log(processed_lines, total_lines,
                                          corrected_lines, prediction_duration,
                                          cnt_corrections))

    predicting_elapsed_time = time.time() - predicting_start_time
    prediction_duration = datetime.timedelta(seconds=predicting_elapsed_time)

    print(prediction_duration)

    return cnt_corrections