def predict_for_file(input_file, output_file, transforms_file, model, batch_size=32): test_data = read_lines(input_file) predictions, pred_tokens = [], [] cnt_corrections = 0 batch = [] for sent in test_data: batch.append(sent.split()) if len(batch) == batch_size: preds, cnt, total_output_tokens = model.handle_batch(batch) predictions.extend(preds) pred_tokens.extend(total_output_tokens) cnt_corrections += cnt batch = [] if batch: preds, cnt, edits = model.handle_batch(batch) predictions.extend(preds) pred_tokens.extend(total_output_tokens) cnt_corrections += cnt with open(output_file, 'w') as f: f.write("\n".join([" ".join(x) for x in predictions]) + '\n') pprint(pred_tokens) # with open(transforms_file, 'w') as f: # f.write("\n".join([" ".join(x) for x in pred_edits]) + '\n') with open('transforms.pkl', 'wb') as f: pickle.dump(pred_tokens, f) return cnt_corrections
def predict_for_file(input_file, output_file, model, batch_size=32): test_data = read_lines(input_file) predictions = [] cnt_corrections = 0 batch = [] for sent in test_data: batch.append(sent.split()) if len(batch) == batch_size: # preds, cnt = model.handle_batch(batch) result = model.handle_batch(batch) preds = result['pred_tokens_batch'] cnt = result['correct_cnt'] predictions.extend(preds) cnt_corrections += cnt batch = [] if batch: # preds, cnt = model.handle_batch(batch) result = model.handle_batch(batch) preds = result['pred_tokens_batch'] cnt = result['correct_cnt'] predictions.extend(preds) cnt_corrections += cnt with open(output_file, 'w') as f: f.write("\n".join([" ".join(x) for x in predictions]) + '\n') return cnt_corrections
def predict_for_file(input_file, output_file, model, batch_size=32, to_normalize=False): test_data = read_lines(input_file) predictions = [] cnt_corrections = 0 batch = [] for sent in test_data: batch.append(sent.split()) if len(batch) == batch_size: preds, cnt = model.handle_batch(batch) predictions.extend(preds) cnt_corrections += cnt batch = [] if batch: preds, cnt = model.handle_batch(batch) predictions.extend(preds) cnt_corrections += cnt result_lines = [" ".join(x) for x in predictions] if to_normalize: result_lines = [normalize(line) for line in result_lines] with open(output_file, 'w') as f: f.write("\n".join(result_lines) + '\n') return cnt_corrections
def predict_for_file(input_file, output_file, model, batch_size=32): test_data = read_lines(input_file) predictions = [] cnt_corrections = 0 batch = [] rec_sent_times = []; rec_word_times = [] for sent in test_data: # print('sent:', sent) batch.append(sent.split()) # print('batch:', batch); exit(0) if len(batch) == batch_size: # print('batch:', batch) # print('batch:', len(batch)) tot_words = sum([len(item) for item in batch]) # print('tot_words:', tot_words) # print('before') # GPUtil.showUtilization() start = time.time() preds, cnt = model.handle_batch(batch) stop = time.time() tot_time = stop-start # print('after') # GPUtil.showUtilization() # print('after empty:') torch.cuda.empty_cache() # GPUtil.showUtilization() # exit(0) # print('time taken:', tot_time); print('sent/sec:', len(batch)/tot_time, 'words/sec:', tot_words/tot_time); rec_sent_times.append(len(batch)/tot_time); rec_word_times.append(tot_words/tot_time) # exit(0) predictions.extend(preds) cnt_corrections += cnt batch = [] if batch: start = time.time() preds, cnt = model.handle_batch(batch) stop=time.time() tot_words = sum([len(item) for item in batch]) tot_time = stop - start rec_sent_times.append(len(batch) / tot_time); rec_word_times.append(tot_words / tot_time) predictions.extend(preds) cnt_corrections += cnt # print('first batch:', rec_sent_times[0], rec_word_times[0]) print('Mean sent/sec:', np.mean(rec_sent_times), 'Mean words/sec:', np.mean(rec_word_times)) print('Median sent/sec:', np.median(rec_sent_times), 'Median words/sec:', np.median(rec_word_times)) with open(output_file, 'w') as f: f.write("\n".join([" ".join(x) for x in predictions]) + '\n') return cnt_corrections
def correct_sentence_by_file(self, input_file='./data/predict_for_file/input.txt', output_file='./data/predict_for_file/output.txt', batch_size=32): test_data = read_lines(input_file) predictions = [] cnt_corrections = 0 batch = [] for sent in test_data: batch.append(self.language_checker(sent).split()) if len(batch) == batch_size: preds, cnt = self.model.handle_batch(batch) predictions.extend(preds) cnt_corrections += cnt batch = [] if batch: preds, cnt = self.model.handle_batch(batch) predictions.extend(preds) cnt_corrections += cnt with open(output_file, 'w') as f: f.write("\n".join([" ".join(x) for x in predictions]) + '\n') return cnt_corrections
def predict_for_file(input_file, output_file, model, batch_size=32): test_data = read_lines(input_file) predictions = [] cnt_corrections = 0 batch = [] count = 0 for sent in test_data: batch.append(sent.split()) if len(batch) == batch_size: preds, cnt = model.handle_batch(batch, count, batch_size) predictions.extend(preds) cnt_corrections += cnt batch = [] count += 1 if batch: preds, cnt = model.handle_batch(batch, count, batch_size) predictions.extend(preds) cnt_corrections += cnt with open(output_file, 'w') as f: f.write("\n".join([" ".join(x) for x in predictions]) + '\n') return cnt_corrections
def predict_for_file(input_file, output_file, model, batch_size=32, save_logs=0): test_data = read_lines(input_file) # predictions = [] cnt_corrections = 0 batch = [] with open(output_file, 'w') as f: f.write("") if save_logs: with open(output_file + ".log", 'w') as f: f.write("") with open(output_file + ".check_correction", 'w') as f: f.write("") predicting_start_time = time.time() total_lines = len(test_data) processed_lines = 0 corrected_lines = 0 for sent in test_data: batch.append(sent.split()) if len(batch) == batch_size: preds, cnt = model.handle_batch(batch) processed_lines += batch_size pred_sents = [" ".join(x) for x in preds] with open(output_file, 'a') as f: f.write("\n".join(pred_sents) + '\n') cnt_corrections += cnt if save_logs: checked_lines = get_corrected_lines_for_batch(batch, preds) corrected_lines += sum(checked_lines) checked_lines = [str(s) for s in checked_lines] with open(output_file + ".check_correction", 'a') as f: f.write("\n".join(checked_lines) + '\n') predicting_elapsed_time = time.time() - predicting_start_time prediction_duration = datetime.timedelta( seconds=predicting_elapsed_time) with open(output_file + ".log", 'w') as f: f.write( generate_text_for_log(processed_lines, total_lines, corrected_lines, prediction_duration, cnt_corrections)) batch = [] if batch: preds, cnt = model.handle_batch(batch) processed_lines += len(batch) pred_sents = [" ".join(x) for x in preds] with open(output_file, 'a') as f: f.write("\n".join(pred_sents) + '\n') cnt_corrections += cnt checked_lines = get_corrected_lines_for_batch(batch, preds) corrected_lines += sum(checked_lines) checked_lines = [str(s) for s in checked_lines] if save_logs: with open(output_file + ".check_correction", 'a') as f: f.write("\n".join(checked_lines) + '\n') predicting_elapsed_time = time.time() - predicting_start_time prediction_duration = datetime.timedelta( seconds=predicting_elapsed_time) with open(output_file + ".log", 'w') as f: f.write( generate_text_for_log(processed_lines, total_lines, corrected_lines, prediction_duration, cnt_corrections)) predicting_elapsed_time = time.time() - predicting_start_time prediction_duration = datetime.timedelta(seconds=predicting_elapsed_time) print(prediction_duration) return cnt_corrections