', '.join(str(cm[true][pred] / float(1)) for pred in categories)) #for true in cm: # print(true + ', ' + ', '.join(pred + ":" + str(cm[true][pred]) for pred in cm[true])) if __name__ == '__main__': parser = argparse.ArgumentParser( description='Preprocess text before it is used for language model') parser.add_argument('--parse-fillin-orig-noarticles', action='store_true') parser.add_argument('--show-automatic-parse-candidates-for-prediction', action='store_true') parser.add_argument('--all-together', action='store_true') args = parser.parse_args() orig_sents = get_sentences(os.path.join(MANUAL_DATA_PATH, 'fill_in_orig.txt'), sent_tokenize=True) if args.parse_fillin_orig_noarticles: orig_sents = normalize_special_characters(orig_sents) trees = parse_sents(orig_sents) with open(os.path.join(MANUAL_DATA_PATH, 'fill_in_orig_parsed.txt'), 'w+') as f: for t in trees: f.write(t.to_string() + '\n') exit() if args.show_automatic_parse_candidates_for_prediction: with codecs.open(os.path.join('code', 'corrector', 'tmp_parses.txt'), 'r', 'utf-8') as f: print[t for t in f.readlines()]
# ad-hoc upravy kvuli blbymu formatovani pri predikci: sent = sent.replace('- owned', '-owned') sent = sent.replace('- a-year', '-a-year') sent = sent.replace('Co. . ', 'Co. ') sent = sent.replace('9:30 -10', '9:30-10') sent = sent.replace('1/2-inch', '1/2 - inch') return sent if __name__ == '__main__': parser = argparse.ArgumentParser(description='') parser.add_argument('--all-together', action='store_true') args = parser.parse_args() orig_sents = get_sentences( os.path.join(PENN_DATA_PATH, 'penn_test_raw_orig.txt')) if args.all_together: predict_sents = [ get_sentences(os.path.join(PENN_DATA_PATH, predict_file)) for predict_file in ('solution_LOGREG.txt', 'solution_XGB.txt', 'solution_LM_50_05.txt', 'solution_LM_0_0.txt') ] for annotator in predict_sents: assert len(orig_sents) == len(annotator) for sent_nb in range(len(orig_sents)): compare_multiple_predictions(orig_sents[sent_nb], [ correct_stupid_formating(annotator[sent_nb]) for annotator in predict_sents
# for l in f_in: # sents.extend(tokenizer.tokenize(l.strip())) # # print "... removing articles" # sents = [remove_articles(s) for s in sents if len(s.split()) <= MAXLENGTH] # # print "... parsing" # trees = parser.raw_parse_sents(sents[:200]) # trees = [Tree.from_string(str(t_)) for t in trees for t_ in t] # print trees # # # # print "...extracting features" # # df = extract_features(trees) # # # # print "...predicting" # # y = model.predict(df) # # # # format_output(y, trees) # if __name__ == '__main__': sents = get_sentences(sys.argv[1], sent_tokenize=True) #sents = normalize_special_characters(sents) # naparsujeme original, abychom nemuseli resit blbosti typu 'LLR'?: trees = parse_sents(sents) #sents = remove_articles(sents) #print(sents[:200]) #trees = parse_sents(sents) #for t in trees: # print(t.to_string())
SETTINGS.get('paths', 'model'), 'logreg_ovr_binarized_{}_allfeatures_263088.pkl'.format( target)), 'rb')) assert m.classes_[0] == target predictions[:, i] = m.predict_proba(x)[:, 0] return [target_tuple[idx] for idx in np.argmax(predictions, axis=1)] if __name__ == '__main__': parser = argparse.ArgumentParser(description='') parser.add_argument('--model-type', required=True) parser.add_argument('--input', required=True) parser.add_argument('--output', required=True) args = parser.parse_args() sents = remove_articles(get_sentences(args.input, sent_tokenize=True)) model = None if args.model_type == 'xgboost': sents = remove_articles(get_sentences(args.input, sent_tokenize=False)) model = pickle.load(open(XGB_MODEL_PATH, 'rb')) trees = parse_sents(sents) assert len(trees) == len(sents) # #trees = [] # #with codecs.open(os.path.join('code', 'corrector', 'tmp_parses.txt'), 'r', 'utf-8') as f: # # trees = [Tree.from_string(t) for t in f.readlines()] test_x = extract_features(trees) test_y = model.predict(test_x) new_sents = format_predictions_by_machlearn_model(test_y, trees) with codecs.open(args.output, 'w+', 'utf-8') as f: for sent in new_sents: f.write(sent + '\n')