示例#1
0
              ', '.join(str(cm[true][pred] / float(1)) for pred in categories))
    #for true in cm:
    #    print(true + ', ' + ', '.join(pred + ":" + str(cm[true][pred]) for pred in cm[true]))


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Preprocess text before it is used for language model')
    parser.add_argument('--parse-fillin-orig-noarticles', action='store_true')
    parser.add_argument('--show-automatic-parse-candidates-for-prediction',
                        action='store_true')
    parser.add_argument('--all-together', action='store_true')
    args = parser.parse_args()

    orig_sents = get_sentences(os.path.join(MANUAL_DATA_PATH,
                                            'fill_in_orig.txt'),
                               sent_tokenize=True)

    if args.parse_fillin_orig_noarticles:
        orig_sents = normalize_special_characters(orig_sents)
        trees = parse_sents(orig_sents)
        with open(os.path.join(MANUAL_DATA_PATH, 'fill_in_orig_parsed.txt'),
                  'w+') as f:
            for t in trees:
                f.write(t.to_string() + '\n')
        exit()

    if args.show_automatic_parse_candidates_for_prediction:
        with codecs.open(os.path.join('code', 'corrector', 'tmp_parses.txt'),
                         'r', 'utf-8') as f:
            print[t for t in f.readlines()]
示例#2
0
    # ad-hoc upravy kvuli blbymu formatovani pri predikci:
    sent = sent.replace('- owned', '-owned')
    sent = sent.replace('- a-year', '-a-year')
    sent = sent.replace('Co. . ', 'Co. ')
    sent = sent.replace('9:30 -10', '9:30-10')
    sent = sent.replace('1/2-inch', '1/2 - inch')

    return sent


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--all-together', action='store_true')
    args = parser.parse_args()

    orig_sents = get_sentences(
        os.path.join(PENN_DATA_PATH, 'penn_test_raw_orig.txt'))

    if args.all_together:
        predict_sents = [
            get_sentences(os.path.join(PENN_DATA_PATH, predict_file))
            for predict_file in ('solution_LOGREG.txt', 'solution_XGB.txt',
                                 'solution_LM_50_05.txt',
                                 'solution_LM_0_0.txt')
        ]
        for annotator in predict_sents:
            assert len(orig_sents) == len(annotator)

        for sent_nb in range(len(orig_sents)):
            compare_multiple_predictions(orig_sents[sent_nb], [
                correct_stupid_formating(annotator[sent_nb])
                for annotator in predict_sents
示例#3
0
#     for l in f_in:
#         sents.extend(tokenizer.tokenize(l.strip()))
#
#     print "... removing articles"
#     sents = [remove_articles(s) for s in sents if len(s.split()) <= MAXLENGTH]
#
#     print "... parsing"
#     trees = parser.raw_parse_sents(sents[:200])
#     trees = [Tree.from_string(str(t_)) for t in trees for t_ in t]
#     print trees
#     #
#     # print "...extracting features"
#     # df = extract_features(trees)
#     #
#     # print "...predicting"
#     # y = model.predict(df)
#     #
#     # format_output(y, trees)
#

if __name__ == '__main__':
    sents = get_sentences(sys.argv[1], sent_tokenize=True)

    #sents = normalize_special_characters(sents)
    # naparsujeme original, abychom nemuseli resit blbosti typu 'LLR'?: trees = parse_sents(sents)
    #sents = remove_articles(sents)
    #print(sents[:200])
    #trees = parse_sents(sents)
    #for t in trees:
    #    print(t.to_string())
示例#4
0
                    SETTINGS.get('paths', 'model'),
                    'logreg_ovr_binarized_{}_allfeatures_263088.pkl'.format(
                        target)), 'rb'))
        assert m.classes_[0] == target
        predictions[:, i] = m.predict_proba(x)[:, 0]
    return [target_tuple[idx] for idx in np.argmax(predictions, axis=1)]


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--model-type', required=True)
    parser.add_argument('--input', required=True)
    parser.add_argument('--output', required=True)
    args = parser.parse_args()

    sents = remove_articles(get_sentences(args.input, sent_tokenize=True))
    model = None
    if args.model_type == 'xgboost':
        sents = remove_articles(get_sentences(args.input, sent_tokenize=False))
        model = pickle.load(open(XGB_MODEL_PATH, 'rb'))
        trees = parse_sents(sents)
        assert len(trees) == len(sents)
        # #trees = []
        # #with codecs.open(os.path.join('code', 'corrector', 'tmp_parses.txt'), 'r', 'utf-8') as f:
        # #    trees = [Tree.from_string(t) for t in f.readlines()]
        test_x = extract_features(trees)
        test_y = model.predict(test_x)
        new_sents = format_predictions_by_machlearn_model(test_y, trees)
        with codecs.open(args.output, 'w+', 'utf-8') as f:
            for sent in new_sents:
                f.write(sent + '\n')