Exemplo n.º 1
0
def __check_errors():
    sents_file = 'd:/data/aspect/semeval14/laptops/laptops_test_sents.json'
    lstmcrf_aspects_file = 'd:/data/aspect/semeval14/lstmcrf-aspects.txt'
    decnn_aspects_file = 'd:/data/aspect/semeval14/pred-de-cnn-lap.json'
    lstmcrf_opinions_file = 'd:/data/aspect/semeval14/lstmcrf-opinions.txt'
    nrdj_aspects_file = 'd:/data/aspect/semeval14/nrdj-aspects.txt'
    nrdj_opinions_file = 'd:/data/aspect/semeval14/nrdj-opinions.txt'
    rule_aspects_file = 'd:/data/aspect/semeval14/laptops/laptops-test-aspect-rule-result.txt'

    sents = utils.load_json_objs(sents_file)
    lc_aspects_list = utils.load_json_objs(lstmcrf_aspects_file)
    decnn_sents = utils.load_json_objs(decnn_aspects_file)
    decnn_aspects_list = list()
    for dcs in decnn_sents:
        terms = [t['term'].lower() for t in dcs.get('terms', list())]
        decnn_aspects_list.append(terms)
    nrdj_aspects_list = utils.load_json_objs(nrdj_aspects_file)
    rule_aspects_list = utils.load_json_objs(rule_aspects_file)
    for sent, lc_aspects, decnn_aspects, nrdj_aspects, rule_aspects in zip(
            sents, lc_aspects_list, decnn_aspects_list, nrdj_aspects_list,
            rule_aspects_list):
        terms = [t['term'].lower() for t in sent.get('terms', list())]
        lc_correct = __is_correct(lc_aspects, terms)
        dc_correct = __is_correct(decnn_aspects, terms)
        nrdj_correct = __is_correct(nrdj_aspects, terms)
        rule_correct = __is_correct(rule_aspects, terms)
        if not dc_correct and rule_correct and nrdj_correct:
            print(sent['text'])
            print(terms)
            print('lstm', lc_aspects)
            print('rule', rule_aspects)
            print('nrdj', nrdj_aspects)
            print('decnn', decnn_aspects)
            print()
Exemplo n.º 2
0
def __merge_train_test(train_sents_file, test_sents_file, train_valid_split_file, dst_sents_file, dst_datasplit_file):
    train_sents = utils.load_json_objs(train_sents_file)
    test_sents = utils.load_json_objs(test_sents_file)
    all_sents = train_sents + test_sents
    utils.save_json_objs(all_sents, dst_sents_file)

    train_valid_split_labels = utils.read_lines(train_valid_split_file)[0]
    train_valid_split_labels = [int(v) for v in train_valid_split_labels.split(' ')]
    all_data_split_labels = train_valid_split_labels + [2 for _ in range(len(test_sents))]
    with open(dst_datasplit_file, 'w', encoding='utf-8') as fout:
        fout.write('{}\n'.format(' '.join([str(v) for v in all_data_split_labels])))
Exemplo n.º 3
0
def check_unseen_terms():
    train_sents_file = 'd:/data/aspect/semeval14/laptops/laptops_train_sents.json'
    train_aspect_terms, train_opinion_terms = __get_all_terms(train_sents_file)
    sents_file = 'd:/data/aspect/semeval14/laptops/laptops_test_sents.json'
    lstmcrf_aspects_file = 'd:/data/aspect/semeval14/lstmcrf-aspects.txt'
    lstmcrf_opinions_file = 'd:/data/aspect/semeval14/lstmcrf-opinions.txt'
    nrdj_aspects_file = 'd:/data/aspect/semeval14/nrdj-opinions-malt.txt'
    nrdj_opinions_file = 'd:/data/aspect/semeval14/nrdj-opinions-malt.txt'
    rule_aspects_file = 'd:/data/aspect/semeval14/laptops/laptops-test-aspect-rule-result.txt'

    sents = utils.load_json_objs(sents_file)
    lc_aspects_list = utils.load_json_objs(lstmcrf_aspects_file)
    nrdj_aspects_list = utils.load_json_objs(nrdj_aspects_file)
    rule_aspects_list = utils.load_json_objs(rule_aspects_file)
    terms_true_list, terms_nrdj_list = list(), list()
    n_true, n_nrdj, n_hit = 0, 0, 0
    n_lc, n_lc_hit = 0, 0
    for sent, lc_aspects, nrdj_aspects, rule_aspects in zip(
            sents, lc_aspects_list, nrdj_aspects_list, rule_aspects_list):
        # terms = [t['term'].lower() for t in sent.get('terms', list())]
        terms = [t.lower() for t in sent.get('opinions', list())]
        # terms = [t for t in terms if t in train_aspect_terms]
        # print(terms, nrdj_aspects)
        terms_true_list.append(terms)
        terms_nrdj_list.append(nrdj_aspects)
        n_true += len(terms)
        n_nrdj += len(nrdj_aspects)
        n_hit += utils.count_hit(terms, nrdj_aspects)
        for t in terms:
            if t not in nrdj_aspects:
                print(t)
                print(sent['text'])

        n_lc += len(lc_aspects)
        n_lc_hit += utils.count_hit(terms, lc_aspects)
        # lc_correct = __is_correct(lc_aspects, terms)
        # nrdj_correct = __is_correct(nrdj_aspects, terms)
        # rule_correct = __is_correct(rule_aspects, terms)
        # if not lc_correct and not rule_correct and nrdj_correct:
        #     print(sent['text'])
        #     print(terms)
        #     print(lc_aspects)
        #     print(rule_aspects)
        #     print(nrdj_aspects)
        #     print()

    print(n_true, n_nrdj)
    p, r, f1 = utils.prf1(n_true, n_nrdj, n_hit)
    print(p, r, f1)
    p, r, f1 = utils.prf1(n_true, n_lc, n_lc_hit)
    print(p, r, f1)
Exemplo n.º 4
0
def __count_rule_extracted_terms():
    # aspect_terms_file = 'd:/data/aspect/semeval14/laptops/laptops-test-aspect-rule-result.txt'
    # opinion_terms_file = 'd:/data/aspect/semeval14/laptops/laptops-test-opinion-rule-result.txt'
    aspect_terms_file = 'd:/data/aspect/semeval15/restaurants/restaurants-test-aspect-rule-result.txt'
    opinion_terms_file = 'd:/data/aspect/semeval15/restaurants/restaurants-test-opinion-rule-result.txt'

    aspect_terms_list = utils.load_json_objs(aspect_terms_file)
    opinion_terms_list = utils.load_json_objs(opinion_terms_file)

    num_aspect_terms = sum([len(terms) for terms in aspect_terms_list])
    print(num_aspect_terms)

    num_opinion_terms = sum([len(terms) for terms in opinion_terms_list])
    print(num_opinion_terms)
Exemplo n.º 5
0
def __texts_file_from_sents(sents_file, dst_texts_file):
    sents = utils.load_json_objs(sents_file)
    with open(dst_texts_file, 'w', encoding='utf-8') as fout:
        for sent in sents:
            sent_text = sent['text']
            assert '\n' not in sent_text
            fout.write('{}\n'.format(sent['text']))
Exemplo n.º 6
0
def __gen_aspect_noun_filter_dict_file(sents_file, tok_texts_file,
                                       pos_tags_file, common_words_file,
                                       dst_file):
    sents = utils.load_json_objs(sents_file)
    tok_texts = utils.read_lines(tok_texts_file)
    pos_tags_list = utils.load_pos_tags(pos_tags_file)
    term_sys_cnts, term_hit_cnts = dict(), dict()
    for sent_idx, (sent, tok_text,
                   pos_tags) in enumerate(zip(sents, tok_texts,
                                              pos_tags_list)):
        sent_words = tok_text.split(' ')
        noun_phrases = rules.rec_rule1(sent_words, pos_tags, None)
        term_objs = sent.get('terms', list())
        terms_true = {term_obj['term'].lower() for term_obj in term_objs}
        for n in noun_phrases:
            sys_cnt = term_sys_cnts.get(n, 0)
            term_sys_cnts[n] = sys_cnt + 1
            if n in terms_true:
                hit_cnt = term_hit_cnts.get(n, 0)
                term_hit_cnts[n] = hit_cnt + 1

    common_words = utils.read_lines(common_words_file)
    filter_terms = set(common_words)
    for term, sys_cnt in term_sys_cnts.items():
        hit_cnt = term_hit_cnts.get(term, 0)
        # print(term, hit_cnt, sys_cnt)
        if hit_cnt / sys_cnt < 0.4:
            filter_terms.add(term)

    fout = open(dst_file, 'w', encoding='utf-8', newline='\n')
    for t in filter_terms:
        fout.write('{}\n'.format(t))
    fout.close()
Exemplo n.º 7
0
def __dp_se():
    dataset = 'semeval14'
    # dataset = 'semeval15'
    # sub_dataset = 'restaurants'
    sub_dataset = 'laptops'
    sents_file = 'd:/data/aspect/{}/{}/{}_test_sents.json'.format(
        dataset, sub_dataset, sub_dataset)
    tok_texts_file = 'd:/data/aspect/{}/{}/{}_test_texts_tok.txt'.format(
        dataset, sub_dataset, sub_dataset)
    sent_texts_file = 'd:/data/aspect/{}/{}/{}_test_texts.txt'.format(
        dataset, sub_dataset, sub_dataset)
    dep_file = 'd:/data/aspect/{}/{}/{}-test-rule-dep.txt'.format(
        dataset, sub_dataset, sub_dataset)
    pos_file = 'd:/data/aspect/{}/{}/{}-test-rule-pos.txt'.format(
        dataset, sub_dataset, sub_dataset)
    term_hit_rate_file = 'd:/data/aspect/{}/{}/opinion-term-hit-rate.txt'.format(
        dataset, sub_dataset)
    sents = utils.load_json_objs(sents_file)
    seed_opinions = __read_seed_opinions()
    pos_tags_list = utils.load_pos_tags(pos_file)
    dep_tags_list = utils.load_dep_tags_list(dep_file)

    aspect_terms_list, opinion_terms_list = __get_true_terms_se(sents)
    term_vocab = rulescommon.get_term_vocab(term_hit_rate_file, 0.6)

    cnt_hit, cnt_sys, cnt_true = __dp_new(aspect_terms_list,
                                          opinion_terms_list, tok_texts_file,
                                          sent_texts_file, dep_tags_list,
                                          pos_tags_list, seed_opinions,
                                          term_vocab)
    prec = cnt_hit / cnt_sys
    recall = cnt_hit / cnt_true
    print(prec, recall, 2 * prec * recall / (prec + recall), cnt_hit, cnt_sys,
          cnt_true)
Exemplo n.º 8
0
def __load_data(dep_tags_file, pos_tags_file, sents_file,
                train_valid_split_file):
    tvs_line = utils.read_lines(train_valid_split_file)[0]
    tvs_arr = [int(v) for v in tvs_line.split()]

    dep_tags_list = utils.load_dep_tags_list(dep_tags_file)
    pos_tags_list = utils.load_pos_tags(pos_tags_file)
    sents = utils.load_json_objs(sents_file)

    assert len(tvs_arr) == len(dep_tags_list)

    dep_tags_list_train, dep_tags_list_valid = list(), list()
    pos_tags_list_train, pos_tags_list_valid = list(), list()
    sents_train, sents_valid = list(), list()
    for tvs_label, dep_tags, pos_tags, sent in zip(tvs_arr, dep_tags_list,
                                                   pos_tags_list, sents):
        if tvs_label == 0:
            dep_tags_list_train.append(dep_tags)
            pos_tags_list_train.append(pos_tags)
            sents_train.append(sent)
        else:
            dep_tags_list_valid.append(dep_tags)
            pos_tags_list_valid.append(pos_tags)
            sents_valid.append(sent)

    data_train = RuleMineData(dep_tags_list_train, pos_tags_list_train,
                              sents_train)
    data_valid = RuleMineData(dep_tags_list_valid, pos_tags_list_valid,
                              sents_valid)
    return data_train, data_valid
Exemplo n.º 9
0
def __get_all_terms(sents_file):
    sents = utils.load_json_objs(sents_file)
    aspect_terms, opinion_terms = set(), set()
    for s in sents:
        for t in s.get('terms', list()):
            aspect_terms.add(t['term'].lower())
        for t in s.get('opinions', list()):
            opinion_terms.add(t.lower())
    return aspect_terms, opinion_terms
Exemplo n.º 10
0
def __missing_terms():
    opinion_terms_file = 'd:/data/aspect/semeval14/opinion-terms-full.txt'
    opinion_terms_vocab = set(utils.read_lines(opinion_terms_file))
    train_sents = utils.load_json_objs(config.SE15R_FILES['train_sents_file'])
    test_sents = utils.load_json_objs(config.SE15R_FILES['test_sents_file'])
    train_terms = set()
    test_terms = dict()
    for s in train_sents:
        for t in s['opinions']:
            train_terms.add(t.lower())
    for s in test_sents:
        for t in s['opinions']:
            cnt = test_terms.get(t.lower(), 0)
            test_terms[t.lower()] = cnt + 1
            # test_terms.add(t.lower())
    for t, cnt in test_terms.items():
        if t not in train_terms:
            print(t, cnt, t in opinion_terms_vocab)
Exemplo n.º 11
0
def __run_with_mined_rules(mine_helper,
                           rule_patterns_file,
                           term_hit_rate_file,
                           dep_tags_file,
                           pos_tags_file,
                           sent_texts_file,
                           filter_terms_vocab_file,
                           term_hit_rate_thres=0.6,
                           dst_result_file=None,
                           sents_file=None):
    l1_rules, l2_rules = rulescommon.load_rule_patterns_file(
        rule_patterns_file)
    term_vocab = rulescommon.get_term_vocab(term_hit_rate_file,
                                            term_hit_rate_thres)

    dep_tags_list = utils.load_dep_tags_list(dep_tags_file)
    pos_tags_list = utils.load_pos_tags(pos_tags_file)
    sent_texts = utils.read_lines(sent_texts_file)
    filter_terms_vocab = set(utils.read_lines(filter_terms_vocab_file))
    # opinion_terms_vocab = set(utils.read_lines(opinion_terms_file))

    terms_sys_list = list()
    for sent_idx, (dep_tag_seq, pos_tag_seq, sent_text) in enumerate(
            zip(dep_tags_list, pos_tags_list, sent_texts)):
        terms = set()
        l1_terms_new = set()
        for p in l1_rules:
            terms_new = rulescommon.find_terms_by_l1_pattern(
                p, dep_tag_seq, pos_tag_seq, mine_helper, filter_terms_vocab)
            terms.update(terms_new)
            l1_terms_new.update(terms_new)
        for p in l2_rules:
            terms_new = rulescommon.find_terms_by_l2_pattern(
                p, dep_tag_seq, pos_tag_seq, mine_helper, filter_terms_vocab,
                l1_terms_new)
            terms.update(terms_new)

        terms_new = mine_helper.get_terms_by_matching(dep_tag_seq, pos_tag_seq,
                                                      sent_text, term_vocab)
        terms.update(terms_new)

        terms_sys_list.append(terms)

        if sent_idx % 10000 == 0:
            print(sent_idx)

    if dst_result_file is not None:
        __write_rule_results(terms_sys_list, sent_texts, dst_result_file)

    if sents_file is not None:
        sents = utils.load_json_objs(sents_file)
        # aspect_terms_true = utils.aspect_terms_list_from_sents(sents)
        terms_list_true = mine_helper.terms_list_from_sents(sents)
        sent_texts = [sent['text'] for sent in sents]
        correct_sent_idxs = __evaluate(terms_sys_list, terms_list_true,
                                       dep_tags_list, pos_tags_list,
                                       sent_texts)
Exemplo n.º 12
0
def __gen_opinion_terms_file(sents_file, dst_terms_file):
    sents = utils.load_json_objs(sents_file)
    fout = open(dst_terms_file, 'w', encoding='utf-8')
    for sent in sents:
        terms = [t for t in sent.get('opinions', list())]
        if not terms:
            fout.write('\n')
        else:
            fout.write('{}\n'.format(','.join(terms)))
    fout.close()
Exemplo n.º 13
0
def __check_difficulty():
    rest_sents_file = 'd:/data/aspect/semeval14/restaurants/restaurants_test_sents.json'
    lap_sents_file = 'd:/data/aspect/semeval14/laptops/laptops_test_sents.json'

    rest_sents = utils.load_json_objs(rest_sents_file)
    lap_sents = utils.load_json_objs(lap_sents_file)

    def __count_sps(sents):
        sp_cnt, cnt = 0, 0
        for sent in sents:
            terms = [t['term'] for t in sent.get('terms', list())]
            for t in terms:
                if ' ' in t:
                    sp_cnt += 1
            cnt += len(terms)
        print(sp_cnt / cnt)

    __count_sps(rest_sents)
    __count_sps(lap_sents)
Exemplo n.º 14
0
def __check_opinion_errors():
    terms_sys_list = utils.load_json_objs(
        'd:/onedrive/opinion_terms_bert_output_r.txt')
    terms_sys_nr_list = utils.load_json_objs(
        'd:/onedrive/opinion_terms_bert_output.txt')
    test_sents = utils.load_json_objs(config.SE15R_FILES['test_sents_file'])
    terms_true_list = [s['opinions'] for s in test_sents]
    for s, terms_true, terms_sys, terms_sys_nr in zip(test_sents,
                                                      terms_true_list,
                                                      terms_sys_list,
                                                      terms_sys_nr_list):
        if not terms_true and not terms_sys:
            continue
        terms_true = [t.lower() for t in terms_true]
        if len(terms_true) == len(terms_sys) and __count_hits(
                terms_true, terms_sys) == len(terms_true):
            continue
        print(s['text'])
        print(terms_true, terms_sys, terms_sys_nr)
        print()
Exemplo n.º 15
0
def __get_manual_feat(tok_texts_file, terms_file):
    tok_texts = utils.read_lines(tok_texts_file)
    terms_list = utils.load_json_objs(terms_file)
    feat_list = list()
    for terms_true, tok_text in zip(terms_list, tok_texts):
        words = tok_text.split(' ')
        label_seq = modelutils.label_sentence(words, terms_true)
        feat_seq = np.zeros([len(label_seq), 3], np.int32)
        for i, v in enumerate(label_seq):
            feat_seq[i][v] = 1
        feat_list.append(feat_seq)
    return feat_list
Exemplo n.º 16
0
def __split_training_set(train_sents_file, dst_file, n_dev_samples=None):
    sents = utils.load_json_objs(train_sents_file)
    n_sents = len(sents)
    if n_dev_samples is None:
        valid_data_percent = 0.2
        n_dev_samples = int(n_sents * valid_data_percent)
    perm = np.random.permutation(n_sents)
    valid_idxs = set(perm[:n_dev_samples])
    with open(dst_file, 'w', encoding='utf-8', newline='\n') as fout:
        train_valid_labels = ['1' if i in valid_idxs else '0' for i in range(n_sents)]
        fout.write(' '.join(train_valid_labels))
        fout.write('\n')
Exemplo n.º 17
0
def __load_terms_in_train(train_sents_file):
    sents_train = utils.load_json_objs(train_sents_file)
    terms_train = set()
    for sent in sents_train:
        terms = sent.get('terms', None)
        if terms is None:
            continue
        for t in terms:
            terms_train.add(t['term'].lower())
    terms_train = list(terms_train)
    terms_train.sort(key=lambda x: -len(x))
    return terms_train
Exemplo n.º 18
0
def get_data_amazon_ao(vocab, aspect_terms_file, opinion_terms_file,
                       tok_texts_file):
    aspect_terms_list = utils.load_json_objs(aspect_terms_file)
    opinion_terms_list = utils.load_json_objs(opinion_terms_file)
    tok_texts = utils.read_lines(tok_texts_file)
    assert len(aspect_terms_list) == len(tok_texts)
    assert len(opinion_terms_list) == len(tok_texts)

    word_idx_dict = {w: i + 1 for i, w in enumerate(vocab)}

    label_seq_list = list()
    word_idx_seq_list = list()
    for aspect_terms, opinion_terms, tok_text in zip(aspect_terms_list,
                                                     opinion_terms_list,
                                                     tok_texts):
        words = tok_text.split(' ')
        label_seq = label_sentence(words, aspect_terms, opinion_terms)
        label_seq_list.append(label_seq)
        word_idx_seq_list.append([word_idx_dict.get(w, 0) for w in words])

    np.random.seed(3719)
    perm = np.random.permutation(len(label_seq_list))
    n_train = len(label_seq_list) - 2000
    idxs_train, idxs_valid = perm[:n_train], perm[n_train:]

    label_seq_list_train = [label_seq_list[idx] for idx in idxs_train]
    word_idx_seq_list_train = [word_idx_seq_list[idx] for idx in idxs_train]
    train_data = TrainData(label_seq_list_train, word_idx_seq_list_train)

    label_seq_list_valid = [label_seq_list[idx] for idx in idxs_valid]
    word_idx_seq_list_valid = [word_idx_seq_list[idx] for idx in idxs_valid]
    tok_texts_valid = [tok_texts[idx] for idx in idxs_valid]

    aspects_list_valid = [aspect_terms_list[idx] for idx in idxs_valid]
    opinions_list_valid = [opinion_terms_list[idx] for idx in idxs_valid]
    valid_data = ValidData(label_seq_list_valid, word_idx_seq_list_valid,
                           tok_texts_valid, aspects_list_valid,
                           opinions_list_valid)

    return train_data, valid_data
Exemplo n.º 19
0
def get_data_semeval(train_sents_file, train_tok_text_file,
                     train_valid_split_file, test_sents_file,
                     test_tok_text_file, vocab, n_train, task):
    tvs_line = utils.read_lines(train_valid_split_file)[0]
    tvs_arr = [int(v) for v in tvs_line.split()]

    sents = utils.load_json_objs(train_sents_file)
    # texts = utils.read_lines(train_tok_text_file)
    tok_texts, word_span_seqs = load_token_pos_file(train_tok_text_file)

    sents_train, tok_texts_train, sents_valid, tok_texts_valid = list(), list(
    ), list(), list()
    word_span_seqs_train, word_span_seqs_valid = list(), list()
    for label, s, t, span_seq in zip(tvs_arr, sents, tok_texts,
                                     word_span_seqs):
        if label == 0:
            sents_train.append(s)
            tok_texts_train.append(t)
            word_span_seqs_train.append(span_seq)
        else:
            sents_valid.append(s)
            tok_texts_valid.append(t)
            word_span_seqs_valid.append(span_seq)

    labels_list_train, word_idxs_list_train = data_from_sents_file(
        sents_train, tok_texts_train, word_span_seqs_train, vocab, task)
    if n_train > -1:
        labels_list_train = labels_list_train[:n_train]
        word_idxs_list_train = word_idxs_list_train[:n_train]

    train_data = TrainData(labels_list_train, word_idxs_list_train)

    valid_data = __get_valid_data(sents_valid, tok_texts_valid,
                                  word_span_seqs_valid, vocab, task)

    sents_test = utils.load_json_objs(test_sents_file)
    texts_test, word_span_seqs_test = load_token_pos_file(test_tok_text_file)
    test_data = __get_valid_data(sents_test, texts_test, word_span_seqs_test,
                                 vocab, task)
    return train_data, valid_data, test_data
Exemplo n.º 20
0
def __gen_yelp_path_count_feat(mentions_file, mention_id_idx_file, path_strs, for_pra, dst_file):
    mentions = utils.load_json_objs(mentions_file)
    mention_ids = {m['mention_id'] for m in mentions}
    mention_candidates = utils.load_candidates_for_mentions(config.YELP_CANDIDATES_FILE, mention_ids)
    mention_id_to_idx = utils.load_id_to_idx(mention_id_idx_file)
    biz_id_to_idx = utils.load_id_to_idx(config.YELP_BIZ_ID_TO_IDX_FILE)
    if for_pra:
        commuting_matrix_files = [os.path.join(
            config.YELP_DATA_DIR, 'network/{}_norm.txt'.format(s)) for s in path_strs]
    else:
        commuting_matrix_files = [os.path.join(config.YELP_DATA_DIR, 'network/{}.txt'.format(s)) for s in path_strs]
    gen_path_count_feats_file(config.YELP_DATA_INFO_FILE, mention_candidates, mention_id_to_idx, biz_id_to_idx,
                              commuting_matrix_files, for_pra, dst_file)
Exemplo n.º 21
0
def __gen_filter_terms_vocab_file(mine_helper, dep_tags_file, pos_tags_file,
                                  sents_file, dst_file):
    dep_tags_list = utils.load_dep_tags_list(dep_tags_file)
    pos_tags_list = utils.load_pos_tags(pos_tags_file)
    sents = utils.load_json_objs(sents_file)
    # aspect_terms_list = utils.aspect_terms_list_from_sents(sents)
    terms_list = mine_helper.terms_list_from_sents(sents)
    filter_terms_vocab = __get_term_filter_dict(dep_tags_list, pos_tags_list,
                                                terms_list, term_filter_rate,
                                                mine_helper)
    with open(dst_file, 'w', encoding='utf-8', newline='\n') as fout:
        for t in filter_terms_vocab:
            fout.write('{}\n'.format(t))
Exemplo n.º 22
0
        def get_flat_feats_test(mentions_file):
            mentions = utils.load_json_objs(mentions_file)
            feats_list = [modelutils.get_ind_flat_feat_test(
                mentions, mention_candidates_dict, mention_id_to_idx, biz_id_idx_dict, mf, bf
            ) for mf, bf in zip(mention_feat_list, biz_feat_list)]

            if mention_cand_feats_list is not None:
                X_bnd_list = [modelutils.get_bnd_feat_test(
                    mentions, mention_candidates_dict, feats) for feats in mention_cand_feats_list]
                feats_list += X_bnd_list

            X = modelutils.concatenate_feats(feats_list)
            y_true = utils.get_y_true(mentions, mention_candidates_dict, 'target_id')
            return modelutils.DataTest(X, y_true)
Exemplo n.º 23
0
def __gen_aspect_opinion_file(sents_file, dst_aspect_file, dst_opinion_file):
    sents = utils.load_json_objs(sents_file)
    aspects_list, opinions_list = list(), list()
    for sent in sents:
        aspects_list.append([t['term'] for t in sent.get('terms', list())])
        opinions_list.append(sent.get('opinions', list()))

    def __write_terms_file(terms_list, dst_file):
        with open(dst_file, 'w', encoding='utf-8') as fout:
            for terms in terms_list:
                fout.write('{}\n'.format(','.join(terms)))

    __write_terms_file(aspects_list, dst_aspect_file)
    __write_terms_file(opinions_list, dst_opinion_file)
Exemplo n.º 24
0
def __dataset_statistics():
    # sents_file = 'd:/data/aspect/semeval14/laptops/laptops_test_sents.json'
    # sents_file = 'd:/data/aspect/semeval14/restaurants/restaurants_test_sents.json'
    # sents_file = 'd:/data/aspect/semeval14/restaurants/restaurants_train_sents.json'
    # sents_file = 'd:/data/aspect/semeval15/restaurants/restaurants_train_sents.json'
    sents_file = 'd:/data/aspect/semeval15/restaurants/restaurants_test_sents.json'
    sents = utils.load_json_objs(sents_file)
    print(len(sents), 'sentences')
    at_cnt, ot_cnt = 0, 0
    for s in sents:
        at_cnt += len(s.get('terms', list()))
        ot_cnt += len(s.get('opinions', list()))
    print(at_cnt, 'aspect terms')
    print(ot_cnt, 'opinion terms')
Exemplo n.º 25
0
def __dp_hl04():
    reviews = utils.load_json_objs(config.REVIEWS_FILE_HL04)
    sents = utils.load_json_objs(config.SENTS_FILE_HL04)
    review_prod_dict = {r['review_id']: r['file'] for r in reviews}
    prod_set = {v for v in review_prod_dict.values()}
    prod_sents_dict = {v: list() for v in prod_set}
    for i, sent in enumerate(sents):
        prod_sents_dict[review_prod_dict[sent['review_id']]].append(i)

    # seed_opinions = utils.read_lines(config.SEED_OPINIONS_FILE_HL04)
    seed_opinions = __read_seed_opinions()
    pos_tags_list = utils.load_pos_tags(config.SENT_POS_FILE_HL04)
    dep_tags_list = utils.load_dep_tags_list(config.SENT_DEPENDENCY_FILE_HL04)
    assert len(pos_tags_list) == len(sents)
    assert len(dep_tags_list) == len(sents)

    cnt_hit, cnt_sys, cnt_true = __dp_new(sents, dep_tags_list, pos_tags_list,
                                          seed_opinions)
    # cnt_hit, cnt_sys, cnt_true = 0, 0, 0
    # for prod, sent_idxs in prod_sents_dict.items():
    #     print(prod)
    #     # if prod != 'Canon G3.txt':
    #     #     continue
    #
    #     prod_sents = [sents[i] for i in sent_idxs]
    #     prod_pos_tags_list = [pos_tags_list[i] for i in sent_idxs]
    #     prod_dep_tags_list = [dep_tags_list[i] for i in sent_idxs]
    #     n_hit, n_sys, n_true = __dp_new(prod_sents, prod_dep_tags_list, prod_pos_tags_list, seed_opinions)
    #     cnt_hit += n_hit
    #     cnt_sys += n_sys
    #     cnt_true += n_true
    #     # break

    prec = cnt_hit / cnt_sys
    recall = cnt_hit / cnt_true
    print(prec, recall, 2 * prec * recall / (prec + recall), cnt_hit, cnt_sys,
          cnt_true)
Exemplo n.º 26
0
def __semeval_rule_insight():
    train_file = 'd:/data/aspect/semeval14/Laptops_Train.json'
    test_file = 'd:/data/aspect/semeval14/Laptops_Test_Gold.json'
    sents_train = utils.load_json_objs(train_file)
    sents_test = utils.load_json_objs(test_file)

    def __count_terms(sents):
        cnt_dict = dict()
        for sent in sents:
            aspect_terms = sent.get('terms', None)
            if aspect_terms is not None:
                for term in aspect_terms:
                    s = term['term']
                    cnt = cnt_dict.get(s, 0)
                    cnt_dict[s] = cnt + 1
        return cnt_dict

    term_cnts_train = __count_terms(sents_train)
    term_cnts_test = __count_terms(sents_test)
    term_cnt_tups = [(t, cnt) for t, cnt in term_cnts_test.items()]
    term_cnt_tups.sort(key=lambda x: -x[1])
    for t, cnt in term_cnt_tups:
        if t not in term_cnts_train:
            print(t, cnt)
Exemplo n.º 27
0
def gen_ordinary_features_file(mentions_file, candidates_file, review_file,
                               biz_file, dst_file):
    print('generation ordinary features ...')
    mentions = utils.load_json_objs(mentions_file)
    mention_candidates = utils.load_candidates(candidates_file)
    mention_cand_feats = get_ordinary_features(mentions, mention_candidates,
                                               review_file, biz_file,
                                               feats_include)
    fout = open(dst_file, 'w', encoding='utf-8', newline='\n')
    for m in mentions:
        mention_id = m['mention_id']
        candidates = mention_candidates[mention_id]
        feats = mention_cand_feats[mention_id]
        utils.write_candidate_features(mention_id, candidates, feats, fout)
    fout.close()
Exemplo n.º 28
0
def gen_path_count_feats_file(data_info_file, mention_candidates, mention_id_to_idx, biz_id_to_idx,
                              commuting_matrix_files, for_pra, dst_file):
    mention_candidates_idxs = utils.get_mention_cadidate_idxs_dict(
        mention_candidates, mention_id_to_idx, biz_id_to_idx)
    data_info = utils.load_json_objs(data_info_file)[0]
    if for_pra:
        # __get_path_count_features(commuting_matrix_files, mention_candidates_idxs, data_info['mentions'],
        #                           data_info['bizs'], -1)
        features_list = [__get_path_count_features(
            f, mention_candidates_idxs, data_info['mentions'], data_info['bizs'], -1
        ) for f in commuting_matrix_files]
        __write_mention_candidate_features(features_list, mention_candidates, mention_id_to_idx, dst_file)
    else:
        features_list = [__get_path_count_features(
            f, mention_candidates_idxs, data_info['mentions'], data_info['bizs'], NORM_THRES
        ) for f in commuting_matrix_files]
        __write_mention_candidate_features(features_list, mention_candidates, mention_id_to_idx, dst_file)
Exemplo n.º 29
0
def __opinion_rule_insight(dep_tags_file,
                           pos_tags_file,
                           sent_text_file,
                           terms_vocab,
                           dst_result_file=None,
                           sents_file=None):
    print('loading data ...')
    dep_tags_list = utils.load_dep_tags_list(dep_tags_file)
    pos_tags_list = utils.load_pos_tags(pos_tags_file)
    sent_texts = utils.read_lines(sent_text_file)
    assert len(dep_tags_list) == len(sent_texts)
    assert len(pos_tags_list) == len(dep_tags_list)
    print('done.')
    opinions_sys_list = list()
    for sent_idx, sent_text in enumerate(sent_texts):
        dep_tags = dep_tags_list[sent_idx]
        pos_tags = pos_tags_list[sent_idx]
        assert len(dep_tags) == len(pos_tags)

        opinion_terms = set()
        # used rule2 and __match_terms to pretrain
        # terms_new = opinionrules.rule1(dep_tags, pos_tags)
        # opinion_terms.update(terms_new)
        terms_new = opinionrules.rule2(dep_tags, pos_tags)
        opinion_terms.update(terms_new)
        # terms_new = opinionrules.rule4(dep_tags, pos_tags)
        # opinion_terms.update(terms_new)
        terms_new = __match_terms(sent_text, terms_vocab)
        opinion_terms.update(terms_new)
        opinions_sys_list.append(opinion_terms)

        if sent_idx % 10000 == 0:
            print(sent_idx)

    if dst_result_file is not None:
        __write_rule_results(opinions_sys_list, sent_texts, dst_result_file)

    if sents_file is not None:
        sents = utils.load_json_objs(sents_file)
        opinions_true_list = list()
        for sent in sents:
            opinions_true_list.append(
                [t.lower() for t in sent.get('opinions', list())])
        correct_sent_idxs = __evaluate(opinions_sys_list, opinions_true_list,
                                       dep_tags_list, pos_tags_list,
                                       sent_texts)
Exemplo n.º 30
0
def get_data_amazon(vocab, true_terms_file, tok_texts_file, task):
    terms_true_list = utils.load_json_objs(true_terms_file)
    tok_texts = utils.read_lines(tok_texts_file)
    # print(len(terms_true_list), tok_texts_file, len(tok_texts))
    assert len(terms_true_list) == len(tok_texts)

    word_idx_dict = {w: i + 1 for i, w in enumerate(vocab)}

    label_seq_list = list()
    word_idx_seq_list = list()
    for terms_true, tok_text in zip(terms_true_list, tok_texts):
        words = tok_text.split(' ')
        label_seq = label_sentence(words, terms_true)
        label_seq_list.append(label_seq)
        word_idx_seq_list.append([word_idx_dict.get(w, 0) for w in words])

    np.random.seed(3719)
    perm = np.random.permutation(len(label_seq_list))
    n_train = len(label_seq_list) - 2000
    idxs_train, idxs_valid = perm[:n_train], perm[n_train:]

    label_seq_list_train = [label_seq_list[idx] for idx in idxs_train]
    word_idx_seq_list_train = [word_idx_seq_list[idx] for idx in idxs_train]
    train_data = TrainData(label_seq_list_train, word_idx_seq_list_train)

    label_seq_list_valid = [label_seq_list[idx] for idx in idxs_valid]
    word_idx_seq_list_valid = [word_idx_seq_list[idx] for idx in idxs_valid]
    tok_texts_valid = [tok_texts[idx] for idx in idxs_valid]
    terms_true_list_valid = [terms_true_list[idx] for idx in idxs_valid]
    aspect_true_list, opinion_true_list = None, None
    if task != 'opinion':
        aspect_true_list = terms_true_list_valid
    if task != 'aspect':
        opinion_true_list = terms_true_list_valid
    valid_data = ValidData(None, label_seq_list_valid, word_idx_seq_list_valid,
                           None, tok_texts_valid, aspect_true_list,
                           opinion_true_list)

    return train_data, valid_data