예제 #1
0
def __load_data(dep_tags_file, pos_tags_file, sents_file, train_valid_split_file):
    tvs_line = datautils.read_lines(train_valid_split_file)[0]
    tvs_arr = [int(v) for v in tvs_line.split()]

    dep_tags_list = datautils.load_dep_tags_list(dep_tags_file)
    pos_tags_list = datautils.load_pos_tags(pos_tags_file)
    sents = datautils.load_json_objs(sents_file)

    assert len(tvs_arr) == len(dep_tags_list)

    dep_tags_list_train, dep_tags_list_valid = list(), list()
    pos_tags_list_train, pos_tags_list_valid = list(), list()
    sents_train, sents_valid = list(), list()
    for tvs_label, dep_tags, pos_tags, sent in zip(tvs_arr, dep_tags_list, pos_tags_list, sents):
        if tvs_label == 0:
            dep_tags_list_train.append(dep_tags)
            pos_tags_list_train.append(pos_tags)
            sents_train.append(sent)
        else:
            dep_tags_list_valid.append(dep_tags)
            pos_tags_list_valid.append(pos_tags)
            sents_valid.append(sent)

    data_train = RuleMineData(dep_tags_list_train, pos_tags_list_train, sents_train)
    data_valid = RuleMineData(dep_tags_list_valid, pos_tags_list_valid, sents_valid)
    return data_train, data_valid
예제 #2
0
def gen_term_hit_rate_file(mine_tool, train_sents_file, dep_tags_file, pos_tags_file, dst_file):
    dep_tags_list = datautils.load_dep_tags_list(dep_tags_file)
    pos_tags_list = datautils.load_pos_tags(pos_tags_file)
    sents = datautils.load_json_objs(train_sents_file)
    terms_list = mine_tool.terms_list_from_sents(sents)
    term_hit_cnts = dict()
    for terms in terms_list:
        for t in terms:
            cnt = term_hit_cnts.get(t, 0)
            term_hit_cnts[t] = cnt + 1

    all_terms = set(term_hit_cnts.keys())
    print(len(all_terms), 'terms')
    term_cnts = {t: 0 for t in all_terms}
    # for t in term_hit_cnts.keys():
    for dep_tags, pos_tags, sent in zip(dep_tags_list, pos_tags_list, sents):
        sent_text = sent['text'].lower()
        terms = mine_tool.get_terms_by_matching(dep_tags, pos_tags, sent_text, all_terms)
        for t in terms:
            cnt = term_cnts.get(t, 0)
            term_cnts[t] = cnt + 1

    term_hit_rate_tups = list()
    for t, hit_cnt in term_hit_cnts.items():
        total_cnt = term_cnts.get(t, 0)
        if total_cnt > 0:
            term_hit_rate_tups.append((t, hit_cnt / (total_cnt + 1e-5)))

    term_hit_rate_tups.sort(key=lambda x: -x[1])

    with open(dst_file, 'w', encoding='utf-8', newline='\n') as fout:
        pd.DataFrame(term_hit_rate_tups, columns=['term', 'rate']).to_csv(
            fout, float_format='%.4f', index=False)
예제 #3
0
def gen_filter_terms_vocab_file(mine_tool, dep_tags_file, pos_tags_file, sents_file, term_filter_rate, output_file):
    dep_tags_list = datautils.load_dep_tags_list(dep_tags_file)
    pos_tags_list = datautils.load_pos_tags(pos_tags_file)
    sents = datautils.load_json_objs(sents_file)
    # aspect_terms_list = datautils.aspect_terms_list_from_sents(sents)
    terms_list = mine_tool.terms_list_from_sents(sents)
    filter_terms_vocab = __get_term_filter_dict(
        dep_tags_list, pos_tags_list, terms_list, term_filter_rate, mine_tool)
    with open(output_file, 'w', encoding='utf-8', newline='\n') as fout:
        for t in filter_terms_vocab:
            fout.write('{}\n'.format(t))
예제 #4
0
def get_data_semeval(train_sents_file, train_tok_text_file,
                     train_valid_split_file, test_sents_file,
                     test_tok_text_file, vocab, n_train, task):
    tvs_line = datautils.read_lines(train_valid_split_file)[0]
    tvs_arr = [int(v) for v in tvs_line.split()]

    sents = datautils.load_json_objs(train_sents_file)
    # texts = utils.read_lines(train_tok_text_file)
    tok_texts, word_span_seqs = load_token_pos_file(train_tok_text_file)

    sents_train, tok_texts_train, sents_valid, tok_texts_valid = list(), list(
    ), list(), list()
    word_span_seqs_train, word_span_seqs_valid = list(), list()
    for label, s, t, span_seq in zip(tvs_arr, sents, tok_texts,
                                     word_span_seqs):
        if label == 0:
            sents_train.append(s)
            tok_texts_train.append(t)
            word_span_seqs_train.append(span_seq)
        else:
            sents_valid.append(s)
            tok_texts_valid.append(t)
            word_span_seqs_valid.append(span_seq)

    labels_list_train, word_idxs_list_train = data_from_sents_file(
        sents_train, tok_texts_train, word_span_seqs_train, vocab, task)
    if n_train > -1:
        labels_list_train = labels_list_train[:n_train]
        word_idxs_list_train = word_idxs_list_train[:n_train]

    train_data = TrainData(labels_list_train, word_idxs_list_train)

    valid_data = __get_valid_data(sents_valid, tok_texts_valid,
                                  word_span_seqs_valid, vocab, task)

    sents_test = datautils.load_json_objs(test_sents_file)
    texts_test, word_span_seqs_test = load_token_pos_file(test_tok_text_file)
    print('get test')
    test_data = __get_valid_data(sents_test, texts_test, word_span_seqs_test,
                                 vocab, task)
    return train_data, valid_data, test_data
예제 #5
0
def __run_with_mined_rules(mine_tool, rule_patterns_file, term_hit_rate_file, dep_tags_file, pos_tags_file,
                           sent_texts_file, filter_terms_vocab_file, term_hit_rate_thres=0.6,
                           output_result_file=None, sents_file=None):
    l1_rules, l2_rules = ruleutils.load_rule_patterns_file(rule_patterns_file)
    term_vocab = ruleutils.get_term_vocab(term_hit_rate_file, term_hit_rate_thres)

    dep_tags_list = datautils.load_dep_tags_list(dep_tags_file)
    pos_tags_list = datautils.load_pos_tags(pos_tags_file)
    sent_texts = datautils.read_lines(sent_texts_file)
    filter_terms_vocab = set(datautils.read_lines(filter_terms_vocab_file))
    # opinion_terms_vocab = set(utils.read_lines(opinion_terms_file))

    terms_sys_list = list()
    for sent_idx, (dep_tag_seq, pos_tag_seq, sent_text) in enumerate(zip(dep_tags_list, pos_tags_list, sent_texts)):
        terms = set()
        l1_terms_new = set()
        for p in l1_rules:
            terms_new = ruleutils.find_terms_by_l1_pattern(
                p, dep_tag_seq, pos_tag_seq, mine_tool, filter_terms_vocab)
            terms.update(terms_new)
            l1_terms_new.update(terms_new)
        for p in l2_rules:
            terms_new = ruleutils.find_terms_by_l2_pattern(
                p, dep_tag_seq, pos_tag_seq, mine_tool, filter_terms_vocab, l1_terms_new)
            terms.update(terms_new)

        terms_new = mine_tool.get_terms_by_matching(dep_tag_seq, pos_tag_seq, sent_text, term_vocab)
        terms.update(terms_new)

        terms_sys_list.append(terms)

        if sent_idx % 10000 == 0:
            print(sent_idx)

    if output_result_file is not None:
        __write_rule_results(terms_sys_list, sent_texts, output_result_file)

    if sents_file is not None:
        sents = datautils.load_json_objs(sents_file)
        # aspect_terms_true = utils.aspect_terms_list_from_sents(sents)
        terms_list_true = mine_tool.terms_list_from_sents(sents)
        sent_texts = [sent['text'] for sent in sents]
        correct_sent_idxs = __evaluate(terms_sys_list, terms_list_true, dep_tags_list, pos_tags_list, sent_texts)
예제 #6
0
def get_weak_label_data(vocab, true_terms_file, tok_texts_file, task):
    terms_true_list = datautils.load_json_objs(true_terms_file)
    tok_texts = datautils.read_lines(tok_texts_file)
    # print(len(terms_true_list), tok_texts_file, len(tok_texts))
    if len(terms_true_list) != len(tok_texts):
        print(len(terms_true_list), len(tok_texts))
    assert len(terms_true_list) == len(tok_texts)

    word_idx_dict = {w: i + 1 for i, w in enumerate(vocab)}

    label_seq_list = list()
    word_idx_seq_list = list()
    for terms_true, tok_text in zip(terms_true_list, tok_texts):
        words = tok_text.split(' ')
        label_seq = label_sentence(words, terms_true)
        label_seq_list.append(label_seq)
        word_idx_seq_list.append([word_idx_dict.get(w, 0) for w in words])

    np.random.seed(3719)
    perm = np.random.permutation(len(label_seq_list))
    n_train = len(label_seq_list) - 2000
    idxs_train, idxs_valid = perm[:n_train], perm[n_train:]

    label_seq_list_train = [label_seq_list[idx] for idx in idxs_train]
    word_idx_seq_list_train = [word_idx_seq_list[idx] for idx in idxs_train]
    train_data = TrainData(label_seq_list_train, word_idx_seq_list_train)

    label_seq_list_valid = [label_seq_list[idx] for idx in idxs_valid]
    word_idx_seq_list_valid = [word_idx_seq_list[idx] for idx in idxs_valid]
    tok_texts_valid = [tok_texts[idx] for idx in idxs_valid]
    terms_true_list_valid = [terms_true_list[idx] for idx in idxs_valid]
    aspect_true_list, opinion_true_list = None, None
    if task != 'opinion':
        aspect_true_list = terms_true_list_valid
    if task != 'aspect':
        opinion_true_list = terms_true_list_valid
    valid_data = ValidData(None, label_seq_list_valid, word_idx_seq_list_valid,
                           None, tok_texts_valid, aspect_true_list,
                           opinion_true_list)

    return train_data, valid_data