예제 #1
0
def __load_data(dep_tags_file, pos_tags_file, sents_file, train_valid_split_file):
    tvs_line = datautils.read_lines(train_valid_split_file)[0]
    tvs_arr = [int(v) for v in tvs_line.split()]

    dep_tags_list = datautils.load_dep_tags_list(dep_tags_file)
    pos_tags_list = datautils.load_pos_tags(pos_tags_file)
    sents = datautils.load_json_objs(sents_file)

    assert len(tvs_arr) == len(dep_tags_list)

    dep_tags_list_train, dep_tags_list_valid = list(), list()
    pos_tags_list_train, pos_tags_list_valid = list(), list()
    sents_train, sents_valid = list(), list()
    for tvs_label, dep_tags, pos_tags, sent in zip(tvs_arr, dep_tags_list, pos_tags_list, sents):
        if tvs_label == 0:
            dep_tags_list_train.append(dep_tags)
            pos_tags_list_train.append(pos_tags)
            sents_train.append(sent)
        else:
            dep_tags_list_valid.append(dep_tags)
            pos_tags_list_valid.append(pos_tags)
            sents_valid.append(sent)

    data_train = RuleMineData(dep_tags_list_train, pos_tags_list_train, sents_train)
    data_valid = RuleMineData(dep_tags_list_valid, pos_tags_list_valid, sents_valid)
    return data_train, data_valid
예제 #2
0
def __gen_word_cnts_file(tok_texts_file, output_file):
    import pandas as pd

    texts = datautils.read_lines(tok_texts_file)
    word_cnts_dict = dict()
    total_word_cnt = 0
    for i, sent_text in enumerate(texts):
        if i % 2 == 1:
            continue
        words = sent_text.split()
        total_word_cnt += len(words)
        for w in words:
            cnt = word_cnts_dict.get(w, 0)
            word_cnts_dict[w] = cnt + 1

    word_cnt_tups = list(word_cnts_dict.items())
    word_cnt_tups.sort(key=lambda x: -x[1])

    word_cnt_rate_tups = list()
    for w, cnt in word_cnt_tups:
        word_cnt_rate_tups.append((w, cnt, cnt / total_word_cnt))
    df = pd.DataFrame(word_cnt_rate_tups, columns=['word', 'cnt', 'p'])
    with open(output_file, 'w', encoding='utf-8', newline='\n') as fout:
        df.to_csv(fout, index=False, float_format='%.5f')
    print(total_word_cnt)
예제 #3
0
def __run_with_mined_rules(mine_tool, rule_patterns_file, term_hit_rate_file, dep_tags_file, pos_tags_file,
                           sent_texts_file, filter_terms_vocab_file, term_hit_rate_thres=0.6,
                           output_result_file=None, sents_file=None):
    l1_rules, l2_rules = ruleutils.load_rule_patterns_file(rule_patterns_file)
    term_vocab = ruleutils.get_term_vocab(term_hit_rate_file, term_hit_rate_thres)

    dep_tags_list = datautils.load_dep_tags_list(dep_tags_file)
    pos_tags_list = datautils.load_pos_tags(pos_tags_file)
    sent_texts = datautils.read_lines(sent_texts_file)
    filter_terms_vocab = set(datautils.read_lines(filter_terms_vocab_file))
    # opinion_terms_vocab = set(utils.read_lines(opinion_terms_file))

    terms_sys_list = list()
    for sent_idx, (dep_tag_seq, pos_tag_seq, sent_text) in enumerate(zip(dep_tags_list, pos_tags_list, sent_texts)):
        terms = set()
        l1_terms_new = set()
        for p in l1_rules:
            terms_new = ruleutils.find_terms_by_l1_pattern(
                p, dep_tag_seq, pos_tag_seq, mine_tool, filter_terms_vocab)
            terms.update(terms_new)
            l1_terms_new.update(terms_new)
        for p in l2_rules:
            terms_new = ruleutils.find_terms_by_l2_pattern(
                p, dep_tag_seq, pos_tag_seq, mine_tool, filter_terms_vocab, l1_terms_new)
            terms.update(terms_new)

        terms_new = mine_tool.get_terms_by_matching(dep_tag_seq, pos_tag_seq, sent_text, term_vocab)
        terms.update(terms_new)

        terms_sys_list.append(terms)

        if sent_idx % 10000 == 0:
            print(sent_idx)

    if output_result_file is not None:
        __write_rule_results(terms_sys_list, sent_texts, output_result_file)

    if sents_file is not None:
        sents = datautils.load_json_objs(sents_file)
        # aspect_terms_true = utils.aspect_terms_list_from_sents(sents)
        terms_list_true = mine_tool.terms_list_from_sents(sents)
        sent_texts = [sent['text'] for sent in sents]
        correct_sent_idxs = __evaluate(terms_sys_list, terms_list_true, dep_tags_list, pos_tags_list, sent_texts)
예제 #4
0
def get_weak_label_data(vocab, true_terms_file, tok_texts_file, task):
    terms_true_list = datautils.load_json_objs(true_terms_file)
    tok_texts = datautils.read_lines(tok_texts_file)
    # print(len(terms_true_list), tok_texts_file, len(tok_texts))
    if len(terms_true_list) != len(tok_texts):
        print(len(terms_true_list), len(tok_texts))
    assert len(terms_true_list) == len(tok_texts)

    word_idx_dict = {w: i + 1 for i, w in enumerate(vocab)}

    label_seq_list = list()
    word_idx_seq_list = list()
    for terms_true, tok_text in zip(terms_true_list, tok_texts):
        words = tok_text.split(' ')
        label_seq = label_sentence(words, terms_true)
        label_seq_list.append(label_seq)
        word_idx_seq_list.append([word_idx_dict.get(w, 0) for w in words])

    np.random.seed(3719)
    perm = np.random.permutation(len(label_seq_list))
    n_train = len(label_seq_list) - 2000
    idxs_train, idxs_valid = perm[:n_train], perm[n_train:]

    label_seq_list_train = [label_seq_list[idx] for idx in idxs_train]
    word_idx_seq_list_train = [word_idx_seq_list[idx] for idx in idxs_train]
    train_data = TrainData(label_seq_list_train, word_idx_seq_list_train)

    label_seq_list_valid = [label_seq_list[idx] for idx in idxs_valid]
    word_idx_seq_list_valid = [word_idx_seq_list[idx] for idx in idxs_valid]
    tok_texts_valid = [tok_texts[idx] for idx in idxs_valid]
    terms_true_list_valid = [terms_true_list[idx] for idx in idxs_valid]
    aspect_true_list, opinion_true_list = None, None
    if task != 'opinion':
        aspect_true_list = terms_true_list_valid
    if task != 'aspect':
        opinion_true_list = terms_true_list_valid
    valid_data = ValidData(None, label_seq_list_valid, word_idx_seq_list_valid,
                           None, tok_texts_valid, aspect_true_list,
                           opinion_true_list)

    return train_data, valid_data
예제 #5
0
def get_data_semeval(train_sents_file, train_tok_text_file,
                     train_valid_split_file, test_sents_file,
                     test_tok_text_file, vocab, n_train, task):
    tvs_line = datautils.read_lines(train_valid_split_file)[0]
    tvs_arr = [int(v) for v in tvs_line.split()]

    sents = datautils.load_json_objs(train_sents_file)
    # texts = utils.read_lines(train_tok_text_file)
    tok_texts, word_span_seqs = load_token_pos_file(train_tok_text_file)

    sents_train, tok_texts_train, sents_valid, tok_texts_valid = list(), list(
    ), list(), list()
    word_span_seqs_train, word_span_seqs_valid = list(), list()
    for label, s, t, span_seq in zip(tvs_arr, sents, tok_texts,
                                     word_span_seqs):
        if label == 0:
            sents_train.append(s)
            tok_texts_train.append(t)
            word_span_seqs_train.append(span_seq)
        else:
            sents_valid.append(s)
            tok_texts_valid.append(t)
            word_span_seqs_valid.append(span_seq)

    labels_list_train, word_idxs_list_train = data_from_sents_file(
        sents_train, tok_texts_train, word_span_seqs_train, vocab, task)
    if n_train > -1:
        labels_list_train = labels_list_train[:n_train]
        word_idxs_list_train = word_idxs_list_train[:n_train]

    train_data = TrainData(labels_list_train, word_idxs_list_train)

    valid_data = __get_valid_data(sents_valid, tok_texts_valid,
                                  word_span_seqs_valid, vocab, task)

    sents_test = datautils.load_json_objs(test_sents_file)
    texts_test, word_span_seqs_test = load_token_pos_file(test_tok_text_file)
    print('get test')
    test_data = __get_valid_data(sents_test, texts_test, word_span_seqs_test,
                                 vocab, task)
    return train_data, valid_data, test_data
예제 #6
0
 def __init__(self, opinion_terms_vocab_file):
     self.opinion_terms_vocab = set(
         datautils.read_lines(opinion_terms_vocab_file))