def __load_data(dep_tags_file, pos_tags_file, sents_file, train_valid_split_file): tvs_line = datautils.read_lines(train_valid_split_file)[0] tvs_arr = [int(v) for v in tvs_line.split()] dep_tags_list = datautils.load_dep_tags_list(dep_tags_file) pos_tags_list = datautils.load_pos_tags(pos_tags_file) sents = datautils.load_json_objs(sents_file) assert len(tvs_arr) == len(dep_tags_list) dep_tags_list_train, dep_tags_list_valid = list(), list() pos_tags_list_train, pos_tags_list_valid = list(), list() sents_train, sents_valid = list(), list() for tvs_label, dep_tags, pos_tags, sent in zip(tvs_arr, dep_tags_list, pos_tags_list, sents): if tvs_label == 0: dep_tags_list_train.append(dep_tags) pos_tags_list_train.append(pos_tags) sents_train.append(sent) else: dep_tags_list_valid.append(dep_tags) pos_tags_list_valid.append(pos_tags) sents_valid.append(sent) data_train = RuleMineData(dep_tags_list_train, pos_tags_list_train, sents_train) data_valid = RuleMineData(dep_tags_list_valid, pos_tags_list_valid, sents_valid) return data_train, data_valid
def gen_term_hit_rate_file(mine_tool, train_sents_file, dep_tags_file, pos_tags_file, dst_file): dep_tags_list = datautils.load_dep_tags_list(dep_tags_file) pos_tags_list = datautils.load_pos_tags(pos_tags_file) sents = datautils.load_json_objs(train_sents_file) terms_list = mine_tool.terms_list_from_sents(sents) term_hit_cnts = dict() for terms in terms_list: for t in terms: cnt = term_hit_cnts.get(t, 0) term_hit_cnts[t] = cnt + 1 all_terms = set(term_hit_cnts.keys()) print(len(all_terms), 'terms') term_cnts = {t: 0 for t in all_terms} # for t in term_hit_cnts.keys(): for dep_tags, pos_tags, sent in zip(dep_tags_list, pos_tags_list, sents): sent_text = sent['text'].lower() terms = mine_tool.get_terms_by_matching(dep_tags, pos_tags, sent_text, all_terms) for t in terms: cnt = term_cnts.get(t, 0) term_cnts[t] = cnt + 1 term_hit_rate_tups = list() for t, hit_cnt in term_hit_cnts.items(): total_cnt = term_cnts.get(t, 0) if total_cnt > 0: term_hit_rate_tups.append((t, hit_cnt / (total_cnt + 1e-5))) term_hit_rate_tups.sort(key=lambda x: -x[1]) with open(dst_file, 'w', encoding='utf-8', newline='\n') as fout: pd.DataFrame(term_hit_rate_tups, columns=['term', 'rate']).to_csv( fout, float_format='%.4f', index=False)
def gen_filter_terms_vocab_file(mine_tool, dep_tags_file, pos_tags_file, sents_file, term_filter_rate, output_file): dep_tags_list = datautils.load_dep_tags_list(dep_tags_file) pos_tags_list = datautils.load_pos_tags(pos_tags_file) sents = datautils.load_json_objs(sents_file) # aspect_terms_list = datautils.aspect_terms_list_from_sents(sents) terms_list = mine_tool.terms_list_from_sents(sents) filter_terms_vocab = __get_term_filter_dict( dep_tags_list, pos_tags_list, terms_list, term_filter_rate, mine_tool) with open(output_file, 'w', encoding='utf-8', newline='\n') as fout: for t in filter_terms_vocab: fout.write('{}\n'.format(t))
def get_data_semeval(train_sents_file, train_tok_text_file, train_valid_split_file, test_sents_file, test_tok_text_file, vocab, n_train, task): tvs_line = datautils.read_lines(train_valid_split_file)[0] tvs_arr = [int(v) for v in tvs_line.split()] sents = datautils.load_json_objs(train_sents_file) # texts = utils.read_lines(train_tok_text_file) tok_texts, word_span_seqs = load_token_pos_file(train_tok_text_file) sents_train, tok_texts_train, sents_valid, tok_texts_valid = list(), list( ), list(), list() word_span_seqs_train, word_span_seqs_valid = list(), list() for label, s, t, span_seq in zip(tvs_arr, sents, tok_texts, word_span_seqs): if label == 0: sents_train.append(s) tok_texts_train.append(t) word_span_seqs_train.append(span_seq) else: sents_valid.append(s) tok_texts_valid.append(t) word_span_seqs_valid.append(span_seq) labels_list_train, word_idxs_list_train = data_from_sents_file( sents_train, tok_texts_train, word_span_seqs_train, vocab, task) if n_train > -1: labels_list_train = labels_list_train[:n_train] word_idxs_list_train = word_idxs_list_train[:n_train] train_data = TrainData(labels_list_train, word_idxs_list_train) valid_data = __get_valid_data(sents_valid, tok_texts_valid, word_span_seqs_valid, vocab, task) sents_test = datautils.load_json_objs(test_sents_file) texts_test, word_span_seqs_test = load_token_pos_file(test_tok_text_file) print('get test') test_data = __get_valid_data(sents_test, texts_test, word_span_seqs_test, vocab, task) return train_data, valid_data, test_data
def __run_with_mined_rules(mine_tool, rule_patterns_file, term_hit_rate_file, dep_tags_file, pos_tags_file, sent_texts_file, filter_terms_vocab_file, term_hit_rate_thres=0.6, output_result_file=None, sents_file=None): l1_rules, l2_rules = ruleutils.load_rule_patterns_file(rule_patterns_file) term_vocab = ruleutils.get_term_vocab(term_hit_rate_file, term_hit_rate_thres) dep_tags_list = datautils.load_dep_tags_list(dep_tags_file) pos_tags_list = datautils.load_pos_tags(pos_tags_file) sent_texts = datautils.read_lines(sent_texts_file) filter_terms_vocab = set(datautils.read_lines(filter_terms_vocab_file)) # opinion_terms_vocab = set(utils.read_lines(opinion_terms_file)) terms_sys_list = list() for sent_idx, (dep_tag_seq, pos_tag_seq, sent_text) in enumerate(zip(dep_tags_list, pos_tags_list, sent_texts)): terms = set() l1_terms_new = set() for p in l1_rules: terms_new = ruleutils.find_terms_by_l1_pattern( p, dep_tag_seq, pos_tag_seq, mine_tool, filter_terms_vocab) terms.update(terms_new) l1_terms_new.update(terms_new) for p in l2_rules: terms_new = ruleutils.find_terms_by_l2_pattern( p, dep_tag_seq, pos_tag_seq, mine_tool, filter_terms_vocab, l1_terms_new) terms.update(terms_new) terms_new = mine_tool.get_terms_by_matching(dep_tag_seq, pos_tag_seq, sent_text, term_vocab) terms.update(terms_new) terms_sys_list.append(terms) if sent_idx % 10000 == 0: print(sent_idx) if output_result_file is not None: __write_rule_results(terms_sys_list, sent_texts, output_result_file) if sents_file is not None: sents = datautils.load_json_objs(sents_file) # aspect_terms_true = utils.aspect_terms_list_from_sents(sents) terms_list_true = mine_tool.terms_list_from_sents(sents) sent_texts = [sent['text'] for sent in sents] correct_sent_idxs = __evaluate(terms_sys_list, terms_list_true, dep_tags_list, pos_tags_list, sent_texts)
def get_weak_label_data(vocab, true_terms_file, tok_texts_file, task): terms_true_list = datautils.load_json_objs(true_terms_file) tok_texts = datautils.read_lines(tok_texts_file) # print(len(terms_true_list), tok_texts_file, len(tok_texts)) if len(terms_true_list) != len(tok_texts): print(len(terms_true_list), len(tok_texts)) assert len(terms_true_list) == len(tok_texts) word_idx_dict = {w: i + 1 for i, w in enumerate(vocab)} label_seq_list = list() word_idx_seq_list = list() for terms_true, tok_text in zip(terms_true_list, tok_texts): words = tok_text.split(' ') label_seq = label_sentence(words, terms_true) label_seq_list.append(label_seq) word_idx_seq_list.append([word_idx_dict.get(w, 0) for w in words]) np.random.seed(3719) perm = np.random.permutation(len(label_seq_list)) n_train = len(label_seq_list) - 2000 idxs_train, idxs_valid = perm[:n_train], perm[n_train:] label_seq_list_train = [label_seq_list[idx] for idx in idxs_train] word_idx_seq_list_train = [word_idx_seq_list[idx] for idx in idxs_train] train_data = TrainData(label_seq_list_train, word_idx_seq_list_train) label_seq_list_valid = [label_seq_list[idx] for idx in idxs_valid] word_idx_seq_list_valid = [word_idx_seq_list[idx] for idx in idxs_valid] tok_texts_valid = [tok_texts[idx] for idx in idxs_valid] terms_true_list_valid = [terms_true_list[idx] for idx in idxs_valid] aspect_true_list, opinion_true_list = None, None if task != 'opinion': aspect_true_list = terms_true_list_valid if task != 'aspect': opinion_true_list = terms_true_list_valid valid_data = ValidData(None, label_seq_list_valid, word_idx_seq_list_valid, None, tok_texts_valid, aspect_true_list, opinion_true_list) return train_data, valid_data