def __check_errors(): sents_file = 'd:/data/aspect/semeval14/laptops/laptops_test_sents.json' lstmcrf_aspects_file = 'd:/data/aspect/semeval14/lstmcrf-aspects.txt' decnn_aspects_file = 'd:/data/aspect/semeval14/pred-de-cnn-lap.json' lstmcrf_opinions_file = 'd:/data/aspect/semeval14/lstmcrf-opinions.txt' nrdj_aspects_file = 'd:/data/aspect/semeval14/nrdj-aspects.txt' nrdj_opinions_file = 'd:/data/aspect/semeval14/nrdj-opinions.txt' rule_aspects_file = 'd:/data/aspect/semeval14/laptops/laptops-test-aspect-rule-result.txt' sents = utils.load_json_objs(sents_file) lc_aspects_list = utils.load_json_objs(lstmcrf_aspects_file) decnn_sents = utils.load_json_objs(decnn_aspects_file) decnn_aspects_list = list() for dcs in decnn_sents: terms = [t['term'].lower() for t in dcs.get('terms', list())] decnn_aspects_list.append(terms) nrdj_aspects_list = utils.load_json_objs(nrdj_aspects_file) rule_aspects_list = utils.load_json_objs(rule_aspects_file) for sent, lc_aspects, decnn_aspects, nrdj_aspects, rule_aspects in zip( sents, lc_aspects_list, decnn_aspects_list, nrdj_aspects_list, rule_aspects_list): terms = [t['term'].lower() for t in sent.get('terms', list())] lc_correct = __is_correct(lc_aspects, terms) dc_correct = __is_correct(decnn_aspects, terms) nrdj_correct = __is_correct(nrdj_aspects, terms) rule_correct = __is_correct(rule_aspects, terms) if not dc_correct and rule_correct and nrdj_correct: print(sent['text']) print(terms) print('lstm', lc_aspects) print('rule', rule_aspects) print('nrdj', nrdj_aspects) print('decnn', decnn_aspects) print()
def __merge_train_test(train_sents_file, test_sents_file, train_valid_split_file, dst_sents_file, dst_datasplit_file): train_sents = utils.load_json_objs(train_sents_file) test_sents = utils.load_json_objs(test_sents_file) all_sents = train_sents + test_sents utils.save_json_objs(all_sents, dst_sents_file) train_valid_split_labels = utils.read_lines(train_valid_split_file)[0] train_valid_split_labels = [int(v) for v in train_valid_split_labels.split(' ')] all_data_split_labels = train_valid_split_labels + [2 for _ in range(len(test_sents))] with open(dst_datasplit_file, 'w', encoding='utf-8') as fout: fout.write('{}\n'.format(' '.join([str(v) for v in all_data_split_labels])))
def check_unseen_terms(): train_sents_file = 'd:/data/aspect/semeval14/laptops/laptops_train_sents.json' train_aspect_terms, train_opinion_terms = __get_all_terms(train_sents_file) sents_file = 'd:/data/aspect/semeval14/laptops/laptops_test_sents.json' lstmcrf_aspects_file = 'd:/data/aspect/semeval14/lstmcrf-aspects.txt' lstmcrf_opinions_file = 'd:/data/aspect/semeval14/lstmcrf-opinions.txt' nrdj_aspects_file = 'd:/data/aspect/semeval14/nrdj-opinions-malt.txt' nrdj_opinions_file = 'd:/data/aspect/semeval14/nrdj-opinions-malt.txt' rule_aspects_file = 'd:/data/aspect/semeval14/laptops/laptops-test-aspect-rule-result.txt' sents = utils.load_json_objs(sents_file) lc_aspects_list = utils.load_json_objs(lstmcrf_aspects_file) nrdj_aspects_list = utils.load_json_objs(nrdj_aspects_file) rule_aspects_list = utils.load_json_objs(rule_aspects_file) terms_true_list, terms_nrdj_list = list(), list() n_true, n_nrdj, n_hit = 0, 0, 0 n_lc, n_lc_hit = 0, 0 for sent, lc_aspects, nrdj_aspects, rule_aspects in zip( sents, lc_aspects_list, nrdj_aspects_list, rule_aspects_list): # terms = [t['term'].lower() for t in sent.get('terms', list())] terms = [t.lower() for t in sent.get('opinions', list())] # terms = [t for t in terms if t in train_aspect_terms] # print(terms, nrdj_aspects) terms_true_list.append(terms) terms_nrdj_list.append(nrdj_aspects) n_true += len(terms) n_nrdj += len(nrdj_aspects) n_hit += utils.count_hit(terms, nrdj_aspects) for t in terms: if t not in nrdj_aspects: print(t) print(sent['text']) n_lc += len(lc_aspects) n_lc_hit += utils.count_hit(terms, lc_aspects) # lc_correct = __is_correct(lc_aspects, terms) # nrdj_correct = __is_correct(nrdj_aspects, terms) # rule_correct = __is_correct(rule_aspects, terms) # if not lc_correct and not rule_correct and nrdj_correct: # print(sent['text']) # print(terms) # print(lc_aspects) # print(rule_aspects) # print(nrdj_aspects) # print() print(n_true, n_nrdj) p, r, f1 = utils.prf1(n_true, n_nrdj, n_hit) print(p, r, f1) p, r, f1 = utils.prf1(n_true, n_lc, n_lc_hit) print(p, r, f1)
def __count_rule_extracted_terms(): # aspect_terms_file = 'd:/data/aspect/semeval14/laptops/laptops-test-aspect-rule-result.txt' # opinion_terms_file = 'd:/data/aspect/semeval14/laptops/laptops-test-opinion-rule-result.txt' aspect_terms_file = 'd:/data/aspect/semeval15/restaurants/restaurants-test-aspect-rule-result.txt' opinion_terms_file = 'd:/data/aspect/semeval15/restaurants/restaurants-test-opinion-rule-result.txt' aspect_terms_list = utils.load_json_objs(aspect_terms_file) opinion_terms_list = utils.load_json_objs(opinion_terms_file) num_aspect_terms = sum([len(terms) for terms in aspect_terms_list]) print(num_aspect_terms) num_opinion_terms = sum([len(terms) for terms in opinion_terms_list]) print(num_opinion_terms)
def __texts_file_from_sents(sents_file, dst_texts_file): sents = utils.load_json_objs(sents_file) with open(dst_texts_file, 'w', encoding='utf-8') as fout: for sent in sents: sent_text = sent['text'] assert '\n' not in sent_text fout.write('{}\n'.format(sent['text']))
def __gen_aspect_noun_filter_dict_file(sents_file, tok_texts_file, pos_tags_file, common_words_file, dst_file): sents = utils.load_json_objs(sents_file) tok_texts = utils.read_lines(tok_texts_file) pos_tags_list = utils.load_pos_tags(pos_tags_file) term_sys_cnts, term_hit_cnts = dict(), dict() for sent_idx, (sent, tok_text, pos_tags) in enumerate(zip(sents, tok_texts, pos_tags_list)): sent_words = tok_text.split(' ') noun_phrases = rules.rec_rule1(sent_words, pos_tags, None) term_objs = sent.get('terms', list()) terms_true = {term_obj['term'].lower() for term_obj in term_objs} for n in noun_phrases: sys_cnt = term_sys_cnts.get(n, 0) term_sys_cnts[n] = sys_cnt + 1 if n in terms_true: hit_cnt = term_hit_cnts.get(n, 0) term_hit_cnts[n] = hit_cnt + 1 common_words = utils.read_lines(common_words_file) filter_terms = set(common_words) for term, sys_cnt in term_sys_cnts.items(): hit_cnt = term_hit_cnts.get(term, 0) # print(term, hit_cnt, sys_cnt) if hit_cnt / sys_cnt < 0.4: filter_terms.add(term) fout = open(dst_file, 'w', encoding='utf-8', newline='\n') for t in filter_terms: fout.write('{}\n'.format(t)) fout.close()
def __dp_se(): dataset = 'semeval14' # dataset = 'semeval15' # sub_dataset = 'restaurants' sub_dataset = 'laptops' sents_file = 'd:/data/aspect/{}/{}/{}_test_sents.json'.format( dataset, sub_dataset, sub_dataset) tok_texts_file = 'd:/data/aspect/{}/{}/{}_test_texts_tok.txt'.format( dataset, sub_dataset, sub_dataset) sent_texts_file = 'd:/data/aspect/{}/{}/{}_test_texts.txt'.format( dataset, sub_dataset, sub_dataset) dep_file = 'd:/data/aspect/{}/{}/{}-test-rule-dep.txt'.format( dataset, sub_dataset, sub_dataset) pos_file = 'd:/data/aspect/{}/{}/{}-test-rule-pos.txt'.format( dataset, sub_dataset, sub_dataset) term_hit_rate_file = 'd:/data/aspect/{}/{}/opinion-term-hit-rate.txt'.format( dataset, sub_dataset) sents = utils.load_json_objs(sents_file) seed_opinions = __read_seed_opinions() pos_tags_list = utils.load_pos_tags(pos_file) dep_tags_list = utils.load_dep_tags_list(dep_file) aspect_terms_list, opinion_terms_list = __get_true_terms_se(sents) term_vocab = rulescommon.get_term_vocab(term_hit_rate_file, 0.6) cnt_hit, cnt_sys, cnt_true = __dp_new(aspect_terms_list, opinion_terms_list, tok_texts_file, sent_texts_file, dep_tags_list, pos_tags_list, seed_opinions, term_vocab) prec = cnt_hit / cnt_sys recall = cnt_hit / cnt_true print(prec, recall, 2 * prec * recall / (prec + recall), cnt_hit, cnt_sys, cnt_true)
def __load_data(dep_tags_file, pos_tags_file, sents_file, train_valid_split_file): tvs_line = utils.read_lines(train_valid_split_file)[0] tvs_arr = [int(v) for v in tvs_line.split()] dep_tags_list = utils.load_dep_tags_list(dep_tags_file) pos_tags_list = utils.load_pos_tags(pos_tags_file) sents = utils.load_json_objs(sents_file) assert len(tvs_arr) == len(dep_tags_list) dep_tags_list_train, dep_tags_list_valid = list(), list() pos_tags_list_train, pos_tags_list_valid = list(), list() sents_train, sents_valid = list(), list() for tvs_label, dep_tags, pos_tags, sent in zip(tvs_arr, dep_tags_list, pos_tags_list, sents): if tvs_label == 0: dep_tags_list_train.append(dep_tags) pos_tags_list_train.append(pos_tags) sents_train.append(sent) else: dep_tags_list_valid.append(dep_tags) pos_tags_list_valid.append(pos_tags) sents_valid.append(sent) data_train = RuleMineData(dep_tags_list_train, pos_tags_list_train, sents_train) data_valid = RuleMineData(dep_tags_list_valid, pos_tags_list_valid, sents_valid) return data_train, data_valid
def __get_all_terms(sents_file): sents = utils.load_json_objs(sents_file) aspect_terms, opinion_terms = set(), set() for s in sents: for t in s.get('terms', list()): aspect_terms.add(t['term'].lower()) for t in s.get('opinions', list()): opinion_terms.add(t.lower()) return aspect_terms, opinion_terms
def __missing_terms(): opinion_terms_file = 'd:/data/aspect/semeval14/opinion-terms-full.txt' opinion_terms_vocab = set(utils.read_lines(opinion_terms_file)) train_sents = utils.load_json_objs(config.SE15R_FILES['train_sents_file']) test_sents = utils.load_json_objs(config.SE15R_FILES['test_sents_file']) train_terms = set() test_terms = dict() for s in train_sents: for t in s['opinions']: train_terms.add(t.lower()) for s in test_sents: for t in s['opinions']: cnt = test_terms.get(t.lower(), 0) test_terms[t.lower()] = cnt + 1 # test_terms.add(t.lower()) for t, cnt in test_terms.items(): if t not in train_terms: print(t, cnt, t in opinion_terms_vocab)
def __run_with_mined_rules(mine_helper, rule_patterns_file, term_hit_rate_file, dep_tags_file, pos_tags_file, sent_texts_file, filter_terms_vocab_file, term_hit_rate_thres=0.6, dst_result_file=None, sents_file=None): l1_rules, l2_rules = rulescommon.load_rule_patterns_file( rule_patterns_file) term_vocab = rulescommon.get_term_vocab(term_hit_rate_file, term_hit_rate_thres) dep_tags_list = utils.load_dep_tags_list(dep_tags_file) pos_tags_list = utils.load_pos_tags(pos_tags_file) sent_texts = utils.read_lines(sent_texts_file) filter_terms_vocab = set(utils.read_lines(filter_terms_vocab_file)) # opinion_terms_vocab = set(utils.read_lines(opinion_terms_file)) terms_sys_list = list() for sent_idx, (dep_tag_seq, pos_tag_seq, sent_text) in enumerate( zip(dep_tags_list, pos_tags_list, sent_texts)): terms = set() l1_terms_new = set() for p in l1_rules: terms_new = rulescommon.find_terms_by_l1_pattern( p, dep_tag_seq, pos_tag_seq, mine_helper, filter_terms_vocab) terms.update(terms_new) l1_terms_new.update(terms_new) for p in l2_rules: terms_new = rulescommon.find_terms_by_l2_pattern( p, dep_tag_seq, pos_tag_seq, mine_helper, filter_terms_vocab, l1_terms_new) terms.update(terms_new) terms_new = mine_helper.get_terms_by_matching(dep_tag_seq, pos_tag_seq, sent_text, term_vocab) terms.update(terms_new) terms_sys_list.append(terms) if sent_idx % 10000 == 0: print(sent_idx) if dst_result_file is not None: __write_rule_results(terms_sys_list, sent_texts, dst_result_file) if sents_file is not None: sents = utils.load_json_objs(sents_file) # aspect_terms_true = utils.aspect_terms_list_from_sents(sents) terms_list_true = mine_helper.terms_list_from_sents(sents) sent_texts = [sent['text'] for sent in sents] correct_sent_idxs = __evaluate(terms_sys_list, terms_list_true, dep_tags_list, pos_tags_list, sent_texts)
def __gen_opinion_terms_file(sents_file, dst_terms_file): sents = utils.load_json_objs(sents_file) fout = open(dst_terms_file, 'w', encoding='utf-8') for sent in sents: terms = [t for t in sent.get('opinions', list())] if not terms: fout.write('\n') else: fout.write('{}\n'.format(','.join(terms))) fout.close()
def __check_difficulty(): rest_sents_file = 'd:/data/aspect/semeval14/restaurants/restaurants_test_sents.json' lap_sents_file = 'd:/data/aspect/semeval14/laptops/laptops_test_sents.json' rest_sents = utils.load_json_objs(rest_sents_file) lap_sents = utils.load_json_objs(lap_sents_file) def __count_sps(sents): sp_cnt, cnt = 0, 0 for sent in sents: terms = [t['term'] for t in sent.get('terms', list())] for t in terms: if ' ' in t: sp_cnt += 1 cnt += len(terms) print(sp_cnt / cnt) __count_sps(rest_sents) __count_sps(lap_sents)
def __check_opinion_errors(): terms_sys_list = utils.load_json_objs( 'd:/onedrive/opinion_terms_bert_output_r.txt') terms_sys_nr_list = utils.load_json_objs( 'd:/onedrive/opinion_terms_bert_output.txt') test_sents = utils.load_json_objs(config.SE15R_FILES['test_sents_file']) terms_true_list = [s['opinions'] for s in test_sents] for s, terms_true, terms_sys, terms_sys_nr in zip(test_sents, terms_true_list, terms_sys_list, terms_sys_nr_list): if not terms_true and not terms_sys: continue terms_true = [t.lower() for t in terms_true] if len(terms_true) == len(terms_sys) and __count_hits( terms_true, terms_sys) == len(terms_true): continue print(s['text']) print(terms_true, terms_sys, terms_sys_nr) print()
def __get_manual_feat(tok_texts_file, terms_file): tok_texts = utils.read_lines(tok_texts_file) terms_list = utils.load_json_objs(terms_file) feat_list = list() for terms_true, tok_text in zip(terms_list, tok_texts): words = tok_text.split(' ') label_seq = modelutils.label_sentence(words, terms_true) feat_seq = np.zeros([len(label_seq), 3], np.int32) for i, v in enumerate(label_seq): feat_seq[i][v] = 1 feat_list.append(feat_seq) return feat_list
def __split_training_set(train_sents_file, dst_file, n_dev_samples=None): sents = utils.load_json_objs(train_sents_file) n_sents = len(sents) if n_dev_samples is None: valid_data_percent = 0.2 n_dev_samples = int(n_sents * valid_data_percent) perm = np.random.permutation(n_sents) valid_idxs = set(perm[:n_dev_samples]) with open(dst_file, 'w', encoding='utf-8', newline='\n') as fout: train_valid_labels = ['1' if i in valid_idxs else '0' for i in range(n_sents)] fout.write(' '.join(train_valid_labels)) fout.write('\n')
def __load_terms_in_train(train_sents_file): sents_train = utils.load_json_objs(train_sents_file) terms_train = set() for sent in sents_train: terms = sent.get('terms', None) if terms is None: continue for t in terms: terms_train.add(t['term'].lower()) terms_train = list(terms_train) terms_train.sort(key=lambda x: -len(x)) return terms_train
def get_data_amazon_ao(vocab, aspect_terms_file, opinion_terms_file, tok_texts_file): aspect_terms_list = utils.load_json_objs(aspect_terms_file) opinion_terms_list = utils.load_json_objs(opinion_terms_file) tok_texts = utils.read_lines(tok_texts_file) assert len(aspect_terms_list) == len(tok_texts) assert len(opinion_terms_list) == len(tok_texts) word_idx_dict = {w: i + 1 for i, w in enumerate(vocab)} label_seq_list = list() word_idx_seq_list = list() for aspect_terms, opinion_terms, tok_text in zip(aspect_terms_list, opinion_terms_list, tok_texts): words = tok_text.split(' ') label_seq = label_sentence(words, aspect_terms, opinion_terms) label_seq_list.append(label_seq) word_idx_seq_list.append([word_idx_dict.get(w, 0) for w in words]) np.random.seed(3719) perm = np.random.permutation(len(label_seq_list)) n_train = len(label_seq_list) - 2000 idxs_train, idxs_valid = perm[:n_train], perm[n_train:] label_seq_list_train = [label_seq_list[idx] for idx in idxs_train] word_idx_seq_list_train = [word_idx_seq_list[idx] for idx in idxs_train] train_data = TrainData(label_seq_list_train, word_idx_seq_list_train) label_seq_list_valid = [label_seq_list[idx] for idx in idxs_valid] word_idx_seq_list_valid = [word_idx_seq_list[idx] for idx in idxs_valid] tok_texts_valid = [tok_texts[idx] for idx in idxs_valid] aspects_list_valid = [aspect_terms_list[idx] for idx in idxs_valid] opinions_list_valid = [opinion_terms_list[idx] for idx in idxs_valid] valid_data = ValidData(label_seq_list_valid, word_idx_seq_list_valid, tok_texts_valid, aspects_list_valid, opinions_list_valid) return train_data, valid_data
def get_data_semeval(train_sents_file, train_tok_text_file, train_valid_split_file, test_sents_file, test_tok_text_file, vocab, n_train, task): tvs_line = utils.read_lines(train_valid_split_file)[0] tvs_arr = [int(v) for v in tvs_line.split()] sents = utils.load_json_objs(train_sents_file) # texts = utils.read_lines(train_tok_text_file) tok_texts, word_span_seqs = load_token_pos_file(train_tok_text_file) sents_train, tok_texts_train, sents_valid, tok_texts_valid = list(), list( ), list(), list() word_span_seqs_train, word_span_seqs_valid = list(), list() for label, s, t, span_seq in zip(tvs_arr, sents, tok_texts, word_span_seqs): if label == 0: sents_train.append(s) tok_texts_train.append(t) word_span_seqs_train.append(span_seq) else: sents_valid.append(s) tok_texts_valid.append(t) word_span_seqs_valid.append(span_seq) labels_list_train, word_idxs_list_train = data_from_sents_file( sents_train, tok_texts_train, word_span_seqs_train, vocab, task) if n_train > -1: labels_list_train = labels_list_train[:n_train] word_idxs_list_train = word_idxs_list_train[:n_train] train_data = TrainData(labels_list_train, word_idxs_list_train) valid_data = __get_valid_data(sents_valid, tok_texts_valid, word_span_seqs_valid, vocab, task) sents_test = utils.load_json_objs(test_sents_file) texts_test, word_span_seqs_test = load_token_pos_file(test_tok_text_file) test_data = __get_valid_data(sents_test, texts_test, word_span_seqs_test, vocab, task) return train_data, valid_data, test_data
def __gen_yelp_path_count_feat(mentions_file, mention_id_idx_file, path_strs, for_pra, dst_file): mentions = utils.load_json_objs(mentions_file) mention_ids = {m['mention_id'] for m in mentions} mention_candidates = utils.load_candidates_for_mentions(config.YELP_CANDIDATES_FILE, mention_ids) mention_id_to_idx = utils.load_id_to_idx(mention_id_idx_file) biz_id_to_idx = utils.load_id_to_idx(config.YELP_BIZ_ID_TO_IDX_FILE) if for_pra: commuting_matrix_files = [os.path.join( config.YELP_DATA_DIR, 'network/{}_norm.txt'.format(s)) for s in path_strs] else: commuting_matrix_files = [os.path.join(config.YELP_DATA_DIR, 'network/{}.txt'.format(s)) for s in path_strs] gen_path_count_feats_file(config.YELP_DATA_INFO_FILE, mention_candidates, mention_id_to_idx, biz_id_to_idx, commuting_matrix_files, for_pra, dst_file)
def __gen_filter_terms_vocab_file(mine_helper, dep_tags_file, pos_tags_file, sents_file, dst_file): dep_tags_list = utils.load_dep_tags_list(dep_tags_file) pos_tags_list = utils.load_pos_tags(pos_tags_file) sents = utils.load_json_objs(sents_file) # aspect_terms_list = utils.aspect_terms_list_from_sents(sents) terms_list = mine_helper.terms_list_from_sents(sents) filter_terms_vocab = __get_term_filter_dict(dep_tags_list, pos_tags_list, terms_list, term_filter_rate, mine_helper) with open(dst_file, 'w', encoding='utf-8', newline='\n') as fout: for t in filter_terms_vocab: fout.write('{}\n'.format(t))
def get_flat_feats_test(mentions_file): mentions = utils.load_json_objs(mentions_file) feats_list = [modelutils.get_ind_flat_feat_test( mentions, mention_candidates_dict, mention_id_to_idx, biz_id_idx_dict, mf, bf ) for mf, bf in zip(mention_feat_list, biz_feat_list)] if mention_cand_feats_list is not None: X_bnd_list = [modelutils.get_bnd_feat_test( mentions, mention_candidates_dict, feats) for feats in mention_cand_feats_list] feats_list += X_bnd_list X = modelutils.concatenate_feats(feats_list) y_true = utils.get_y_true(mentions, mention_candidates_dict, 'target_id') return modelutils.DataTest(X, y_true)
def __gen_aspect_opinion_file(sents_file, dst_aspect_file, dst_opinion_file): sents = utils.load_json_objs(sents_file) aspects_list, opinions_list = list(), list() for sent in sents: aspects_list.append([t['term'] for t in sent.get('terms', list())]) opinions_list.append(sent.get('opinions', list())) def __write_terms_file(terms_list, dst_file): with open(dst_file, 'w', encoding='utf-8') as fout: for terms in terms_list: fout.write('{}\n'.format(','.join(terms))) __write_terms_file(aspects_list, dst_aspect_file) __write_terms_file(opinions_list, dst_opinion_file)
def __dataset_statistics(): # sents_file = 'd:/data/aspect/semeval14/laptops/laptops_test_sents.json' # sents_file = 'd:/data/aspect/semeval14/restaurants/restaurants_test_sents.json' # sents_file = 'd:/data/aspect/semeval14/restaurants/restaurants_train_sents.json' # sents_file = 'd:/data/aspect/semeval15/restaurants/restaurants_train_sents.json' sents_file = 'd:/data/aspect/semeval15/restaurants/restaurants_test_sents.json' sents = utils.load_json_objs(sents_file) print(len(sents), 'sentences') at_cnt, ot_cnt = 0, 0 for s in sents: at_cnt += len(s.get('terms', list())) ot_cnt += len(s.get('opinions', list())) print(at_cnt, 'aspect terms') print(ot_cnt, 'opinion terms')
def __dp_hl04(): reviews = utils.load_json_objs(config.REVIEWS_FILE_HL04) sents = utils.load_json_objs(config.SENTS_FILE_HL04) review_prod_dict = {r['review_id']: r['file'] for r in reviews} prod_set = {v for v in review_prod_dict.values()} prod_sents_dict = {v: list() for v in prod_set} for i, sent in enumerate(sents): prod_sents_dict[review_prod_dict[sent['review_id']]].append(i) # seed_opinions = utils.read_lines(config.SEED_OPINIONS_FILE_HL04) seed_opinions = __read_seed_opinions() pos_tags_list = utils.load_pos_tags(config.SENT_POS_FILE_HL04) dep_tags_list = utils.load_dep_tags_list(config.SENT_DEPENDENCY_FILE_HL04) assert len(pos_tags_list) == len(sents) assert len(dep_tags_list) == len(sents) cnt_hit, cnt_sys, cnt_true = __dp_new(sents, dep_tags_list, pos_tags_list, seed_opinions) # cnt_hit, cnt_sys, cnt_true = 0, 0, 0 # for prod, sent_idxs in prod_sents_dict.items(): # print(prod) # # if prod != 'Canon G3.txt': # # continue # # prod_sents = [sents[i] for i in sent_idxs] # prod_pos_tags_list = [pos_tags_list[i] for i in sent_idxs] # prod_dep_tags_list = [dep_tags_list[i] for i in sent_idxs] # n_hit, n_sys, n_true = __dp_new(prod_sents, prod_dep_tags_list, prod_pos_tags_list, seed_opinions) # cnt_hit += n_hit # cnt_sys += n_sys # cnt_true += n_true # # break prec = cnt_hit / cnt_sys recall = cnt_hit / cnt_true print(prec, recall, 2 * prec * recall / (prec + recall), cnt_hit, cnt_sys, cnt_true)
def __semeval_rule_insight(): train_file = 'd:/data/aspect/semeval14/Laptops_Train.json' test_file = 'd:/data/aspect/semeval14/Laptops_Test_Gold.json' sents_train = utils.load_json_objs(train_file) sents_test = utils.load_json_objs(test_file) def __count_terms(sents): cnt_dict = dict() for sent in sents: aspect_terms = sent.get('terms', None) if aspect_terms is not None: for term in aspect_terms: s = term['term'] cnt = cnt_dict.get(s, 0) cnt_dict[s] = cnt + 1 return cnt_dict term_cnts_train = __count_terms(sents_train) term_cnts_test = __count_terms(sents_test) term_cnt_tups = [(t, cnt) for t, cnt in term_cnts_test.items()] term_cnt_tups.sort(key=lambda x: -x[1]) for t, cnt in term_cnt_tups: if t not in term_cnts_train: print(t, cnt)
def gen_ordinary_features_file(mentions_file, candidates_file, review_file, biz_file, dst_file): print('generation ordinary features ...') mentions = utils.load_json_objs(mentions_file) mention_candidates = utils.load_candidates(candidates_file) mention_cand_feats = get_ordinary_features(mentions, mention_candidates, review_file, biz_file, feats_include) fout = open(dst_file, 'w', encoding='utf-8', newline='\n') for m in mentions: mention_id = m['mention_id'] candidates = mention_candidates[mention_id] feats = mention_cand_feats[mention_id] utils.write_candidate_features(mention_id, candidates, feats, fout) fout.close()
def gen_path_count_feats_file(data_info_file, mention_candidates, mention_id_to_idx, biz_id_to_idx, commuting_matrix_files, for_pra, dst_file): mention_candidates_idxs = utils.get_mention_cadidate_idxs_dict( mention_candidates, mention_id_to_idx, biz_id_to_idx) data_info = utils.load_json_objs(data_info_file)[0] if for_pra: # __get_path_count_features(commuting_matrix_files, mention_candidates_idxs, data_info['mentions'], # data_info['bizs'], -1) features_list = [__get_path_count_features( f, mention_candidates_idxs, data_info['mentions'], data_info['bizs'], -1 ) for f in commuting_matrix_files] __write_mention_candidate_features(features_list, mention_candidates, mention_id_to_idx, dst_file) else: features_list = [__get_path_count_features( f, mention_candidates_idxs, data_info['mentions'], data_info['bizs'], NORM_THRES ) for f in commuting_matrix_files] __write_mention_candidate_features(features_list, mention_candidates, mention_id_to_idx, dst_file)
def __opinion_rule_insight(dep_tags_file, pos_tags_file, sent_text_file, terms_vocab, dst_result_file=None, sents_file=None): print('loading data ...') dep_tags_list = utils.load_dep_tags_list(dep_tags_file) pos_tags_list = utils.load_pos_tags(pos_tags_file) sent_texts = utils.read_lines(sent_text_file) assert len(dep_tags_list) == len(sent_texts) assert len(pos_tags_list) == len(dep_tags_list) print('done.') opinions_sys_list = list() for sent_idx, sent_text in enumerate(sent_texts): dep_tags = dep_tags_list[sent_idx] pos_tags = pos_tags_list[sent_idx] assert len(dep_tags) == len(pos_tags) opinion_terms = set() # used rule2 and __match_terms to pretrain # terms_new = opinionrules.rule1(dep_tags, pos_tags) # opinion_terms.update(terms_new) terms_new = opinionrules.rule2(dep_tags, pos_tags) opinion_terms.update(terms_new) # terms_new = opinionrules.rule4(dep_tags, pos_tags) # opinion_terms.update(terms_new) terms_new = __match_terms(sent_text, terms_vocab) opinion_terms.update(terms_new) opinions_sys_list.append(opinion_terms) if sent_idx % 10000 == 0: print(sent_idx) if dst_result_file is not None: __write_rule_results(opinions_sys_list, sent_texts, dst_result_file) if sents_file is not None: sents = utils.load_json_objs(sents_file) opinions_true_list = list() for sent in sents: opinions_true_list.append( [t.lower() for t in sent.get('opinions', list())]) correct_sent_idxs = __evaluate(opinions_sys_list, opinions_true_list, dep_tags_list, pos_tags_list, sent_texts)
def get_data_amazon(vocab, true_terms_file, tok_texts_file, task): terms_true_list = utils.load_json_objs(true_terms_file) tok_texts = utils.read_lines(tok_texts_file) # print(len(terms_true_list), tok_texts_file, len(tok_texts)) assert len(terms_true_list) == len(tok_texts) word_idx_dict = {w: i + 1 for i, w in enumerate(vocab)} label_seq_list = list() word_idx_seq_list = list() for terms_true, tok_text in zip(terms_true_list, tok_texts): words = tok_text.split(' ') label_seq = label_sentence(words, terms_true) label_seq_list.append(label_seq) word_idx_seq_list.append([word_idx_dict.get(w, 0) for w in words]) np.random.seed(3719) perm = np.random.permutation(len(label_seq_list)) n_train = len(label_seq_list) - 2000 idxs_train, idxs_valid = perm[:n_train], perm[n_train:] label_seq_list_train = [label_seq_list[idx] for idx in idxs_train] word_idx_seq_list_train = [word_idx_seq_list[idx] for idx in idxs_train] train_data = TrainData(label_seq_list_train, word_idx_seq_list_train) label_seq_list_valid = [label_seq_list[idx] for idx in idxs_valid] word_idx_seq_list_valid = [word_idx_seq_list[idx] for idx in idxs_valid] tok_texts_valid = [tok_texts[idx] for idx in idxs_valid] terms_true_list_valid = [terms_true_list[idx] for idx in idxs_valid] aspect_true_list, opinion_true_list = None, None if task != 'opinion': aspect_true_list = terms_true_list_valid if task != 'aspect': opinion_true_list = terms_true_list_valid valid_data = ValidData(None, label_seq_list_valid, word_idx_seq_list_valid, None, tok_texts_valid, aspect_true_list, opinion_true_list) return train_data, valid_data