candidate_scores = [(score, remove_str_postags(cand)) for score, cand in candidate_scores] stoplist = binom_stoplist(0.2) # 0.5 buen valor # stoplist = log_likelihood_stoplist(5) candidate_scores = stoplist_filter(candidate_scores, stoplist) return candidate_scores if __name__ == "__main__": terms = load_terms() domain_sents = load_analysis() train_terms, test_corpus, test_terms = split_train_test(domain_sents, terms) candidates = main(train_terms, test_corpus) candidates = [word for word, score in candidates] print "[C]", len(candidates) print "[T]", len(set(candidates).intersection(set(test_terms))) print "======" precision, recall = evaluation.precision_recall(test_terms, candidates) print "[P]", round(precision, 3) print "[R]", round(recall, 3) print "======" precision_by_segment = evaluation.precision_by_segments(test_terms, candidates, 4) for i, seg_precision in enumerate(precision_by_segment): print "[%s] %s" % (i + 1, round(seg_precision, 3)) recall_list, precision_list = evaluation.precision_at_recall_values(test_terms, candidates) evaluation.plot_precision_at_recall_values(recall_list, precision_list)
def main(loglike_threshold): term_corp = load_terms() anal_corp = load_analysis() random.shuffle(anal_corp) train_corp = anal_corp[:int(0.5*len(anal_corp))] train_terms = [] for sent in train_corp: for term in term_corp: if term.lower() in ' '.join(sent).lower(): train_terms.append(term) train_terms = [t for t in train_terms if t] test_corp = anal_corp[int(0.5*len(anal_corp)):] test_terms = [] for sent in test_corp: for term in term_corp: if term.lower() in ' '.join(sent).lower(): test_terms.append(term) test_terms = [remove_str_postags(t) for t in test_terms if t] term_model = make_term_model(train_terms) gen_corp = load_general() gen_model = make_general_model(gen_corp) candidate_scores = [] pos_patterns = term_model['pos_freq'].keys() for pos_seq in pos_patterns: syn_coef = calc_syntactic_coef(pos_seq, term_model) chunks = chunk_sents(pos_seq, test_corp) chunk_freq_dict = defaultdict(int) for chnk in chunks: chunk_freq_dict[chnk] += 1 accepted_phrases = chunk_freq_dict for candidate in accepted_phrases.keys(): cand_freq = chunk_freq_dict[candidate] lex_coef = calc_lexical_coef(candidate, term_model, gen_model) morph_coef = calc_morph_coef(candidate, term_model, gen_model) candidate_coef = cand_freq * syn_coef * lex_coef * morph_coef candidate_scores.append((candidate_coef, candidate),) candidate_scores = sorted(candidate_scores, reverse=True) candidates = [cand for score, cand in candidate_scores] stripped_candidates = [] for cand in candidates: new_cand = remove_str_postags(cand) stripped_candidates.append(new_cand) stoplist = loglike_stoplist(loglike_threshold) accepted_candidates, rejected_candidates = \ filter_out(stripped_candidates, stoplist) precision, recall = \ evaluation.precision_recall(test_terms, accepted_candidates) print '\nNAZAR' print '==========' print '[P]', round(precision, 3) print '[R]', round(recall, 3) print '==========' precision_by_segment = evaluation.precision_by_segments( test_terms, accepted_candidates, 4) for i, seg_precision in enumerate(precision_by_segment): print '[%s] %s' % (i, round(seg_precision, 3)) recall_list, precision_list = evaluation.precision_at_recall_values( test_terms, accepted_candidates) evaluation.plot_precision_at_recall_values(recall_list, precision_list) return candidates
print '========' print '[C]', len(sorted_candidates) print '[T]', len(set(sorted_candidates).intersection(set(terms))) print '========' precision, recall = evaluation.precision_recall(terms, sorted_candidates) print '[P]', round(precision, 3) print '[R]', round(recall, 3) print '========' precision_by_segment = evaluation.precision_by_segments( terms, sorted_candidates, 4) for i, seg_precision in enumerate(precision_by_segment): print '[%s] %s' % (i, round(seg_precision, 3)) recall_list, precision_list = evaluation.precision_at_recall_values( terms, sorted_candidates) evaluation.plot_precision_at_recall_values(recall_list, precision_list) cvalue_top = [c for c in sorted_candidates[:int(len(candidates) * 0.2)]] context_words = make_contextword_weight_dict(cvalue_top, domain_corpus, ['NC', 'AQ', 'VM'], 5) ncvalue_output = calc_ncvalue(candidates, domain_corpus, context_words, ['NC', 'AQ', 'VM'], 5) sorted_ncvalue = [(cand, score) for cand, score in sorted( ncvalue_output.items(), key=lambda x: x[1], reverse=True)] with open('ncvalue.txt', 'w') as f: new_cands = [] for c in sorted_ncvalue: newc = '%.5f\t%s' % (c[1], c[0]) new_cands.append(newc) f.write('\n'.join(new_cands).encode('utf-8'))
def main(pattern, min_freq, loglike_threshold, min_cvalue, use_ncval=False, cval_top=0.2): # STEP 1. # POS-tagged corpus. domain = load_domain() # STEP 2. # Extract matching patterns above frequency threshold. phrase_freq = chunk_sents(domain, pattern, min_freq) # Remove POS tags from candidates. phrase_freq = remove_dict_postags(phrase_freq) # Remove candidates with words in stoplist. stoplist = loglike_stoplist(loglike_threshold) accepted_phrases, discarded_phrases = filter_out(phrase_freq, stoplist) # Order candidates first by number of words, then by frequency. sorted_phrases = build_sorted_phrases(accepted_phrases) # STEP 3. # Calculate C-value, discard if C-value below threshold. cvalue_output = calc_cvalue(sorted_phrases, min_cvalue) cvalue_candidates = [c[0] for c in cvalue_output] known_terms = load_terms() if use_ncval is True: cvalue_top = [c for c in cvalue_candidates[0:int(len(cvalue_candidates) * cval_top)]] context_word_weights = make_contextword_weight_dict( cvalue_top, domain, ['NC', 'AQ', 'VM'], 5) ncvalue_output = calc_ncvalue( cvalue_output, domain, context_word_weights, ['NC', 'AQ', 'VM'], 5) ncvalue_candidates = [c[0] for c in ncvalue_output] precision, recall = \ evaluation.precision_recall(known_terms, ncvalue_candidates) print '\nNC-VALUE' print '==========' print 'PRECISION:', round(precision, 3) print 'RECALL:', round(recall, 3) print '==========' precision_by_segment = evaluation.precision_by_segments( known_terms, ncvalue_candidates, 4) for i, seg_precision in enumerate(precision_by_segment): print '[%s] %s' % (i, round(seg_precision, 3)) recall_list, precision_list = evaluation.precision_at_recall_values( known_terms, cvalue_candidates) evaluation.plot_precision_at_recall_values(recall_list, precision_list) results = ncvalue_candidates else: precision, recall = \ evaluation.precision_recall(known_terms, cvalue_candidates) print '\nC-VALUE' print '==========' print '[P]', round(precision, 3) print '[R]', round(recall, 3) print '==========' precision_by_segment = evaluation.precision_by_segments( known_terms, cvalue_candidates, 4) for i, seg_precision in enumerate(precision_by_segment): print '[%s] %s' % (i, round(seg_precision, 3)) recall_list, precision_list = evaluation.precision_at_recall_values( known_terms, cvalue_candidates) evaluation.plot_precision_at_recall_values(recall_list, precision_list) results = cvalue_candidates return results