def compute_features(d_dict, q_dict, c_dict): # in_q, in_c, lemma_in_q, lemma_in_c, tf q_words_set = set([w.lower() for w in q_dict['words']]) in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words']] c_words_set = set([w.lower() for w in c_dict['words']]) in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words']] q_words_set = set([w.lower() for w in q_dict['lemma']]) lemma_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma']] c_words_set = set([w.lower() for w in c_dict['lemma']]) lemma_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma']] tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in d_dict['words']] tf = [float('%.2f' % v) for v in tf] d_words = Counter(filter(lambda w: not is_stopword(w) and not is_punc(w), d_dict['words'])) from conceptnet import concept_net p_q_relation = concept_net.p_q_relation(d_dict['words'], q_dict['words']) p_c_relation = concept_net.p_q_relation(d_dict['words'], c_dict['words']) assert len(in_q) == len(in_c) and len(lemma_in_q) == len(in_q) and len(lemma_in_c) == len(in_q) and len(tf) == len(in_q) assert len(tf) == len(p_q_relation) and len(tf) == len(p_c_relation) return { 'in_q': in_q, 'in_c': in_c, 'lemma_in_q': lemma_in_q, 'lemma_in_c': lemma_in_c, 'tf': tf, 'p_q_relation': p_q_relation, 'p_c_relation': p_c_relation }
def get_relation(self, w1, w2): if is_stopword(w1) or is_stopword(w2): return '<NULL>' w1 = '_'.join(w1.lower().split()) w2 = '_'.join(w2.lower().split()) if not w1 in self.data: return '<NULL>' return self.data[w1].get(w2, '<NULL>')
def compute_features(d_dicts, q_dict, c_dicts, q_terms): # compute features for each d_dict and c_dict in_qs, in_cs, lemma_in_qs, lemma_in_cs = [], [], [], [] p_q_relations, p_c_relations = [], [] tfs = [] for d_dict, c_dict in zip(d_dicts, c_dicts): # in_q, in_c, lemma_in_q, lemma_in_c, tf q_words_set = set([w.lower() for w in q_dict['words']]) in_q = [ int(w.lower() in q_words_set and not is_stopword(w)) for w in d_dict['words'] ] in_qs.append(in_q) c_words_set = set([w.lower() for w in c_dict['words']]) in_c = [ int(w.lower() in c_words_set and not is_stopword(w)) for w in d_dict['words'] ] in_cs.append(in_c) tf = [ 0.1 * math.log(word_count * word_frequency(w.lower(), 'zh') + 5) for w in d_dict['words'] ] tf = [float('%.2f' % v) for v in tf] tfs.append(tf) # d_words = Counter(filter(lambda w: not is_stopword(w) and not is_punc(w), d_dict['words'])) from conceptnet import concept_net p_q_relation = concept_net.p_q_relation(d_dict['words'], q_dict['words']) p_q_relations.append(p_q_relation) p_c_relation = concept_net.p_q_relation(d_dict['words'], c_dict['words']) p_c_relations.append(p_c_relation) assert len(in_q) == len(in_c) and len(tf) == len(in_q) assert len(tf) == len(p_q_relation) and len(tf) == len(p_c_relation) q_es = [True if w in q_terms else False for w in q_dict['words']] # update in_c, lemma_in_c and p_c_relation return { 'in_qs': in_qs, 'in_cs': in_cs, 'tfs': tfs, 'p_q_relations': p_q_relations, 'p_c_relations': p_c_relations, 'q_es': q_es }
def compute_features(q_dict, c_dict): # in_c, lemma_in_c, tf c_words_set = set([w.lower() for w in c_dict['words']]) in_c = [ int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['words'] ] c_words_set = set([w.lower() for w in c_dict['lemma']]) lemma_in_c = [ int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['lemma'] ] # tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in q_dict['words']] tf = [wikiwords.freq(w.lower()) for w in q_dict['words']] # tf = [float('%.2f' % v) for v in tf] q_words = Counter( filter(lambda w: not is_stopword(w) and not is_punc(w), q_dict['words'])) from conceptnet import concept_net q_c_relation = concept_net.p_q_relation(q_dict['words'], c_dict['words']) assert len(lemma_in_c) == len(in_c) and len(tf) == len(in_c) assert len(tf) == len(q_c_relation) q_is_science_term = [is_science_term(w) for w in q_dict['words']] q_is_cand = [ 1 if not is_punc(w) and not is_stopword(w) else 0 for w in q_dict['words'] ] return { 'in_c': in_c, 'lemma_in_c': lemma_in_c, 'tf': tf, 'q_c_relation': q_c_relation, 'q_is_science_term': q_is_science_term, 'q_is_cand': q_is_cand }
def p_q_relation(self, passage, query): passage = [w.lower() for w in passage] query = [w.lower() for w in query] query = set(query) | set([' '.join(query[i:(i+2)]) for i in range(len(query))]) query = set([q for q in query if not is_stopword(q)]) ret = ['<NULL>' for _ in passage] for i in range(len(passage)): for q in query: r = self.get_relation(passage[i], q) if r != '<NULL>': ret[i] = r break r = self.get_relation(' '.join(passage[i:(i+2)]), q) if r != '<NULL>': ret[i] = r break return ret
def compute_features(p_dict, q_dict, c_dict): # p_in_q, p_in_c, lemma_p_in_q, lemma_p_in_c, tf p_words_set = set([w.lower() for w in p_dict['words']]) q_words_set = set([w.lower() for w in q_dict['words']]) c_words_set = set([w.lower() for w in c_dict['words']]) p_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['words']] p_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['words']] q_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['words']] q_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['words']] c_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['words']] c_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['words']] p_words_set = set([w.lower() for w in p_dict['lemma']]) q_words_set = set([w.lower() for w in q_dict['lemma']]) c_words_set = set([w.lower() for w in c_dict['lemma']]) p_lemma_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['lemma']] p_lemma_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in p_dict['lemma']] q_lemma_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['lemma']] q_lemma_in_c = [int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in q_dict['lemma']] c_lemma_in_p = [int(w.lower() in p_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['lemma']] c_lemma_in_q = [int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in c_dict['lemma']] p_tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in p_dict['words']] p_tf = [float('%.2f' % v) for v in p_tf] q_tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in q_dict['words']] q_tf = [float('%.2f' % v) for v in q_tf] c_tf = [0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in c_dict['words']] c_tf = [float('%.2f' % v) for v in c_tf] d_words = Counter(filter(lambda w: not is_stopword(w) and not is_punc(w), p_dict['words'])) from conceptnet import concept_net p_q_relation = concept_net.p_q_relation(p_dict['words'], q_dict['words']) p_c_relation = concept_net.p_q_relation(p_dict['words'], c_dict['words']) q_p_relation = concept_net.p_q_relation(q_dict['words'], p_dict['words']) q_c_relation = concept_net.p_q_relation(q_dict['words'], c_dict['words']) c_p_relation = concept_net.p_q_relation(c_dict['words'], p_dict['words']) c_q_relation = concept_net.p_q_relation(c_dict['words'], q_dict['words']) assert len(p_tf) == len(p_q_relation) and len(p_tf) == len(p_c_relation) assert len(q_tf) == len(q_p_relation) and len(q_tf) == len(q_c_relation) assert len(c_tf) == len(c_p_relation) and len(c_tf) == len(c_q_relation) return { 'p_in_q': p_in_q, 'p_in_c': p_in_c, 'p_lemma_in_q': p_lemma_in_q, 'p_lemma_in_c': p_lemma_in_c, 'p_tf': p_tf, 'p_q_relation': p_q_relation, 'p_c_relation': p_c_relation, 'q_in_p': q_in_p, 'q_in_c': q_in_c, 'q_lemma_in_p': q_lemma_in_p, 'q_lemma_in_c': q_lemma_in_c, 'q_tf': q_tf, 'q_p_relation': q_p_relation, 'q_c_relation': q_c_relation, 'c_in_p': c_in_p, 'c_in_q': c_in_q, 'c_lemma_in_p': c_lemma_in_p, 'c_lemma_in_q': c_lemma_in_q, 'c_tf': c_tf, 'c_p_relation': c_p_relation, 'c_q_relation': c_q_relation, }
def compute_features(d_dict, q_dict, c_dict, d_id, q_id, c_id, graphs, sentence_graphs): # in_q, in_c, lemma_in_q, lemma_in_c, tf q_words_set = set([w.lower() for w in q_dict['words']]) in_q = [ int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words'] ] c_words_set = set([w.lower() for w in c_dict['words']]) in_c = [ int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['words'] ] q_words_set = set([w.lower() for w in q_dict['lemma']]) lemma_in_q = [ int(w.lower() in q_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma'] ] c_words_set = set([w.lower() for w in c_dict['lemma']]) lemma_in_c = [ int(w.lower() in c_words_set and not is_stopword(w) and not is_punc(w)) for w in d_dict['lemma'] ] tf = [ 0.1 * math.log(wikiwords.N * wikiwords.freq(w.lower()) + 10) for w in d_dict['words'] ] tf = [float('%.2f' % v) for v in tf] d_words = Counter( filter(lambda w: not is_stopword(w) and not is_punc(w), d_dict['words'])) four_lang_utils = Utils() p_q_four_lang_relation = compute_4lang_relation(graphs, four_lang_utils, d_dict, q_dict) p_c_four_lang_relation = compute_4lang_relation(graphs, four_lang_utils, d_dict, c_dict) q_c_four_lang_relation = compute_4lang_relation(graphs, four_lang_utils, q_dict, c_dict) p_q_four_lang_sentence_relation =\ compute_4lang_sentence_relation(sentence_graphs[d_id], sentence_graphs[d_id]["questions"][q_id], four_lang_utils) p_c_four_lang_sentence_relation =\ compute_4lang_sentence_relation(sentence_graphs[d_id], sentence_graphs[d_id]["questions"][q_id]["choice"][c_id], four_lang_utils) q_c_four_lang_sentence_relation =\ compute_4lang_sentence_relation(sentence_graphs[d_id]["questions"][q_id], sentence_graphs[d_id]["questions"][q_id]["choice"][c_id], four_lang_utils) from conceptnet import concept_net p_q_relation = concept_net.p_q_relation(d_dict['words'], q_dict['words']) p_c_relation = concept_net.p_q_relation(d_dict['words'], c_dict['words']) assert len(in_q) == len(in_c) and len(lemma_in_q) == len(in_q) and len( lemma_in_c) == len(in_q) and len(tf) == len(in_q) assert len(tf) == len(p_q_relation) and len(tf) == len(p_c_relation) return { 'in_q': in_q, 'in_c': in_c, 'lemma_in_q': lemma_in_q, 'lemma_in_c': lemma_in_c, 'tf': tf, 'p_q_relation': p_q_relation, 'p_c_relation': p_c_relation, 'p_q_four_lang_relation': p_q_four_lang_relation, 'p_c_four_lang_relation': p_c_four_lang_relation, 'q_c_four_lang_relation': q_c_four_lang_relation, 'p_q_four_lang_sentence_relation': p_q_four_lang_sentence_relation, 'p_c_four_lang_sentence_relation': p_c_four_lang_sentence_relation, 'q_c_four_lang_sentence_relation': q_c_four_lang_sentence_relation }