def extract_features(wordseg, ql, qr, tfidf_count_hash_vectorModels): sent_l = Sentence() sent_l.raw_form = ql sent_l.base_form = ql wordseg_out = wordseg.segment(ql) sent_l.basic_words = wordseg_out.basic_words feature_dict = {} sent_r = Sentence() sent_r.raw_form = qr sent_r.base_form = qr wordseg_out = wordseg.segment(qr) sent_r.basic_words = wordseg_out.basic_words l_notion = get_notional_tokens(sent_l) r_notion = get_notional_tokens(sent_r) count_tfidf_hash_features = get_tfidf_count_hash_features(sent_l, sent_r, tfidf_count_hash_vectorModels) feature_dict.update(count_tfidf_hash_features) notion_count_tfidf_hash_features = get_tfidf_count_hash_features(l_notion, r_notion, tfidf_count_hash_vectorModels, "notion") feature_dict.update(notion_count_tfidf_hash_features) for k in feature_dict: print(k) return feature_dict
def extract_features(wordseg, ql, qr, tfidf_count_hash_vectorModels, sent_word2vec, sent_vocab_dict, sent_model): sent_l = Sentence() sent_l.raw_form = ql sent_l.base_form = ql wordseg_out = wordseg.segment(ql) sent_l.basic_words = wordseg_out.basic_words feature_dict = {} sent_r = Sentence() sent_r.raw_form = qr sent_r.base_form = qr wordseg_out = wordseg.segment(qr) sent_r.basic_words = wordseg_out.basic_words lexical_features = calc_lexical_features(sent_l, sent_r) feature_dict.update(lexical_features) count_tfidf_hash_features = get_tfidf_count_hash_features( sent_l, sent_r, tfidf_count_hash_vectorModels) feature_dict.update(count_tfidf_hash_features) sentvec_features = get_sentvec_features(sent_word2vec, sent_vocab_dict, sent_model, sent_l, sent_r) feature_dict.update(sentvec_features) return feature_dict
def extract_features(wordseg, ql, qr): sent_l = Sentence() sent_l.raw_form = ql sent_l.base_form = ql wordseg_out = wordseg.segment(ql) sent_l.basic_words = wordseg_out.basic_words sent_r = Sentence() sent_r.raw_form = qr sent_r.base_form = qr wordseg_out = wordseg.segment(qr) sent_r.basic_words = wordseg_out.basic_words l_periph = get_periph(sent_l, qr) r_periph = get_periph(sent_r, ql) feature_dict = {} lexical_features = calc_lexical_features(sent_l, sent_r) feature_dict.update(lexical_features) periph_lexical_features = calc_periph_lexical_features(l_periph, r_periph) feature_dict.update(periph_lexical_features) return feature_dict
def extract_features(wordseg, ql, qr, sent_word2vec, word_weights, sent_model): sent_l = Sentence() sent_l.raw_form = ql sent_l.base_form = ql wordseg_out = wordseg.segment(ql) sent_l.basic_words = wordseg_out.basic_words sent_r = Sentence() sent_r.raw_form = qr sent_r.base_form = qr wordseg_out = wordseg.segment(qr) sent_r.basic_words = wordseg_out.basic_words l_notion = get_notional_tokens(sent_l) r_notion = get_notional_tokens(sent_r) feature_dict = {} sentvec_features = get_sentvec_features(sent_word2vec, word_weights, sent_model, sent_l, sent_r) feature_dict.update(sentvec_features) sentvec_features = get_sentvec_features(sent_word2vec, word_weights, sent_model, sent_l, sent_r, "notion") feature_dict.update(sentvec_features) for k, value in feature_dict.items(): print(k) print(value) return feature_dict
def extract_features(wordseg, ql, qr, tfidf_count_hash_vectorModels, sent_word2vec, sent_vocab_dict, sent_model, ner_dict, syn_dict): sent_l = Sentence() sent_l.raw_form = ql sent_l.base_form = ql wordseg_out = wordseg.segment(ql) sent_l.basic_words = wordseg_out.basic_words feature_dict = {} sent_r = Sentence() sent_r.raw_form = qr sent_r.base_form = qr wordseg_out = wordseg.segment(qr) sent_r.basic_words = wordseg_out.basic_words l_notion = get_notional_tokens(sent_l) r_notion = get_notional_tokens(sent_r) l_periph = get_periph(sent_l, qr) r_periph = get_periph(sent_r, ql) #---------------------------------------------------------------------------------------------------------- lexical_features = calc_lexical_features(sent_l, sent_r) feature_dict.update(lexical_features) periph_lexical_features = calc_periph_lexical_features(l_periph, r_periph) feature_dict.update(periph_lexical_features) mt_features = calc_mt_features(sent_l, sent_r) feature_dict.update(mt_features) count_tfidf_hash_features = get_tfidf_count_hash_features( sent_l, sent_r, tfidf_count_hash_vectorModels) feature_dict.update(count_tfidf_hash_features) notion_count_tfidf_hash_features = get_tfidf_count_hash_features( l_notion, r_notion, tfidf_count_hash_vectorModels, signature="notion") feature_dict.update(notion_count_tfidf_hash_features) sentvec_features = get_sentvec_features(sent_word2vec, sent_vocab_dict, sent_model, sent_l, sent_r) feature_dict.update(sentvec_features) sentvec_features = get_sentvec_features(sent_word2vec, sent_vocab_dict, sent_model, l_notion, r_notion, signature="notion") feature_dict.update(sentvec_features) ner_features = get_ner_features(sent_l, sent_r, ner_dict, syn_dict) feature_dict.update(ner_features) return feature_dict
def extract_features(wordseg, ql, qr, ner_dict, syn_dict): sent_l = Sentence() sent_l.raw_form = ql sent_l.base_form = ql wordseg_out = wordseg.segment(ql) sent_l.basic_words = wordseg_out.basic_words sent_r = Sentence() sent_r.raw_form = qr sent_r.base_form = qr wordseg_out = wordseg.segment(qr) sent_r.basic_words = wordseg_out.basic_words feature_dict = {} entities_features = get_ner_features(sent_l, sent_r, ner_dict, syn_dict) feature_dict.update(entities_features) return feature_dict
def split_ql_qr(wordseg, ql, qr): tokenized_ql = [] tokenized_qr = [] sent_l = Sentence() sent_l.raw_form = ql sent_l.base_form = ql wordseg_out = wordseg.segment(ql) sent_l.basic_words = wordseg_out.basic_words items=[] for item in sent_l.basic_words: items.append(item.term) tokenized_ql.append(items) sent_r = Sentence() sent_r.raw_form = qr sent_r.base_form = qr wordseg_out = wordseg.segment(qr) sent_r.basic_words = wordseg_out.basic_words items=[] for item in sent_r.basic_words: items.append(item.term) tokenized_qr.append(items) return tokenized_ql,tokenized_qr