def rte_features(rtepair, id, training=True, test_id=0): # construct feature of a pair if test_id == 4: true_test_id = 2410 + id else: true_test_id = id + test_id * 800 extractor = nltk.RTEFeatureExtractor(rtepair) features = {} features['word_overlap'] = len(extractor.overlap('word')) features['word_hyp_extra'] = len(extractor.hyp_extra('word')) features['ne_overlap'] = len(extractor.overlap('ne')) features['ne_hyp_extra'] = len(extractor.hyp_extra('ne')) features['neg_txt'] = len(extractor.negwords & extractor.text_words) features['neg_hyp'] = len(extractor.negwords & extractor.hyp_words) if training: features['tfidf_sim'] = tfidf_sim_list_train[id] # features['w2v_ne_sim'] = ne_w2v_train[id] # features['w2v_noun_sim'] = noun_w2v_train[id] # features['w2v_adj_sim'] = adj_w2v_train[id] # features['w2v_verb_sim'] = verb_w2v_train[id] else: features['tfidf_sim'] = tfidf_sim_list_test[true_test_id] # features['w2v_ne_sim'] = ne_w2v_test[true_test_id] # features['w2v_noun_sim'] = noun_w2v_test[true_test_id] # features['w2v_adj_sim'] = adj_w2v_test[true_test_id] # features['w2v_verb_sim'] = verb_w2v_test[true_test_id] return features
def rte_features(rtepair): extractor=nltk.RTEFeatureExtractor(rtepair) features={} features['word_overlap']=len(extractor.overlap('word')) features['word_hyp_extra']=len(extractor.hyp_extra('word')) features['ne_overlap']=len(extractor.overlap('ne')) features['ne_hyp_extra']=len(extractor.hyp_extra('ne')) return features
def rte_features(rtepair): # RTEFeatureExtractor 类建立了一个在文本和假设中都有的并已去除一些停用词后的词汇包 extractor = nltk.RTEFeatureExtractor(rtepair) features = {} # 计算重叠性和差异性 features['word_overlap'] = len(extractor.overlap('word')) features['word_hyp_extra'] = len(extractor.hyp_extra('word')) features['ne_overlap'] = len(extractor.overlap('ne')) features['ne_hyp_extra'] = len(extractor.hyp_extra('ne')) return features
def rte_features(rtepair): extractor = nltk.RTEFeatureExtractor(rtepair) # 抽取器API,会去除一些高频虚词 features = {} features['word_overlap'] = len(extractor.overlap('word')) # 计算文本与假设的重叠程度 features['word_hyp_extra'] = len( extractor.hyp_extra('word')) # 计算假设中的无关内容量 # 单独计算命名实体 features['ne_overlap'] = len(extractor.overlap('ne')) # 命名实体,计算文本与假设的重叠程度 features['ne_hyp_extra'] = len( extractor.hyp_extra('ne')) # 命名实体计算假设中的无关内容量 return features
def recognizeTextualEntailment(): def rte_features(rtepair): extractor = nltk.RTEFeatureExtractor(rtepair) features = {} features['word_overlap'] = len(extractor.overlap('word')) features['word_hyp_extra'] = len(extractor.hyp_extra('word')) features['ne_overlap'] = len(extractor.overlap('ne')) features['ne_hyp_extra'] = len(extractor.hyp_extra('ne')) return features rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33] extractor = nltk.RTEFeatureExtractor(rtepair) print extractor.text_words print extractor.hyp_words print extractor.overlap('word') print extractor.overlap('ne') print extractor.hyp_extra('word')
def rte_features(rtepair): extractor = nltk.RTEFeatureExtractor(rtepair) features = {} features['word_overlap'] = len(extractor.overlap('word')) features['word_hyp_extra'] = len(extractor.hyp_extra('word')) # print "1111111",extractor.hyp_extra('word') features['ne_overlap'] = len(extractor.overlap('ne')) # print "ne repeat\t\t\t",extractor.overlap('ne') features['ne_hyp_extra'] = len(extractor.hyp_extra('ne')) # print "ne extraeeeeeeeee\t",extractor.hyp_extra('ne') features['dif_length'] = len(extractor.text_tokens) - len( extractor.hyp_tokens) features['result'] = cos_sim(rtepair) features['neg_'] = len(negtivewords(list(extractor.text_words))) - len( negtivewords(list(extractor.hyp_words))) # features['td'] = treedis.cal(treedis.buildtree(rtepair.text), treedis.buildtree(rtepair.hyp)) return features
def rte_features(rtepair): """ 词(即词类型)作为信息的代理,计数词重叠的程度和假设中有而文本没有的词的程度 特征词包括(命名实体、) :param rtepair: :type rtepair: :return: :rtype: """ # RTEFeatureExtractor类建立了一个在文本和假设中都有的并且已经除去了一些停用词后的词汇包 extractor = nltk.RTEFeatureExtractor(rtepair) features = {} # 计算 重叠性 和 差异性 features['word_overlap'] = len(extractor.overlap('word')) features['word_hyp_extra'] = len(extractor.hyp_extra('word')) features['ne_overlap'] = len(extractor.overlap('ne')) features['ne_hyp_extra'] = len(extractor.hyp_extra('ne')) return features
def test2_tre(): def rte_features(rtepair): extractor = nltk.RTEFeatureExtractor(rtepair) features = {} features['word_overlap'] = len(extractor.overlap('word')) features['word_hyp_extra'] = len(extractor.hyp_extra('word')) features['ne_overlap'] = len(extractor.overlap('ne')) features['ne_hyp_extra'] = len(extractor.hyp_extra('ne')) return features rtepair = nltk.corpus.rte.pairs(['rte2_dev.xml'])[33] extractor = nltk.RTEFeatureExtractor(rtepair) print(extractor.text_words) print(extractor.hyp_words) print(extractor.overlap('word')) print(extractor.overlap('ne')) print(extractor.hyp_extra('word')) nltk.classify.rte_classifier(rtepair)
def fun3(): # 在RTE特征探测器(例6-7)中,我们让词(即词类型)作为信息的代理,计数词重叠的程度和假设中 # 有而文本中没有的词的程度(由hyp_extra()方法获取)。不是所有的词都是同样重要的——提到的命名实 # 体,如人、组织和地方的名称,可能会更为重要,这促使我们分别为words和nes(命名实体)提取不同的信息。此外,一些高频虚词作为“停用词”被过滤掉 def rte_features(rtepair): extractor = nltk.RTEFeatureExtractor(rtepair) features = {} features['word_overlap'] = len(extractor.overlap('word')) features['word_hyp_extra'] = len(extractor.hyp_extra('word')) features['ne_overlap'] = len(extractor.overlap('ne')) features['ne_hyp_extra'] = len(extractor.hyp_extra('ne')) return features # 为了说明这些特征的内容,检查前面显示的文本/假设对34的一些属性 rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33] extractor = nltk.RTEFeatureExtractor(rtepair) print extractor.text_words print extractor.hyp_words print extractor.overlap('word') print extractor.overlap('ne') print extractor.hyp_extra('word') nltk.pos_tag()
classifier=nltk.NaiveBayesClassifier.train(train_set) print(nltk.classify.accuracy(classifier,test_set)) #0.749 ## Recognizing Textual Entailment (RTE) def rte_features(rtepair): extractor=nltk.RTEFeatureExtractor(rtepair) features={} features['word_overlap']=len(extractor.overlap('word')) features['word_hyp_extra']=len(extractor.hyp_extra('word')) features['ne_overlap']=len(extractor.overlap('ne')) features['ne_hyp_extra']=len(extractor.hyp_extra('ne')) return features rtepair=nltk.corpus.rte.pairs(['rte3_dev.xml'])[33] extractor=nltk.RTEFeatureExtractor(rtepair) print(extractor.text_words) print(extractor.hyp_words) print(extractor.overlap('word')) print(extractor.hyp_extra('word')) print(extractor.overlap('ne')) print(extractor.hyp_extra('ne')) ## Scaling up to large Datasets
rte_10 = nltk.corpus.reader.rte.RTECorpusReader( "/Users/yuhaomao/Downloads/rte/rte10.xml", "rte_10.xml") rte_30 = nltk.corpus.reader.rte.RTECorpusReader( "/Users/yuhaomao/Downloads/rte/rte10.xml", "rte_30.xml") test_pair_rte10 = rte_corpus.pairs(['/Users/yuhaomao/Downloads/rte/rte10.xml']) test_pair_rte30 = rte_corpus.pairs(['/Users/yuhaomao/Downloads/rte/rte30.xml']) ####################################################### test_pair = rte_corpus.pairs(['rte1_test.xml']) rte_pair = rte_corpus.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml']) for pair in rte_pair: # print "2222222222",pair.text text_tokenize = [] hyp_token = [] extractor = nltk.RTEFeatureExtractor(pair) text_tokenize.append(list(extractor.text_words)) hyp_token.append(extractor.hyp_words) # print "1111111",type(text_tokenize) # print text_tokenize # print "2222222",type(hyp_token) # print hyp_token # print "......................",rte_corpus.pairs(['rte1_dev.xml'])[8].text # print "......................",rte_corpus.pairs(['rte1_dev.xml'])[8].hyp tokenizer = RegexpTokenizer('[\w.@:/]+|\w+|\$[\d.]+') ####################################################### def cos_sim(post): final_text_tokenize = tokenizer.tokenize(post.text) final_hyp_takenize = tokenizer.tokenize(post.hyp)