示例#1
0
文件: main.py 项目: ocykat/MH18
def main():
    words = []
    sentences = []
    with open('..//dataset//computer.txt', 'r') as f:
        for line in f:
            words += util.split_words(line)
            sentences += util.split_sentences(line)

    # Tf-idf evaluation
    tfidf = Tfidf(words, sample_doc_ids)
    tfidf.calc_tfidf()

    # Show tf-idf values
    sorted_tfidf = util.sort_dict_by_value(tfidf.tfidf)
    for i in range(len(sorted_tfidf)):
        print('Word: {0:25}; tfidf = {1}'.format(sorted_tfidf[i][0],
                                                 sorted_tfidf[i][1]))

    # Work out summary
    summary = tfidf.best_sentences(sentences, 100)

    for sentence in summary:
        print(sentence.text)
        print("Score: {0}\n".format(sentence.score))

    print("-----------\nDONE")
示例#2
0
    def preprocessing(self, is_tfidf=False, is_cosinesimilarty=False, is_Nmf=False):
        all = self.Alllist()
        #print("all uniq list")
        #print(all)
        lst = self.listtostring(all)

        uniq = self.unique_list(lst)
        concerns=[]

        concerns = self.load_set1()
        concerns = [self.tokenization(concerns[i]) for i in range(len(concerns))]
        concerns = [self.stop_words(concerns[i]) for i in range(len(concerns))]
        concerns = [self.regular_exp(concerns[i]) for i in range(len(concerns))]
        concerns = [self.port_stem(concerns[i]) for i in range(len(concerns))]
        concerns= [" ".join(concerns[i]) for i in range(len(concerns))]
        op1 = None
        if is_tfidf:
            
            op1 = Tfidf(concerns).tfidf()
        
        
        if is_cosinesimilarty:
            op1 = Cosine(op1).cosinesimilarty()
        
        
        if is_Nmf:
           op1=NMF(op1).non_negative_matrices()

        return op1
示例#3
0
class Feature:
    def __init__(self):
        self.tfidf = Tfidf()
        self.docs = []
        self.words = []
        self.title = Set()
        self.body = Set()
        self.topics = Set()
        self.places = Set()
        self.matrix = {}
        self.dfs = {}

    def add(self, doc):
        if not doc.topics:
            return
        self.docs.append(doc)
        self.tfidf.add(doc)
        self.title.update(doc.title)
        self.body.update(doc.body)
        self.topics.update(doc.topics)
        self.places.update(doc.places)

    def _compress(self):
        self.title = list(self.title)
        self.body = list(self.body)
        self.topics = sorted(self.topics)
        self.places = sorted(self.places)
        self.words = self.title + self.body
        self.title = []
        self.body = []

    def build(self):
        self._compress()
        print "Building tf-idf matrix"
        for d in self.docs:
            self.matrix[d.id] = {}
        for w in self.words:
            df = self.tfidf.get_df(w)
            if df > 2:
                self.dfs[w] = df
                for d in self.docs:
                    tfidf = self.tfidf.get_tfidf(w, d.id)
                    if tfidf:
                        self.matrix[d.id][w] = tfidf
        self.words = []
示例#4
0
 def __init__(self):
     self.tfidf = Tfidf()
     self.docs = []
     self.words = []
     self.title = Set()
     self.body = Set()
     self.topics = Set()
     self.places = Set()
     self.matrix = {}
     self.dfs = {}
示例#5
0
 def reveal_doc(self, text_panel):
     """
     点击“选择文件”按钮调用此函数,获取文件路径,并将文档内容展示在相应的文本框中
     Args:
         text_panel: 'dp': 表示关键词展示模块中的text框
                     'dp_r1': 表示文档相似度对比模块中的第一个text框
                     'dp_r2': 表示文档相似度对比模块中的第二个text框
     """
     if text_panel == 'dp':
         t = self.doc_panel
         title = self.doc_title
     elif text_panel == 'dp_r1':
         t = self.doc_panel_r1
         title = self.doc_title_r1
     elif text_panel == 'dp_r2':
         t = self.doc_panel_r2
         title = self.doc_title_r2
     file_path = filedialog.askopenfilename()
     print('file_path', file_path)
     # 判断是否选择了文件
     if not file_path:
         messagebox.showinfo(message='请选择正确的文件。')
         return
     try:
         # 先清除文本框
         t.delete('1.0', 'end')
         data = Data(file_path)
         title.set(os.path.split(file_path)[-1])
         print('file:', os.path.split(file_path)[-1])
         t.insert('end', data.raw_text)
         # 加载 idf 字典
         idf = load_idf(self.idf_dir)
         # 计算 tf_idf score
         tf_idf = Tfidf(data.corpus, idf)
         # 保存当前文档的data、tfidf对象,用于后续的提取关键词或者计算相似度
         self.status[text_panel] = {
             'data': data,
             'tfidf': tf_idf
         }
         if text_panel == 'dp':
             # 提取关键词模块,默认提取20个关键词
             self.topk_var.set(20)
             self.refresh_key_words()
         elif text_panel == 'dp_r1' or text_panel == 'dp_r2':
             # if self.status.get('dp_r1', None) and self.status.get('dp_r2', None):
             #     self.show_similarity()
             # 当文档相似度模块选择新的文档时,清空相似度输出的文本框
             self.sim_panel.delete(0, 'end')
     except Exception:
         traceback.print_exc()
示例#6
0
文件: utils.py 项目: gjtucker/6885
def jaccard_score_tfidf(locu, four, p1,p2,field):
    #make a tfidf object so that we don't need to re-compute the list of all names every time
    tfidf = Tfidf(locu, four,field)

    name1 = p1[field] 
    name2 = p2[field]
    
    if name1 == "":
        set1 = set()
    else:
        set1 = set(name1.lower().split())
    if name2 == "":
        set2 = set()
    else:
        set2 = set(name2.lower().split())

    i = list(set1.intersection(set2))
    u = list(set1.union(set2))

    #compute idf score (decided to ignore tf)
    iscore = sum([tfidf.get_score(word) for word in i])
    uscore = sum([tfidf.get_score(word) for word in u])

    return 0 if uscore == 0 else float(iscore) / uscore
示例#7
0
 def reveal_doc(self, text_panel):
     """
     点击“选择文件”按钮调用此函数,获取文件路径,并将文档内容展示在相应的文本框中
     Args:
         text_panel: 'dp': 表示关键词展示模块中的text框
                     'dp_r1': 表示文档相似度对比模块中的第一个text框
                     'dp_r2': 表示文档相似度对比模块中的第二个text框
     """
     self.root.wm_attributes('-topmost', 0)
     if text_panel == 'dp':
         t = self.doc_panel
         title = self.doc_title
     elif text_panel == 'dp_r1':
         t = self.doc_panel_r1
         title = self.doc_title_r1
     elif text_panel == 'dp_r2':
         t = self.doc_panel_r2
         title = self.doc_title_r2
     file_path = filedialog.askopenfilename()
     print('file_path', file_path)
     # 判断是否选择了文件
     if not file_path:
         self.show_error('请选择正确的文件。')
         return
     try:
         # 先清除文本框
         t.delete('1.0', 'end')
         data = Data(file_path)
         title.set(os.path.split(file_path)[-1])
         print('file:', os.path.split(file_path)[-1])
         t.insert('end', data.corpus)
         # 计算 tf_idf score
         tf_idf = Tfidf(data.corpus, len(data.corpus) // 20)
         # 保存当前文档的data、tfidf对象,用于后续的提取关键词或者计算相似度
         self.status[text_panel] = {'data': data, 'tfidf': tf_idf}
         if text_panel == 'dp':
             # 提取关键词模块,默认提取20个关键词
             self.kw_ours_var.set('Ours')
             self.kw_jieba_var.set('Jieba')
             self.kw_panel_jieba.delete(0, 'end')
             self.kw_panel_ours.delete(0, 'end')
         elif text_panel == 'dp_r1' or text_panel == 'dp_r2':
             # 当文档相似度模块选择新的文档时,清空相似度输出的文本框
             self.sim_panel.delete(0, 'end')
         self.root.wm_attributes('-topmost', 1)
     except Exception:
         traceback.print_exc()
示例#8
0
 def parse_cbr(self):
     all_content = []
     with codecs.open(self.path, 'r', 'utf-8-sig') as lines:
         for lin in lines:
             create_time = utility.split_data_by_date(lin)
             lin = lin.strip().split()
             userid, newsid, scan_time, title, create_time_ = int(
                 lin[0]), int(lin[1]), lin[2], lin[3], lin[-1]
             news = News(int(userid), int(newsid), title, scan_time, [],
                         create_time_)
             self.AllNews.append(news)
             content = "".join(lin[4:-1])
             all_content.append(content)
     if self.isTfidf:
         tags = Tfidf(all_content).derive_keyword_zh(keyword_num=5)
         for index in xrange(len(tags)):
             self.AllNews[index].tags = tags[index]
示例#9
0
                exit()
            else:
                print "language error!"
                exit()
            tfidfText.append(text1)
            vsText.append(text2)
    return tfidfText, vsText, np.array(polarities), categories
    
if __name__ == "__main__":
    if len(sys.argv) < 4:
        print "sys.argv[1]: input train corpus"
        print "sys.argv[2]: input test corpus"
        print "sys.argv[3]: corpus language, 'en' for English and 'zh-CN' for Chinese"
        exit()
    
    tfidfInstance = Tfidf()
    sentimentInstance = Sentiment()
    
    trCorpus1, trCorpus2, trPolarity, trCategory = readData(sys.argv[1], sys.argv[3])
    teCorpus1, teCorpus2, tePolarity, teCategory = readData(sys.argv[2], sys.argv[3])

    trainTfidf, testTfidf = tfidfInstance.tfidf(trCorpus1, teCorpus1)
    trainVS = sentimentInstance.VSPolarity(trCorpus2)
    testVS = sentimentInstance.VSPolarity(teCorpus2)
    trainMatrix = combineFeature(trainTfidf, trainVS)
    testMatrix = combineFeature(testTfidf, testVS)
    
    print sys.argv[2]
    trainMatrix, testMatrix = featureSelection(trainMatrix, trPolarity, testMatrix, tePolarity)

	def __init__(self, posting_list_path, data_path):
		super(QuerySearch, self).__init__()
		self.posting_list_path = posting_list_path
		self.data_path = data_path
		self.TfidfObj = Tfidf(self.posting_list_path, self.data_path)
class QuerySearch(object):
	"""docstring for QuerySearch"""
	def __init__(self, posting_list_path, data_path):
		super(QuerySearch, self).__init__()
		self.posting_list_path = posting_list_path
		self.data_path = data_path
		self.TfidfObj = Tfidf(self.posting_list_path, self.data_path)
		
	def find_best_doc(self, data_dict, num_docs_retrieval=2):
		'''
			Find sum of all tf-idfs of all words in data_dict for all documents.
		'''
		
		# Dictionary. {docid: (sum of tf-idf of all words from query)}
		lst_tfidf = {}
		for key in (data_dict):
			for docid in data_dict[key]:
				try:
					lst_tfidf[docid] += self.TfidfObj.tfidf(key,docid)
				except:
					lst_tfidf[docid] = self.TfidfObj.tfidf(key,docid)
		
		# Sorting dictionary in descending manner
		# lst_tfidf = {k: v for k, v in sorted(lst_tfidf.items(), key=lambda item: -1*item[1])}
		arr = sorted(lst_tfidf.items(), key=lambda item: -1*item[1])
		return arr[:num_docs_retrieval]

	def search_query(self, query):
		'''
			Assuming query is similar to N:new* | new*
		'''
		temp = query.split(':')
		res = {}
		if(temp[0]=='P'):
			res = {key:val for key, val in self.TfidfObj.data.items()  
				if key.startswith('person_'+temp[-1].split('*')[0])}
		elif(temp[0]=='O'):
			res = {key:val for key, val in self.TfidfObj.data.items()  
				if key.startswith('org_'+temp[-1].split('*')[0])}
		elif(temp[0]=='L'):
			res = {key:val for key, val in self.TfidfObj.data.items()  
				if key.startswith('loc_'+temp[-1].split('*')[0])}
		elif(temp[0]=='N'):
			res = {	
					key:val for key, val in self.TfidfObj.data.items()  
					if (key.startswith('person_'+temp[-1].split('*')[0]) 
					or key.startswith('loc_'+temp[-1].split('*')[0]) 
					or key.startswith('org_'+temp[-1].split('*')[0]))
				}
		else:
			res = {key:val for key, val in self.TfidfObj.data.items()  
				if key.startswith(temp[-1].split('*')[0])}
		# print('Refined posting list to relevant queries')
		return self.find_best_doc(res)

	def search_queries(self, queries, num_docs_retrieval=2):
		res = {}
		for query in queries:
			temp = query.split(':')
			if(len(temp)>1):
				res = dict({key:val for key, val in self.TfidfObj.data.items()  
						if (temp[0]=='P' and key.startswith('person_'+temp[-1].split('*')[0]))
						or (temp[0]=='O' and key.startswith('org_'+temp[-1].split('*')[0]))
						or (temp[0]=='L' and key.startswith('loc_'+temp[-1].split('*')[0]))
						or (temp[0]=='N' and (key.startswith('person_'+temp[-1].split('*')[0]) 
						or key.startswith('loc_'+temp[-1].split('*')[0]) 
						or key.startswith('org_'+temp[-1].split('*')[0])))
					}, **res)
			else:
				res = dict({key:val for key, val in self.TfidfObj.data.items()  
					if key.startswith(temp[-1].split('*')[0])}, **res)
		return self.find_best_doc(res, num_docs_retrieval=num_docs_retrieval)
示例#12
0
            else:
                print "language error!"
                exit()
            tfidfText.append(text1)
            vsText.append(text2)
    return tfidfText, vsText, np.array(polarities), categories


if __name__ == "__main__":
    if len(sys.argv) < 4:
        print "sys.argv[1]: input train corpus"
        print "sys.argv[2]: input test corpus"
        print "sys.argv[3]: corpus language, 'en' for English and 'zh-CN' for Chinese"
        exit()

    tfidfInstance = Tfidf()
    sentimentInstance = Sentiment()

    trCorpus1, trCorpus2, trPolarity, trCategory = readData(
        sys.argv[1], sys.argv[3])
    teCorpus1, teCorpus2, tePolarity, teCategory = readData(
        sys.argv[2], sys.argv[3])

    trainTfidf, testTfidf = tfidfInstance.tfidf(trCorpus1, teCorpus1)
    trainVS = sentimentInstance.VSPolarity(trCorpus2)
    testVS = sentimentInstance.VSPolarity(teCorpus2)
    trainMatrix = combineFeature(trainTfidf, trainVS)
    testMatrix = combineFeature(testTfidf, testVS)

    print sys.argv[2]
    trainMatrix, testMatrix = featureSelection(trainMatrix, trPolarity,
def convert_to_tfidf(filename):
    with open(filename,"w+") as f:
        query_keyword = Tfidf("query_ids.txt","keyword_ids.txt")
        query_title = Tfidf("query_ids.txt","title_ids.txt")
        query_description = Tfidf("query_ids.txt","desc_ids.txt")
        keyword_title = Tfidf("keyword_ids.txt","title_ids.txt")
        keyword_description = Tfidf("keyword_ids.txt","desc_ids.txt")
        title_description = Tfidf("title_ids.txt","desc_ids.txt")
        data = csv_io.read_train("10percent_5lakh_preprocessed_training_data.txt")
        count = 0
        with open("2lakh_training_data.txt") as f1:
            for line in f1:
                count = count + 1
                sample = csv_io.split(line,[','])
                queryid = sample[7]
                keywordid = ''+sample[8]
                titleid = ''+sample[9]
                descriptionid = ''+sample[10]
                qk_sim = query_keyword.classify(queryid,keywordid)
                qt_sim = query_title.classify(queryid,titleid)
                qd_sim = query_description.classify(queryid,descriptionid)
                kt_sim = keyword_title.classify(keywordid,titleid)
                kd_sim = keyword_description.classify(keywordid,descriptionid)
                td_sim = title_description.classify(titleid,descriptionid)
                sample.append('%.2f' % qk_sim[0][0])
                sample.append('%.2f' % qt_sim[0][0])
                sample.append('%.2f' % qd_sim[0][0])
                sample.append('%.2f' % kt_sim[0][0])
                sample.append('%.2f' % kd_sim[0][0])
                sample.append('%.2f' % td_sim[0][0])
                f.write(",".join(sample))
                f.write("\n")
def dummy():
    id1 = '1'
    id2 = '3'
    tfidf = Tfidf("dummy.txt","dummy2.txt")
    return tfidf.classify(id1,id2)
示例#15
0
from brown import get_indexed
from tfidf import Tfidf

if __name__ == '__main__':
    documents_indexed, word2idx = get_indexed(10000)
    vocab_size = len(word2idx)
    print("Data loaded | Vocab size:", vocab_size, '| Document size:',
          len(documents_indexed))

    model = Tfidf()
    TD = model.fit(documents_indexed, vocab_size)

    idx2word = {idx: word for word, idx in word2idx.items()}
    model.find_closest(['london', 'king', 'italy', 'queen'], TD, word2idx,
                       idx2word)
示例#16
0
文件: problem5.py 项目: samanthaks/ML
    # print("Starting with bigrams...")
    # bigram_perceptron = Bigram(train_ratio=0.8)
    # print("Bigram accuracy", bigram_perceptron.accuracy)

    # PART C: Compare the data representations
    ratios = np.arange(0.05, 1.05, 0.05)
    unigram_accuracies = []
    tfidf_accuracies = []
    bigram_accuracies = []
    for r in ratios:
        unigram_perceptron = Unigram(train_ratio=r)
        unigram_accuracy = unigram_perceptron.accuracy
        unigram_accuracies.append(unigram_accuracy)
        print(r, "unigram_perceptron", unigram_accuracy)

        tfidf_perceptron = Tfidf(train_ratio=r)
        tfidf_accuracy = tfidf_perceptron.accuracy
        tfidf_accuracies.append(tfidf_accuracy)
        print(r, "tfidf_perceptron", tfidf_accuracy)

        bigram_perceptron = Bigram(train_ratio=r)
        bigram_accuracy = bigram_perceptron.accuracy
        bigram_accuracies.append(bigram_accuracy)
        print(r, "bigram_perceptron", bigram_accuracy)

    pickle.dump(unigram_accuracies, open("unigram_accuracies.pkl", "wb"))
    pickle.dump(tfidf_accuracies, open("tfidf_accuracies.pkl", "wb"))
    pickle.dump(bigram_accuracies, open("bigram_accuracies.pkl", "wb"))
    # unigram_accuracies = pickle.load(open("unigram_accuracies.pkl", "rb"))
    # tfidf_accuracies = pickle.load(open("tfidf_accuracies.pkl", "rb"))
    # bigram_accuracies = pickle.load(open("bigram_accuracies.pkl", "rb"))