def find_new_words(self): """ 调用smooth nlp的接口,新词发现 :return: """ words = extract_phrase(corpus=self.corpus, top_k=self.top_k, chunk_size=self.chunk_size, min_n=self.min_n, max_n=self.max_n, min_freq=self.min_freq) self.new_words = words
def smoothnlp_test(input_file,output_file=None): text = open(input_file,"r").readlines() new_words = extract_phrase(corpus=text,top_k=1000,chunk_size=10000,max_n=5,min_freq=5) print(new_words)
def discover_words(corpus): new_phrases = extract_phrase(corpus, top_k=2000) new_phrases = [i + ' 100 n' for i in new_phrases] with open('C:\\ChinaMobile\\new_words_detect.txt', 'w') as fout: fout.write('\n'.join(new_phrases))
# -*- coding: utf-8 -*- from smoothnlp.algorithm.phrase import extract_phrase from util.html_parser import bulk_doc_json if __name__ == '__main__': doc_content_list = [] for bulk_json in bulk_doc_json([ "maozedong", ]): for doc_json in bulk_json: doc_content = doc_json["content"] doc_content_list.append(doc_content) result = extract_phrase(corpus=doc_content_list, top_k=.99) print(result)
root_dir = "../baidu_academy/data/" files = [f for f in os.listdir(root_dir) if not f.startswith('.')] aggregated_list = [] for each in files: aggregated_list.append(pickle.load(open(root_dir + each, 'rb'))) corpus = [] keywords = [] for each_file in aggregated_list: for each in each_file: corpus.append(each["title"]) corpus.append(each["abstract"]) keywords.extend(each["keyword"].split()) keywords = set(keywords) result = set(extract_phrase(corpus, 200, 200, 4, 7, 20)) print(result) print("Original new word count: " + str(len(result))) print("Words from keywords: " + str(len(keywords))) all_words = set(pickle.load(open("all_words.pkl", "rb"))) with open("words1.txt", "r") as f: for line in f: all_words.add(line.strip()) with open("words2.txt", 'r') as f: for line in f: all_words.add(line.strip()) filtered_words = [] for each in result:
domain_folder = eachFile(file_path1) domain_folder = [x.split('test/')[1] + '/' for x in domain_folder] num_file = [] num_char = [] domain_file = [] for folder in domain_folder: files = eachFile(file_path1 + folder) + eachFile(file_path2 + folder) num_file.append(len(files)) corpus = '' #遍历同一domain folder下所有txt for txt in files: lines = open(txt, 'r', encoding='utf-8').readlines() corpus += ''.join([x.split('\n')[0] for x in lines if not '【' in x]) num_char.append(len(corpus)) top_k = extract_phrase(corpus, 5000, max_n=5) file_name = news_path + folder[:-1] + '.txt' domain_file.append(file_name) f = open(file_name, 'w') for i in top_k: if is_chinese(i): f.write(i + '\n') f.close() df = pd.DataFrame({'domain':domain_file,'num_file':num_file,'num_char':num_char})\ .sort_values('num_char',ascending=False) top_domain = df['domain'].values[:5] common_words = set(read_data(top_domain[0])) for domain in top_domain[1:]: common_words &= set(read_data(domain))