示例#1
0
 def find_new_words(self):
     """
     调用smooth nlp的接口,新词发现
     :return:
     """
     words = extract_phrase(corpus=self.corpus,
                            top_k=self.top_k,
                            chunk_size=self.chunk_size,
                            min_n=self.min_n,
                            max_n=self.max_n,
                            min_freq=self.min_freq)
     self.new_words = words
示例#2
0
def smoothnlp_test(input_file,output_file=None):
    text = open(input_file,"r").readlines()
    new_words = extract_phrase(corpus=text,top_k=1000,chunk_size=10000,max_n=5,min_freq=5)
    print(new_words)
def discover_words(corpus):
    new_phrases = extract_phrase(corpus, top_k=2000)
    new_phrases = [i + ' 100 n' for i in new_phrases]

    with open('C:\\ChinaMobile\\new_words_detect.txt', 'w') as fout:
        fout.write('\n'.join(new_phrases))
示例#4
0
# -*- coding: utf-8 -*-

from smoothnlp.algorithm.phrase import extract_phrase

from util.html_parser import bulk_doc_json

if __name__ == '__main__':
    doc_content_list = []
    for bulk_json in bulk_doc_json([
            "maozedong",
    ]):

        for doc_json in bulk_json:
            doc_content = doc_json["content"]
            doc_content_list.append(doc_content)

    result = extract_phrase(corpus=doc_content_list, top_k=.99)
    print(result)
root_dir = "../baidu_academy/data/"
files = [f for f in os.listdir(root_dir) if not f.startswith('.')]
aggregated_list = []
for each in files:
    aggregated_list.append(pickle.load(open(root_dir + each, 'rb')))

corpus = []
keywords = []
for each_file in aggregated_list:
    for each in each_file:
        corpus.append(each["title"])
        corpus.append(each["abstract"])
        keywords.extend(each["keyword"].split())
keywords = set(keywords)
result = set(extract_phrase(corpus, 200, 200, 4, 7, 20))
print(result)
print("Original new word count: " + str(len(result)))
print("Words from keywords: " + str(len(keywords)))
all_words = set(pickle.load(open("all_words.pkl", "rb")))

with open("words1.txt", "r") as f:
    for line in f:
        all_words.add(line.strip())

with open("words2.txt", 'r') as f:
    for line in f:
        all_words.add(line.strip())

filtered_words = []
for each in result:
示例#6
0
domain_folder = eachFile(file_path1)
domain_folder = [x.split('test/')[1] + '/' for x in domain_folder]

num_file = []
num_char = []
domain_file = []
for folder in domain_folder:
    files = eachFile(file_path1 + folder) + eachFile(file_path2 + folder)
    num_file.append(len(files))
    corpus = ''
    #遍历同一domain folder下所有txt
    for txt in files:
        lines = open(txt, 'r', encoding='utf-8').readlines()
        corpus += ''.join([x.split('\n')[0] for x in lines if not '【' in x])
    num_char.append(len(corpus))
    top_k = extract_phrase(corpus, 5000, max_n=5)
    file_name = news_path + folder[:-1] + '.txt'
    domain_file.append(file_name)
    f = open(file_name, 'w')
    for i in top_k:
        if is_chinese(i):
            f.write(i + '\n')
    f.close()

df = pd.DataFrame({'domain':domain_file,'num_file':num_file,'num_char':num_char})\
                  .sort_values('num_char',ascending=False)
top_domain = df['domain'].values[:5]
common_words = set(read_data(top_domain[0]))
for domain in top_domain[1:]:
    common_words &= set(read_data(domain))