def do_category(): print "[1] %s - loading titles" % (diff()) cat_titles = load_json("categories", "all_cats_3000") n_docs = sum(len(v) for k,v in cat_titles.iteritems()) print "[2] %s - loading pos tags, n_docs = %s" % (diff(), n_docs) cat_titles_pos = list( (cat, t, rpos.value_by_title(cat,t)) for cat, ts in cat_titles.iteritems() for t in ts) print "[3] %s - extractings words" % diff() def calc_idf(w): n = count_occurrences(w, cat_titles_pos) try: return (w, math.log(n_docs / float(n))) except ZeroDivisionError: return (w, 0) words = [] for c_t_p in cat_titles_pos: words += c_t_p[2].keys() words = set(words) print "[4] %s - calculating idfs, n_words = %s" % (diff(), len(words)) words_idfs = dict(map(calc_idf, words)) print "[5] %s - saving idfs" % diff() ridf.puts("all", words_idfs) print "[6] %s - method finisheed" % (diff())
def update_symspell_dict(input_file, output_file): ''' @description: 得到symspell key-value词典 @param {type} @return: ''' dict_tmp = symspell( input_file=input_file, res_dict=load_json( "/home/zixiang/Projects/text_correction/codebase/data/symspell_dict.json" )) save_json(dict_tmp, output_file)
def test(): ''' @description: 测试symspell功能 @param {type} @return: ''' #dict_tmp = symspell("/home/zixiang/ExtraData/xmnlp/tests/sample_words.txt") #save_json(dict_tmp,"/home/zixiang/Projects/text_correction/codebase/data/symspell.json") tmp_dict = load_json(config.new_symspell_json) #print("tmp_dict", tmp_dict) print(cheaksmyspell(tmp_dict, "小孩同学"))
def get_tmp_result(): global count tmp_dict = load_json( "/home/zixiang/Projects/text_correction/codebase/data/symspell_dict.json" ) print("symspell_dict loaded") #print("tmp_dict", tmp_dict) #print(cheaksmyspell(tmp_dict,"新兴肥业")) import pandas as pd df = pd.read_excel("/home/zixiang/Downloads/热词评测20200123.xlsx", sheet_name=2) #print(df["query"]) df["new_result"] = df["query"].apply(cheaksmyspell_apply) print(df.head()) df.to_excel("/home/zixiang/Downloads/热词评测20200123_new.xlsx", index=None, columns=["query", "rewrite_query", "new_result"], encoding="utf_8") print("count 影响面", count)
def cheaksmyspell_apply(x): ''' @description: @param {type} @return: ''' global count pattern_type = 0 symspell_dict = load_json( "/home/zixiang/Projects/text_correction/codebase/data/symspell_sample_dict.json" ) tmp_condidate = set() if find_pattern(x) is not None: print(find_pattern(x)) x, pattern_type = find_pattern(x) deletes = get_deletes(x) #查询symspell词典 for d in deletes: if d in symspell_dict: for i in symspell_dict[d][0]: tmp_condidate.add(i) if len(tmp_condidate) == 0: return x else: count += 1 if pattern_type == 0: return str(list(tmp_condidate)[0]) elif pattern_type == 1: return "怎么预防" + str(list(tmp_condidate)[0]) elif pattern_type == 2: return "什么是" + str(list(tmp_condidate)[0]) elif pattern_type == 3: return str(list(tmp_condidate)[0]) + "怎么预防" elif pattern_type == 4: return str(list(tmp_condidate)[0]) + "是什么" elif pattern_type == 5: return "播放" + str(list(tmp_condidate)[0]) + "的新闻"
def test_pattern_symspell(query: list) -> list: ''' @description: 使用提取的pattern树对错的句子进行纠错。 @param {type} @return: ''' trie = load_pattern_trie_tree() symspell_dict = load_json( "/home/zixiang/Projects/text_correction/codebase/data/symspell_dict.json" ) base_path = "/home/zixiang/DataSets/berttestdata/xiaoaiquerylog" tmp_condidate = set() isword, word_path, pattern = trie.search(query) #logger.debug("pattern {} wororigin_patternd_path {}".format(pattern,word_path)) if not isword and len(word_path) == 0: pass add_flag = True origin_words_list = [i for i in word_path.split("*") if i] tmp_string = "" for i in pattern: if i == "*": if add_flag: tmp_string += i add_flag = False else: tmp_string += i add_flag = True origin_pattern = tmp_string.replace("*", "{}") #logger.debug("origin_pattern {}".format(origin_pattern)) #logger.debug("origin_word_list {}".format(origin_words_list)) length = len(origin_words_list) tmp_list = [] for i in range(length): tmp_list.append([]) #处理多个位置有错误的情况 for i in range(length): deletes = get_deletes(origin_words_list[i]) #查询symspell词典 for d in deletes: if d in symspell_dict: for j in symspell_dict[d][0]: if j in tmp_list[i]: continue tmp_list[i].append(j) if len(tmp_list) == 0: return set() fn = lambda x, code=',': reduce( lambda x, y: [str(i) + code + str(j) for i in x for j in y], x) res = fn(tmp_list) for line in res: tmp_condidate.add((origin_pattern.format(*(line.split(","))))) return list(tmp_condidate)
from nltk import PorterStemmer from nltk import WordNetLemmatizer from nltk.corpus import stopwords from utils import persistence_path from io_utils import read, load_json from ast import literal_eval # Pos Tagging from textblob import TextBlob from textblob_aptagger import PerceptronTagger STOPS = literal_eval(read(persistence_path() + "/love_the_data/stop_words.txt")) SPECIAL = literal_eval(read(persistence_path() + "/love_the_data/special_characters.txt")) LETTER_FREQ = dict(load_json("love_the_data","english-letter-frequencies")["letters"]) EN_US_DICT = enchant.Dict("en_US") EN_GB_DICT = enchant.Dict("en_GB") TAGGER = PerceptronTagger() PORTER = PorterStemmer() WN_LEMMATIZER = WordNetLemmatizer() SENTENCE_DETECTOR = data.load('tokenizers/punkt/english.pickle') ''' CC Coordinating conjunction CD Cardinal number DT Determiner EX Existential there FW Foreign word IN Preposition or subordinating conjunction
def load_corpus(corpus_path): corpus = load_json(corpus_path) return corpus
from io_utils import flatten_hash, load_json, save_json from utils import good_title from rediss import RFeature import re rfeature = RFeature() pattern = re.compile("(Category\:|List of|File\:).*") categories = ["biology", "physics", "chemistry"] r = {} for cat in categories: r[cat] = [] l = set(good_title(title) for title in flatten_hash(load_json("categories", "%s_titles_1" % cat)) if not pattern.search(title)) for title, fvector in rfeature.key_value_by_titles(cat, l): s = sum(map(lambda x: x[1], fvector)) if s > 500: r[cat].append(title) save_json("categories", "all_cats_3000", r) #print len(r)