def cut_document(document): """ 对 mongodb 中的每个文档的 summary 做分词 同时在单词表中添加电影名、导演名、演员名等 """ # 对 summary 进行分词 if 'summary' in document: words = jieba.lcut_for_search(document['summary']) else: words = [] # 对标题进行分词 extra_words = jieba.lcut_for_search(document['name']) # 对导演名进行分词 for director in document['directors']: extra_words.extend(jieba.lcut_for_search(director)) # 对演员名进行分词 for actor in document['actors']: extra_words.extend(jieba.lcut_for_search(actor)) words.extend(extra_words) words = filter(match_words, words) return words
def similarAlg(noteId): content = get_note(noteId) # get note content dynamic selected_note_word_list = jieba.lcut_for_search(content) ignore_list = get_ignore_list() selected_note_word_list = list( filter(lambda x: x not in ignore_list, selected_note_word_list)) notes = get_categorized_notes() + get_uncategorized_notes( 60 * 60 * 24 * 300) returnList = [] for note in notes: count = 0 current_word_list = jieba.lcut_for_search(note['content']) match_list = [] for word in current_word_list: if word in selected_note_word_list: # 如果和当前文章有相同的词,则记录 if word not in match_list: # 去重 count += 1 match_list.append(word) if count > 0: note['count'] = count note['match_list'] = match_list returnList.append(note) returnList = sorted(returnList, key=lambda x: -len(x['match_list'])) return json.dumps(returnList)
def search(): form = SearchForm() if form.validate_on_submit(): pprint(form.data) history_record(form.query.data) if form.trade.data == 'buy': is_buy = True query = jieba.lcut_for_search(form.query.data) ptype = form.ptype.data cond = form.cond.data r = ps.search_buy(query=query, ptype=ptype, cond=cond) results = [hit.to_dict() for hit in r] return render_template("SearchResult.html", results=results, is_buy=is_buy) if form.trade.data == 'sell': is_buy = False query = jieba.lcut_for_search(form.query.data) ptype = form.ptype.data cond = form.cond.data r = ps.search_sell(query=query, ptype=ptype, cond=cond) results = [hit.to_dict() for hit in r] return render_template("SearchResult.html", results=results, is_buy=is_buy) return render_template('search.html', form=form)
def preprocess_query(query): begin = -1 end = -1 flag = 0 phrase = [] for i, item in enumerate(query): if item == '\'' and begin == -1: begin = i elif item == '\'': end = i flag = 1 break elif item == '\"' and begin == -1: begin = i elif item == '\"': flag = 1 end = i break if flag: phrase = jieba.lcut_for_search(query[begin + 1:end]) res = jieba.lcut_for_search(query) for item in query: res.append(item) res = stopping(res) return res[:20], phrase
def indexDocs(self, root, writer): for root,dirnames,filenames in os.walk(root): for dirname in dirnames: #遍历文件夹 path1 = os.path.join(root,dirname) for trivial1 , trivial2 , filenames in os.walk(path1): #遍历文件夹下的文件 for filename in filenames: #print(root,dirnames,filename) print("adding", filename) # try: path = os.path.join(path1, filename) file = open(path, encoding='utf8') page = file.readline() title = file.readline() contents = file.read() file.close() # jieba 分词 seg_contents = jieba.lcut_for_search(contents) contents = ' '.join(seg_contents) url = page seg_url = jieba.lcut_for_search(page) page = ' '.join(list(set(seg_url)-set(['.','http','https','/',':','?','=','html','shtml','www']))) doc = Document() doc.add(StringField("name", filename, Field.Store.YES)) doc.add(StringField("path", path, Field.Store.YES)) if len(contents) > 0: doc.add(TextField('title', title, Field.Store.YES)) doc.add(TextField('site', page, Field.Store.YES)) doc.add(TextField('url',url,Field.Store.YES)) doc.add(TextField('contents', contents, Field.Store.YES)) else: print("warning: no content in %s" % filename) writer.addDocument(doc)
def ConsineScore(query): Score = {} for doc in doc_list: Score[doc] = 0 TotalDoc = len(doc_list) query_term_list = jieba.lcut_for_search(query) #Term-at-a-Time Processing for term in query_term_list: #calculate w[t,q] and fetch posting list try: posting_list = indexTable[term]; except: continue for posting in posting_list: #for each pair(d,tf) in posting list doc = posting['doc'] tf = posting['tf'] df = len(posting_list) #compute tf-idf weight weight = (1+math.log10(tf))*math.log10(TotalDoc/df) if (doc.count(u'jkzhu')>0 and term.count(u'计算机')>0): weight += 100 Score[doc] += weight; #w(t,q)=1 for fast Consine Score #Consine Normalization for doc in doc_list: Score[doc] = Score[doc]/doc_length[doc]; #boolean search for 'not' if query.count("not")>0: not_list = jieba.lcut_for_search(query.partition("not")[2]) for term in not_list: try: posting_list = indexTable[term] except: continue for posting in posting_list: doc = posting['doc'] Score[doc] = 0 #rank documents with respect to the query #use Min Heap for Selecting top k out of N result = [] queue = Queue.PriorityQueue(10) for term in doc_list: #process a new document d with score s #if Score[term]==0: # continue if queue.full(): min_score = queue.get(); #get current minimum h_min of heap (O(1)) if (Score[term],term)>min_score: #if s>h_min heap-add((doc,score)) (O(logk)) queue.put((Score[term],term)) else: #if s<h_min skip to next document queue.put(min_score) else: queue.put((Score[term],term)) while queue.empty()==False: result.append(queue.get()[1]) result.reverse() return result
def split_result(r, re_filter): ''' 拆分关键词 :param r: :param re_filter: :return: ''' title = r.get('title') dsc = r.get('goods_desc') title = re.sub(re_filter, ' ', title) dsc = re.sub(re_filter, ' ', dsc) titles1 = jieba.lcut_for_search(title) titles3 = jieba.lcut(title, cut_all=True) titles = titles1 + titles3 dscs1 = jieba.lcut_for_search(dsc) dscs3 = jieba.lcut(dsc, cut_all=True) types = dscs1 + dscs3 jbs = set(titles + types) search_map = dict() search_map['id'] = r.get('id') search_map['result'] = list() if '' in jbs: jbs.remove('') if ' ' in jbs: jbs.remove(' ') for jb in jbs: flag, result = chinese_to_number(jb) # 如果有数据 if flag: search_map['result'].append(result) return search_map
def cutForSearch(self, text): if type(text) is list: result = list() for s in text: result.append(jieba.lcut_for_search(s)) return result else: return jieba.lcut_for_search(text)
def count_simisc(word, title): wd_set = set(jieba.lcut_for_search(word)) tt_set = set(jieba.lcut_for_search(title)) simisc = wd_set - tt_set try: score = 1 - len(simisc) / len(wd_set) except: score = 1.0 return score
def releventScore(self, text, ques, tfidf={}): def filtWord(li): # filt out stop words nl = [] for l in li: if l not in STOPWORDS: nl.append(l) return nl def sims(t, q): if t in self.dic.keys() and q in self.dic.keys(): vector1 = self.dic[t] vector2 = self.dic[q] dot_product = 0.0 normA = 0.0 normB = 0.0 for a, b in zip(vector1, vector2): dot_product += a * b normA += a**2 normB += b**2 if normA == 0.0 or normB == 0.0: return 0 else: return dot_product / ((normA * normB)**0.5) else: l = max([len(t), len(q)]) if Levenshtein.distance(t, q) < l: return (l - Levenshtein.distance(t, q)) / l * 0.7 else: return 0 ttoks = filtWord(jieba.lcut_for_search(text)) qtoks = filtWord(jieba.lcut_for_search(ques)) score = 0 if len(ttoks) == 0: return 0 for tword in ttoks: for qword in qtoks: if tword in tfidf.keys(): rate = tfidf[tword] else: rate = 1 if tword == qword: # exact match score += rate * 2.5 elif sims(tword, qword) > 0.4: # similar score += sims(tword, qword) * rate # remove advantage of length return score / len(ttoks) / len(qtoks) * 100
def getTeamKeyword(): global team_keyword_list for team in team_list: tempwordlist = [] tempwordlist.extend(jieba.lcut_for_search(team['name'])) for member in team['member']: tempwordlist.extend(jieba.lcut_for_search(member)) tempwordlist = list(set(tempwordlist)) wordlist = [] for index in range(0, len(tempwordlist)): if len(tempwordlist[index]) >= 2: wordlist.append(tempwordlist[index]) team_keyword_list.append(wordlist)
def obj_to_document(obj): def conv_to_str(x): if isinstance(x, unicode): return x.encode('utf8') return str(x) res = Document() res.add(StringField('index', conv_to_str(obj.index), Field.Store.YES)) res.add(StringField('type', obj.__class__.__name__, Field.Store.YES)) for k, v in vars(obj.data).items(): if v is None: res.add(Field(k, '', Field.Store.YES, Field.Index.NO)) fieldtype = LT_NONE elif isinstance(v, list): if len(v) > 0 and isinstance(v[0], int): res.add( TextField(k, ' '.join((str(x) for x in set(v))), Field.Store.YES)) fieldtype = LT_INTLIST else: res.add(TextField(k, ' '.join(list(set(v))), Field.Store.YES)) fieldtype = LT_LIST elif isinstance(v, str) or isinstance(v, unicode): res.add(Field(k, v, Field.Store.YES, Field.Index.NO)) res.add( TextField(k + LTPF_FOR_QUERY, ' '.join(jieba.lcut_for_search(v)), Field.Store.NO)) fieldtype = LT_STRING elif isinstance(v, hyper_text): res.add(Field(k, v.raw, Field.Store.YES, Field.Index.NO)) res.add( TextField(k + LTPF_FOR_QUERY, ' '.join(jieba.lcut_for_search(v.text)), Field.Store.NO)) fieldtype = LT_HYPERTEXT elif isinstance(v, bool): if v: vs = '1' else: vs = '0' res.add(StringField(k, vs, Field.Store.YES)) fieldtype = LT_BOOL elif isinstance(v, int) or isinstance(v, long): res.add(StringField(k, str(v), Field.Store.YES)) fieldtype = LT_INT else: raise Exception('unrecognized data type') res.add( Field(k + LTPF_TYPE, fieldtype, Field.Store.YES, Field.Index.NO)) return res
def getanswer(question, data, model, vec_model, oldQuestion, oldAns): #print(0) if '上海交通大学' in question: question = re.sub('上海交通大学', '', question) if '?' not in question: question = question + '?' #print(question) serialTalk = checkForEmission(question) if serialTalk: q1 = jieba.lcut_for_search(oldQuestion) q2 = jieba.lcut_for_search(question) q = '' for word2 in q2: if word2 not in q1 and word2 not in ['那', '那么']: q = q + word2 question = oldQuestion[0:-1] + q + q + '?' #print('this is new question',question) ans = getNormalAnswer(question, data, model, vec_model, serialTalk) #print('===+++',ans,len(ans)>1,serialTalk,ans[0]==oldAns,oldAns) ans = ans[1] if len( ans) > 1 and serialTalk and ans[0] == oldAns else ans[0] #print(2.9) if checkQuestion(question): cnt = 0 for qword in question: if qword in vec_model.vocab: vec1 = vec_model[qword] asentence = jieba.lcut_for_search(ans) for aword in asentence: if aword in vec_model.vocab: vec2 = vec_model[aword] sim = ifSimilar(vec1, vec2) if sim > 0.95: # 0.9 if qword != aword and qword not in [ '是', '的', ',', '?' ]: cnt = cnt + sim**24 # 30 continue if qword in ans and qword not in ['是', '的', ',', '?']: cnt = cnt + 1 #print(3.1) if cnt + 0.1 - len(ans)**2 / 100 > 2.3: return '是', question else: return '否', question else: #print(3) return ans, question
def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): for filename in filenames: print("adding", filename) path = os.path.join(root, filename) file = open(path, encoding='utf8') url = file.readline() title = file.readline() contents = file.read() file.close() img_url = self.getTxtAttribute(contents, 'img_url') img_info = self.getTxtAttribute(contents, 'img_info') for i in range(len(img_url)): if len(img_info[i]) > 0: title = title doc = Document() doc.add(StringField('title', title, Field.Store.YES)) doc.add(StringField('url', url, Field.Store.YES)) doc.add( StringField('img_url', img_url[i], Field.Store.YES)) seg_contents = jieba.lcut_for_search(img_info[i]) contents = ' '.join(seg_contents) doc.add( TextField('contents', contents, Field.Store.YES)) writer.addDocument(doc) else: continue
def fenci(request): i = 990 for news in NewsPiece.objects.all()[990:]: i = i + 1 print(i) for tag in news.tag_set.all(): tag.delete() title_tag = jieba.lcut_for_search(news.news_title) content_tag = jieba.lcut(news.news_content) r_title_tag = [] r_content_tag = [] for t in title_tag: if len(t) < 2: continue r_title_tag.append(t) for t in content_tag: if len(t) < 2: continue r_content_tag.append(t) r_title_tag = list(set(r_title_tag)) r_content_tag = list(set(r_content_tag)) for r in r_title_tag: news.tag_set.create(name=r) for r in r_content_tag: news.contenttag_set.create(name=r) return HttpResponse('fin')
def search(): color_data = [] deletefile('C:/Users/luoyujia/flasky/static/picture/') wanted = request.args.get("wanted", type=str) seg_list = jieba.lcut_for_search(wanted) # 全模式 size = 0 for keyword in seg_list: print(keyword) ImgDownload(keyword) #网络爬虫函数 print(keyword) for num in range(1, 11): try: im = Image.open('C:/Users/luoyujia/flasky/static/picture/' + str(num) + '.jpg') im = im.convert('RGB') im.thumbnail((50, 50)) im.save('C:/Users/luoyujia/flasky/static/test1.jpg') list1 = list( colorz("C:/Users/luoyujia/flasky/static/test1.jpg", n=5)) color_data.append(list1) size += 1 except OSError: continue print(num) print(size) print(color_data) return render_template("picular.html", data=color_data, size=size)
def fun(self): start=time.clock() keyword_list=jieba.lcut_for_search(self.searchBar.text()) # print(keyword_list) # 检索词中含有专业排名则返回排名 # 否则正常搜索 if ('专业' in keyword_list) and ('排名' in keyword_list): result=db[MONGO_MAJOR_TABLE].find().sort([('count', -1)]) count = 1 for r in result[0:42]: if count<=9: t = '0' + str(count) + ' ' + r['major'] else: t = str(count) + ' ' + r['major'] self.resultEdit.append(t) count += 1 end = time.clock() ex_time = 'Excute Time:' + str(end - start) # print(ex_time) self.time.setText(ex_time) else: # db[MONGO_TABLE].ensure_index([('tf', -1)]) get_tf(keyword_list) result = db[MONGO_TABLE].find().sort([('tf', -1)]) count = 1 for r in result[0:30]: t = str(count) + ' ' + r['title'] self.resultEdit.append(t) self.resultEdit.append(r['href']) count += 1 end = time.clock() ex_time = 'Excute Time:' + str(end - start) # print(ex_time) self.time.setText(ex_time)
def construct_postings_lists(self, day_before=1, end_date=datetime.today().date()): config = configparser.ConfigParser() config.read(self.config_path, self.config_encoding) # files = News.objects.filter(datetime__day=timezone.now().day - day_before) files = News.objects.filter( datetime__gte=end_date - timezone.timedelta(days=day_before), datetime__lte=end_date) total_len = int(config['FORMULA']['avg_l']) * int(config['FORMULA']['n']) for file in files: title = file.title body = file.body docid = file.pk date_time = file.datetime.strftime('%y-%m-%d %H:%M:%S') seg_list = jieba.lcut_for_search(title + '。' + body) total_words, cleaned_dict = self.clean_list(seg_list) total_len += total_words for term, tf_in_doc in cleaned_dict.items(): if term in title: tf_in_doc += int(math.log2(total_words)) d = Doc(docid, date_time, tf_in_doc, total_words) if term in self.postings_lists: # if term in dict, append doc self.postings_lists[term][0] += 1 # doc_frequency++ self.postings_lists[term][1].append(d) else: # else create new term and insert doc self.postings_lists[term] = [1, [d]] # [doc_frequency, [Doc]] AVG_L = int(total_len / News.objects.count()) config.set('FORMULA', 'n', str(News.objects.count())) config.set('FORMULA', 'avg_l', str(AVG_L)) with open(self.config_path, 'w', encoding=self.config_encoding) as configfile: config.write(configfile) self.write_postings_to_db()
def query(sentence, result = 3): dic = corpora.Dictionary.load(conf.dictionary) tfidf = models.TfidfModel.load(conf.tfidf) lda = models.LdaModel.load(conf.lda) q_topic = lda[tfidf[dic.doc2bow(jieba.lcut_for_search(sentence))]] topics = load_topic_of_post() martix = np.zeros((len(topics), conf.num_topics), float) for ti, t in enumerate(topics): for tj,v in t: martix[ti,tj] = v q_vec = np.zeros(conf.num_topics, float) for ti,v in q_topic: q_vec[ti] = v pq = [] i = 0 while i < len(topics): heapq.heappush(pq, (sum((martix[i] - q_vec)**2), i)) i+=1 sel = {} for s, i in heapq.nsmallest(result, pq): sel[i] = s print sel post = get_post(sel).values() post.sort() return post
def divide_and_generate(self): q_all_list = [] stop_words = self.stop_words_list("./chineseStopWords.txt") qa = self.qa.copy(deep=True) qa["Q_Clean"] = qa["QUESTION"].apply(self.remove_punctuation) qa["Q_D"] = qa["Q_Clean"].apply(lambda x: " ".join( [w for w in jb.lcut_for_search(x) if w not in stop_words])) qa["Q_Tag"] = qa["CLINIC"].apply(lambda x: self.clinic_code[x] if x in self.clinic_code else 0) # 生成词云 self.word_cloud(qa) tf_idf = TfidfVectorizer(norm='l2', ngram_range=(1, 2)) features = tf_idf.fit_transform(qa.Q_D) labels = qa.Q_Tag alpha_logger.info(features.shape) alpha_logger.info(features) N = 2 for cli, cli_tag in self.clinic_code.items(): features_chi2 = chi2(features, labels == cli_tag) indices = np.argsort(features_chi2[0]) feature_names = np.array(tf_idf.get_feature_names())[indices] uni_grams = [v for v in feature_names if len(v.split(' ')) == 1] bi_grams = [v for v in feature_names if len(v.split(' ')) == 2] print("# '{}':".format(cli)) print(" . Most correlated uni-grams:\n . {}".format( '\n . '.join(uni_grams[-N:]))) print(" . Most correlated bi-grams:\n . {}".format( '\n . '.join(bi_grams[-N:]))) alpha_logger.info("相关性展示") return qa, features, labels
def search_cut(sentence): """ HMM的切割方式 :param sentence: :return: """ return jieba.lcut_for_search(sentence)
def split_result(r, re_filter): ''' 拆分关键词 :param r: :param re_filter: :return: ''' title = r[1] title = re.sub(re_filter,' ', title) titles1 = jieba.lcut_for_search(title) titles3 = jieba.lcut(title, cut_all=True) titles = titles1 + titles3 jbs = set(titles) search_map = dict() search_map['id'] = r[0] search_map['result'] = list() if '' in jbs: jbs.remove('') if ' ' in jbs: jbs.remove(' ') for jb in jbs: flag, result = chinese_to_number(jb) # 如果有数据 if flag: search_map['result'].append(result) return search_map
def search_code_snippet(cls, search_value, start=0, size=10, request_type=None): if not isinstance(search_value, list): search_value = jieba.lcut_for_search(search_value) # print(search_value) if ElasticSearchSearvice.is_available(): results = ElasticSearchSearvice.search_code_snippet( fields=['code_name', 'code_des', 'code_tags', 'code_source'], index='code_snippets', search_values=search_value, start=start, size=size) hits = results['hits']['hits'] # print(hits) total = results['hits']['total'] pre_process = [] for hit in hits: source = hit['_source'] search_id = hit['_id'] # source['id'] = search_id source['code_from'] = search_id pre_process.append(source) # total, pre_process = cls.__single_table_search(results, 'code_snippets', search_value) return {'total': total, 'hits': pre_process} else: print('空的') return {'total': 0, 'hints': []}
def add_document(self, fullpath): f = open(fullpath, 'r', encoding='utf-8') content = '' iscode = False for line in f: if line[0:3] == '```': iscode = not iscode elif iscode == True or line[0] == '!': pass else: content += line content = re.sub('\W', ' ', content).lower().replace('__', '') tags = jieba.analyse.extract_tags(content, topK=20) wordlist = jieba.lcut_for_search(content) while ' ' in wordlist: wordlist.remove(' ') docid = len(self.paths) self.documents.append(Document(docid, fullpath, wordlist, tags)) self.paths.append(fullpath) for word in wordlist: if word not in self.worddoc: self.worddoc[word] = [] if docid not in self.worddoc[word]: self.worddoc[word].append(docid)
def fit(self, sentences, sort_by_count=False): """ 创建词序列 :param sentences: <list> 句子列表 :param sort_by_count: <bool> 是否根据句子中单词的数量进行排序。 默认为 False。 :return: 无 """ assert not self.fited, "word sequence fit once" for word in add_word_list: jieba.add_word(word) word_count = Counter() for sentence in sentences: word_count.update(jieba.lcut_for_search(sentence)) if sort_by_count: sorted(word_count.items(), lambda x: x[1]) for word, _ in word_count.items(): self.word_dict[word] = len(self.word_dict) else: for word in sorted(word_count.keys()): self.word_dict[word] = len(self.word_dict) self.fited = True
def get_video_kw_list(self, aid): # 关键字从name和official中提取 video = self.mongo_video.find_one({'aid': aid}, { '_id': 0, 'title': 1, 'channel': 1, 'subChannel': 1, 'author': 1, 'tag': 1 }) kw = [] for each_key in video: if each_key != 'keyword' or each_key != 'tag': kw.append(str(video[each_key]).lower()) elif each_key == 'tag': kw += video['tag'] else: kw += video['keyword'] seg_list = jieba.lcut_for_search(' '.join(kw), True) # 搜索引擎模式 # 全名算作关键字 if 'author' in video and video['author'].lower() not in seg_list: seg_list.append(video['author'].lower()) while ' ' in seg_list: seg_list.remove(' ') while '、' in seg_list: seg_list.remove('、') return list(set(seg_list))
def get_author_kw_list(self, mid): # 关键字从name和official中提取 author = self.mongo_author.find_one({'mid': mid}, { '_id': 0, 'name': 1, 'official': 1, 'keyword': 1 }) kw = [] for each_key in author: if each_key != 'keyword': kw.append(str(author[each_key]).lower()) else: kw += author['keyword'] seg_list = jieba.lcut_for_search(' '.join(kw), True) # 搜索引擎模式 # 全名算作关键字 if 'name' in author and author['name'].lower() not in seg_list: seg_list.append(author['name'].lower()) while ' ' in seg_list: seg_list.remove(' ') while '、' in seg_list: seg_list.remove('、') return list(set(seg_list))
def _command_filter(command): """过滤指令,缩小遍历范围""" # 相同指令 same_commands = data_conveyor.filter_command(command) if same_commands: return same_commands # 分词匹配指令,有限的硬件资源环境不允许本地NLP all_commands = data_conveyor.all_command() def __filter_by_words(_words): actions = [] for commands, action in all_commands: for _word in _words: if _word in commands: actions.append(action) return data_conveyor.filter_command_by_actions(set(actions)) # 像似分词 https://cuiqingcai.com/5844.html cut_words = lcut_for_search(command) like_commands = __filter_by_words(cut_words) if like_commands: return like_commands # 模糊分词 cut_all_words = lcut(command, cut_all=True) vague_commands = __filter_by_words(cut_all_words) if vague_commands: return vague_commands # 无匹配 return []
def query_handler(q): ''' 处理查询 英文和中文分别处理 ''' '''分别提取英文和中文单词集''' en = re.findall(r'\w+', q) cnPattern = re.compile(u'[\u4e00-\u9fa5]+') cn = re.findall(cnPattern, q) '''分词处理''' en_tokens = list(set([w for s in en for w in nltk.word_tokenize(s)])) cn_tokens = list(set([w for s in cn for w in jieba.lcut_for_search(s)])) '''去除停用词''' en_cleans = [w for w in en_tokens if w.lower() not in en_stopwd] cn_cleans = [w for w in cn_tokens if w.encode('utf-8') not in cn_stopwd] if not en_cleans: en_cleans = en_tokens if not cn_cleans: cn_cleans = cn_tokens # cn_cleans.append(jieba.analyse.extract_tags(s, topK=1)) '''英文词干化''' porter = nltk.PorterStemmer() en_result = [porter.stem(t) for t in en_cleans] tokens = en_result + cn_cleans return tokens
def query(sentence, result=3): dic = corpora.Dictionary.load(conf.dictionary) tfidf = models.TfidfModel.load(conf.tfidf) lda = models.LdaModel.load(conf.lda) q_topic = lda[tfidf[dic.doc2bow(jieba.lcut_for_search(sentence))]] topics = load_topic_of_post() martix = np.zeros((len(topics), conf.num_topics), float) for ti, t in enumerate(topics): for tj, v in t: martix[ti, tj] = v q_vec = np.zeros(conf.num_topics, float) for ti, v in q_topic: q_vec[ti] = v pq = [] i = 0 while i < len(topics): heapq.heappush(pq, (sum((martix[i] - q_vec)**2), i)) i += 1 sel = {} for s, i in heapq.nsmallest(result, pq): sel[i] = s print sel post = get_post(sel).values() post.sort() return post
def test_segment(): """ 测试简单分词方法。 :return: """ # 全模式 seg_list = jieba.cut("我来到北京清华大学", cut_all=True) common_logger.info("Full Mode:{0}".format("/".join(seg_list))) # 精确模式 seg_list = jieba.cut("我来到北京清华大学", cut_all=False) common_logger.info("Default Mode:{0}".format("/".join(seg_list))) # 不使用HMM模型 seg_list = jieba.cut("他来到了网易杭研大厦", HMM=False) common_logger.info("不使用HMM模型:{0}".format("/".join(seg_list))) # 使用HMM模型 seg_list = jieba.cut("他来到了网易杭研大厦", HMM=True) common_logger.info("使用HMM模型:{0}".format("/".join(seg_list))) # 搜索引擎模式 seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", HMM=False) common_logger.info("搜索引擎模式:{0}".format("/".join(seg_list))) # 搜索引擎模式,且返回结果拼装为list,通常情况返回结果为生成器类型 seg_list = jieba.lcut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", HMM=True) common_logger.info(seg_list)
def result_by_time(self, sentence): seg_list = jieba.lcut_for_search(sentence) n, cleaned_dict = self.clean_list(seg_list) time_scores = {} for term in cleaned_dict.keys(): r = self.fetch_from_db(term) if r is None: continue docs = r[2].split('\n') for doc in docs: docid, date_time, tf, ld = doc.split('\t') if docid in time_scores: continue news_datetime = datetime.strptime( date_time, "%Y-%m-%d %H:%M:%S") now_datetime = datetime.now() td = now_datetime - news_datetime docid = int(docid) td = (timedelta.total_seconds(td) / 3600) # hour time_scores[docid] = td time_scores = sorted(time_scores.items(), key=operator.itemgetter(1)) if len(time_scores) == 0: return 0, [] else: return 1, time_scores
def get_terms(self, query): """ processing query ,unsolved :param query: query_from_web :return:list """ query = query.split() terms = [x for x in query if x not in self.stopwords] str_terms = ''.join(map(str, terms)).encode('UTF-8') return jieba.lcut_for_search(str_terms)
def generatePeopleB(dictionary, nameList): for name in nameList: nameSplitSign = splitSign(name) # has been added in a6 a7 for name2 in nameSplitSign: jieba_cut_name_list = jieba.lcut_for_search(name2) if len(jieba_cut_name_list) > 1: for name3 in jieba_cut_name_list: if name3 == name2: # 克里斯 cut出 克里斯 和 克里 克里斯不用再添加了 continue addIntoDict(dictionary, name3, "b2")
def generateInfoB(dictionary, infoList): for info in infoList: # 2015-07-24(中国大陆) infoSplitSign = splitSign(info) for info2 in infoSplitSign: addIntoDict(dictionary, info2, "b5") jieba_cut_info_list = jieba.lcut_for_search(info2) if len(jieba_cut_info_list) > 1: for info3 in jieba_cut_info_list: if info3 == info2: # 中国大陆 有 中国大陆 和 中国 和 大陆 continue addIntoDict(dictionary, info3, "b5")
def generateTitleB(dictionary, titleList, alias=False): for title in titleList: if alias: title = dealWithAlias(title) titleSplitSign = splitSign(title) # has been added in a2 for title2 in titleSplitSign: jieba_cut_title_list = jieba.lcut_for_search(title2) if len(jieba_cut_title_list) > 1: for title3 in jieba_cut_title_list: if title3 == title2: # 克里斯 cut出 克里斯 和 克里 克里斯不用再添加了 continue addIntoDict(dictionary, title3, "b1")
def generateIntroC(dictionary, introList): jieba_cut_intro_list = [] longIntro = u"" for intro in introList: jieba_cut_intro_list += jieba.lcut_for_search(intro) longIntro += intro jieba_cut_intro_list = [y for y in jieba_cut_intro_list if not isSign(y)] jieba_cut_intro_set = set(jieba_cut_intro_list) for word in jieba_cut_intro_set: # for every word, search its term frequency pattern = re.compile(word) tf = len(re.findall(pattern, longIntro)) # print word, tf addIntoDict(dictionary, word, "c2", tf)
def make_inverted_index(filename,read_buff_size,output_file_record_size,web_record_numbers=100000): ''' :param filename: 网页数据,包含后缀.txt :param read_buff_size:按块读文件,每个块的大小 :param output_file_token_size:输出每个文件中包含的新闻纪录数 :param 输入文件网页记录总数,用于显示程序处理进度 ,无其他用处 :return:倒排索引文件 ''' #读文件,分词,存储倒排索引至多个文件 block_read=read_block(read_buff_size,filename) punct = set(u'''/+%#:!),.:;?]}¢'"、。〉》」』】〕〗〞︰︱︳﹐、﹒ ﹔﹕﹖﹗﹚﹜﹞!),.:;?|}︴︶︸︺︼︾﹀﹂﹄﹏、~¢ 々‖•·ˇˉ―--′’”([{£¥'"‵〈《「『【〔〖([{£¥〝︵︷︹︻ ︽︿﹁﹃﹙﹛﹝({“‘-—_…''') Letters_and_numbers=set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') buff_dir=filename[:-4]+'_buff' #创建问价夹,用于存放 构建倒排索引过程中的中间文件,构建完成后删除文件夹 if os.path.exists(buff_dir): pass else: os.mkdir(buff_dir) file_numbers=1 while True: print "process :cuting word +making inverted_index files---->>>>",file_numbers*(output_file_record_size)*1.0/web_record_numbers spimi=SPIMI_Invert(buff_dir+'/'+str(file_numbers)+'.txt') count=0 while True: doc_id,content=block_read.pop_token() if content==''or count==output_file_record_size: break content_list=jieba.lcut_for_search(content) spimi.push_id(doc_id) for j in range(len(content_list)): if content_list[j] not in punct and content_list[j] not in Letters_and_numbers : spimi.push_word(content_list[j]) del content_list,doc_id,content count+=1 spimi.push_word('')#为空 表示写文件 file_numbers+=1 if content=='': break print ("process :cuting word +making inverted_index files---->>>>Finish") #合并倒排索引文件 merged_filename=merge_inverted_files.merge_file([str(i) for i in range(1,file_numbers)],read_buff_size,buff_dir+'/') print "process:mergeing inverted index files----->Finish" #由倒排索引文件构建 词-倒排索引位置 Dictionary.establish_ditionary(buff_dir+'/'+merged_filename+'.txt',read_buff_size,buff_dir+'/'+"Dictionary.txt") shutil.copy(buff_dir+'/'+merged_filename+'.txt',filename[:-4]+'_inverted_index.txt')#移动文件 shutil.copy(buff_dir+'/'+"Dictionary.txt",filename[:-4]+'_index_Dictionary.txt') shutil.rmtree(buff_dir)#删除文件夹 del merged_filename,buff_dir,punct,Letters_and_numbers
def inverted_index(filename,read_buff_size,output_file_record_size): ''' :param filename: 网页数据,包含后缀.txt :param read_buff_size:按块读文件,每个块的大小 :param output_file_token_size:输出每个文件中包含的新闻纪录数 :return:倒排索引文件 ''' ''' 读文件,分词,存储倒排索引至多个文件 ''' block_read=read_block(read_buff_size,filename) punct = set(u''':!),.:;?]}¢'"、。〉》」』】〕〗〞︰︱︳﹐、﹒ ﹔﹕﹖﹗﹚﹜﹞!),.:;?|}︴︶︸︺︼︾﹀﹂﹄﹏、~¢ 々‖•·ˇˉ―--′’”([{£¥'"‵〈《「『【〔〖([{£¥〝︵︷︹︻ ︽︿﹁﹃﹙﹛﹝({“‘-—_…''') file_numbers=1 while True: spimi=SPIMI_Invert(filename[:-4]+str(file_numbers)+'.txt') count=0 while True: doc_id,content=block_read.pop_token() if content==''or count==output_file_record_size: break content_list=jieba.lcut_for_search(content) spimi.push_id(doc_id) for j in range(len(content_list)): if content_list[j] not in punct: spimi.push_word(content_list[j]) del content_list,doc_id,content count+=1 spimi.push_word('')#为空 表示写文件 if content=='': break file_numbers+=1 ''' 合并倒排索引文件 ''' merged_filename=merge_inverted_files.merge_file([str(i) for i in range(1,file_numbers)],buff_size,filename[:-4]) ''' 由倒排索引文件构建 词-倒排索引位置 ''' Dictionary.establish_ditionary(filename[:-4]+merged_filename+'.txt',buff_size,filename[:-4]+"Dictionary.txt") ''' 加载存储的词典 ''' Dictionary.dictionary(filename[:-4]+"Dictionary.txt",filename[:-4]+merged_filename+'.txt',1024*1024)
def ConsineScore(query): query.decode(sys.stdin.encoding or locale.getpreferredencoding(True)) Score = {} for doc in doc_list: Score[doc] = 0 TotalDoc = len(doc_list) query_term_list = jieba.lcut_for_search(query) for term in query_term_list: try: posting_list = indexTable[term]; except: continue for posting in posting_list: doc = posting['doc'] tf = posting['tf'] df = len(posting_list) weight = (1+math.log10(tf))*math.log10(TotalDoc/df) Score[doc] += weight; for doc in doc_list: Score[doc] = Score[doc]/doc_length[doc]; return sorted(Score.iteritems(), key=lambda d:d[1], reverse = True)[:10] #返回排名前十的页面 list[(doc_id,score)...]
def getResultDictionaryListFromSentence(sentence): words = sentence.split("+") words_results = [] longest_result_length = 0 some_new_jieba_words = [] for word in words: one_result = getResultDictionaryListFromOneSingleWord(word, return_dictionaryList=False) if len(one_result) == 0: some_new_jieba_words += jieba.lcut_for_search(word) continue words_results.append(one_result) if len(one_result) > longest_result_length: longest_result_length = len(one_result) if len(some_new_jieba_words) > 0: for word in some_new_jieba_words: one_result = getResultDictionaryListFromOneSingleWord(word, return_dictionaryList=False) words_results.append(one_result) if len(one_result) > longest_result_length: longest_result_length = len(one_result) URLOrderDict = collections.OrderedDict() for movie_pos in range(longest_result_length): for word_pos in range(len(words_results)): if movie_pos >= len(words_results[word_pos]): continue if URLOrderDict.get(words_results[word_pos][movie_pos]) is None: URLOrderDict[words_results[word_pos][movie_pos]] = 1 else: URLOrderDict[words_results[word_pos][movie_pos]] += 1 final_dictionaryList = [] for appearance in reversed(range(len(words_results))): for url in URLOrderDict: if len(final_dictionaryList) > AMOUNT_OF_FINAL_RESULT: break if URLOrderDict[url] == appearance+1: print url, URLOrderDict[url] final_dictionaryList.append(getDictionaryFromURL(url)) line = json.dumps(dict(final_dictionaryList[-1]), ensure_ascii=False, sort_keys=True) print line return final_dictionaryList
('我们中出了一个叛徒', ('中', '出')), ] for sent, seg in testlist: print('/'.join(jieba.cut(sent, HMM=False))) word = ''.join(seg) print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True))) print('/'.join(jieba.cut(sent, HMM=False))) print("-"*40) # quit() jieba.add_word('石墨烯') seg_list = jieba.cut(p, cut_all=True) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 seg_list = jieba.cut(p, cut_all=False) print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 seg_list = jieba.cut(p) # 默认是精确模式 print(", ".join(seg_list)) seg_list = jieba.cut_for_search(p) # 搜索引擎模式 print(", ".join(seg_list)) print jieba.suggest_freq(('好','我')) print jieba.suggest_freq(('走','了')) print ','.join(jieba.lcut(p)) print ','.join(jieba.lcut_for_search(p)) print ','.join(['%s/%s'%(i,j) for i,j in pseg.lcut(p)])
def cutword(self, text): return jieba.lcut_for_search(text)
def separatewords(self, text): #splitter = re.compile(r'\W*') return jieba.lcut_for_search(text)
def rankpage(request): if 'query' in request.GET: start = time.clock() tmp = ConsineScore(request.GET['query']) print request.GET['query'] query_term_list = jieba.lcut_for_search(request.GET['query']) filename = tmp[0] filename = filename[1:] filename = r'SearchEngine'+filename+r'.txt'; filename = filename.replace(r'|',r'_') filename = filename.replace(r'?',r'_') timg = tmp[0] timg = timg[1:] timg = timg + r'.jpg'; timg = timg.replace(r'|',r'_') timg = timg.replace(r'?',r'_') timg = r"/static" + timg with open(filename,"r") as f: text=f.read() d=eval(text) teacher = [] for k,v in d.iteritems(): try: if k in ["name","department","homepage"]: if k=="name": teacher.append(("姓名",v)) elif k=="department": teacher.append(("院系",v)) else: teacher.append(("主页:",v)) except: teacher.append((k,'None')) count = len(tmp) #处理网站链接 result = [] for term in tmp: link = title = content = "" link = term[1:] link = link.replace(r'|',r'/') link = r'http:/'+link filename = term filename = filename[1:] filename = r'SearchEngine'+filename; filename = filename.replace(r'|',r'_') filename = filename.replace(r'?',r'_') fo = open(filename, "r") doc_text = fo.read(); fo.close(); if doc_text.find("博客")<doc_text.find("主页"): if doc_text.find("博客")>0: title = doc_text[:doc_text.find("博客")] title = title + "博客" elif doc_text.find("主页")>0: title = doc_text[:doc_text.find("主页")] title = title + "主页" else: title = link else: if doc_text.find("主页")>0: title = doc_text[:doc_text.find("主页")] title = title + "主页" elif doc_text.find("博客")>0: title = doc_text[:doc_text.find("博客")] title = title + "博客" else: title = link while doc_text.count("主页")>1: doc_text = doc_text.partition("主页")[2] content = doc_text.partition("主页")[2] content = content.decode('utf-8')[:200].encode('utf-8') + '...' result.append({'link':link,'title':title,'content':content}) query = request.GET['query'] end = time.clock() runtime = end-start return render_to_response('searchpage.html',locals()) else: return render_to_response('startSearch.html',locals())
def add_school_seq(t): t['school_seq'] = jieba.lcut_for_search(t['school'])