Пример #1
0
    def cut_document(document):
        """
        对 mongodb 中的每个文档的 summary 做分词
        同时在单词表中添加电影名、导演名、演员名等
        """

        # 对 summary 进行分词
        if 'summary' in document:
            words = jieba.lcut_for_search(document['summary'])
        else:
            words = []

        # 对标题进行分词
        extra_words = jieba.lcut_for_search(document['name'])

        # 对导演名进行分词
        for director in document['directors']:
            extra_words.extend(jieba.lcut_for_search(director))

        # 对演员名进行分词
        for actor in document['actors']:
            extra_words.extend(jieba.lcut_for_search(actor))

        words.extend(extra_words)

        words = filter(match_words, words)

        return words
Пример #2
0
def similarAlg(noteId):
    content = get_note(noteId)  # get note content dynamic

    selected_note_word_list = jieba.lcut_for_search(content)
    ignore_list = get_ignore_list()
    selected_note_word_list = list(
        filter(lambda x: x not in ignore_list, selected_note_word_list))

    notes = get_categorized_notes() + get_uncategorized_notes(
        60 * 60 * 24 * 300)
    returnList = []
    for note in notes:
        count = 0
        current_word_list = jieba.lcut_for_search(note['content'])
        match_list = []
        for word in current_word_list:
            if word in selected_note_word_list:  # 如果和当前文章有相同的词,则记录
                if word not in match_list:  # 去重
                    count += 1
                    match_list.append(word)
        if count > 0:
            note['count'] = count
            note['match_list'] = match_list
            returnList.append(note)

    returnList = sorted(returnList, key=lambda x: -len(x['match_list']))
    return json.dumps(returnList)
Пример #3
0
def search():
    form = SearchForm()
    if form.validate_on_submit():
        pprint(form.data)
        history_record(form.query.data)

        if form.trade.data == 'buy':
            is_buy = True
            query = jieba.lcut_for_search(form.query.data)
            ptype = form.ptype.data
            cond = form.cond.data
            r = ps.search_buy(query=query, ptype=ptype, cond=cond)
            results = [hit.to_dict() for hit in r]
            return render_template("SearchResult.html",
                                   results=results,
                                   is_buy=is_buy)

        if form.trade.data == 'sell':
            is_buy = False
            query = jieba.lcut_for_search(form.query.data)
            ptype = form.ptype.data
            cond = form.cond.data
            r = ps.search_sell(query=query, ptype=ptype, cond=cond)
            results = [hit.to_dict() for hit in r]
            return render_template("SearchResult.html",
                                   results=results,
                                   is_buy=is_buy)

    return render_template('search.html', form=form)
Пример #4
0
def preprocess_query(query):
    begin = -1
    end = -1
    flag = 0
    phrase = []
    for i, item in enumerate(query):
        if item == '\'' and begin == -1:
            begin = i
        elif item == '\'':
            end = i
            flag = 1
            break
        elif item == '\"' and begin == -1:
            begin = i
        elif item == '\"':
            flag = 1
            end = i
            break
    if flag:
        phrase = jieba.lcut_for_search(query[begin + 1:end])

    res = jieba.lcut_for_search(query)
    for item in query:
        res.append(item)
    res = stopping(res)
    return res[:20], phrase
Пример #5
0
    def indexDocs(self, root, writer):   
        for root,dirnames,filenames in os.walk(root):
            for dirname in dirnames: #遍历文件夹
                path1 = os.path.join(root,dirname)
                for trivial1 , trivial2 , filenames in os.walk(path1): #遍历文件夹下的文件
                    for filename in filenames:
                        #print(root,dirnames,filename)
                        print("adding", filename)
                        # try:
                        path = os.path.join(path1, filename)
                        file = open(path, encoding='utf8')
                        page = file.readline()
                        title = file.readline()
                        contents = file.read()
                        file.close()

                        # jieba 分词
                        seg_contents = jieba.lcut_for_search(contents)
                        contents = ' '.join(seg_contents)
                        url = page
                        seg_url = jieba.lcut_for_search(page)
                        page = ' '.join(list(set(seg_url)-set(['.','http','https','/',':','?','=','html','shtml','www'])))

                        doc = Document()
                        doc.add(StringField("name", filename, Field.Store.YES))
                        doc.add(StringField("path", path, Field.Store.YES))
                        if len(contents) > 0:
                            doc.add(TextField('title', title, Field.Store.YES))
                            doc.add(TextField('site', page, Field.Store.YES))
                            doc.add(TextField('url',url,Field.Store.YES))
                            doc.add(TextField('contents', contents, Field.Store.YES))
                        else:
                            print("warning: no content in %s" % filename)
                        writer.addDocument(doc)
Пример #6
0
def ConsineScore(query):
	Score = {}
	for doc in doc_list:
		Score[doc] = 0
	TotalDoc = len(doc_list)	
	query_term_list = jieba.lcut_for_search(query)

	#Term-at-a-Time Processing
	for term in query_term_list:
		#calculate w[t,q] and fetch posting list 
		try:					
			posting_list = indexTable[term];
		except:
			continue
		for posting in posting_list:		#for each pair(d,tf) in posting list 	
			doc = posting['doc']
			tf = posting['tf']
			df = len(posting_list)
			#compute tf-idf weight
			weight = (1+math.log10(tf))*math.log10(TotalDoc/df)
			if (doc.count(u'jkzhu')>0 and term.count(u'计算机')>0):
				weight += 100
			Score[doc] += weight;			#w(t,q)=1 for fast Consine Score

	#Consine Normalization
	for doc in doc_list:
		Score[doc] = Score[doc]/doc_length[doc];

	#boolean search for 'not'
	if query.count("not")>0:
		not_list = jieba.lcut_for_search(query.partition("not")[2])
		for term in not_list:
			try:
				posting_list = indexTable[term]
			except:
				continue
			for posting in posting_list:
				doc = posting['doc']
				Score[doc] = 0

	#rank documents with respect to the query
	
	#use Min Heap for Selecting top k out of N
	result = []
	queue = Queue.PriorityQueue(10)
	for term in doc_list:		#process a new document d with score s
		#if Score[term]==0:
		#	continue
		if queue.full():	
			min_score = queue.get();		#get current minimum h_min of heap (O(1))
			if (Score[term],term)>min_score:	#if s>h_min heap-add((doc,score)) (O(logk))
				queue.put((Score[term],term))
			else:							#if s<h_min skip to next document
				queue.put(min_score)		
		else:
			queue.put((Score[term],term))
	while queue.empty()==False:
		result.append(queue.get()[1])
	result.reverse()
	return result
Пример #7
0
def split_result(r, re_filter):
    '''
    拆分关键词
    :param r:
    :param re_filter:
    :return:
    '''
    title = r.get('title')
    dsc = r.get('goods_desc')
    title = re.sub(re_filter, ' ', title)
    dsc = re.sub(re_filter, ' ', dsc)

    titles1 = jieba.lcut_for_search(title)
    titles3 = jieba.lcut(title, cut_all=True)
    titles = titles1 + titles3

    dscs1 = jieba.lcut_for_search(dsc)
    dscs3 = jieba.lcut(dsc, cut_all=True)
    types = dscs1 + dscs3

    jbs = set(titles + types)
    search_map = dict()
    search_map['id'] = r.get('id')
    search_map['result'] = list()

    if '' in jbs:
        jbs.remove('')
    if ' ' in jbs:
        jbs.remove(' ')
    for jb in jbs:
        flag, result = chinese_to_number(jb)
        # 如果有数据
        if flag:
            search_map['result'].append(result)
    return search_map
Пример #8
0
 def cutForSearch(self, text):
     if type(text) is list:
         result = list()
         for s in text:
             result.append(jieba.lcut_for_search(s))
         return result
     else:
         return jieba.lcut_for_search(text)
Пример #9
0
 def count_simisc(word, title):
     wd_set = set(jieba.lcut_for_search(word))
     tt_set = set(jieba.lcut_for_search(title))
     simisc = wd_set - tt_set
     try:
         score = 1 - len(simisc) / len(wd_set)
     except:
         score = 1.0
     return score
Пример #10
0
    def releventScore(self, text, ques, tfidf={}):
        def filtWord(li):
            # filt out stop words
            nl = []
            for l in li:
                if l not in STOPWORDS:
                    nl.append(l)
            return nl

        def sims(t, q):
            if t in self.dic.keys() and q in self.dic.keys():
                vector1 = self.dic[t]
                vector2 = self.dic[q]
                dot_product = 0.0
                normA = 0.0
                normB = 0.0
                for a, b in zip(vector1, vector2):
                    dot_product += a * b
                    normA += a**2
                    normB += b**2
                if normA == 0.0 or normB == 0.0:
                    return 0
                else:
                    return dot_product / ((normA * normB)**0.5)
            else:
                l = max([len(t), len(q)])
                if Levenshtein.distance(t, q) < l:
                    return (l - Levenshtein.distance(t, q)) / l * 0.7
                else:
                    return 0

        ttoks = filtWord(jieba.lcut_for_search(text))
        qtoks = filtWord(jieba.lcut_for_search(ques))

        score = 0
        if len(ttoks) == 0:
            return 0
        for tword in ttoks:
            for qword in qtoks:

                if tword in tfidf.keys():
                    rate = tfidf[tword]
                else:
                    rate = 1

                if tword == qword:
                    # exact match
                    score += rate * 2.5
                elif sims(tword, qword) > 0.4:
                    # similar
                    score += sims(tword, qword) * rate
        # remove advantage of length
        return score / len(ttoks) / len(qtoks) * 100
Пример #11
0
def getTeamKeyword():
    global team_keyword_list
    for team in team_list:
        tempwordlist = []
        tempwordlist.extend(jieba.lcut_for_search(team['name']))
        for member in team['member']:
            tempwordlist.extend(jieba.lcut_for_search(member))
        tempwordlist = list(set(tempwordlist))
        wordlist = []
        for index in range(0, len(tempwordlist)):
            if len(tempwordlist[index]) >= 2:
                wordlist.append(tempwordlist[index])
        team_keyword_list.append(wordlist)
def obj_to_document(obj):
    def conv_to_str(x):
        if isinstance(x, unicode):
            return x.encode('utf8')
        return str(x)

    res = Document()
    res.add(StringField('index', conv_to_str(obj.index), Field.Store.YES))
    res.add(StringField('type', obj.__class__.__name__, Field.Store.YES))
    for k, v in vars(obj.data).items():
        if v is None:
            res.add(Field(k, '', Field.Store.YES, Field.Index.NO))
            fieldtype = LT_NONE
        elif isinstance(v, list):
            if len(v) > 0 and isinstance(v[0], int):
                res.add(
                    TextField(k, ' '.join((str(x) for x in set(v))),
                              Field.Store.YES))
                fieldtype = LT_INTLIST
            else:
                res.add(TextField(k, ' '.join(list(set(v))), Field.Store.YES))
                fieldtype = LT_LIST
        elif isinstance(v, str) or isinstance(v, unicode):
            res.add(Field(k, v, Field.Store.YES, Field.Index.NO))
            res.add(
                TextField(k + LTPF_FOR_QUERY,
                          ' '.join(jieba.lcut_for_search(v)), Field.Store.NO))
            fieldtype = LT_STRING
        elif isinstance(v, hyper_text):
            res.add(Field(k, v.raw, Field.Store.YES, Field.Index.NO))
            res.add(
                TextField(k + LTPF_FOR_QUERY,
                          ' '.join(jieba.lcut_for_search(v.text)),
                          Field.Store.NO))
            fieldtype = LT_HYPERTEXT
        elif isinstance(v, bool):
            if v:
                vs = '1'
            else:
                vs = '0'
            res.add(StringField(k, vs, Field.Store.YES))
            fieldtype = LT_BOOL
        elif isinstance(v, int) or isinstance(v, long):
            res.add(StringField(k, str(v), Field.Store.YES))
            fieldtype = LT_INT
        else:
            raise Exception('unrecognized data type')
        res.add(
            Field(k + LTPF_TYPE, fieldtype, Field.Store.YES, Field.Index.NO))
    return res
Пример #13
0
def getanswer(question, data, model, vec_model, oldQuestion, oldAns):
    #print(0)
    if '上海交通大学' in question:
        question = re.sub('上海交通大学', '', question)
    if '?' not in question:
        question = question + '?'
    #print(question)
    serialTalk = checkForEmission(question)
    if serialTalk:
        q1 = jieba.lcut_for_search(oldQuestion)
        q2 = jieba.lcut_for_search(question)
        q = ''
        for word2 in q2:
            if word2 not in q1 and word2 not in ['那', '那么']:
                q = q + word2
        question = oldQuestion[0:-1] + q + q + '?'
        #print('this is new question',question)
    ans = getNormalAnswer(question, data, model, vec_model, serialTalk)
    #print('===+++',ans,len(ans)>1,serialTalk,ans[0]==oldAns,oldAns)
    ans = ans[1] if len(
        ans) > 1 and serialTalk and ans[0] == oldAns else ans[0]
    #print(2.9)
    if checkQuestion(question):
        cnt = 0
        for qword in question:
            if qword in vec_model.vocab:
                vec1 = vec_model[qword]
                asentence = jieba.lcut_for_search(ans)
                for aword in asentence:
                    if aword in vec_model.vocab:
                        vec2 = vec_model[aword]
                        sim = ifSimilar(vec1, vec2)
                        if sim > 0.95:  # 0.9
                            if qword != aword and qword not in [
                                    '是', '的', ',', '?'
                            ]:
                                cnt = cnt + sim**24  # 30
                                continue
            if qword in ans and qword not in ['是', '的', ',', '?']:
                cnt = cnt + 1
        #print(3.1)

        if cnt + 0.1 - len(ans)**2 / 100 > 2.3:
            return '是', question
        else:
            return '否', question
    else:
        #print(3)
        return ans, question
Пример #14
0
    def indexDocs(self, root, writer):
        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                print("adding", filename)
                path = os.path.join(root, filename)
                file = open(path, encoding='utf8')
                url = file.readline()
                title = file.readline()
                contents = file.read()
                file.close()
                img_url = self.getTxtAttribute(contents, 'img_url')
                img_info = self.getTxtAttribute(contents, 'img_info')
                for i in range(len(img_url)):
                    if len(img_info[i]) > 0:
                        title = title
                        doc = Document()

                        doc.add(StringField('title', title, Field.Store.YES))
                        doc.add(StringField('url', url, Field.Store.YES))
                        doc.add(
                            StringField('img_url', img_url[i],
                                        Field.Store.YES))
                        seg_contents = jieba.lcut_for_search(img_info[i])
                        contents = ' '.join(seg_contents)
                        doc.add(
                            TextField('contents', contents, Field.Store.YES))
                        writer.addDocument(doc)
                    else:
                        continue
Пример #15
0
def fenci(request):
    i = 990
    for news in NewsPiece.objects.all()[990:]:
        i = i + 1
        print(i)
        for tag in news.tag_set.all():
            tag.delete()
        title_tag = jieba.lcut_for_search(news.news_title)
        content_tag = jieba.lcut(news.news_content)
        r_title_tag = []
        r_content_tag = []
        for t in title_tag:
            if len(t) < 2:
                continue
            r_title_tag.append(t)
        for t in content_tag:
            if len(t) < 2:
                continue
            r_content_tag.append(t)
        r_title_tag = list(set(r_title_tag))
        r_content_tag = list(set(r_content_tag))
        for r in r_title_tag:
            news.tag_set.create(name=r)
        for r in r_content_tag:
            news.contenttag_set.create(name=r)
    return HttpResponse('fin')
Пример #16
0
def search():
    color_data = []
    deletefile('C:/Users/luoyujia/flasky/static/picture/')
    wanted = request.args.get("wanted", type=str)
    seg_list = jieba.lcut_for_search(wanted)  # 全模式
    size = 0
    for keyword in seg_list:
        print(keyword)
        ImgDownload(keyword)  #网络爬虫函数
        print(keyword)
        for num in range(1, 11):
            try:
                im = Image.open('C:/Users/luoyujia/flasky/static/picture/' +
                                str(num) + '.jpg')
                im = im.convert('RGB')
                im.thumbnail((50, 50))
                im.save('C:/Users/luoyujia/flasky/static/test1.jpg')
                list1 = list(
                    colorz("C:/Users/luoyujia/flasky/static/test1.jpg", n=5))
                color_data.append(list1)
                size += 1
            except OSError:
                continue
            print(num)
    print(size)
    print(color_data)
    return render_template("picular.html", data=color_data, size=size)
Пример #17
0
 def fun(self):
     start=time.clock()
     keyword_list=jieba.lcut_for_search(self.searchBar.text())
     # print(keyword_list)
     # 检索词中含有专业排名则返回排名
     # 否则正常搜索
     if ('专业' in keyword_list) and ('排名' in keyword_list):
         result=db[MONGO_MAJOR_TABLE].find().sort([('count', -1)])
         count = 1
         for r in result[0:42]:
             if count<=9:
                 t = '0' + str(count) + '  ' + r['major']
             else:
                 t =  str(count) + '  ' + r['major']
             self.resultEdit.append(t)
             count += 1
         end = time.clock()
         ex_time = 'Excute Time:' + str(end - start)
         # print(ex_time)
         self.time.setText(ex_time)
     else:
         # db[MONGO_TABLE].ensure_index([('tf', -1)])
         get_tf(keyword_list)
         result = db[MONGO_TABLE].find().sort([('tf', -1)])
         count = 1
         for r in result[0:30]:
             t = str(count) + '  ' + r['title']
             self.resultEdit.append(t)
             self.resultEdit.append(r['href'])
             count += 1
         end = time.clock()
         ex_time = 'Excute Time:' + str(end - start)
         # print(ex_time)
         self.time.setText(ex_time)
Пример #18
0
    def construct_postings_lists(self, day_before=1, end_date=datetime.today().date()):
        config = configparser.ConfigParser()
        config.read(self.config_path, self.config_encoding)

        # files = News.objects.filter(datetime__day=timezone.now().day - day_before)
        files = News.objects.filter(
            datetime__gte=end_date - timezone.timedelta(days=day_before),
            datetime__lte=end_date)
        total_len = int(config['FORMULA']['avg_l']) * int(config['FORMULA']['n'])
        for file in files:
            title = file.title
            body = file.body
            docid = file.pk
            date_time = file.datetime.strftime('%y-%m-%d %H:%M:%S')
            seg_list = jieba.lcut_for_search(title + '。' + body)

            total_words, cleaned_dict = self.clean_list(seg_list)

            total_len += total_words

            for term, tf_in_doc in cleaned_dict.items():
                if term in title:
                    tf_in_doc += int(math.log2(total_words))
                d = Doc(docid, date_time, tf_in_doc, total_words)
                if term in self.postings_lists:  # if term in dict, append doc
                    self.postings_lists[term][0] += 1   # doc_frequency++
                    self.postings_lists[term][1].append(d)
                else:  # else create new term and insert doc
                    self.postings_lists[term] = [1, [d]]  # [doc_frequency, [Doc]]
        AVG_L = int(total_len / News.objects.count())
        config.set('FORMULA', 'n', str(News.objects.count()))
        config.set('FORMULA', 'avg_l', str(AVG_L))
        with open(self.config_path, 'w', encoding=self.config_encoding) as configfile:
            config.write(configfile)
        self.write_postings_to_db()
Пример #19
0
def query(sentence, result = 3):
	dic = corpora.Dictionary.load(conf.dictionary)
	tfidf = models.TfidfModel.load(conf.tfidf)
	lda = models.LdaModel.load(conf.lda)
	q_topic = lda[tfidf[dic.doc2bow(jieba.lcut_for_search(sentence))]]
	topics = load_topic_of_post()
	martix = np.zeros((len(topics), conf.num_topics), float)
	for ti, t in enumerate(topics):
		for tj,v in t:
			martix[ti,tj] = v
	q_vec = np.zeros(conf.num_topics, float)
	for ti,v in q_topic:
		q_vec[ti] = v
	pq = []
	i = 0
	while i < len(topics):
		heapq.heappush(pq, (sum((martix[i] - q_vec)**2), i))
		i+=1
	sel = {}
	for s, i in heapq.nsmallest(result, pq):
		sel[i] = s
	print sel
	post = get_post(sel).values()
	post.sort()
	return post
Пример #20
0
    def divide_and_generate(self):
        q_all_list = []
        stop_words = self.stop_words_list("./chineseStopWords.txt")
        qa = self.qa.copy(deep=True)
        qa["Q_Clean"] = qa["QUESTION"].apply(self.remove_punctuation)
        qa["Q_D"] = qa["Q_Clean"].apply(lambda x: " ".join(
            [w for w in jb.lcut_for_search(x) if w not in stop_words]))
        qa["Q_Tag"] = qa["CLINIC"].apply(lambda x: self.clinic_code[x]
                                         if x in self.clinic_code else 0)
        # 生成词云
        self.word_cloud(qa)

        tf_idf = TfidfVectorizer(norm='l2', ngram_range=(1, 2))
        features = tf_idf.fit_transform(qa.Q_D)
        labels = qa.Q_Tag
        alpha_logger.info(features.shape)
        alpha_logger.info(features)

        N = 2
        for cli, cli_tag in self.clinic_code.items():
            features_chi2 = chi2(features, labels == cli_tag)
            indices = np.argsort(features_chi2[0])
            feature_names = np.array(tf_idf.get_feature_names())[indices]
            uni_grams = [v for v in feature_names if len(v.split(' ')) == 1]
            bi_grams = [v for v in feature_names if len(v.split(' ')) == 2]
            print("# '{}':".format(cli))
            print("  . Most correlated uni-grams:\n       . {}".format(
                '\n       . '.join(uni_grams[-N:])))
            print("  . Most correlated bi-grams:\n       . {}".format(
                '\n       . '.join(bi_grams[-N:])))

        alpha_logger.info("相关性展示")
        return qa, features, labels
Пример #21
0
def search_cut(sentence):
    """
    HMM的切割方式
    :param sentence:
    :return:
    """
    return jieba.lcut_for_search(sentence)
Пример #22
0
def split_result(r, re_filter):
    '''
    拆分关键词
    :param r:
    :param re_filter:
    :return:
    '''
    title = r[1]
    title = re.sub(re_filter,' ', title)

    titles1 = jieba.lcut_for_search(title)
    titles3 = jieba.lcut(title, cut_all=True)
    titles = titles1 + titles3

    jbs = set(titles)
    search_map = dict()
    search_map['id'] = r[0]
    search_map['result'] = list()

    if '' in jbs:
        jbs.remove('')
    if ' ' in jbs:
        jbs.remove(' ')
    for jb in jbs:
        flag, result = chinese_to_number(jb)
        # 如果有数据
        if flag:
            search_map['result'].append(result)
    return search_map
Пример #23
0
 def search_code_snippet(cls,
                         search_value,
                         start=0,
                         size=10,
                         request_type=None):
     if not isinstance(search_value, list):
         search_value = jieba.lcut_for_search(search_value)
     # print(search_value)
     if ElasticSearchSearvice.is_available():
         results = ElasticSearchSearvice.search_code_snippet(
             fields=['code_name', 'code_des', 'code_tags', 'code_source'],
             index='code_snippets',
             search_values=search_value,
             start=start,
             size=size)
         hits = results['hits']['hits']
         # print(hits)
         total = results['hits']['total']
         pre_process = []
         for hit in hits:
             source = hit['_source']
             search_id = hit['_id']
             # source['id'] = search_id
             source['code_from'] = search_id
             pre_process.append(source)
         # total, pre_process = cls.__single_table_search(results, 'code_snippets', search_value)
         return {'total': total, 'hits': pre_process}
     else:
         print('空的')
         return {'total': 0, 'hints': []}
Пример #24
0
    def add_document(self, fullpath):
        f = open(fullpath, 'r', encoding='utf-8')
        content = ''
        iscode = False
        for line in f:
            if line[0:3] == '```':
                iscode = not iscode
            elif iscode == True or line[0] == '!':
                pass
            else:
                content += line

        content = re.sub('\W', ' ', content).lower().replace('__', '')
        tags = jieba.analyse.extract_tags(content, topK=20)

        wordlist = jieba.lcut_for_search(content)
        while ' ' in wordlist:
            wordlist.remove(' ')

        docid = len(self.paths)
        self.documents.append(Document(docid, fullpath, wordlist, tags))
        self.paths.append(fullpath)

        for word in wordlist:
            if word not in self.worddoc:
                self.worddoc[word] = []
            if docid not in self.worddoc[word]:
                self.worddoc[word].append(docid)
Пример #25
0
    def fit(self, sentences, sort_by_count=False):
        """
        创建词序列
        :param sentences: <list> 句子列表
        :param sort_by_count: <bool> 是否根据句子中单词的数量进行排序。
                                     默认为 False。
        :return: 无
        """
        assert not self.fited, "word sequence fit once"
        for word in add_word_list:
            jieba.add_word(word)

        word_count = Counter()
        for sentence in sentences:
            word_count.update(jieba.lcut_for_search(sentence))

        if sort_by_count:
            sorted(word_count.items(), lambda x: x[1])

            for word, _ in word_count.items():
                self.word_dict[word] = len(self.word_dict)
        else:
            for word in sorted(word_count.keys()):
                self.word_dict[word] = len(self.word_dict)

        self.fited = True
Пример #26
0
    def get_video_kw_list(self, aid):
        # 关键字从name和official中提取
        video = self.mongo_video.find_one({'aid': aid}, {
            '_id': 0,
            'title': 1,
            'channel': 1,
            'subChannel': 1,
            'author': 1,
            'tag': 1
        })
        kw = []
        for each_key in video:
            if each_key != 'keyword' or each_key != 'tag':
                kw.append(str(video[each_key]).lower())
            elif each_key == 'tag':
                kw += video['tag']
            else:
                kw += video['keyword']
        seg_list = jieba.lcut_for_search(' '.join(kw), True)  # 搜索引擎模式

        # 全名算作关键字
        if 'author' in video and video['author'].lower() not in seg_list:
            seg_list.append(video['author'].lower())

        while ' ' in seg_list:
            seg_list.remove(' ')
        while '、' in seg_list:
            seg_list.remove('、')
        return list(set(seg_list))
Пример #27
0
    def get_author_kw_list(self, mid):
        # 关键字从name和official中提取
        author = self.mongo_author.find_one({'mid': mid}, {
            '_id': 0,
            'name': 1,
            'official': 1,
            'keyword': 1
        })
        kw = []
        for each_key in author:
            if each_key != 'keyword':
                kw.append(str(author[each_key]).lower())
            else:
                kw += author['keyword']
        seg_list = jieba.lcut_for_search(' '.join(kw), True)  # 搜索引擎模式

        # 全名算作关键字
        if 'name' in author and author['name'].lower() not in seg_list:
            seg_list.append(author['name'].lower())

        while ' ' in seg_list:
            seg_list.remove(' ')
        while '、' in seg_list:
            seg_list.remove('、')
        return list(set(seg_list))
Пример #28
0
    def _command_filter(command):
        """过滤指令,缩小遍历范围"""
        # 相同指令
        same_commands = data_conveyor.filter_command(command)
        if same_commands:
            return same_commands

        # 分词匹配指令,有限的硬件资源环境不允许本地NLP
        all_commands = data_conveyor.all_command()

        def __filter_by_words(_words):
            actions = []
            for commands, action in all_commands:
                for _word in _words:
                    if _word in commands:
                        actions.append(action)
            return data_conveyor.filter_command_by_actions(set(actions))

        # 像似分词 https://cuiqingcai.com/5844.html
        cut_words = lcut_for_search(command)
        like_commands = __filter_by_words(cut_words)
        if like_commands:
            return like_commands

        # 模糊分词
        cut_all_words = lcut(command, cut_all=True)
        vague_commands = __filter_by_words(cut_all_words)
        if vague_commands:
            return vague_commands

        # 无匹配
        return []
Пример #29
0
def query_handler(q):
    '''
    处理查询
    英文和中文分别处理
    '''
    '''分别提取英文和中文单词集'''
    en = re.findall(r'\w+', q)
    cnPattern = re.compile(u'[\u4e00-\u9fa5]+')
    cn = re.findall(cnPattern, q)
    '''分词处理'''
    en_tokens = list(set([w for s in en for w in nltk.word_tokenize(s)]))
    cn_tokens = list(set([w for s in cn for w in jieba.lcut_for_search(s)]))
    '''去除停用词'''
    en_cleans = [w for w in en_tokens if w.lower() not in en_stopwd]
    cn_cleans = [w for w in cn_tokens if w.encode('utf-8') not in cn_stopwd]
    if not en_cleans:
        en_cleans = en_tokens
    if not cn_cleans:
        cn_cleans = cn_tokens

    # cn_cleans.append(jieba.analyse.extract_tags(s, topK=1))
    '''英文词干化'''
    porter = nltk.PorterStemmer()
    en_result = [porter.stem(t) for t in en_cleans]
    tokens = en_result + cn_cleans
    return tokens
Пример #30
0
def query(sentence, result=3):
    dic = corpora.Dictionary.load(conf.dictionary)
    tfidf = models.TfidfModel.load(conf.tfidf)
    lda = models.LdaModel.load(conf.lda)
    q_topic = lda[tfidf[dic.doc2bow(jieba.lcut_for_search(sentence))]]
    topics = load_topic_of_post()
    martix = np.zeros((len(topics), conf.num_topics), float)
    for ti, t in enumerate(topics):
        for tj, v in t:
            martix[ti, tj] = v
    q_vec = np.zeros(conf.num_topics, float)
    for ti, v in q_topic:
        q_vec[ti] = v
    pq = []
    i = 0
    while i < len(topics):
        heapq.heappush(pq, (sum((martix[i] - q_vec)**2), i))
        i += 1
    sel = {}
    for s, i in heapq.nsmallest(result, pq):
        sel[i] = s
    print sel
    post = get_post(sel).values()
    post.sort()
    return post
Пример #31
0
def test_segment():
    """
    测试简单分词方法。
    :return:
    """
    # 全模式
    seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
    common_logger.info("Full Mode:{0}".format("/".join(seg_list)))

    # 精确模式
    seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
    common_logger.info("Default Mode:{0}".format("/".join(seg_list)))

    # 不使用HMM模型
    seg_list = jieba.cut("他来到了网易杭研大厦", HMM=False)
    common_logger.info("不使用HMM模型:{0}".format("/".join(seg_list)))

    # 使用HMM模型
    seg_list = jieba.cut("他来到了网易杭研大厦", HMM=True)
    common_logger.info("使用HMM模型:{0}".format("/".join(seg_list)))

    # 搜索引擎模式
    seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", HMM=False)
    common_logger.info("搜索引擎模式:{0}".format("/".join(seg_list)))

    # 搜索引擎模式,且返回结果拼装为list,通常情况返回结果为生成器类型
    seg_list = jieba.lcut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", HMM=True)
    common_logger.info(seg_list)
Пример #32
0
 def result_by_time(self, sentence):
     seg_list = jieba.lcut_for_search(sentence)
     n, cleaned_dict = self.clean_list(seg_list)
     time_scores = {}
     for term in cleaned_dict.keys():
         r = self.fetch_from_db(term)
         if r is None:
             continue
         docs = r[2].split('\n')
         for doc in docs:
             docid, date_time, tf, ld = doc.split('\t')
             if docid in time_scores:
                 continue
             news_datetime = datetime.strptime(
                 date_time, "%Y-%m-%d %H:%M:%S")
             now_datetime = datetime.now()
             td = now_datetime - news_datetime
             docid = int(docid)
             td = (timedelta.total_seconds(td) / 3600)  # hour
             time_scores[docid] = td
     time_scores = sorted(time_scores.items(), key=operator.itemgetter(1))
     if len(time_scores) == 0:
         return 0, []
     else:
         return 1, time_scores
Пример #33
0
 def get_terms(self, query):
     """ processing query ,unsolved
     :param query: query_from_web
     :return:list
     """
     query = query.split()
     terms = [x for x in query if x not in self.stopwords]
     str_terms = ''.join(map(str, terms)).encode('UTF-8')
     return jieba.lcut_for_search(str_terms)
Пример #34
0
def generatePeopleB(dictionary, nameList):
    for name in nameList:
        nameSplitSign = splitSign(name)  # has been added in a6 a7
        for name2 in nameSplitSign:
            jieba_cut_name_list = jieba.lcut_for_search(name2)
            if len(jieba_cut_name_list) > 1:
                for name3 in jieba_cut_name_list:
                    if name3 == name2:  # 克里斯 cut出 克里斯 和 克里  克里斯不用再添加了
                        continue
                    addIntoDict(dictionary, name3, "b2")
Пример #35
0
def generateInfoB(dictionary, infoList):
    for info in infoList:
        # 2015-07-24(中国大陆)
        infoSplitSign = splitSign(info)
        for info2 in infoSplitSign:
            addIntoDict(dictionary, info2, "b5")
            jieba_cut_info_list = jieba.lcut_for_search(info2)
            if len(jieba_cut_info_list) > 1:
                for info3 in jieba_cut_info_list:
                    if info3 == info2:  # 中国大陆 有 中国大陆 和 中国 和 大陆
                        continue
                    addIntoDict(dictionary, info3, "b5")
Пример #36
0
def generateTitleB(dictionary, titleList, alias=False):
    for title in titleList:
        if alias:
            title = dealWithAlias(title)
        titleSplitSign = splitSign(title)  # has been added in a2
        for title2 in titleSplitSign:
            jieba_cut_title_list = jieba.lcut_for_search(title2)
            if len(jieba_cut_title_list) > 1:
                for title3 in jieba_cut_title_list:
                    if title3 == title2:  # 克里斯 cut出 克里斯 和 克里  克里斯不用再添加了
                        continue
                    addIntoDict(dictionary, title3, "b1")
Пример #37
0
def generateIntroC(dictionary, introList):
    jieba_cut_intro_list = []
    longIntro = u""
    for intro in introList:
        jieba_cut_intro_list += jieba.lcut_for_search(intro)
        longIntro += intro
    jieba_cut_intro_list = [y for y in jieba_cut_intro_list if not isSign(y)]
    jieba_cut_intro_set = set(jieba_cut_intro_list)
    for word in jieba_cut_intro_set:  # for every word, search its term frequency
        pattern = re.compile(word)
        tf = len(re.findall(pattern, longIntro))
        # print word, tf
        addIntoDict(dictionary, word, "c2", tf)
def make_inverted_index(filename,read_buff_size,output_file_record_size,web_record_numbers=100000):
    '''
    :param filename: 网页数据,包含后缀.txt
    :param read_buff_size:按块读文件,每个块的大小
    :param output_file_token_size:输出每个文件中包含的新闻纪录数
    :param 输入文件网页记录总数,用于显示程序处理进度 ,无其他用处
    :return:倒排索引文件
    '''
    #读文件,分词,存储倒排索引至多个文件
    block_read=read_block(read_buff_size,filename)
    punct = set(u'''/+%#:!),.:;?]}¢'"、。〉》」』】〕〗〞︰︱︳﹐、﹒
    ﹔﹕﹖﹗﹚﹜﹞!),.:;?|}︴︶︸︺︼︾﹀﹂﹄﹏、~¢
    々‖•·ˇˉ―--′’”([{£¥'"‵〈《「『【〔〖([{£¥〝︵︷︹︻
    ︽︿﹁﹃﹙﹛﹝({“‘-—_…''')
    Letters_and_numbers=set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789')
    buff_dir=filename[:-4]+'_buff' #创建问价夹,用于存放 构建倒排索引过程中的中间文件,构建完成后删除文件夹
    if os.path.exists(buff_dir):
        pass
    else:
        os.mkdir(buff_dir)
    file_numbers=1
    while True:
        print "process :cuting word +making inverted_index files---->>>>",file_numbers*(output_file_record_size)*1.0/web_record_numbers
        spimi=SPIMI_Invert(buff_dir+'/'+str(file_numbers)+'.txt')
        count=0
        while True:
            doc_id,content=block_read.pop_token()
            if content==''or count==output_file_record_size:
                break
            content_list=jieba.lcut_for_search(content)
            spimi.push_id(doc_id)
            for j in range(len(content_list)):
                if  content_list[j] not in punct and content_list[j] not in Letters_and_numbers :
                    spimi.push_word(content_list[j])
            del content_list,doc_id,content
            count+=1
        spimi.push_word('')#为空 表示写文件
        file_numbers+=1
        if content=='':
            break
    print ("process :cuting word +making inverted_index files---->>>>Finish")
    #合并倒排索引文件
    merged_filename=merge_inverted_files.merge_file([str(i) for i in range(1,file_numbers)],read_buff_size,buff_dir+'/')
    print "process:mergeing inverted index files----->Finish"
    #由倒排索引文件构建 词-倒排索引位置
    Dictionary.establish_ditionary(buff_dir+'/'+merged_filename+'.txt',read_buff_size,buff_dir+'/'+"Dictionary.txt")
    shutil.copy(buff_dir+'/'+merged_filename+'.txt',filename[:-4]+'_inverted_index.txt')#移动文件
    shutil.copy(buff_dir+'/'+"Dictionary.txt",filename[:-4]+'_index_Dictionary.txt')
    shutil.rmtree(buff_dir)#删除文件夹
    del merged_filename,buff_dir,punct,Letters_and_numbers
def inverted_index(filename,read_buff_size,output_file_record_size):
    '''
    :param filename: 网页数据,包含后缀.txt
    :param read_buff_size:按块读文件,每个块的大小
    :param output_file_token_size:输出每个文件中包含的新闻纪录数
    :return:倒排索引文件
    '''
    '''
    读文件,分词,存储倒排索引至多个文件
    '''
    block_read=read_block(read_buff_size,filename)
    punct = set(u''':!),.:;?]}¢'"、。〉》」』】〕〗〞︰︱︳﹐、﹒
    ﹔﹕﹖﹗﹚﹜﹞!),.:;?|}︴︶︸︺︼︾﹀﹂﹄﹏、~¢
    々‖•·ˇˉ―--′’”([{£¥'"‵〈《「『【〔〖([{£¥〝︵︷︹︻
    ︽︿﹁﹃﹙﹛﹝({“‘-—_…''')
    file_numbers=1
    while True:
        spimi=SPIMI_Invert(filename[:-4]+str(file_numbers)+'.txt')
        count=0
        while True:
            doc_id,content=block_read.pop_token()
            if content==''or count==output_file_record_size:
                break
            content_list=jieba.lcut_for_search(content)
            spimi.push_id(doc_id)
            for j in range(len(content_list)):
                if  content_list[j] not in punct:
                    spimi.push_word(content_list[j])
            del content_list,doc_id,content
            count+=1
        spimi.push_word('')#为空 表示写文件
        if content=='':
            break
        file_numbers+=1
    '''
    合并倒排索引文件
    '''
    merged_filename=merge_inverted_files.merge_file([str(i) for i in range(1,file_numbers)],buff_size,filename[:-4])
    '''
    由倒排索引文件构建 词-倒排索引位置
    '''
    Dictionary.establish_ditionary(filename[:-4]+merged_filename+'.txt',buff_size,filename[:-4]+"Dictionary.txt")
    '''
    加载存储的词典
    '''
    Dictionary.dictionary(filename[:-4]+"Dictionary.txt",filename[:-4]+merged_filename+'.txt',1024*1024)
Пример #40
0
def ConsineScore(query):
	query.decode(sys.stdin.encoding or locale.getpreferredencoding(True))
	Score = {}
	for doc in doc_list:
		Score[doc] = 0
	TotalDoc = len(doc_list)
	query_term_list = jieba.lcut_for_search(query)
	for term in query_term_list:
		try:
			posting_list = indexTable[term];
		except:
			continue
		for posting in posting_list:
			doc = posting['doc']
			tf = posting['tf']
			df = len(posting_list)
			weight = (1+math.log10(tf))*math.log10(TotalDoc/df)
			Score[doc] += weight;
	for doc in doc_list:
		Score[doc] = Score[doc]/doc_length[doc];
	return sorted(Score.iteritems(), key=lambda d:d[1], reverse = True)[:10]	#返回排名前十的页面 list[(doc_id,score)...]
Пример #41
0
def getResultDictionaryListFromSentence(sentence):
    words = sentence.split("+")
    words_results = []
    longest_result_length = 0
    some_new_jieba_words = []
    for word in words:
        one_result = getResultDictionaryListFromOneSingleWord(word, return_dictionaryList=False)
        if len(one_result) == 0:
            some_new_jieba_words += jieba.lcut_for_search(word)
            continue
        words_results.append(one_result)
        if len(one_result) > longest_result_length:
            longest_result_length = len(one_result)
    if len(some_new_jieba_words) > 0:
        for word in some_new_jieba_words:
            one_result = getResultDictionaryListFromOneSingleWord(word, return_dictionaryList=False)
            words_results.append(one_result)
            if len(one_result) > longest_result_length:
                longest_result_length = len(one_result)

    URLOrderDict = collections.OrderedDict()
    for movie_pos in range(longest_result_length):
        for word_pos in range(len(words_results)):
            if movie_pos >= len(words_results[word_pos]):
                continue
            if URLOrderDict.get(words_results[word_pos][movie_pos]) is None:
                URLOrderDict[words_results[word_pos][movie_pos]] = 1
            else:
                URLOrderDict[words_results[word_pos][movie_pos]] += 1
    final_dictionaryList = []
    for appearance in reversed(range(len(words_results))):
        for url in URLOrderDict:
            if len(final_dictionaryList) > AMOUNT_OF_FINAL_RESULT:
                break
            if URLOrderDict[url] == appearance+1:
                print url, URLOrderDict[url]
                final_dictionaryList.append(getDictionaryFromURL(url))
                line = json.dumps(dict(final_dictionaryList[-1]), ensure_ascii=False, sort_keys=True)
                print line
    return final_dictionaryList
Пример #42
0
('我们中出了一个叛徒', ('中', '出')),
]

for sent, seg in testlist:
    print('/'.join(jieba.cut(sent, HMM=False)))
    word = ''.join(seg)
    print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True)))
    print('/'.join(jieba.cut(sent, HMM=False)))
    print("-"*40)

# quit()
jieba.add_word('石墨烯')
seg_list = jieba.cut(p, cut_all=True)
print("Full Mode: " + "/ ".join(seg_list))  # 全模式

seg_list = jieba.cut(p, cut_all=False)
print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

seg_list = jieba.cut(p)  # 默认是精确模式
print(", ".join(seg_list))

seg_list = jieba.cut_for_search(p)  # 搜索引擎模式
print(", ".join(seg_list))
print jieba.suggest_freq(('好','我'))
print jieba.suggest_freq(('走','了'))


print ','.join(jieba.lcut(p))
print ','.join(jieba.lcut_for_search(p))

print ','.join(['%s/%s'%(i,j) for i,j in pseg.lcut(p)])
Пример #43
0
  def cutword(self, text):
	return jieba.lcut_for_search(text)
 def separatewords(self, text):
     #splitter = re.compile(r'\W*')
     return jieba.lcut_for_search(text)
Пример #45
0
def rankpage(request): 
	if 'query' in request.GET:
		start = time.clock()
		tmp = ConsineScore(request.GET['query'])
		print request.GET['query']

		query_term_list = jieba.lcut_for_search(request.GET['query'])
		
		filename = tmp[0]
		filename = filename[1:]
		filename = r'SearchEngine'+filename+r'.txt';
		filename = filename.replace(r'|',r'_')
		filename = filename.replace(r'?',r'_')

		timg = tmp[0]
		timg = timg[1:]
		timg = timg + r'.jpg';
		timg = timg.replace(r'|',r'_')
		timg = timg.replace(r'?',r'_')
		timg = r"/static" + timg

		with open(filename,"r") as f:
			text=f.read()
		d=eval(text)
		teacher = []
		for k,v in d.iteritems():
			try:
				if k in ["name","department","homepage"]:
					if k=="name":
						teacher.append(("姓名",v))
					elif k=="department":
						teacher.append(("院系",v))
					else:
						teacher.append(("主页:",v))
			except:
				teacher.append((k,'None'))

		count = len(tmp)
		#处理网站链接
		result = []				
		for term in tmp:
			link = title = content = ""


			link = term[1:]
			link = link.replace(r'|',r'/')
			link = r'http:/'+link

			filename = term
			filename = filename[1:]
			filename = r'SearchEngine'+filename;
			filename = filename.replace(r'|',r'_')
			filename = filename.replace(r'?',r'_')

			fo = open(filename, "r")
			doc_text = fo.read();
			fo.close();

			if doc_text.find("博客")<doc_text.find("主页"):
				if doc_text.find("博客")>0:
					title = doc_text[:doc_text.find("博客")]
					title = title + "博客"
				elif doc_text.find("主页")>0:
					title = doc_text[:doc_text.find("主页")]
					title = title + "主页"
				else:
					title = link
			else:
				if doc_text.find("主页")>0:
					title = doc_text[:doc_text.find("主页")]
					title = title + "主页"
				elif doc_text.find("博客")>0:
					title = doc_text[:doc_text.find("博客")]
					title = title + "博客"
				else:
					title = link

			while doc_text.count("主页")>1:
				doc_text = doc_text.partition("主页")[2]
			content = doc_text.partition("主页")[2]

			content = content.decode('utf-8')[:200].encode('utf-8') + '...'

			result.append({'link':link,'title':title,'content':content})

		query = request.GET['query']

		end = time.clock()
		runtime = end-start
		return render_to_response('searchpage.html',locals()) 
	else:
		return render_to_response('startSearch.html',locals()) 
Пример #46
0
 def add_school_seq(t):
     t['school_seq'] = jieba.lcut_for_search(t['school'])