示例#1
0
#!/usr/bin/python
#-*- coding:utf-8 -*-

"""
select feature from a file
"""
import sys
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
from sklearn.feature_selection import SelectKBest,chi2
from wcut.jieba.norm import norm_cut,load_industrydict
from collections import Counter
load_industrydict([0,2,7])


class FeatureSelect(object):
    """
    1.load data
    2.cut word
    3.load vocabulary
    4,init_vectorizer
    """

    def __init__(self):
        self.vectorizer=None
        self.vocabulary = None

    def load_data(self,infile):
        self.data = []
        with open(infile,'rb') as inf:
            for line in inf :
                self.data.append(self.cut_word(line))
示例#2
0
"""
"""
from __future__ import division
import sys
import os
import time
import logging
import logging.config

logger = logging.getLogger(__name__)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
from wcut.jieba.norm import norm_cut, load_industrydict
from wcut import jieba

load_industrydict([0])
dir_path = os.path.dirname(os.path.abspath(__file__))
import esm

#加载广告词表
adver = [
    line.strip()
    for line in open('%s/conf/advertiseWord.txt' % dir_path).readlines()
]
adver_esm = esm.Index()

for word in adver:
    adver_esm.enter(word)
adver_esm.fix()

示例#3
0
文件: QA_Extract.py 项目: spikems/qa
# -*- coding:utf-8 -*-
import re
import os
import xlrd
from read_data import WordsTrend
from conf.extract_word_conf import singleton
from conf.extract_word_conf import is_include
from conf.extract_word_conf import remove_include_words
from wcut.jieba.norm import norm_cut, load_industrydict
load_industrydict([0, 2])
project_path = os.path.dirname(os.path.realpath(__file__))  # 获取当前文件夹的路径


# QA系统的提取关键词部分
# @singleton
class NewQAExtractWord(object):
    def __init__(self):
        # brand_dict = {'汽车品牌名称':'所属的汽车名'}   区别同义词,三个都是类似
        self.brand_dict, self.product_dict, self.component_dict, self.attribute_dict, self.evaluation_dict, \
        self.service_dict,self.rela_dict = self.read_dm_data()  # 读取数据库数据

        #读取疑问词
        self.common_word_list, self.re_word_list, self.query_word_type_dict = self.read_query_data(
            path=(project_path + '/conf/query_word.xlsx'))

        self.re_formula_dict = self.assemble_re_formula()
        self.synonym = {}
        for sub in [
                self.brand_dict,
                self.product_dict,
                self.component_dict,
示例#4
0
from os.path import join, dirname, abspath
import os
import logging
import csv
import sys
import numpy as np
import gzip
from datetime import datetime
from optparse import OptionParser
import simhash
from pyspark import SparkContext
from wcut.jieba import suggest_freq
from wcut.jieba.norm import norm_seg, load_industrydict

#test 2
load_industrydict([2, 7])


#
#
#
def cutline(input):
    '''
    cut a input string, return utf-8 string
    '''
    result = norm_seg(input)
    wordsList = []
    for w in result:
        if w.word.strip() == '' or w.flag.strip() == '':
            continue
        wordsList.append(w.word)