#!/usr/bin/python #-*- coding:utf-8 -*- """ select feature from a file """ import sys from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer from sklearn.feature_selection import SelectKBest,chi2 from wcut.jieba.norm import norm_cut,load_industrydict from collections import Counter load_industrydict([0,2,7]) class FeatureSelect(object): """ 1.load data 2.cut word 3.load vocabulary 4,init_vectorizer """ def __init__(self): self.vectorizer=None self.vocabulary = None def load_data(self,infile): self.data = [] with open(infile,'rb') as inf: for line in inf : self.data.append(self.cut_word(line))
""" """ from __future__ import division import sys import os import time import logging import logging.config logger = logging.getLogger(__name__) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) from wcut.jieba.norm import norm_cut, load_industrydict from wcut import jieba load_industrydict([0]) dir_path = os.path.dirname(os.path.abspath(__file__)) import esm #加载广告词表 adver = [ line.strip() for line in open('%s/conf/advertiseWord.txt' % dir_path).readlines() ] adver_esm = esm.Index() for word in adver: adver_esm.enter(word) adver_esm.fix()
# -*- coding:utf-8 -*- import re import os import xlrd from read_data import WordsTrend from conf.extract_word_conf import singleton from conf.extract_word_conf import is_include from conf.extract_word_conf import remove_include_words from wcut.jieba.norm import norm_cut, load_industrydict load_industrydict([0, 2]) project_path = os.path.dirname(os.path.realpath(__file__)) # 获取当前文件夹的路径 # QA系统的提取关键词部分 # @singleton class NewQAExtractWord(object): def __init__(self): # brand_dict = {'汽车品牌名称':'所属的汽车名'} 区别同义词,三个都是类似 self.brand_dict, self.product_dict, self.component_dict, self.attribute_dict, self.evaluation_dict, \ self.service_dict,self.rela_dict = self.read_dm_data() # 读取数据库数据 #读取疑问词 self.common_word_list, self.re_word_list, self.query_word_type_dict = self.read_query_data( path=(project_path + '/conf/query_word.xlsx')) self.re_formula_dict = self.assemble_re_formula() self.synonym = {} for sub in [ self.brand_dict, self.product_dict, self.component_dict,
from os.path import join, dirname, abspath import os import logging import csv import sys import numpy as np import gzip from datetime import datetime from optparse import OptionParser import simhash from pyspark import SparkContext from wcut.jieba import suggest_freq from wcut.jieba.norm import norm_seg, load_industrydict #test 2 load_industrydict([2, 7]) # # # def cutline(input): ''' cut a input string, return utf-8 string ''' result = norm_seg(input) wordsList = [] for w in result: if w.word.strip() == '' or w.flag.strip() == '': continue wordsList.append(w.word)