def _build_feature_extractor(self, mode, files): print('Build feature extraction...') corpus = list() for path in files: with open(path, 'r') as f: for line in f: # line = json.loads(line.strip().decode('utf-8')) # question = line['question'] question = line.replace('\t', '').replace( ' ', '').strip('\n').decode('utf-8') question = QueryUtils.static_remove_cn_punct(str(question)) tokens = self.cut(question) corpus.append(tokens) if mode == 'ngram': bigram_vectorizer = CountVectorizer( ngram_range=(1, 2), min_df=0.0, max_df=1.0, analyzer='char', stop_words=[',', '?', '我', '我要'], binary=True) self.feature_extractor = bigram_vectorizer.fit(corpus) if mode == 'tfidf': print_cn('use {0}'.format(mode)) tfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), max_df=1.0, min_df=1, sublinear_tf=True) self.feature_extractor = tfidf_vectorizer.fit(corpus)
def _prepare_data(self, files): print('prepare data...') embeddings = list() queries = list() queries_ = dict() labels = list() mlb = MultiLabelBinarizer() for index in xrange(len(files)): path = files[index] label = self.named_labels[index] queries_[label] = list() with open(path, 'r') as f: for line in f: # line = json.loads(line.strip().decode('utf-8')) # question = line['question'] question = line.replace('\t', '').replace( ' ', '').strip('\n').decode('utf-8') question = QueryUtils.static_remove_cn_punct(str(question)) tokens = QueryUtils.static_jieba_cut(question) # print_cn(tokens) if len(tokens) == 0: continue # cc=self.check_zero_tokens(tokens) # if not cc: # continue queries_[label].append(question) # print len(queries_) for label, questions in queries_.iteritems(): for question in questions: if question in queries and label not in labels[queries.index( question)]: # print_cn(question) index = queries.index(question) labels[index].append(label) else: # print_cn(question) queries.append(question) labels.append([label]) tokens = self.cut(question).split(' ') embedding = self.get_w2v_emb(tokens) embeddings.append(embedding) embeddings = np.array(embeddings) embeddings = np.squeeze(embeddings) self.mlb = mlb.fit(labels) labels = self.mlb.transform(labels) # print (embeddings.shape, len(queries)) # print_cn(labels.shape) return embeddings, labels, queries
def cut(input_): input_ = QueryUtils.static_remove_cn_punct(input_) tokens = list(jieba.cut(input_, cut_all=False)) return tokens
def cut(self, input_): input_ = QueryUtils.static_remove_cn_punct(input_) tokens = jieba.cut(input_, cut_all=True) seg = " ".join(tokens) tokens = _uniout.unescape(str(seg), 'utf8') return tokens