def _prepare_data(self, files): print('prepare data...') embeddings = list() queries = list() labels = list() # mlb = MultiLabelBinarizer() for index in xrange(len(files)): path = files[index] with open(path, 'r') as f: for line in f: # line = json.loads(line.strip().decode('utf-8')) # question = line['question'] line = line.replace('\t', '').replace( ' ', '').strip('\n').decode('utf-8').split('#') question = QueryUtils.static_simple_remove_punct( str(line[0])) label = self.named_labels.index( str(line[1].encode('utf-8'))) queries.append(question) labels.append(label) tokens = [self.cut(question)] embedding = self.feature_extractor.transform( tokens).toarray() embeddings.append(embedding) embeddings = np.array(embeddings) embeddings = np.squeeze(embeddings) # self.kernel.fit() # self.mlb = mlb.fit(labels) # labels = self.mlb.transform(labels) # print (embeddings.shape, len(queries)) # print_cn(labels.shape) return embeddings, labels, queries
def cut(self, input_): input_ = QueryUtils.static_simple_remove_punct(input_) seg = " ".join(jieba.cut(input_, cut_all=False)) tokens = _uniout.unescape(str(seg), 'utf8') return tokens