def _extrParamW2V(self, model, reprList, ques): """ extract parameters using word2vec model """ if model == None: raise Exception("W2VM Not Found") parsedQues = ut.parseSentence(ques) res = [] for word in parsedQues.split(' '): replWord = ut.replNum(word) if replWord in model.vocab: l = [] for w in reprList: replRepr = ut.replNum(w) #.decode('utf-8') if replRepr in model.vocab: val = model.similarity(replRepr, replWord) l.append(val) if self.W2VCalcMethod == 'max': d = (word, max(l)) if self.W2VCalcMethod == 'avg': d = (word, sum(l) / len(l)) print '{} : {}'.format(d[0].encode('utf-8'), d[1]) if d[1] > self.THR: res.append(d) return res
def _buildIndvW2VM(self, cat, corpus): """ Build the category's word2vec model using corpus """ sentences = [ ut.replNum(ut.parseSentence(x)).split(' ') for x in corpus ] self.indvW2VM[cat] = gensim.models.Word2Vec(sentences, min_count=1, size=100, workers=12) self.save()
def _buildAllW2VM(self, allCorpus): """ Build word2vec model using all corpus NOTE : indvW2V makes each category's model but allW2V is shared among categories, so It doesn't have to rebuild often. """ sentences = [] for v in allCorpus.values(): sentences.extend( [ut.replNum(ut.parseSentence(x)).split(' ') for x in v]) self.allW2VM = gensim.models.Word2Vec(sentences, min_count=1, size=100, workers=12) self.save()
def _learnParam(self, cat, feat, rawReprList): """ Learn reprentative words of each feature """ reprList = ut.parseSentence(' '.join(rawReprList)).split(' ') if not self.reprDict.has_key(cat): self.reprDict[cat] = {} if not self.reprDict[cat].has_key(feat): self.reprDict[cat][feat] = [] self.reprDict[cat][feat].extend(reprList) self.save()
def build(self, allCorpus): """ Build classifier model from corpus """ #Make question and category list to use at sklearn #NOTE: each category's corpus has different amount of corpus, so we equalize each category's corpus cntPerCat = min(map(len, allCorpus.values())) quesList = sum([x[0:cntPerCat] for x in allCorpus.values()], []) catList = sum([[x] * cntPerCat for x in allCorpus.keys()], []) #shuffle question and category list to build better model combined = list(zip(quesList, catList)) random.shuffle(combined) quesList[:], catList[:] = zip(*combined) self.categories = allCorpus.keys() #We use TfidVectorizer and bigram self.vectorizer = TfidfVectorizer(ngram_range=(1, 2)) Xlist = self.vectorizer.fit_transform( [ut.replNum(ut.parseSentence(x)) for x in quesList]) Ylist = [self.categories.index(x) for x in catList] print 'build prepared' #Search best model svc_param = {'C': np.logspace(-2, 0, 20)} print 'build start!' gs_svc = GridSearchCV(LinearSVC(), svc_param, cv=5, n_jobs=8) gs_svc.fit(Xlist, Ylist) #logging.debug(gs_svc.best_params_) #logging.debug('score : ' + str(gs_svc.best_score_)) print gs_svc.best_params_ print 'score : ' + str(gs_svc.best_score_) print 'make model using C parameter...' svm = LinearSVC(C=gs_svc.best_params_['C']) self.clfModel = CalibratedClassifierCV(base_estimator=svm) #Build model self.clfModel.fit(Xlist, Ylist) #save model self.save()
def predict(self, ques): """ Predict category of the question """ if type(ques) is not unicode: ques = ques.decode('utf-8') if self.vectorizer == None or self.categories == None or self.clfModel == None: raise Exception('contextClf Not built yet') parsedQues = ut.parseSentence(ques) testX = self.vectorizer.transform([ut.replNum(parsedQues)]) """ Predict category and probability """ predList = self.clfModel.predict_proba(testX) """ Make List of tuples and sort to return """ res = [(self.categories[x], predList[0][x]) for x in range(len(self.categories))] sortedRes = sorted(res, key=operator.itemgetter(1), reverse=True) return sortedRes