def _get_model(self, feature): ''' computes the vector/matrix for feature and returns a DictVectorizer :param feature: feature name :return: vec: DictVectorzier, train/test_matrix: matrix from self.train/self.test fitted on vec ''' if feature == "skipgrams": vec = skipgrams.SkipgramVectorizer() matrix = vec.fit_transform(self.train_unified) support = SelectKBest(chi2, self.max_features[feature]).fit(matrix, self.y_train) vec.restrict(support.get_support()) train_matrix = vec.transform(self.train_unified) test_matrix = vec.transform(self.test_unified) return vec, train_matrix, test_matrix if feature == "#tokens": train_matrix = token_counter.countTokens(self.train_unified) test_matrix = token_counter.countTokens(self.test_unified) return None, train_matrix, test_matrix if feature == "wordpairs": vec = wordpairs.WordpairVectorizer() matrix = vec.fit_transform(self.train) support = SelectKBest(chi2, self.max_features[feature]).fit(matrix, self.y_train) vec.restrict(support.get_support()) train_matrix = vec.transform(self.train) test_matrix = vec.transform(self.test) return vec, train_matrix, test_matrix if feature == "modals": vec = modality.ModelVectozier() train_matrix = vec.check_modality(self.train_raw) test_matrix = vec.check_modality(self.test_raw) return None, train_matrix, test_matrix if feature == "ngrams": vec = TfidfVectorizer(ngram_range=(1, 2), max_features=self.max_features[feature]) train_matrix = vec.fit_transform(self.train_unified) test_matrix = vec.transform(self.test_unified) return vec, train_matrix, test_matrix if feature == "doc2vec": #load existing model #model = Doc2Vec.load(fname) #train model model = doc2vec.train_model(doc2vec.prep_data(self.train_unified)) #save model #model.save(fname) train_matrix = doc2vec.get_train_X(model, len(self.train_unified)) test_matrix = doc2vec.transform(model, self.test_unified) return model, train_matrix, test_matrix if feature == "#chunks": vec = chunk_counter.ChunkcountVectorizer() train_matrix = vec.count_chunks(self.train_raw) test_matrix = vec.count_chunks(self.test_raw) return None, train_matrix, test_matrix if feature == "#args": vec = chunk_counter.ChunkcountVectorizer() train_matrix = vec.count_args(self.train_raw) test_matrix = vec.count_args(self.test_raw) return None, train_matrix, test_matrix
def _get_model(self, feature): ''' computes the vector/matrix for feature and returns a DictVectorizer :param feature: feature name :return: vec: DictVectorzier, train/test_matrix: matrix from self.train/self.test fitted on vec ''' if feature == "skipgrams": vec = skipgrams.SkipgramVectorizer() matrix = vec.fit_transform(self.train_unified) support = SelectKBest(chi2, self.max_features[feature]).fit( matrix, self.y_train) vec.restrict(support.get_support()) train_matrix = vec.transform(self.train_unified) test_matrix = vec.transform(self.test_unified) return vec, train_matrix, test_matrix if feature == "#tokens": train_matrix = token_counter.countTokens(self.train_unified) test_matrix = token_counter.countTokens(self.test_unified) return None, train_matrix, test_matrix if feature == "wordpairs": vec = wordpairs.WordpairVectorizer() matrix = vec.fit_transform(self.train) support = SelectKBest(chi2, self.max_features[feature]).fit( matrix, self.y_train) vec.restrict(support.get_support()) train_matrix = vec.transform(self.train) test_matrix = vec.transform(self.test) return vec, train_matrix, test_matrix if feature == "modals": vec = modality.ModelVectozier() train_matrix = vec.check_modality(self.train_raw) test_matrix = vec.check_modality(self.test_raw) return None, train_matrix, test_matrix if feature == "ngrams": vec = TfidfVectorizer(ngram_range=(1, 2), max_features=self.max_features[feature]) train_matrix = vec.fit_transform(self.train_unified) test_matrix = vec.transform(self.test_unified) return vec, train_matrix, test_matrix if feature == "doc2vec": #load existing model #model = Doc2Vec.load(fname) #train model model = doc2vec.train_model(doc2vec.prep_data(self.train_unified)) #save model #model.save(fname) train_matrix = doc2vec.get_train_X(model, len(self.train_unified)) test_matrix = doc2vec.transform(model, self.test_unified) return model, train_matrix, test_matrix if feature == "#chunks": vec = chunk_counter.ChunkcountVectorizer() train_matrix = vec.count_chunks(self.train_raw) test_matrix = vec.count_chunks(self.test_raw) return None, train_matrix, test_matrix if feature == "#args": vec = chunk_counter.ChunkcountVectorizer() train_matrix = vec.count_args(self.train_raw) test_matrix = vec.count_args(self.test_raw) return None, train_matrix, test_matrix