def _predict(self, json_test): test = self.pre_process(json_test, istrain = False) bow_vectorizer = BagOfWordsVectorizer() word2vec_model = Word2VecModel() tag_counter_model = TagCounterModel() # word2vec_model.set_model(self.author_model) # author_features = word2vec_model.transform(test["author_pos_sentences"], "author") #bow_vectorizer.set_vectorizer(self.title_model) #title_features = bow_vectorizer.transform(test["title_pos_sentences"], "title") bow_vectorizer.set_vectorizer(self.text_model) text_features = bow_vectorizer.transform(test["text_pos_sentences"], "text") #tag_counter_model.set_col(self.tag_model) #tag_features = tag_counter_model.transform(test["text"]) test = pd.concat([test, text_features], axis = 1) #le = preprocessing.LabelEncoder() #test["forumid"] = le.fit_transform(test["forumid"]) test = test.drop(['text', 'text_pos', 'text_pos_sentences'], axis=1) test.columns = [str(x) for x in range(len(test.columns))] return test
def fit(self, json_train, n_estimators = 10, is_xgb = True): train = self.pre_process(json_train, istrain = True) bow_vectorizer = BagOfWordsVectorizer() word2vec_model = Word2VecModel() tag_counter_model = TagCounterModel() # word2vec_model.fit(train["author_pos_sentences"], 500) # author_features = word2vec_model.transform(train["author_pos_sentences"], "author") # self.author_model = word2vec_model.get_model() # bow_vectorizer.fit(train["title_pos_sentences"], 1000) # title_features = bow_vectorizer.transform(train["title_pos_sentences"], "title") # self.title_model = bow_vectorizer.get_vectorizer() bow_vectorizer.fit(train["text_pos_sentences"], 1000) text_features = bow_vectorizer.transform(train["text_pos_sentences"], "text") self.text_model = bow_vectorizer.get_vectorizer() # tag_features = tag_counter_model.fit_transform(train["text"]) # self.tag_model = tag_counter_model.get_col() train = pd.concat([train, text_features], axis = 1) #le = preprocessing.LabelEncoder() # train["forumid"] = le.fit_transform(train["forumid"]) label = train['istroll'] train = train.drop('istroll', axis=1) train = train.drop(['text', 'text_pos', 'text_pos_sentences'], axis=1) print(train.columns) train.columns = [str(x) for x in range(len(train.columns))] if is_xgb == False: self.model = RandomForestClassifier(n_estimators, n_jobs=-1) else: self.model = XGBClassifier(n_estimators = n_estimators, max_depth = 10) print(train.shape) self.model.fit(train, label)