def create_disaster_pipeline(disaster_csv_path, category_name): disaster = ut.read_csv(disaster_csv_path) print('Getting data...') X = disaster['message'].values Y = disaster[category_name].values x_train, x_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.3) print('Creating pipeline...') pipeline = pi.Pipeline([ ('vect', st.CountVectorizer( tokenizer=lambda text: (pt.pipe | __normalize_text__ | __tokenize_text__ | __remove_stopwords__ | __lemmatize_text__)(text))), ('tfidf', st.TfidfTransformer()), ('clf', en.RandomForestClassifier()) ]) print('Fitting pipeline...') pipeline.fit(x_train, y_train) print('Predicting with pipeline...') y_pred = pipeline.predict(x_test) print('Displaying results...') display_results(y_test, y_pred) pass
def get_data(): df = pd.read_table('SMSSpamCollection', sep='\t', header=None, names=['label', 'sms_message']) df['label'] = df.label.map({'ham': 0, 'spam': 1}) X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], df['label'], random_state=1, test_size=0.1) count_vector = text.CountVectorizer(ngram_range=[1, 4], analyzer='char_wb') # Fit the training data and then return the matrix training_data = count_vector.fit_transform(X_train) testing_data = count_vector.transform(X_test) # NEw layer of tf-idf for better model transformer = text.TfidfTransformer() training_data = transformer.fit_transform(training_data) testing_data = transformer.transform(testing_data) return (training_data, y_train), (testing_data, y_test), count_vector, transformer
def build_model(X_train, X_test, y_train, y_test): """Build and evaluate a model. Also returns the test-set predictions.""" count_vect = sktext.CountVectorizer() tfidf_transformer = sktext.TfidfTransformer() X_train_counts = count_vect.fit_transform(X_train) X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_test_counts = count_vect.transform(X_test) X_test_tfidf = tfidf_transformer.transform(X_test_counts) X_train, X_test = feature_selection(X_train_tfidf, y_train, X_test_tfidf, 'all') model = svm.SVC(C=10, kernel='linear') # model = dummy.DummyClassifier(strategy="stratified") model.fit(X_train, y_train) LOGGER.info("Model trained") # Test trained model: predicted = model.predict(X_test) df_out = pd.DataFrame(y_test) df_out['pred'] = predicted df_out['target'] = y_test df_out['match'] = df_out['pred'] == df_out['target'] classifier = { "model": model, "counter": count_vect, "transformer": tfidf_transformer } return (classifier, df_out)
def count_vectorizer(df, col_name, vocab=None): """ String Vectorizer With Optional Dictionary Support Given a Pandas DataFrame, this function will tokenizer using either the provided dictionary or the one implicit to the data itself. It then returns a matrix of the tf-idf scores of each of the words. :param df: Source data frame to vectorize :type df: pd.DataFrame :param col_name: Name of the feature column in the pandas DataFrame :type col_name: string :param vocab: Dictionary of support dictionary words to mapping of index number :type vocab: dict :return: TF-IDF word matrix and vocabulary. :rtype: Tuple(pd.DataFrame, dict) """ stop_words = nltk.corpus.stopwords.words('english') vectorizer = sklearntext.CountVectorizer(lowercase=True, stop_words=stop_words, vocabulary=vocab) doc_word_matrix = vectorizer.fit_transform(df[col_name]) if vocab is None: vocab = vectorizer.vocabulary_ tf_idf = sklearntext.TfidfTransformer( norm=None).fit_transform(doc_word_matrix) return tf_idf.toarray(), vocab
def testmodel(test): cv, cv_train, model = load_pickle() tests = [] x_test = stopwords(jieba.lcut(test)) # x_test = '众所周知,中国人在世界上有着最出色的生意头脑,他们有着细腻的商业思维和精准的商业预判。' \ # '但有的时候却聪明反被聪明误。很多人看到一个赚钱的机会,大家都会一窝蜂似的挤进来都想分到一块“大蛋糕”。' \ # '而很多人又为了各自利益而不停模仿,导致最后都赚不到钱。中国的女鞋行业就是个典型例子' # x_test = stopwords(jieba.lcut(x_test)) # x_test2 = '首页|财经中心|财经频道6月中国公路物流运价指数降幅收窄2016-07-04 19:51:00北京7月4日电 (记者 刘长忠)记者4日' \ # '从中国物流与采购联合会获悉,6月中国公路物流运价指数为101.3点,比上月回落1.5%,但比年初回升2.8%。数据显示,进入6月,' \ # '公路物流需求较前期小幅回升。一方面,工业物流需求保持平稳增长,其中采矿业、高耗能行业等传统行业增速虽有所回落,但原油、' \ # '橡胶等进口量较前期明显回升;另一方面,消费品物流需求继续保持平稳较快增长,特别是农副产品、食品、纺织品等物流需求加快增长。' \ # '分品种来看,钢材、有色金属等大宗商品物流需求趋弱;农副产品、食品、纺织品等物流需求上升较快。中国物流与采购联合会分析人士称,' \ # '总体来看,未来整车及零担公路物流需求将延续小幅回暖的走势,运量也有望继续回升。公路物流运价指数较前期可能延续回升走势,' \ # '回升幅度难有较大提升,预计总体将与上年同期水平基本持平。(完)' # x_test2 = stopwords(jieba.lcut(x_test2)) tests.append(' '.join(x_test)) print(tests) new_cv_train = cv.transform(tests) new_tfidf = ft.TfidfTransformer(use_idf=False) new_tfidf_train = new_tfidf.fit_transform(new_cv_train) pred_test_Y = model.predict(new_tfidf_train) return pred_test_Y # x_test = '众所周知,中国人在世界上有着最出色的生意头脑,他们有着细腻的商业思维和精准的商业预判。' \ # '但有的时候却聪明反被聪明误。很多人看到一个赚钱的机会,大家都会一窝蜂似的挤进来都想分到一块“大蛋糕”。' \ # '而很多人又为了各自利益而不停模仿,导致最后都赚不到钱。中国的女鞋行业就是个典型例子' # print(testmodel(x_test))
def testmodel(self, model, tfidf_train, cv): tests = [] x_test = '首页|体育新闻欧足联启用新分析体系|欧洲杯数据狂魔遗憾出局2016-07-04 09:10:00随着比利时被威尔士淘汰,' \ '欧洲杯一夜之间送别了两位在身价榜单上位列前十的球员:阿扎尔、德布劳内。值得一提的是,' \ '在欧足联官方的金足奖数据分析体系里,截至发稿,这两位球星都在前三之列,是夺取赛事官方MVP的热门人选。' \ '德布劳内的发挥再次证明:比利时成也靠他、败也因他。欧洲杯激战至今,德布劳内是大赛的数据狂魔之一。助攻榜单上,' \ '他3次助攻,位列第3,仅次于4次助攻的拉姆塞和阿扎尔。威胁传球次数,他有23次,领先19次的帕耶和17次的厄齐尔排名第一。' \ '射门次数榜单,他21次与贝尔并列第二,仅次于C罗一人。由于本次欧洲杯,官方MVP的评选,是欧足联启用了一套全新的数据分析体系,' \ '代入数据进行演算而直接得出排名,因此直到本战之前,德布劳内都是MVP即时榜单上的第一名,只是在本战后,被贝尔超越,沦为第二' x_test = self.stopwords(jieba.lcut(x_test)) x_test2 = '首页|财经中心|财经频道6月中国公路物流运价指数降幅收窄2016-07-04 19:51:00北京7月4日电 (记者 刘长忠)记者4日' \ '从中国物流与采购联合会获悉,6月中国公路物流运价指数为101.3点,比上月回落1.5%,但比年初回升2.8%。数据显示,进入6月,' \ '公路物流需求较前期小幅回升。一方面,工业物流需求保持平稳增长,其中采矿业、高耗能行业等传统行业增速虽有所回落,但原油、' \ '橡胶等进口量较前期明显回升;另一方面,消费品物流需求继续保持平稳较快增长,特别是农副产品、食品、纺织品等物流需求加快增长。' \ '分品种来看,钢材、有色金属等大宗商品物流需求趋弱;农副产品、食品、纺织品等物流需求上升较快。中国物流与采购联合会分析人士称,' \ '总体来看,未来整车及零担公路物流需求将延续小幅回暖的走势,运量也有望继续回升。公路物流运价指数较前期可能延续回升走势,' \ '回升幅度难有较大提升,预计总体将与上年同期水平基本持平。(完)' x_test2 = self.stopwords(jieba.lcut(x_test2)) tests.append(' '.join(x_test)) tests.append(' '.join(x_test2)) new_cv_train = cv.transform(tests) new_tfidf = ft.TfidfTransformer(use_idf=False) new_tfidf_train = new_tfidf.fit_transform(new_cv_train) pred_test_Y = model.predict(new_tfidf_train) print(pred_test_Y)
def train(): """ Builds the SVM based on training data. """ features, labels = __init__.load_data('train') vectorizer = text.CountVectorizer(decode_error='ignore', stop_words='english') transformer = text.TfidfTransformer() classifier = linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, tol=1e-3, random_state=42) # Serializes the processing steps that would be required of the above. text_clf = pipeline.Pipeline( steps=[('vect', vectorizer), ('tfidf', transformer), ('clf-sgdc', classifier)]) start = time.time() text_clf.fit(features, labels) print 'Training time:\t%1.4f seconds' % (time.time() - start) __init__.evaluate(text_clf, features, labels) return text_clf
def create_disaster_sequence(disaster_csv_path, category_name): disaster = ut.read_csv(disaster_csv_path) print('Getting Data...') X = disaster['message'].values Y = disaster[category_name].values x_train, x_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.3) print('Tokenizing and count vectorizing...') vect = st.CountVectorizer(tokenizer=lambda message: ( pt.pipe | __normalize_text__ | __tokenize_text__ | __remove_stopwords__ # | __stem_text__ | __lemmatize_text__)(message)) print('Tfidf transforming...') tfidf = st.TfidfTransformer() classifier = en.RandomForestClassifier() print('Fitting classifier on train...') x_train_counts = vect.fit_transform(x_train) x_train_tfidf = tfidf.fit_transform(x_train_counts) classifier.fit(x_train_tfidf, y_train) print('Running classifier on test...') x_test_counts = vect.transform(x_test) x_test_tfidf = tfidf.transform(x_test_counts) y_pred = classifier.predict(x_test_tfidf) print('Displaying results...') display_results(y_test, y_pred)
def parseLogs(inputFile, outputFile): vectorizer = ext.CountVectorizer(tokenizer=get_tokens, stop_words='english') with open(inputFile) as file: lines = [line.rstrip() for line in file] lineNos = dict(zip(range(1, len(lines)), lines)) doc_matrix = vectorizer.fit_transform(lines) tf_idf_transformer = ext.TfidfTransformer().fit(doc_matrix) sparse = tf_idf_transformer.transform(doc_matrix).toarray() perLineScore = [] for row in sparse: perLineScore.append(row.sum() / len(row.nonzero()[0])) lineScores = dict(zip(range(1, len(lines)), perLineScore)) df = pd.DataFrame([lineNos, lineScores]).T df.columns = ['d{}'.format(i) for i, col in enumerate(df, 1)] df = df.sort_values(by=['d2'], ascending=False) with open(outputFile, 'w') as outFile: for index, row in df.iterrows(): line = "{0:0=3d} {1}\n" outFile.write(line.format(index, row['d1']))
def fit_insights(insightids, response, c): """ create a naive bayes model based on the insight ids and response vector given """ data = get_insights_by_id(insightids, c) nlp_pl = pipeline.Pipeline([('vect', text.CountVectorizer()), ('tfidf', text.TfidfTransformer(use_idf=True)), ('clf', naive_bayes.MultinomialNB())]) text_clf = nlp_pl.fit(insights, response) return text_clf
def compute_freq_mat(self, input_texts): if self.count_vect: word_doc_freq_mat = self.count_vect.transform(input_texts) else: self.count_vect = sk.CountVectorizer(ngram_range=self.ngram_range) word_doc_freq_mat = self.count_vect.fit_transform(input_texts) if self.feature_type == "tf": freq_transformer = sk.TfidfTransformer( use_idf=False).fit(word_doc_freq_mat) else: freq_transformer = sk.TfidfTransformer( use_idf=True).fit(word_doc_freq_mat) freq_mat = freq_transformer.transform(word_doc_freq_mat) return freq_mat
def wordbow(self): # 转换词袋模型 cv = ft.CountVectorizer() cv_train = cv.fit_transform(train) tfidf = ft.TfidfTransformer(use_idf=False) tfidf_train = tfidf.fit_transform(cv_train) # print(cv_train) # print(tfidf_train) self.trainmodel(cv_train, tfidf_train, cv)
def calculate_features(tr_tweets, te_tweets, targets_tr, targets_te, use_tfidf=False, w_sentiment=True, w_handcrafted=True, **bow_kwargs): """ Calculate all features, combine together into two arrays: one for tr and one for te :param tr_tweets: pandas Series of strings, raw texts to convert (from train set) :param te_tweets: pandas Series of strings, raw texts to convert (from test set) :param targets_tr: pandas Series of strings, target classes (from train set) :param targets_te: pandas Series of strings, target classes (from test set) :param use_tfidf: bool, whether to convert BoW to TF-IDF :param w_sentiment: bool, whether to include sentiment analysis inferences as features :param w_handcrafted: bool, whether to include handcrafted features :return: tuple: numpy array of training data, numpy array of test data, list of feature names """ # Preprocess tweets (tokenise, stem, remove stop words) tr_tokens = preprocessing.preprocess_tweets(tr_tweets) te_tokens = preprocessing.preprocess_tweets(te_tweets) # Now join preprocessed tokenised tweets into single strings, # necessary for input to CountVectorizer tr_tweet_proc = tr_tokens.apply(lambda _tokens: ' '.join(_tokens)) te_tweet_proc = te_tokens.apply(lambda _tokens: ' '.join(_tokens)) # Calculate bag-of-words representation x_tr, x_te, feature_names = bag_of_words(tr_tweet_proc, te_tweet_proc, targets_tr, targets_te, **bow_kwargs) if use_tfidf: # Convert BoW to TF-IDF tfidfer = text_sk.TfidfTransformer() x_tr = tfidfer.fit_transform(x_tr) x_te = tfidfer.transform(x_te) if w_handcrafted: # Add handcrafted features x_tr_hc, feature_names_hc = hand_crafted_features(tr_tweets, get_names=True) x_tr = np.hstack((x_tr, x_tr_hc)) x_te = np.hstack((x_te, hand_crafted_features(te_tweets))) feature_names.extend(feature_names_hc) if w_sentiment: # Add inferred sentiment features x_tr_sent, sent_feat_names = infer_sentiment(tr_tweets, get_names=True) x_tr = np.hstack((x_tr, infer_sentiment(tr_tweets))) x_te = np.hstack((x_te, infer_sentiment(te_tweets))) feature_names.extend(sent_feat_names) return x_tr, x_te, feature_names
def tfidf_transform(text,feature_words): count_vector = txt.CountVectorizer() vectorized = count_vector.fit(text) #transforms into tfidf tfidf = txt.TfidfTransformer().fit(vectorized) vectorized_tfidf = tfidf.transform(vectorized) if feature_words: return tfidf.get_feature_names() else: return vectorized_tfidf
def tfIDfVectorizer(df): y = df['class'] if '' in df.columns: df.drop(['class', ''], 1, inplace=True) else: df.drop(['class'], 1, inplace=True) trfm = txt.TfidfTransformer() trfm.fit(df) matrx = trfm.transform(df) matrx = matrx.todense() return matrx.A, y
def wordbow(): # 词袋,做tfidf x_train, x_test, y_train, y_test = train_test_split(train, classes, test_size=0.3, random_state=7) cv = ft.CountVectorizer() cv_x_train = cv.fit_transform(x_train) # print('cv_x_train: ', cv_x_train) tfidf = ft.TfidfTransformer(use_idf=False) tfidf_x_train = tfidf.fit_transform(cv_x_train) # print('tfidf_x_train: ', tfidf_x_train) return train_model(tfidf_x_train, y_train, x_test, y_test, cv, cv_x_train)
def make_BOW_features(X): ''' Create feature vectors for the input list of sentences by using the tf-idf value for each word that occurs in the sentence (idf calculated based on entire input) X: list of sentences ''' vectorizer = sktext.CountVectorizer(min_df=1) countX = vectorizer.fit_transform(X) transformer = sktext.TfidfTransformer() vecs = transformer.fit_transform(countX) return vecs
def to_tfidf_vectors(texts: list, tokenizer: tokenizers.BaseTokenizer()): """ :param texts: list of str :param tokenizer: :return: """ texts_tokenized = [' '.join(tokenizer(text)) for text in texts] vectorizer = sklearn_text.CountVectorizer() freq_word_matrix = vectorizer.fit_transform(texts_tokenized) transformer = sklearn_text.TfidfTransformer() tfidf_matrix = transformer.fit_transform(freq_word_matrix) X = tfidf_matrix.toarray() return X
def getTfidf(data, is_train=True): if is_train: bow = sk_txt.CountVectorizer(ngram_range=(1, 2), stop_words='english', lowercase=True) tfidf = sk_txt.TfidfTransformer() bow_t = bow.fit_transform(data[0], data[1]) tfidf_t = tfidf.fit_transform(bow_t) pickle.dump(bow, open('bow.model', 'wb')) pickle.dump(tfidf, open('tfidf.model', 'wb')) else: bow = pickle.load(open('bow.model', 'rb')) tfidf = pickle.load(open('tfidf.model', 'rb')) bow_t = bow.transform(data[0]) tfidf_t = tfidf.transform(bow_t) return tfidf_t
def go2trainAction(self): trainAction_data = [] trainAction = sd.load_files('resources/Action/', encoding='utf8', shuffle=True, random_state=8) for i in trainAction.data: trainAction_data.append(" ".join(jieba.cut(i))) # train_data = train.data #print(trainAction_data) trainAction_y = trainAction.target #print(trainAction.target) self.categoriesAction = np.array(trainAction.target_names) #print(np.array(trainAction_data).shape) #print(np.array(trainAction_y).shape) #print(self.categoriesAction) # 构建TFIDF矩阵 使用1-gram 这边的词是根据空格划分的,前面jieba已经拆成空格了 self.cvAction = ft.CountVectorizer(ngram_range=(1, 1)) # # input to fit_transform() should be an iterable with strings # ngrams = self.cvAction.fit_transform(trainAction_data) # # # needs to happen after fit_transform() # vocab = self.cvAction.vocabulary_ # # count_values = ngrams.toarray().sum(axis=0) # # # output n-grams # for ng_count, ng_text in sorted([(count_values[i], k) for k, i in vocab.items()], reverse=True): # print(ng_count, ng_text) bowAction = self.cvAction.fit_transform(trainAction_data) #print(bowAction.shape) self.ttAction = ft.TfidfTransformer() tfidfAction = self.ttAction.fit_transform(bowAction) # 模型训练 使用MultinomialNB 是因为tfidf # 矩阵中样本的分布更匹配多项分布 self.Actionmodel = nb.MultinomialNB() #print(trainAction_y.shape) self.Actionmodel.fit(tfidfAction, trainAction_y)
def get_trained_count_and_tfidf_model(texts: list, tokenizer: tokenizers.BaseTokenizer()): """ :param texts: :param tokenizer: :return: """ texts_tokenized = [' '.join(tokenizer(text)) for text in texts] count_model = sklearn_text.CountVectorizer() count_model.fit(texts_tokenized) freq_word_matrix = count_model.transform(texts_tokenized) tfidf_model = sklearn_text.TfidfTransformer() tfidf_model.fit(freq_word_matrix) return count_model, tfidf_model
def to_tfidf_vectors(texts: list, tokenizer: tokenizers.BaseTokenizer()): """ :param texts: list of str :param tokenizer: 分词器 :return: 向量数组,每行一个对应一个text的向量化结果 """ texts_tokenized = [' '.join(tokenizer(text)) for text in texts] # 词频矩阵:矩阵元素a[i][j] 表示j词在i类文本下的词频 vectorizer = sklearn_text.CountVectorizer() freq_word_matrix = vectorizer.fit_transform(texts_tokenized) # 统计每个词语的tf-idf权值 transformer = sklearn_text.TfidfTransformer() tfidf_matrix = transformer.fit_transform(freq_word_matrix) X = tfidf_matrix.toarray() return X
def get_trained_count_and_tfidf_model(texts: list, tokenizer: tokenizers.BaseTokenizer()): """ :param texts: :param tokenizer: :return: """ texts_tokenized = [' '.join(tokenizer(text)) for text in texts] # 词频矩阵:矩阵元素a[i][j] 表示j词在i类文本下的词频 count_model = sklearn_text.CountVectorizer() count_model.fit(texts_tokenized) freq_word_matrix = count_model.transform(texts_tokenized) # 统计每个词语的tf-idf权值 tfidf_model = sklearn_text.TfidfTransformer() tfidf_model.fit(freq_word_matrix) return count_model, tfidf_model
def build_vectors(sentences, vacabulary_size): vectorizer = skyfe.CountVectorizer() trans = vectorizer.fit_transform(sentences) fname = vectorizer.get_feature_names() print(trans) print(trans.toarray()) print(fname) #enable tf-idf transformer = skyfe.TfidfTransformer() tfidf = transformer.fit_transform(trans) print(tfidf.toarray()) print(tfidf.get_feature_names()) #hashed vectorizer2 = skyfe.HashingVectorizer(n_features=6, norm=None) trans = vectorizer2.fit_transform(sentences) #fname = vectorizer2.get_feature_names() print(trans.toarray())
def predict_self(model, x_test, y_test, cv, score): # 测试模型 cv_x_test = cv.transform(x_test) # print(cv_x_test) tfidf = ft.TfidfTransformer(use_idf=False) tfidf_x_test = tfidf.fit_transform(cv_x_test) # print(tfidf_x_test) pred_y = model.predict(tfidf_x_test) # print(pred_y) result = pred_y == y_test result = [r for r in result if r] # score: 查准率,召回率,F1得分,true_count:判断正确的数量, all_count:总数量, true_score:正确率 return { 'score': score, 'true_count': len(result), 'all_count': pred_y.size, 'true_score': (len(result) / pred_y.size) * 100 }
def __init__(self, train_dict, cat1, cat2): self.cat1 = cat1 self.cat2 = cat2 count_vect = text.CountVectorizer(min_df=1, stop_words='english', analyzer = 'word', tokenizer=util.my_tokenizer) tfidf_transformer = text.TfidfTransformer() svd = TruncatedSVD(n_components=50, algorithm='arpack') X_train_counts = count_vect.fit_transform(train_dict[cat1].data + train_dict[cat2].data) X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_train_svd = svd.fit_transform(X_train_tfidf) y_train = [1] * len(train_dict[cat1].filenames) + [-1] * len(train_dict[cat2].filenames) self.count_vect = count_vect self.tfidf_transformer = tfidf_transformer self.svd = svd self.clf = svm.LinearSVC(random_state=42, class_weight='balanced') self.clf.fit(X_train_svd, y_train)
def train(): """Builds the random forest based on training data.""" features, labels = __init__.load_data('train') vectorizer = text.CountVectorizer(decode_error='ignore', stop_words='english') transformer = text.TfidfTransformer() classifier = ensemble.RandomForestClassifier(n_estimators=10) text_clf = pipeline.Pipeline( steps=[('vect', vectorizer), ('tfidf', transformer), ('clf-rf', classifier)]) start = time.time() text_clf.fit(features, labels) print 'Training time:\t%1.4f seconds' % (time.time() - start) __init__.evaluate(text_clf, features, labels) return text_clf
def gen_key_word_dict_list(string_list, max_key_words=10, stop_word_list=[]): from sklearn.feature_extraction import text import numpy as np if stop_word_list == []: stop_word_list = text.ENGLISH_STOP_WORDS vectorizer = text.TfidfVectorizer(stop_words=stop_word_list) transformer = text.TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(string_list)) words = vectorizer.get_feature_names() weight = tfidf.toarray() result_list = [] for w in weight: temp_dict = {} loc = np.argsort(-w) for i in range(max_key_words): if w[loc[i]] <= 0: break temp_dict[words[loc[i]]] = w[loc[i]] result_list.append(temp_dict) return result_list
def init_fit_eval(self): """ Initializes, fits, and evaluates a Random Forest Classifier on the disaster data """ print('Creating Sparse Vector Pipeline...') X = self.__disaster__['message'].values i = 1 num_cats = len(self.__categories_columns__) for category in self.__categories_columns__: Y = self.__disaster__[category] x_train, x_test, y_train, y_test = ms.train_test_split( X, Y, test_size=0.25) current_pipeline = pi.Pipeline([ ('vect', te.CountVectorizer(tokenizer=self.__tokenize_tweet__)), ('tfidf', te.TfidfTransformer()), ('clf', en.RandomForestClassifier()) ]) self.__pipelines__[category] = current_pipeline current_pipeline.fit(x_train, y_train) y_pred = current_pipeline.predict(x_test) self.__add_to_summary__(category, y_test, y_pred) ut.printover('Fitted ' + str(i) + ' out of ' + str(num_cats) + ' models') i += 1 print('\nDone fitting. Overall Accuracy: ' + str(st.mean(self.__accuracies__.values())))
def train(): """ Builds the classifier based on training data. """ features, labels = __init__.load_data('train') vectorizer = text.CountVectorizer(decode_error='ignore', stop_words='english') transformer = text.TfidfTransformer() classifier = linear_model.LogisticRegression(solver='lbfgs') # Serializes the processing steps that would be required of the above. text_clf = pipeline.Pipeline(steps=[('vect', vectorizer), ('tfidf', transformer), ('clf-lr', classifier)]) start = time.time() text_clf.fit(features, labels) print 'Training time:\t%1.4f seconds' % (time.time() - start) __init__.evaluate(text_clf, features, labels) return text_clf