def get_data_(train_limit, AuthorProc, DataProc, labels_enc_with_data, encoder_size): tr, te = import_data() # cross_val_predict (test_sklearnclassifier) does the train/test/cv split # internally text = list(tr.text) + list(te.text) author = list(tr.author) + list(te.author) label_enc = AuthorProc() data_enc = DataProc(encoder_size=encoder_size) if train_limit == None: train_limit = len(text) y = label_enc.fit_transform(author[:train_limit]) if labels_enc_with_data: X, y = zip(*data_enc.fit_transform(text[:train_limit], y)) else: X = data_enc.fit_transform(text[:train_limit]) return (X, y)
sens = self.pipeline_factory(x) sens = list(sens) features = np.array([self.get_features(s) for s in sens]) features = (features - self.means) / self.devs out = self.svm.predict(features) return out myc = [] if __name__ == '__main__': tr, te = import_data() author_enum = {'HPL': 0, 'EAP': 1, 'MWS': 2} classed_auths = [author_enum[a] for a in tr.author] myc = ProbabilisticSVMClassifier(beta_method=True, stem=True, lemma=True, n_jobs=3, kernel='rbf') y_train_pred = cross_val_predict(myc, tr.text, classed_auths, cv=3,
import matplotlib.pyplot as plt import numpy as np # noinspection PyUnresolvedReferences from mpl_toolkits.mplot3d import Axes3D from sklearn.decomposition import KernelPCA from sklearn.feature_extraction.text import CountVectorizer from data.import_data import import_data train, ts = import_data() # Storing the entire training text in a list text = list(train.text.values) authors = np.array(list(train.author.values)) tf_vectorizer = CountVectorizer( max_df=0.95, min_df=2, # stop_words='english', decode_error='ignore') tf = tf_vectorizer.fit_transform(text) # print(tf) rbf_pca = KernelPCA(n_components=3, kernel="sigmoid", gamma=0.04) x_reduced = rbf_pca.fit_transform(tf) # print(x_reduced) fig = plt.figure()
def do_import_data(request): settings = request.registry.settings dbmaker = request.registry.dbmaker db_session = dbmaker() import_data(settings, db_session) return u'Importing Data...'