def test_NaiveBayes_NLTK(): from thesis.Data import Data_loader # load data data = Data_loader().get_data() clf_nltk = NaiveBayes_nltk() # test for NLTK Naive Bayes clf_nltk.classify(data_vectorized=data) clf_nltk.predict(data_vectorized=data)
class Hypothese(object): fitness = 0 name = 'Hypothese' classifier = None vectorizer = None data_cleaner = None data_loader = None # define standard parameter assigning to convention over configuration def __init__(self, data_loader=None, samples=1000, red_method='tsne', vectorizer='word2vec', w2v_dim=300): if data_loader == None: self.data_loader = Data_loader() else: self.data_loader = data_loader self.num_of_samples = samples self.red_method = red_method self.w2v_dim = w2v_dim self.vectorizer = vectorizer # initial variant self.classifier = LinearSVM() def run(self): self.vectorizer = v.get_Vectorizer(vectorizer=self.vectorizer, num_of_samples=self.num_of_samples, reduction_methode=self.red_method, w2v_dimension=self.w2v_dim) # dependency injection for the provided data data_vectorized = self.vectorizer.vectorize( self.data_loader.get_data()) # reduce the dimensionality of the training and testing data with tsne # no effort, acc 50 - 60 % # data_vectorized['x_train_v'] = v.reduce_with_TSNE_single(unreduced_data=data_vectorized['x_train_v']) # data_vectorized['x_test_v'] = v.reduce_with_TSNE_single(unreduced_data=data_vectorized['x_test_v']) self.classifier.classify(data_vectorized) self.classifier.predict(data_vectorized) def calc_fitness(self): pass def mutate(self): pass def compare_to(self): pass
def __init__(self, data_loader=None, samples=1000, red_method='tsne', vectorizer='word2vec', w2v_dim=300): if data_loader == None: self.data_loader = Data_loader() else: self.data_loader = data_loader self.num_of_samples = samples self.red_method = red_method self.w2v_dim = w2v_dim self.vectorizer = vectorizer # initial variant self.classifier = LinearSVM()
def test_LinearSVM(): # test svm with tdidf-vectorized data from thesis.Data import Data_loader import thesis.Vectorizer as vec data = Data_loader().get_data() vec = vec.get_Vectorizer(vectorizer='tfidf') #vec = vec.get_Vectorizer(vectorizer='word2vec') clf = LinearSVM() vectorized_data = vec.vectorize(data=data) clf.classify(vectorized_data) clf.predict(vectorized_data)
def __init__(self): # set up a unique logger logging.info(' start ------------------------------------------------------------------') # possible variants # self.vectorizer = ['word2vec', 'tfidf'] # self.samples = [1000] # n examples possible # self.methods = ['pca', 'tsne'] # self.dim = [50, 100, 300] self.vectorizer = ['word2vec'] self.num_of_samples_to_print = [500] self.methods = ['tsne'] self.dim = [300] self.preprocessing = ['stopwords'] self.Data_loader = Data_loader() #self.Data_cleaner = Data_cleaner() # initialize the population with hypotheses to train/evaluate on # how many samples should we plot for s in self.num_of_samples_to_print: # run with each vectorizer defined for vec in self.vectorizer: for m in self.methods: # only word2vec has different vectorizer models if vec == "word2vec": for d in self.dim: logging.info('initialize w2v hyp') self.population.append(Hypothese(data_loader = self.Data_loader, samples=s, red_method=m, w2v_dim=d, vectorizer=vec)) else: logging.info('initialize tfidf hyp') self.population.append( Hypothese(data_loader=self.Data_loader, samples=s, red_method=m, vectorizer=vec))
def test_NaiveBayes_sklearn(): from thesis.Data import Data_loader import thesis.Vectorizer as vec # load data data = Data_loader().get_data() # create a vectorizer tfidf_vec = vec.get_Vectorizer(vectorizer='tfidf') # create a classifier clf = NaiveBayes_sklearn() # vectorize the data vectorized_data = tfidf_vec.vectorize(data=data) # train classifier clf.classify(vectorized_data) # inverence for the classifier clf.predict(vectorized_data)
def use_word2vec_with_movie_reviews(): clf = cls.LinearSVM() # samples per sentiment for cluster plotting samples = 10000 # tsne related params perplexity = 80 # filter the most significant dimensions #learning_rates = np.logspace(2, 3, 5) learning_rates = [1000] # how to reduce the dimensionality of the wordvectors / document vectors reduction_methode = 'tsne' extract_dim = True normalize = True truncate_by_svd = True # bias for the difference of all averaged document vectors # how big should the difference between negative and positive feats be? # biases = np.array([0.1,0.09,0.08,0.07,0.06,0.05,0.04,0.03,0.02, 0.01, 0.009, 0.008, 0.007,0.006]) biases = np.array([0.09]) accuracies = np.zeros(len(biases)) extracted_dim = np.zeros(len(biases)) logging.info(biases) logging.info(extracted_dim) logging.info(accuracies) # cache the vectorized features for faster parameter research import thesis.IO_Organizer as saver feature_filename = 'w2v_google' try: logging.info('Try to load vectorized features') vectorized_data_full = saver.load_features('dict_' + feature_filename) logging.info('Features loaded from files') except: logging.info('Feature-file not found, vectorize reviews') data = Data_loader().get_data() word2vec = vec.get_Vectorizer(vectorizer='word2vec') vectorized_data_full = word2vec.vectorize(data=data) saver.save_features(vectorized_data_full, feature_filename) data = Data_loader().get_data() word2vec = vec.get_Vectorizer(vectorizer='word2vec') vectorized_data_full = word2vec.vectorize(data=data) for learning_rate in learning_rates: for i, bias in enumerate(biases): logging.info(bias) # create a working copy vectorized_data = dict(vectorized_data_full) ############## plot most informative dimensions ############## #plot_sentiment_distribution(vectorized_data['train_neg_v'], vectorized_data['train_pos_v'], source='feats') # reduce the dim of our document vectors #vectorized_data = vec.transform_data(vectorized_data, bias=bias) # plotting plot_each_review_dimension(vectorized_data=vectorized_data, bias=bias) # # extract the most significant dim of our document vectors if extract_dim: vectorized_data = vec.transform_data(vectorized_data, bias=bias) #### testing purpose, shrinking the whole amount of data to 2d # we need to do it batchsized to avoid memory overflow batchsize = 4000 reduced_to_2d = [] for x in batch(vectorized_data['x_train_v'], batchsize): reduced_to_2d.extend(shrink_dim_to_2d(x)) vectorized_data['x_train_v'] = reduced_to_2d reduced_to_2d = [] for x in batch(vectorized_data['x_test_v'], batchsize): reduced_to_2d.extend(shrink_dim_to_2d(x)) vectorized_data['x_test_v'] = reduced_to_2d reduced_to_2d = [] for x in batch(vectorized_data['train_neg_v'], batchsize): reduced_to_2d.extend(shrink_dim_to_2d(x)) vectorized_data['train_neg_v'] = reduced_to_2d reduced_to_2d = [] for x in batch(vectorized_data['train_pos_v'], batchsize): reduced_to_2d.extend(shrink_dim_to_2d(x)) vectorized_data['train_pos_v'] = reduced_to_2d reduced_to_2d = [] #### shrink_dim_and_plot_2d_clusters( neg_v=vectorized_data['train_neg_v'], pos_v=vectorized_data['train_pos_v'], reduction_methode=reduction_methode, bias=bias, perplexity=perplexity, learning_rate=learning_rate, normalize=normalize, extract_dim=extract_dim, truncate_by_svd=truncate_by_svd, source='feat') # select num_of_samples randomly # we need to define samples, or we get an memory error # neg_samples_v = random.sample(vectorized_data['train_neg_v'], k=samples) # pos_samples_v = random.sample(vectorized_data['train_pos_v'], k=samples) # shrink_dim_and_plot_2d_clusters(neg_v= neg_samples_v, # pos_v= pos_samples_v, # reduction_methode= reduction_methode, # bias= bias, # perplexity= perplexity, # learning_rate= learning_rate, # normalize= normalize, # extract_dim= extract_dim, # truncate_by_svd= truncate_by_svd, # source= 'feat') extr_dim = len(vectorized_data['x_train_v'][0]) extracted_dim[i] = extr_dim #vectorized_data = vec.delete_relevant_dimensions(vectorized_data) ######## linear svm ################ cl = cls.LinearSVM() cl.classify(vectorized_data) cl.predict(vectorized_data) cl = LinearSVC() cl.fit(vectorized_data['x_train_v'], vectorized_data['y_train']) pred = cl.predict(vectorized_data['x_test_v']) acc = accuracy_score(y_true=vectorized_data['y_test'], y_pred=pred) logging.info('acc: ' + str(acc)) accuracies[i] = acc del vectorized_data # #vis.plot_hyperplane(clf=cl, X=vectorized_data['x_train_v'], Y=vectorized_data['y_train']) # ######### RandomForestClassifier ######### # target_names = ['negative', 'positive'] # # clf = RandomForestClassifier(n_jobs=2) # clf.fit(vectorized_data['x_train_v'], vectorized_data['y_train']) # prediction = clf.predict(vectorized_data['x_test_v']) # logging.info(classification_report(vectorized_data['y_test'], prediction, # target_names=target_names)) # ######## Logisticregression ############# # from sklearn.linear_model import LogisticRegression # import pandas as pd # # lr = LogisticRegression() # lr.fit(vectorized_data['x_train_v'], vectorized_data['y_train']) # prediction = lr.predict_proba(vectorized_data['x_test_v']) # # logging.info('LR acc: ' + str(lr.score(vectorized_data['x_test_v'], vectorized_data['y_test']))) # # metrics.accuracy_score(vectorized_data['y_test'], prediction) # logging.info(biases) logging.info(extracted_dim) logging.info(accuracies)
# define a logfile and a level at one point to reuse it in all modules # actually we log to std out and to /logs/SA.log logger = logging.getLogger() hdlr = logging.FileHandler('./logs/' + 'Gridsearch' + '.log') formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') hdlr.setFormatter(formatter) logger.addHandler(hdlr) # console output hdlr_console = logging.StreamHandler() hdlr_console.setFormatter(formatter) logger.addHandler(hdlr_console) logger.setLevel(logging.INFO) from thesis.Data import Data_loader data = Data_loader().get_data() #tokenizer = RegexpTokenizer(r'\w+') tokenizer = nltk.TweetTokenizer() def tokenize(text): #tokens = nltk.word_tokenize(text) tokens = tokenizer.tokenize(text) stems = [] for item in tokens: stems.append(nltk.PorterStemmer().stem(item)) #stems.append(item) return stems