def startAnalysis(folder, S1_path, S2_path): fetcher = PageFetcher() S1 = fetcher.fetchPages(folder, S1_path) S2 = fetcher.fetchPages(folder, S2_path) #We use a document representation based on TF-IDF model TF_IDF = Vectorizer() S1_HTML = TF_IDF.fit_transform(S1) S2_HTML = TF_IDF.fit_transform(S2) pageAllignament = PageAllignament() S1S2_Pairs = pageAllignament.allignSources(S1_HTML, S2_HTML) print("Stats of: " + str(S1_path) + " and " + str(S2_path)) evaluation_pipeline(S1S2_Pairs)
def vectorize_jobs(df_jobs, vectorizer_path, tfidfs_path, debug=False): #initializing tfidf vectorizer if debug: print('[Job Vectorization 2/5] Initializing Vectorizer \n') vectorizer = Vectorizer() if debug: print('[Job Vectorization 3/5] Tranforming/Vectorizing data \n') tfidf_jobs = vectorizer.fit_transform( (df_jobs['text'])) #fitting and transforming the vector if debug: print('[Job Vectorization 4/5] saving vectorizer to {path} \n'.format( path=vectorizer_path)) vectorizer.save_vectorizer(vectorizer_path) if debug: print('[Job Vectorization 5/5] saving tfidf to {path} \n'.format( path=tfidfs_path)) vectorizer.save_tfidfs(tfidf_jobs, tfidfs_path)
def main(): # 学習データ読み込み with timer("train data load"): df = load_data_from_gcs() # 前処理 with timer("preprocess"): df = preprocess(df) vectorizer = Vectorizer() X_train = df.drop(columns="price") y_train = df["price"] with timer("training"): X_train = vectorizer.fit_transform(X_train) # 学習 base_params = { 'input_dropout': 0.2, 'hidden_layers': 3, 'hidden_units': 256, 'hidden_activation': 'relu', 'hidden_dropout': 0.2, 'batch_norm': 'before_act', 'optimizer': { 'type': 'adam', 'lr': 5e-5 }, 'batch_size': 64, } model = ModelMLP(base_params) model.fit(X_train, y_train) with timer("save model"): #モデルとパイプラインの保存 vectorizer.save_vectorizer() model.save_model()
class Trainer(object): """Trains the classifier with training data and does the cross validation. """ def __init__(self): """Initializes the datastructures required. """ # The actual text extraction object (does text to vector mapping). self.vectorizer = Vectorizer() # A list of already hand classified tweets to train our classifier. self.data = None # A list containing the classification to each individual tweet # in the tweets list. self.classification = None self.classifier = None self.scores = None def initialize_training_data(self): """Initializes all types of training data we have. """ corpus_file = open(os.path.join(datasettings.DATA_DIRECTORY, 'full-corpus.csv')) classification, tweets = parse_training_corpus(corpus_file) reviews_positive = parse_imdb_corpus( os.path.join(datasettings.DATA_DIRECTORY, 'positive')) num_postive_reviews = len(reviews_positive) class_positive = ['positive'] * num_postive_reviews reviews_negative = parse_imdb_corpus( os.path.join(datasettings.DATA_DIRECTORY, 'negative')) num_negative_reviews = len(reviews_negative) class_negative = ['negative'] * num_negative_reviews self.data = tweets self.classification = classification #self.date_time = date_time #self.retweet = retweets #self.favorited = favorited def initial_fit(self): """Initializes the vectorizer by doing a fit and then a transform. """ # We map the sentiments to the values specified in the SENTIMENT_MAP. # For any sentiment that is not part of the map we give a value 0. classification_vector = numpy.array(map( lambda s: SENTIMENT_MAP.get(s.lower(), 0), self.classification)) feature_vector = self.vectorizer.fit_transform(self.data) return (classification_vector, feature_vector) def build_word_dict(self): """ Build sentiment dictionary and build vector of weights for tweets. """ fileIn = open(os.path.join(datasettings.DATA_DIRECTORY, 'AFINN-96.txt')) wordDict = {} line = fileIn.readline() while line != '': temp = string.split(line, '\t') wordDict[temp[0]] = int(temp[1]) line = fileIn.readline() fileIn.close() fileIn = open(os.path.join(datasettings.DATA_DIRECTORY, 'AFINN-111.txt')) line = fileIn.readline() while line != '': temp = string.split(line, '\t') wordDict[temp[0]] = int(temp[1]) line = fileIn.readline() fileIn.close() word_dict_vector = [] for tweet in self.data: word_list = tweet.split() sum = 0 for word in word_list: if word in wordDict.keys(): sum += wordDict[word] word_dict_vector.append(sum) return word_dict_vector def transform(self, test_data): """Performs the transform using the already initialized vectorizer. """ feature_vector = self.vectorizer.transform(test_data) def score_func(self, true, predicted): """Score function for the validation. """ return metrics.precision_recall_fscore_support( true, predicted, pos_label=[ SENTIMENT_MAP['positive'], SENTIMENT_MAP['negative'], SENTIMENT_MAP['neutral'], ], average='macro') def cross_validate(self, k=10): """Performs a k-fold cross validation of our training data. Args: k: The number of folds for cross validation. """ self.scores = [] X, y = check_arrays(self.feature_vector, self.classification_vector, sparse_format='csr') cv = cross_validation.check_cv( k, self.feature_vector, self.classification_vector, classifier=True) for train, test in cv: self.classifier1.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier2.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier3.fit(self.feature_vector[train], self.classification_vector[train]) classification1 = self.classifier1.predict( self.feature_vector[test]) classification2 = self.classifier2.predict( self.feature_vector[test]) classification3 = self.classifier3.predict( self.feature_vector[test]) classification = [] for predictions in zip(classification1, classification2, classification3): neutral_count = predictions.count(0) positive_count = predictions.count(1) negative_count = predictions.count(-1) if (neutral_count == negative_count and negative_count == positive_count): classification.append(predictions[0]) elif (neutral_count > positive_count and neutral_count > negative_count): classification.append(0) elif (positive_count > neutral_count and positive_count > negative_count): classification.append(1) elif (negative_count > neutral_count and negative_count > positive_count): classification.append(-1) classification = numpy.array(classification) self.scores.append(self.score_func(y[test], classification)) def train_and_validate(self, cross_validate=False, mean=False, serialize=False): """Trains the SVC with the training data and validates with the test data. We do a K-Fold cross validation with K = 10. """ self.classification_vector, self.feature_vector = self.initial_fit() self.classifier1 = naive_bayes.MultinomialNB() self.classifier2 = naive_bayes.BernoulliNB() self.classifier3 = svm.LinearSVC(loss='l2', penalty='l1', C=1000,dual=False, tol=1e-3) if cross_validate: self.cross_validate(k=cross_validate) else: self.classifier1.fit(self.feature_vector, self.classification_vector) self.classifier2.fit(self.feature_vector, self.classification_vector) self.classifier3.fit(self.feature_vector, self.classification_vector) if serialize: classifiers_file = open(os.path.join( datasettings.DATA_DIRECTORY, 'classifiers.pickle'), 'wb') cPickle.dump([self.classifier1, self.classifier2, self.classifier3], classifiers_file) vectorizer_file = open(os.path.join( datasettings.DATA_DIRECTORY, 'vectorizer.pickle'), 'wb') cPickle.dump(self.vectorizer, vectorizer_file) return self.scores def build_ui(self, mean=False): """Prints out all the scores calculated. """ for i, score in enumerate(self.scores): print "Cross Validation: %d" % (i + 1) print "*" * 40 if mean: print "Mean Accuracy: %f" % (score) else: print "Precision\tRecall\t\tF-Score" print "~~~~~~~~~\t~~~~~~\t\t~~~~~~~" precision = score[0] recall = score[1] f_score = score[2] print "%f\t%f\t%f" % (precision, recall, f_score) print
class Trainer(object): """Trains the classifier with training data and does the cross validation. """ def __init__(self): """Initializes the datastructures required. """ # The actual text extraction object (does text to vector mapping). self.vectorizer = Vectorizer() # A list of already hand classified tweets to train our classifier. self.data = None # A list containing the classification to each individual tweet # in the tweets list. self.classification = None self.classifier = None self.scores = None def initialize_training_data(self): """Initializes all types of training data we have. """ corpus_file = open( os.path.join(datasettings.DATA_DIRECTORY, 'full-corpus.csv')) classification, tweets = parse_training_corpus(corpus_file) reviews_positive = parse_imdb_corpus( os.path.join(datasettings.DATA_DIRECTORY, 'positive')) num_postive_reviews = len(reviews_positive) class_positive = ['positive'] * num_postive_reviews reviews_negative = parse_imdb_corpus( os.path.join(datasettings.DATA_DIRECTORY, 'negative')) num_negative_reviews = len(reviews_negative) class_negative = ['negative'] * num_negative_reviews self.data = tweets self.classification = classification #self.date_time = date_time #self.retweet = retweets #self.favorited = favorited def initial_fit(self): """Initializes the vectorizer by doing a fit and then a transform. """ # We map the sentiments to the values specified in the SENTIMENT_MAP. # For any sentiment that is not part of the map we give a value 0. classification_vector = numpy.array( map(lambda s: SENTIMENT_MAP.get(s.lower(), 0), self.classification)) feature_vector = self.vectorizer.fit_transform(self.data) return (classification_vector, feature_vector) def build_word_dict(self): """ Build sentiment dictionary and build vector of weights for tweets. """ fileIn = open(os.path.join(datasettings.DATA_DIRECTORY, 'AFINN-96.txt')) wordDict = {} line = fileIn.readline() while line != '': temp = string.split(line, '\t') wordDict[temp[0]] = int(temp[1]) line = fileIn.readline() fileIn.close() fileIn = open( os.path.join(datasettings.DATA_DIRECTORY, 'AFINN-111.txt')) line = fileIn.readline() while line != '': temp = string.split(line, '\t') wordDict[temp[0]] = int(temp[1]) line = fileIn.readline() fileIn.close() word_dict_vector = [] for tweet in self.data: word_list = tweet.split() sum = 0 for word in word_list: if word in wordDict.keys(): sum += wordDict[word] word_dict_vector.append(sum) return word_dict_vector def transform(self, test_data): """Performs the transform using the already initialized vectorizer. """ feature_vector = self.vectorizer.transform(test_data) def score_func(self, true, predicted): """Score function for the validation. """ return metrics.precision_recall_fscore_support( true, predicted, pos_label=[ SENTIMENT_MAP['positive'], SENTIMENT_MAP['negative'], SENTIMENT_MAP['neutral'], ], average='macro') def cross_validate(self, k=10): """Performs a k-fold cross validation of our training data. Args: k: The number of folds for cross validation. """ self.scores = [] X, y = check_arrays(self.feature_vector, self.classification_vector, sparse_format='csr') cv = cross_validation.check_cv(k, self.feature_vector, self.classification_vector, classifier=True) for train, test in cv: self.classifier1.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier2.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier3.fit(self.feature_vector[train], self.classification_vector[train]) classification1 = self.classifier1.predict( self.feature_vector[test]) classification2 = self.classifier2.predict( self.feature_vector[test]) classification3 = self.classifier3.predict( self.feature_vector[test]) classification = [] for predictions in zip(classification1, classification2, classification3): neutral_count = predictions.count(0) positive_count = predictions.count(1) negative_count = predictions.count(-1) if (neutral_count == negative_count and negative_count == positive_count): classification.append(predictions[0]) elif (neutral_count > positive_count and neutral_count > negative_count): classification.append(0) elif (positive_count > neutral_count and positive_count > negative_count): classification.append(1) elif (negative_count > neutral_count and negative_count > positive_count): classification.append(-1) classification = numpy.array(classification) self.scores.append(self.score_func(y[test], classification)) def train_and_validate(self, cross_validate=False, mean=False, serialize=False): """Trains the SVC with the training data and validates with the test data. We do a K-Fold cross validation with K = 10. """ self.classification_vector, self.feature_vector = self.initial_fit() self.classifier1 = naive_bayes.MultinomialNB() self.classifier2 = naive_bayes.BernoulliNB() self.classifier3 = svm.LinearSVC(loss='l2', penalty='l1', C=1000, dual=False, tol=1e-3) if cross_validate: self.cross_validate(k=cross_validate) else: self.classifier1.fit(self.feature_vector, self.classification_vector) self.classifier2.fit(self.feature_vector, self.classification_vector) self.classifier3.fit(self.feature_vector, self.classification_vector) if serialize: classifiers_file = open( os.path.join(datasettings.DATA_DIRECTORY, 'classifiers.pickle'), 'wb') cPickle.dump( [self.classifier1, self.classifier2, self.classifier3], classifiers_file) vectorizer_file = open( os.path.join(datasettings.DATA_DIRECTORY, 'vectorizer.pickle'), 'wb') cPickle.dump(self.vectorizer, vectorizer_file) return self.scores def build_ui(self, mean=False): """Prints out all the scores calculated. """ for i, score in enumerate(self.scores): print "Cross Validation: %d" % (i + 1) print "*" * 40 if mean: print "Mean Accuracy: %f" % (score) else: print "Precision\tRecall\t\tF-Score" print "~~~~~~~~~\t~~~~~~\t\t~~~~~~~" precision = score[0] recall = score[1] f_score = score[2] print "%f\t%f\t%f" % (precision, recall, f_score) print
else: res += pred # next_hidden = sess.run(tensors['next_hidden'], feed_dict=feed_dict) # initial_state = np.vstack((initial_state, next_hidden))[1:] return res if __name__ == '__main__': print 'Loading data...' with open('../../data/smalldata.txt', 'r') as f: data = [line.strip() for line in f] vectorizer = Vectorizer(seq_length=25) print 'Fitting Vectorizer...' X_data, y_data = vectorizer.fit_transform(data) with open('vectorizer.pkl', 'w') as f: pickle.dump(vectorizer, f) N, seq_length, input_dim = X_data.shape hidden_dim = 128 output_dim = input_dim X = tf.placeholder(tf.float32, [None, seq_length, input_dim], 'X') y = tf.placeholder(tf.float32, [None, output_dim], 'y') initial_state = tf.placeholder(tf.float32, [None, 2 * hidden_dim], 'initial_state') lstm, next_hidden = lstm_layer(X, input_dim, seq_length, hidden_dim, output_dim, initial_state, 'lstm') with tf.name_scope('predictions'):
output_nonlinearity=softmax, update=nesterov_momentum, update_learning_rate=0.1, update_momentum=0.9, # update=adam, # update_learning_rate=0.01, max_epochs=10000, on_epoch_finished=[SaveBestModel('rnn', vectorizer)], batch_iterator_train=BatchIterator(batch_size), train_split=TrainSplit(eval_size=0.0), regression=False, verbose=2) return net if __name__ == '__main__': print 'Loading data...' with open('data/data.txt', 'r') as f: data = [line.strip() for line in f] vectorizer = Vectorizer(seq_length=25) print 'Fitting Vectorizer...' X, y = vectorizer.fit_transform(data) with open('vectorizer.pkl', 'w') as f: pickle.dump(vectorizer, f) print 'Training Model...' net = build_net(vectorizer) try: net.fit(X, y) except KeyboardInterrupt: pass