def save_vectors_file(): data = load_data(FLAGS.data_path) vectorizer = Vectorizer() logging.info('getting vectors') img_vectors = [] genders = [] for img_path, gender_id in tqdm(data.items()): try: img_array = get_img(img_path) vector = vectorizer.get_vector(img_array) img_vectors.append(vector) genders.append(gender_id) except Exception as e: logging.warning('exception: {}'.format(e)) vectorizer.close() dim_reduction_technique = get_dim_reduction_technique( FLAGS.dim_reduction_technique) reduced, model = dim_reduction_technique(img_vectors, FLAGS.n_dimensions) save_pkl_file(model, FLAGS.reducter_path) save_pkl_file((reduced, genders), FLAGS.vectors_path)
def classify(self): # Classifies unknown forum posts if not self.fit: print("Fitting must be performed before classifying") return vectorizer = Vectorizer(self.dictionary.dictionary) input_file = input( "Enter the name of the .txt file containing the unknown posts (including file-ending: " ) try: with open(input_file, "r") as file: vectors = vectorizer.vectorize( self.preprocessor.preprocess(file)) except FileNotFoundError: if input("File not found. Press enter to try again or type 'm' and press enter to return to menu.").lower()\ == "m": return self.classify() return with open("result.txt", "w") as result_file: for line in self.classifier.classify(vectors): result_file.write((label_list[line] + "\n")) print( "Result saved in result.txt. " + "The predicted label of each post is printed on the corresponding line of the document." )
def setUp(self): self.vec = Vectorizer(layer=-1, backend='gpu', cores=32) # Generate a list of images base_image = os.path.expanduser( '~') + '/SaturnServer/test_resources/map_image' self.imagenames = [] for i in range(1, self.vec.cores + 1): self.imagenames.append("{}{}.jpg".format(base_image, i))
def cosineScore(self, vector1, vector2): # calculate dot product dotProduct = self.getDotProduct(vector1,vector2) # get magnitudes magnitudes = Vectorizer.getMagnitude(vector1) * Vectorizer.getMagnitude(vector2) if magnitudes == 0: magnitudes = 0 + sys.float_info.epsilon #the smallest possible value. avoid divide by zero error return 1 - (dotProduct/magnitudes)
class TopicEmbeddingModel(): ''' Wrapper class for different topic models ''' def __init__(self,folder='model',modeltype='kpca',topics=10): # the classifier, which also contains the trained BoW transformer self.bow = Vectorizer(folder=folder,steps=['hashing','tfidf']) self.folder = folder self.modeltype = modeltype self.topics = topics if self.modeltype is 'kpca': from sklearn.decomposition import KernelPCA self.model = KernelPCA(kernel='rbf',gamma=1.,n_components=topics) if self.modeltype is 'nmf': from sklearn.decomposition import NMF self.model = NMF(n_components=topics) def fit(self,X): ''' fits a topic model INPUT X list of strings ''' # transform list of strings into sparse BoW matrix X = self.bow.transform(X) #X = self.bow['tfidf_transformer'].fit_transform(\ # self.bow['count_vectorizer'].fit_transform(X)) # depending on the model, train if self.modeltype is 'kpca': Xc = self.model.fit_transform(X) if self.modeltype is 'nmf': Xc = self.model.fit_transform(X) def predict(self,X): ''' predicts cluster assignment from list of strings INPUT X list of strings ''' if X is not list: X = [X] X = self.bow.transform(X) #X = self.bow['tfidf_transformer'].transform(\ # self.bow['count_vectorizer'].transform(X)) if self.modeltype is 'kpca': return self.model.transform(X) if self.modeltype is 'nmf': return self.model.transform(X)
def __init__(self): # vectorizer class # based on composition instead of inheritence principles self.vectorizer = Vectorizer() # weights learned and used by model self.weights = np.array([]) self.tag_enums = [] self.tag_dict = {}
def __init__(self,folder='model',modeltype='kpca',topics=10): # the classifier, which also contains the trained BoW transformer self.bow = Vectorizer(folder=folder,steps=['hashing','tfidf']) self.folder = folder self.modeltype = modeltype self.topics = topics if self.modeltype is 'kpca': from sklearn.decomposition import KernelPCA self.model = KernelPCA(kernel='rbf',gamma=1.,n_components=topics) if self.modeltype is 'nmf': from sklearn.decomposition import NMF self.model = NMF(n_components=topics)
def _get_token_similarity(query_string, pred_string): query_tokens = Tokenizer.tokenize(query_string) pred_tokens = Tokenizer.tokenize(pred_string) pred_vec = dict(zip(pred_tokens, Vectorizer.vectorize_tokens(pred_tokens))) query_vec = dict( zip(query_tokens, Vectorizer.vectorize_tokens(query_tokens))) ret = {} for k, v in query_vec.items(): dist = cdist([v], np.stack(list(pred_vec.values()), axis=0), metric="cosine")[0] idx = dist.argsort()[:2] ret.update({k: list(np.asarray(list(pred_vec.keys()))[idx])}) return ret
def main(filename, category_filename, answer_col, predictor_col, hidden_nodes): df = pd.read_csv(filename, usecols=[answer_col, predictor_col]) categories = pd.read_csv(category_filename, usecols=[predictor_col])[predictor_col].values vectorizer = Vectorizer(df, categories, predictor_col, answer_col) vectorizer.format(0.6, 0.2) batch_size = 1000 epocs = 50 learning_rate = 1e-3 model = build_and_train(vectorizer, batch_size, epocs, learning_rate, hidden_nodes) validate(model, vectorizer) joblib.dump(model, filename + '.joblib')
def main(): img = get_img(FLAGS.img_path) vectorizer = Vectorizer() vector = vectorizer.get_vector(img) vectorizer.close() reducter = load_pkl_file(FLAGS.reducter_path) reduced = reducter.transform([vector]) model = load_pkl_file(FLAGS.model_path) output = model.predict(reduced)[0] print('result: {}'.format(output))
def __init__(self): """Initializes the datastructures required. """ # The actual text extraction object (does text to vector mapping). self.vectorizer = Vectorizer() # A list of already hand classified tweets to train our classifier. self.data = None # A list containing the classification to each individual tweet # in the tweets list. self.classification = None self.classifier = None self.scores = None
def startAnalysis(folder, S1_path, S2_path): fetcher = PageFetcher() S1 = fetcher.fetchPages(folder, S1_path) S2 = fetcher.fetchPages(folder, S2_path) #We use a document representation based on TF-IDF model TF_IDF = Vectorizer() S1_HTML = TF_IDF.fit_transform(S1) S2_HTML = TF_IDF.fit_transform(S2) pageAllignament = PageAllignament() S1S2_Pairs = pageAllignament.allignSources(S1_HTML, S2_HTML) print("Stats of: " + str(S1_path) + " and " + str(S2_path)) evaluation_pipeline(S1S2_Pairs)
def get_most_similar_title(query_title, df, top_n=5): logger.info(f"Query: \t\t {query_title}") v0, tokens = Vectorizer.vectorize_sent(query_title, get_tokens=True) logger.info(f"Processed Query: {' '.join(tokens)}\n") dist = cdist([v0], np.stack(df.title_vect.values, axis=0), metric='cosine')[0] idx = dist.argsort()[:top_n] values = df[[ "title", "abstract", "publish_time", "authors", "journal", "source_x", "url" ]].loc[idx].to_dict("records") ret = dict({ "query": query_title, "processed_query": ' '.join(tokens), "pred": {} }) for n, i, each in zip(range(1, top_n + 1), idx, values): tok_sim = _get_token_similarity(" ".join(tokens), each["title"]) ret["pred"].update({ n: { "score": round((1.0 - dist[i]), 5), "title": each["title"], "abstract": each["abstract"], "publish_time": each["publish_time"], "authors": each["authors"], "journal": each["journal"], "source_x": each["source_x"], "url": each["url"], # "token_similarity":tok_sim } }) return ret
def create_tf_idf(file_path): reader = TrainingTextReader(file_path) keywords = KeywordExtractor(reader.articles[10], 'useless.txt') vector_index = Vectorizer(keywords.article_sents_tokened) freq_mat = vector_index.frequencyMatrix normalized_vector = VectorNormalizer(freq_mat) norm_mat = normalized_vector.l2_norm_matrice tf_idf = InverseDocumentFrequency(norm_mat) return tf_idf.tf_idf_matrice
def test_regression__vectorizer_layer_minus_one_behaves_same(self): # GIVEN a layer to test layer_under_test = -1 # AND a vectorizer that uses that layer vec = Vectorizer(layer=layer_under_test, prm_path=default_prm_path, backend='cpu') # AND an expected output expected_output = [0.0016, 0.9883, 0.0099, 0.00] # # WHEN extracting the attributes from an image print 'This test has not stalled, it takes 20-40 seconds on an fast-ish computer (%s)' % strftime("%H:%M:%S", gmtime()) actual_output = roundArray(vec.get_attribute_vector(image_loc)) # # THEN the output is as expected self.assertEqual(expected_output, actual_output, 'The output %s, does not match the expected output of %s' % (str(actual_output), str(expected_output)))
def main(): with timer("model loading"): # モデルとパイプラインの読込 model = ModelMLP() model.load_model() vectorizer = Vectorizer() vectorizer.load_vectorizer() with timer("data loading"): # 予測対象のデータをロード df = load_data_from_gcs() with timer("preprocess"): df = preprocess(df) with timer("predict"): X = df.drop(columns="price") X = vectorizer.transform(X) pred = model.predict(X) print(pred[:10])
def __init__(self, folder='model', train=False): ''' Creates a classifier object if no model is found, or train is set True, a new classifier is learned INPUT folder the root folder with the Bag-of-Word data, where the model is stored train set True if you want to train ''' self.folder = folder # load Bag-of-Word extractor self.bow_vectorizer = Vectorizer(self.folder) # if there is no classifier file or training is invoked if (not os.path.isfile(self.folder + '/classifier.pickle')) or train: print 'Training classifier' self.train() print 'Loading classifier' clfdict = cPickle.load(open(self.folder + '/classifier.pickle')) self.clf = clfdict['classifier'] self.parties = clfdict['labels']
def _load_data(self, data_dir, word_tokens, pristine_input, pristine_output, batch_size, seq_length, seq_step): try: with open(os.path.join(data_dir, 'input.txt'), encoding='utf-8') as input_file: text = input_file.read() except FileNotFoundError: print_red("No input.txt in data_dir") sys.exit(1) skip_validate = True try: with open(os.path.join(data_dir, 'validate.txt')) as validate_file: text_val = validate_file.read() skip_validate = False except FileNotFoundError: pass # Validation text optional # Find some good default seed string in our source text. self.seeds = find_random_seeds(text) # Include our validation texts with our vectorizer all_text = text if skip_validate else '\n'.join([text, text_val]) self.vectorizer = Vectorizer(all_text, word_tokens, pristine_input, pristine_output) data = self.vectorizer.vectorize(text) x, y = shape_for_stateful_rnn(data, batch_size, seq_length, seq_step) print('x.shape:', x.shape) print('y.shape:', y.shape) if skip_validate: return x, y, None, None data_val = self.vectorizer.vectorize(text_val) x_val, y_val = shape_for_stateful_rnn(data_val, batch_size, seq_length, seq_step) print('x_val.shape:', x_val.shape) print('y_val.shape:', y_val.shape) return x, y, x_val, y_val
def start(self): bag_of_words, words = TermFrequency(self.trained).create_vocabulary() v = Vectorizer(self.trained, self.classify, words, bag_of_words) tfidf_trained = v.tfidf_for_tweets_trained evaluations = v.evaluations tfidf_to_classify = v.tfidf_for_tweets_to_classify models = Models(tfidf_trained, evaluations, tfidf_to_classify) prediction = models.svm_linear() return prediction
def main(): # 学習データ読み込み with timer("train data load"): df = load_data_from_gcs() # 前処理 with timer("preprocess"): df = preprocess(df) vectorizer = Vectorizer() X_train = df.drop(columns="price") y_train = df["price"] with timer("training"): X_train = vectorizer.fit_transform(X_train) # 学習 base_params = { 'input_dropout': 0.2, 'hidden_layers': 3, 'hidden_units': 256, 'hidden_activation': 'relu', 'hidden_dropout': 0.2, 'batch_norm': 'before_act', 'optimizer': { 'type': 'adam', 'lr': 5e-5 }, 'batch_size': 64, } model = ModelMLP(base_params) model.fit(X_train, y_train) with timer("save model"): #モデルとパイプラインの保存 vectorizer.save_vectorizer() model.save_model()
def preprocess_and_fit(self): # Method that preprocesses data, indexes all words, vectorizes posts and finally trains and tests the classifier processed = [] processed_test = [] for category in self.categories: processed.append( self.preprocessor.preprocess('training' + str(category) + ".txt")) processed_test.append( self.preprocessor.preprocess('testing' + str(category) + ".txt")) # Word indexing for category in processed: # indexes all words into dictionary self.dictionary.index_words(category) print("Words indexed. Dictionary size: ", len(self.dictionary.dictionary), " words") # Vectorization vectorizer = Vectorizer( self.dictionary.dictionary ) # initializes vectorizer-object with dictionary vector_start = time.time() print("Vectorizing...") training_vectors = [] testing_vectors = [] for category in processed: training_vectors.append(vectorizer.vectorize(category)) for category in processed_test: testing_vectors.append(vectorizer.vectorize(category)) vector_time = time.time() - vector_start print("Vectorization completed in ", ("%.2f" % vector_time), "seconds") # Training and evaluation self.classifier.train(training_vectors) self.fit = True self.classifier.evaluate(testing_vectors)
def test_regression__vectorizer_layer_minus_four_behaves_same(self): # GIVEN a layer to test layer_under_test = -4 # AND a vectorizer that uses that layer vec = Vectorizer(layer=layer_under_test, prm_path=default_prm_path, backend='cpu') # AND an expected output stored in a file expected_output_file_path = os.path.expanduser('~')+'/SaturnServer/test_resources/layer4results.txt' # # WHEN extracting the attributes from an image print 'This test has not stalled, it takes 20-40 seconds on an fast-ish computer (%s)' % strftime("%H:%M:%S", gmtime()) actual_output = roundArray(vec.get_attribute_vector(image_loc)) # # THEN each element of the actual output array must match each element of the expected results with open(expected_output_file_path, 'r') as expected_output_file: element_no = 0 for expected_element in expected_output_file: self.assertEqual(float(expected_element), actual_output[element_no], 'The output (element %d) %s, does not match the expected output of %s' % (element_no, str(actual_output[element_no]), str(expected_element))) element_no += 1
def calculate_cooccurrence(config): with open(config.input_filepath, "rb") as f: corpus = pickle.load(f) vectorizer = Vectorizer.from_corpus( corpus=corpus, vocab_size=config.vocab_size ) cooccurrence = CooccurrenceEntries.setup( corpus=corpus, vectorizer=vectorizer ) cooccurrence.build( window_size=config.window_size, num_partitions=config.num_partitions, chunk_size=config.chunk_size, output_directory=config.cooccurrence_dir )
def __init__(self,folder='model',train=False): ''' Creates a classifier object if no model is found, or train is set True, a new classifier is learned INPUT folder the root folder with the Bag-of-Word data, where the model is stored train set True if you want to train ''' self.folder = folder # load Bag-of-Word extractor self.bow_vectorizer = Vectorizer(self.folder) # if there is no classifier file or training is invoked if (not os.path.isfile(self.folder+'/classifier.pickle')) or train: print 'Training classifier' self.train() print 'Loading classifier' clfdict = cPickle.load(open(self.folder+'/classifier.pickle')) self.clf = clfdict['classifier'] self.parties = clfdict['labels']
def vectorize_jobs(df_jobs, vectorizer_path, tfidfs_path, debug=False): #initializing tfidf vectorizer if debug: print('[Job Vectorization 2/5] Initializing Vectorizer \n') vectorizer = Vectorizer() if debug: print('[Job Vectorization 3/5] Tranforming/Vectorizing data \n') tfidf_jobs = vectorizer.fit_transform( (df_jobs['text'])) #fitting and transforming the vector if debug: print('[Job Vectorization 4/5] saving vectorizer to {path} \n'.format( path=vectorizer_path)) vectorizer.save_vectorizer(vectorizer_path) if debug: print('[Job Vectorization 5/5] saving tfidf to {path} \n'.format( path=tfidfs_path)) vectorizer.save_tfidfs(tfidf_jobs, tfidfs_path)
ix_to_rel = {i: r for i, r in enumerate(rel_set)} num_words = len(word_set) num_tags = len(tag_set) num_rels = len(rel_set) ROOT_TAG = "root" WORD_SIZE = 100 TAG_SIZE = 30 HIDDEN_SIZE = 100 NUM_EPOCHS = 3 word_vectorizer = Vectorizer(WordExtractor(sents), None, "parser_word", WORD_SIZE, filler=ZeroFiller(WORD_SIZE), ce_enabled=False, tf_enabled=False) tag_vectorizer = Vectorizer(TagExtractor(sents), None, "parser_pos", TAG_SIZE, filler=ZeroFiller(TAG_SIZE), ce_enabled=False, tf_enabled=False) parser = SyntaxParser(num_words, WORD_SIZE, num_tags, TAG_SIZE, WORD_SIZE + TAG_SIZE, HIDDEN_SIZE, num_rels) optimizer = optim.SGD(parser.parameters(), lr=0.1) loss_function = nn.NLLLoss()
sorted_article_list = [] for article, score in relevance_sorted_articles: print "Id: ", article.id_num print "Link: ", article.link print "Description: ", article.description print article.title ,":",score print '\n\n\n' sorted_article_list.append(article) return sorted_article_list #==================FINDING TRENDING ARTICLES================= trending_articles = findTrending(PICKELED_RECENT_ARTICLES_ALL_TOPICS) vectorizer = Vectorizer() vectorized_trending_articles = vectorizer.vectorize(trending_articles) setArticleVectors(trending_articles, vectorized_trending_articles) # for article in trending_articles: # print article.description, article.vector dimensions = str(len(vectorized_trending_articles)) + " x " + str(len(vectorized_trending_articles[0])) print "Term document matrix with" + dimensions + ": \n", vectorized_trending_articles #==================KMEANS STARTS HERE======================= # print "Calculating kmeans..." # kmeans_calculator = KMeansClusterer()
PREPROCESSOR = Preprocessor(thesaurus_path) # シソーラス・パスを渡さなければ置換をしません。 print('前処理を行います') PREPROCESSOR.load_text([text_path]) whitelist = PREPROCESSOR.investigate_whitelist(thesaurus_path) print('保存します') PREPROCESSOR.save(auto_text_path) PARSER = Parser() print('かかり受け解析を行います..') PARSER.t2f([auto_text_path + '/' + root + '.text'], kytea_model=kytea_path, eda_model=eda_path) print('結果を保存します') PARSER.save(tree_path) # かかり受け解析したものをファイルに保存 print("Indexを読み込みます...") VECTORIZER = Vectorizer(index_path, t=1, list=whitelist) # Indexの読み込み print('Treeを読み込みます') vectors = VECTORIZER.get_vector([tree_path + '/' + root + '.eda'], filter=3) # ベクトルを生成 print(vectors) print('Vectorを保存します') VECTORIZER.save(vectors, [vector_path]) # ベクトルを保存 #----- # いまもっているTFIDFコーパスベクトル群と、クエリベクトルtfidf_vectorsを比較 #---- print('TFIDF corpus Vectorsを読み込みます') tfidf_corpus_vectors = VECTORIZER.load( sorted(glob.glob(tfidf_DB_path + '/*.vector'))) print(tfidf_corpus_vectors)
print('かかり受け解析を行います..') PARSER.t2f(sorted(glob.glob(auto_text_path + '/*')), kytea_model=kytea_path, eda_model=eda_path) # text_pathのファイルをかかり受け解析 print('結果を保存します') PARSER.save(tree_path) # かかり受け解析したものをファイルに保存 INDEX = Index(unigram=1, dep_trigram=1, bigram=1, dep_bigram=1) # Indexをunigramとbigramの素性を、treeから読み出すことでIndexを作成する print('Treeを読み込みます') INDEX.add_index(sorted(glob.glob(tree_path + '/*'))) # tree_pathのフォルダ以下のファイルからインデックスを作る print('INDEXを保存します...') INDEX.save(index_path) # index_pathにインデックスを保存 print(index_path) print("Indexを読み込みます...") VECTORIZER = Vectorizer(index_path, t=1, list=whitelist) # Indexの読み込み # 閾値は1 print('Treeを読み込みます') vectors = VECTORIZER.get_vector(sorted(glob.glob(tree_path + '/*')), filter=3) # ベクトルを生成 print(vectors) print('Vectorを保存します') filename_list = sorted(glob.glob(tree_path + '/*')) vector_path_list = [] for filename in filename_list: base_name = os.path.basename(filename) # A.text root = os.path.splitext(base_name)[0] # A file_name = vector_folder_path + '/' + root + '.vector' vector_path_list.append(file_name) VECTORIZER.save(vectors, vector_path_list) # ベクトルを保存 print(vector_path_list)
def __init__(self, name, sents, vectorizer_words, vectorizer_forms, embedding_size, tag_sents, tag_embedding_size, context_size, lrs=(0.1, 0.1, 0.1), lr_decrease_factor=0.5, epochs_per_decrease=10): ###################################################################### # Model's parameters. # 'sents' is a list of sentences of tuples ((form, word, tag), rel, head) self.name = name self.sents = sents self.embedding_size = embedding_size self.context_size = context_size ###################################################################### # Load or create indices. # Common self.path_base = "internal" self.num_words = 0 self.root_tag = "root" # CUDA flag self.is_cuda_available = torch.cuda.is_available() # For POS tags: self.tags = set() self.num_tags = 0 self.tag2index = {} self.index2tag = {} # For chunk tags: self.chunks = set() self.num_chunks = 0 self.chunk2index = {} self.index2chunk = {} # For relation tags: self.rels = set() self.num_rels = 0 self.rel2index = {} self.index2rel = {} # Update database self.create_or_load_indices() if self.num_words == 0: self.num_words = self.get_num_words(self.sents) ###################################################################### # Logic. # Learning rate controls self.lrs = lrs self.lr_decrease_factor = lr_decrease_factor self.epochs_per_decrease = epochs_per_decrease # Define machines self.vectorizer = Vectorizer(vectorizer_words, vectorizer_forms, name, embedding_size, filler=ZeroFiller(embedding_size), ce_enabled=True) # self.vectorizer = FastTextVectorizer(name, embedding_size * 2, "ft_sg_syntagrus.bin") self.tag_vectorizer = Vectorizer(tag_sents, None, name + "_pos", tag_embedding_size, filler=ZeroFiller(tag_embedding_size), ce_enabled=False, tf_enabled=False) # Tags embeddings (H). # Chunker will get linear combination as an input: # I = H^T * p # p - probabilities vector self.tag_embeddings = [] for i in range(self.num_tags): tag = self.index2tag[i].lower() self.tag_embeddings.append(self.tag_vectorizer(tag, tag)) self.tag_embeddings = torch.stack(self.tag_embeddings) if self.is_cuda_available: self.tag_embeddings = self.tag_embeddings.cuda() # Vector size is 1 (TF) + 100 (Word embedding) + 100 (Char grams embedding) self.vector_size = self.vectorizer.get_vector_size() self.tag_size = self.tag_vectorizer.get_vector_size() # Chunk size. # Benchmark is 200 (POS hidden) + 201 (embedding) + NUM_TAGS (probabilities) self.chunk_size = 2 * embedding_size + self.vector_size + self.tag_size # Parse size -- input size for parser. # When chunking is not available, parse size is equal to chunk size self.parse_size = self.chunk_size self.log("tagger input size: {}".format(self.vector_size)) self.log("chunker input size: {}".format(self.chunk_size)) self.log("parser input size: {}".format(self.parse_size)) self.tagger = Tagger(self.vector_size, self.num_tags, "GRU", embedding_size) # self.chunker = Tagger(self.chunk_size, self.num_chunks, "LSTM", embedding_size) self.parser = SyntaxParser(0, 0, 0, 0, self.parse_size, embedding_size, self.num_rels) self.is_tagger_trained = False # self.is_chunker_trained = False self.is_parser_trained = False self.tagger_name = "pos tagging" # self.chunker_name = "chunking" self.parser_name = "parsing" # Try to load from file self.tagger_path = "{}/model_pos_{}.pt".format(self.path_base, self.name) # self.chunker_path = "{}/model_chunk_{}.pt".format(self.path_base, self.name) self.parser_path = "{}/model_parse_{}.pt".format(self.path_base, self.name) if os.path.exists(self.tagger_path): self.log("Loading POS tagger") self.tagger = torch.load(self.tagger_path) self.tagger.unit.flatten_parameters() self.is_tagger_trained = True self.log("Done") # if os.path.exists(self.chunker_path): # self.log("Loading chunker") # self.chunker = torch.load(self.chunker_path) # self.chunker.unit.flatten_parameters() # self.is_chunker_trained = True # self.log("Done") if os.path.exists(self.parser_path): self.log("Loading parser") self.parser = torch.load(self.parser_path) self.parser.unit.flatten_parameters() self.is_parser_trained = True self.log("Done")
class Model: def __init__(self, name, sents, vectorizer_words, vectorizer_forms, embedding_size, tag_sents, tag_embedding_size, context_size, lrs=(0.1, 0.1, 0.1), lr_decrease_factor=0.5, epochs_per_decrease=10): ###################################################################### # Model's parameters. # 'sents' is a list of sentences of tuples ((form, word, tag), rel, head) self.name = name self.sents = sents self.embedding_size = embedding_size self.context_size = context_size ###################################################################### # Load or create indices. # Common self.path_base = "internal" self.num_words = 0 self.root_tag = "root" # CUDA flag self.is_cuda_available = torch.cuda.is_available() # For POS tags: self.tags = set() self.num_tags = 0 self.tag2index = {} self.index2tag = {} # For chunk tags: self.chunks = set() self.num_chunks = 0 self.chunk2index = {} self.index2chunk = {} # For relation tags: self.rels = set() self.num_rels = 0 self.rel2index = {} self.index2rel = {} # Update database self.create_or_load_indices() if self.num_words == 0: self.num_words = self.get_num_words(self.sents) ###################################################################### # Logic. # Learning rate controls self.lrs = lrs self.lr_decrease_factor = lr_decrease_factor self.epochs_per_decrease = epochs_per_decrease # Define machines self.vectorizer = Vectorizer(vectorizer_words, vectorizer_forms, name, embedding_size, filler=ZeroFiller(embedding_size), ce_enabled=True) # self.vectorizer = FastTextVectorizer(name, embedding_size * 2, "ft_sg_syntagrus.bin") self.tag_vectorizer = Vectorizer(tag_sents, None, name + "_pos", tag_embedding_size, filler=ZeroFiller(tag_embedding_size), ce_enabled=False, tf_enabled=False) # Tags embeddings (H). # Chunker will get linear combination as an input: # I = H^T * p # p - probabilities vector self.tag_embeddings = [] for i in range(self.num_tags): tag = self.index2tag[i].lower() self.tag_embeddings.append(self.tag_vectorizer(tag, tag)) self.tag_embeddings = torch.stack(self.tag_embeddings) if self.is_cuda_available: self.tag_embeddings = self.tag_embeddings.cuda() # Vector size is 1 (TF) + 100 (Word embedding) + 100 (Char grams embedding) self.vector_size = self.vectorizer.get_vector_size() self.tag_size = self.tag_vectorizer.get_vector_size() # Chunk size. # Benchmark is 200 (POS hidden) + 201 (embedding) + NUM_TAGS (probabilities) self.chunk_size = 2 * embedding_size + self.vector_size + self.tag_size # Parse size -- input size for parser. # When chunking is not available, parse size is equal to chunk size self.parse_size = self.chunk_size self.log("tagger input size: {}".format(self.vector_size)) self.log("chunker input size: {}".format(self.chunk_size)) self.log("parser input size: {}".format(self.parse_size)) self.tagger = Tagger(self.vector_size, self.num_tags, "GRU", embedding_size) # self.chunker = Tagger(self.chunk_size, self.num_chunks, "LSTM", embedding_size) self.parser = SyntaxParser(0, 0, 0, 0, self.parse_size, embedding_size, self.num_rels) self.is_tagger_trained = False # self.is_chunker_trained = False self.is_parser_trained = False self.tagger_name = "pos tagging" # self.chunker_name = "chunking" self.parser_name = "parsing" # Try to load from file self.tagger_path = "{}/model_pos_{}.pt".format(self.path_base, self.name) # self.chunker_path = "{}/model_chunk_{}.pt".format(self.path_base, self.name) self.parser_path = "{}/model_parse_{}.pt".format(self.path_base, self.name) if os.path.exists(self.tagger_path): self.log("Loading POS tagger") self.tagger = torch.load(self.tagger_path) self.tagger.unit.flatten_parameters() self.is_tagger_trained = True self.log("Done") # if os.path.exists(self.chunker_path): # self.log("Loading chunker") # self.chunker = torch.load(self.chunker_path) # self.chunker.unit.flatten_parameters() # self.is_chunker_trained = True # self.log("Done") if os.path.exists(self.parser_path): self.log("Loading parser") self.parser = torch.load(self.parser_path) self.parser.unit.flatten_parameters() self.is_parser_trained = True self.log("Done") ########################################################################## def train(self, sents, num_epochs, machines): ###################################################################### # Define optimizers tag_optimizer = optim.SGD(self.tagger.parameters(), lr=self.lrs[0]) # chunk_optimizer = optim.SGD(self.chunker.parameters(), lr=self.lrs[1]) chunk_optimizer = None # Parameters for both machines params = list(self.tagger.parameters()) + list(self.parser.parameters()) parse_optimizer = optim.SGD(params, lr=self.lrs[2]) tag_loss_function = nn.NLLLoss() chunk_loss_function = nn.NLLLoss() parse_loss_function = nn.NLLLoss() ###################################################################### # Run loop start_time = time.time() for epoch in range(num_epochs): print("epoch #{}: ".format(epoch), end="", flush=True) optimizers = [tag_optimizer, chunk_optimizer, parse_optimizer] self.loop(sents, optimizers, [tag_loss_function, chunk_loss_function, parse_loss_function], [None, None, None]) self.decrease_lr(optimizers, epoch + 1, self.lr_decrease_factor, self.epochs_per_decrease) print("elapsed: {} s".format(int(time.time() - start_time))) # Out misses # self.print_vectorizer_misses() # Save model torch.save(self.tagger, self.tagger_path) # torch.save(self.chunker, self.chunker_path) torch.save(self.parser, self.parser_path) self.log("Done") ########################################################################## def test(self, sents): # Collect statistics tag_score = TagScore(self.tags) # chunk_score = ChunkScore(self.chunks) chunk_score = None parse_score = ParserScore() num_correct_tags = 0 num_correct_chunks = 0 num_words = 0 start_time = time.time() self.loop(sents, [None, None, None], [nn.NLLLoss(), nn.NLLLoss(), nn.NLLLoss()], [tag_score, chunk_score, parse_score]) print("elapsed: {} s".format(int(time.time() - start_time))) # Out statistics print("POS Tagging:") f1_s = [] has_zero = False for tag in sorted(self.tags): stat = tag_score.stats[tag] if stat.num_gold_predicted == 0 or stat.num_gold == 0 or stat.num_predicted == 0: print("\tskipped: {:>5} ({} items)".format(tag, stat.num_gold)) continue num_words += stat.num_gold num_correct_tags += stat.num_gold_predicted precision = stat.num_gold_predicted / max(stat.num_predicted, 1.0) recall = stat.num_gold_predicted / max(stat.num_gold, 1.0) f1 = 0.0 if math.isclose(precision, 0.0) or math.isclose(recall, 0.0): has_zero = True else: f1 = hmean([precision, recall]) f1_s.append(f1) print("\t{:>5}: P = {:4.2f}%, R = {:4.2f}%, F1 = {:4.2f}% ({} items)".format( tag, precision * 100, recall * 100, f1 * 100, stat.num_gold)) # ratio = 0 # if stat[1] != 0: # ratio = stat[0] / stat[1] # ratio *= 100 # # print("\t{:>4}: {:4} / {:4} = {:4.2f}%".format(tag, stat[0], stat[1], ratio)) # print("Chunking:") # for chunk in sorted(self.chunks): # stat = chunk_score.stats[chunk] # num_correct_chunks += stat[0] # # ratio = 0 # if stat[1] != 0: # ratio = stat[0] / stat[1] # ratio *= 100 # # print("\t{:>4}: {:4} / {:4} = {:4.2f}%".format(chunk, stat[0], stat[1], ratio)) # self.print_vectorizer_misses() # POS aggregated print("Total words:", num_words) print("Correct POS tags:", num_correct_tags, "({:4.2f}%)".format( num_correct_tags / num_words * 100.0)) average_f1 = 0.0 if not has_zero: average_f1 = hmean(f1_s) print("Average F1 = {:4.2f}%".format(average_f1 * 100)) # Chunks aggregated # print("Correct chunk tags:", num_correct_chunks, "({:4.2f}%)".format( # num_correct_chunks / num_words * 100.0)) # precision = chunk_score.num_retrieved_relevant / chunk_score.num_retrieved # recall = chunk_score.num_retrieved_relevant / chunk_score.num_relevant # f1_score = hmean([precision, recall]) # print("\tPrecision: {:f}".format(precision)) # print("\tRecall: {:f}".format(recall)) # print("\tF1 score: {:f}".format(f1_score)) print("Parsing:") print("\tUAS: {} from {} ({:4.2f}%)".format(parse_score.num_unlabeled_arcs, num_words, parse_score.num_unlabeled_arcs / num_words * 100.0)) print("\tLAS: {} from {} ({:4.2f}%)".format( parse_score.num_labeled_arcs, num_words, parse_score.num_labeled_arcs / num_words * 100.0 )) print("\tMUAS: {} from {} ({:4.2f}%)".format( parse_score.num_modified_unlabeled_arcs, num_words, parse_score.num_modified_unlabeled_arcs / num_words * 100.0 )) print("\tCRel: {} from {} ({:4.2f}%)".format( parse_score.num_labels, num_words, parse_score.num_labels / num_words * 100.0)) print("\tUEM: {} from {} ({:4.2f}%)".format( parse_score.num_unlabeled_trees, len(sents), parse_score.num_unlabeled_trees / len(sents) * 100.0 )) print("\tLEM: {} from {} ({:4.2f}%)".format( parse_score.num_labeled_trees, len(sents), parse_score.num_labeled_trees / len(sents) * 100.0 )) print("\tMUEM: {} from {} ({:4.2f}%)".format( parse_score.num_modified_unlabeled_trees, len(sents), parse_score.num_modified_unlabeled_trees / len(sents) * 100.0 )) self.log("Done") ########################################################################## # Lose control and decrease the pace # Warped and bewitched # It's time to erase @staticmethod def decrease_lr(optimizers, epoch, factor, epoch_interval): if epoch % epoch_interval != 0: return print("lr is multiplied by {:f}".format(factor)) for optimizer in optimizers: if optimizer is not None: for param_group in optimizer.param_groups: param_group["lr"] *= factor ########################################################################## # Used both by 'train' and 'test'. # The only difference is that optimizers mustn't be provided # during testing def loop(self, sents, optimizers, loss_functions, scores): tag_optimizer = optimizers[0] chunk_optimizer = optimizers[1] parse_optimizer = optimizers[2] tag_loss_function = loss_functions[0] chunk_loss_function = loss_functions[1] parse_loss_function = loss_functions[2] tag_scores = scores[0] chunk_scores = scores[1] parse_scores = scores[2] # Total number of words num_words = self.get_num_words(sents) words_per_interval = num_words // 10 # Average losses tag_loss = 0 chunk_loss = 0 parse_loss = 0 # Reset vectorizer self.vectorizer.reset_counters() # Reset progress bar print("progress [", end="", flush=True) current_word = 0 next_interval = words_per_interval # Dump fout = open("result.conllu", "w") for sent in sents: # Reset optimizers and machines # if tag_optimizer is not None: # tag_optimizer.zero_grad() # if chunk_optimizer is not None: # chunk_optimizer.zero_grad() if parse_optimizer is not None: parse_optimizer.zero_grad() self.tagger.reset() # self.chunker.reset() ############################################################## # POS Tagger. # Prepare input for tagger and targets for chunker sequence = [] tag_targets = [] # chunk_targets = [] parse_head_targets = [] parse_rel_targets = [] for dep, rel, head in sent: form, word, tag = dep sequence.append(self.vectorizer(form, word)) tag_targets.append(self.tag2index[tag]) # chunk_targets.append(self.chunk2index[chunk]) parse_head_targets.append(head) parse_rel_targets.append(self.rel2index[rel]) sequence = Variable(torch.stack(sequence, dim=0)) tag_targets = Variable(torch.LongTensor(tag_targets)) # chunk_targets = Variable(torch.LongTensor(chunk_targets)) parse_head_targets = Variable(torch.LongTensor(parse_head_targets)) parse_rel_targets = Variable(torch.LongTensor(parse_rel_targets)) if self.is_cuda_available: tag_targets = tag_targets.cuda() # chunk_targets = chunk_targets.cuda() parse_head_targets = parse_head_targets.cuda() parse_rel_targets = parse_rel_targets.cuda() # Optimize tagger tag_output = self.tagger(sequence.view((len(sent), 1, -1))) current_loss = tag_loss_function(tag_output, tag_targets) tag_loss += current_loss.data[0] # if tag_optimizer is not None: # current_loss.backward() # tag_optimizer.step() ############################################################## # Chunking. # Prepare input for chunker sequence = Variable(sequence.data) probabilities = torch.exp(tag_output) probabilities = probabilities.mm(Variable(self.tag_embeddings)) tagger_hidden = self.tagger.last_output.view((len(sent), -1)) sequence = torch.cat((tagger_hidden, sequence, probabilities), dim=1) # sequence = Variable(torch.cat((sequence, probabilities), dim=1)) # sequence = Variable(sequence) # Optimize chunker # chunk_output = self.chunker(sequence.view((len(sent), 1, -1))) # current_loss = chunk_loss_function(chunk_output, chunk_targets) # chunk_loss += current_loss.data[0] # if chunk_optimizer is not None: # current_loss.backward() # chunk_optimizer.step() # Optimize parser parse_output_heads, parse_output_rels = self.parser(sequence.view(len(sent), 1, -1)) current_parser_loss = parse_loss_function(parse_output_heads, parse_head_targets) current_parser_loss += parse_loss_function(parse_output_rels, parse_rel_targets) parse_loss += current_parser_loss.data[0] current_loss += current_parser_loss if parse_optimizer is not None: current_loss.backward() parse_optimizer.step() ################################################################## # Collect stats if necessary actual_chunks = [] is_sent_correct = True is_labeled_sent_correct = True heads = [0 for i in range(len(sent))] rels = ["" for i in range(len(sent))] probabilities = [[] for i in range(len(sent))] forms = [] words = [] tags = [] if tag_scores is not None or chunk_scores is not None or parse_scores is not None: # Output is SEQ_LEN x NUM_TAGS for i in range(len(sent)): forms.append(sent[i][0][0]) words.append(sent[i][0][1]) if tag_scores is not None: maximum, indices = tag_output[i].max(0) predicted = indices.data[0] expected = tag_targets.data[i] tag = sent[i][0][2] stat = tag_scores.stats[tag] stat.num_gold += 1 predicted_tag = self.index2tag[predicted] tags.append(predicted_tag) if predicted == expected: stat.num_gold_predicted += 1 tag_scores.stats[predicted_tag].num_predicted += 1 # if chunk_scores is not None: # maximum, indices = chunk_output[i].max(0) # predicted = indices.data[0] # expected = chunk_targets.data[i] # # chunk = sent[i][2] # stat = chunk_scores.stats[chunk] # stat[1] += 1 # # if chunk[0] == "B": # chunk_scores.num_relevant += 1 # # actual_chunk = self.index2chunk[predicted] # actual_chunks.append(actual_chunk) # if actual_chunk[0] == "B": # chunk_scores.num_retrieved += 1 # # if predicted == expected: # stat[0] += 1 if parse_scores is not None: for j in range(len(sent) + 1): probabilities[i].append((j, parse_output_heads.data[i][j])) probabilities[i].sort(key=lambda pair: pair[1], reverse=True) maximum, indices = parse_output_heads[i].max(0) predicted = indices.data[0] expected = parse_head_targets.data[i] head = predicted heads[i] = head is_head_correct = expected == predicted if is_head_correct: parse_scores.num_unlabeled_arcs += 1 else: is_sent_correct = False is_labeled_sent_correct = False maximum, indices = parse_output_rels[i].max(0) predicted = indices.data[0] expected = parse_rel_targets.data[i] rel = self.index2rel[predicted] rels[i] = rel if expected == predicted: parse_scores.num_labels += 1 if is_head_correct: parse_scores.num_labeled_arcs += 1 else: is_labeled_sent_correct = False # Specially for parser if parse_scores is not None: fout.write("#text = {}\n".format(" ".join(forms))) if is_sent_correct: parse_scores.num_unlabeled_trees += 1 if is_labeled_sent_correct: parse_scores.num_labeled_trees += 1 parse_output_heads = parse_output_heads.data # Trying to turn random graph into well-formed tree. # Step 1. Find a root outliers = set() roots = set() unvisited = set() maximum = parse_output_heads[0][0] - 1.0 index = -1 for i in range(len(sent)): if parse_output_heads[i][0] > maximum: maximum = parse_output_heads[i][0] index = i if heads[i] == 0 or rels[i] == self.root_tag: roots.add(i) else: unvisited.add(i) roots.add(index) if len(roots) > 0: options = [] for node in roots: tmp_outliers = outliers.copy() tmp_outliers.update(roots) tmp_outliers.remove(node) options.append(try_build_tree(node, heads, unvisited.copy(), tmp_outliers)) options.sort(key=lambda t: len(t[2])) root, visited, outliers = options[0] else: raise RuntimeError("unreachable branch") # maximum = parse_output_heads[0][0] - 1.0 # index = -1 # for i in range(len(sent)): # if parse_output_heads[i][0] > maximum: # maximum = parse_output_heads[i][0] # index = i # root = index # if root in outliers: # outliers.remove(root) # if root in unvisited: # unvisited.remove(root) # root, visited, outliers = try_build_tree(root, heads, unvisited, outliers) heads[root] = 0 rels[root] = self.root_tag # Now 'unvisited' contains only unresolved references. # Use minimal algo to resolve arcs while len(outliers) > 0: options = [] for node in outliers: options.append(try_expand_tree(node, probabilities, heads, visited, outliers)) options.sort(key=lambda t: len(t[3])) index, head, visited, outliers = options[0] heads[index] = head is_modified_sent_correct = True for i in range(len(sent)): if heads[i] == parse_head_targets.data[i]: parse_scores.num_modified_unlabeled_arcs += 1 else: is_modified_sent_correct = False fout.write("{}\t{}\t{}\t{}\t_\t_\t{}\t{}\t{}:{}\t_\n".format( i + 1, forms[i], words[i], tags[i], heads[i], rels[i], heads[i], rels[i])) fout.write("\n") if is_modified_sent_correct: parse_scores.num_modified_unlabeled_trees += 1 # Specially for chunker determine the quantity of retrieved relevant chunks # if chunk_scores is not None: # gold_chunks = [chunk for word, tag, chunk in sent] # num_retrieved_relevant = 0 # i = 0 # while i < len(gold_chunks): # gold_chunk = gold_chunks[i] # actual_chunk = actual_chunks[i] # # if gold_chunk[0] == "B": # is_correct = True # while True: # if gold_chunk != actual_chunk: # is_correct = False # # i += 1 # if i == len(gold_chunks): # break # # gold_chunk = gold_chunks[i] # actual_chunk = actual_chunks[i] # # if gold_chunk[0] != "I": # if actual_chunk[0] == "I": # is_correct = False # break # # if is_correct: # num_retrieved_relevant += 1 # else: # i += 1 # chunk_scores.num_retrieved_relevant += num_retrieved_relevant # Emulate progress bar current_word += len(sent) if current_word >= next_interval: next_interval += words_per_interval print('💪', end="", flush=True) # Debug epoch log print("], ATL: {:10.8f}, ACL: {:10.8f}, APL: {:10.8f}".format( tag_loss / len(sents), chunk_loss / len(sents), parse_loss / len(sents))) fout.close() ########################################################################## def print_vectorizer_misses(self): print("unknown words: {} from {} ({:4.2f}%)".format( self.vectorizer.num_word_misses, self.vectorizer.num_words, self.vectorizer.num_word_misses / self.vectorizer.num_words * 100.0 )) print("unknown grams: {} from {} ({:4.2f}%)".format( self.vectorizer.num_char_misses, self.vectorizer.num_grams, self.vectorizer.num_char_misses / self.vectorizer.num_grams * 100.0 )) ########################################################################## @staticmethod def get_num_words(sents): num_words = 0 for sent in sents: num_words += len(sent) return num_words ########################################################################## def create_or_load_indices(self): tag_path = "{}/{}_tags.txt".format(self.path_base, self.name) chunk_path = "{}/{}_chunks.txt".format(self.path_base, self.name) rel_path = "{}/{}_rels.txt".format(self.path_base, self.name) create_tag_index = False create_chunk_index = False create_rel_index = False # Try load if os.path.exists(tag_path): # Load from existing data base self.log("Loading POS tag index from file") for line in open(tag_path): tag, index = line.split() index = int(index) self.tags.add(tag) self.tag2index[tag] = index self.index2tag[index] = tag self.num_tags = len(self.tags) else: # Create from scratch self.log("Creating POS tag index") create_tag_index = True # Try load if os.path.exists(chunk_path): # Load chunk index from file self.log("Loading chunk index from file") for line in open(chunk_path): chunk, index = line.split() index = int(index) self.chunks.add(chunk) self.chunk2index[chunk] = index self.index2chunk[index] = chunk self.num_chunks = len(self.chunks) else: # Create from scratch self.log("Creating chunk tag index") create_chunk_index = True # Try load if os.path.exists(rel_path): # Load rel index from file self.log("Loading rel index from file") for line in open(rel_path): rel, index = line.split() index = int(index) self.rels.add(rel) self.rel2index[rel] = index self.index2rel[index] = rel self.num_rels = len(self.rels) else: # Create from scratch self.log("Creating rel tag index") create_rel_index = True # Create if necessary if create_tag_index or create_chunk_index or create_rel_index: # Collect data for sent in self.sents: self.num_words += len(sent) for dep, rel, head in sent: form, word, tag = dep if create_tag_index: self.tags.add(tag) if create_rel_index: self.rels.add(rel) # if create_chunk_index: # self.chunks.add(chunk) # Create POS tag database if create_tag_index: file_tags = open(tag_path, "w") self.num_tags = len(self.tags) for index, tag in enumerate(self.tags): self.index2tag[index] = tag self.tag2index[tag] = index file_tags.write("{} {}\n".format(tag, index)) file_tags.close() # Create chunk tag database if create_chunk_index: file_chunks = open(chunk_path, "w") self.num_chunks = len(self.chunks) for index, chunk in enumerate(self.chunks): self.index2chunk[index] = chunk self.chunk2index[chunk] = index file_chunks.write("{} {}\n".format(chunk, index)) file_chunks.close() # Create rel tag database if create_rel_index: file_rels = open(rel_path, "w") self.num_rels = len(self.rels) for index, rel in enumerate(self.rels): self.index2rel[index] = rel self.rel2index[rel] = index file_rels.write("{} {}\n".format(rel, index)) file_rels.close() ########################################################################## def log(self, message): print("Model [{}]:".format(self.name), message)
from data_analysis import DataManager from vectorizer import Vectorizer import numpy as np import pickle from tempfile import TemporaryFile dm = DataManager('./data/spam.csv') dm.most_frequent_character_in_spam() dm.most_frequent_character_in_legit() dm.most_frequent_characters() dm.average_text_length() sentences, labels = dm.get_text(), dm.get_labels() labels = list(map(lambda v: 0 if v == 'ham' else 1, labels)) vectorizer = Vectorizer(sentences) sentences_features = [] for sentence in sentences: sentence_vector = vectorizer.text_to_vec(sentence, alpha=0.3) sentences_features.append(sentence_vector) train_x, train_y = sentences_features[0:5000], labels[0:5000] train_x = np.asarray(train_x) train_y = np.asarray(train_y) test_x, test_y = sentences_features[5000:], labels[5000:] test_x = np.asarray(test_x) test_y = np.asarray(test_y) np.savetxt('train_x.txt', train_x)
def test_with_nested_CV(folder='model',folds=5, plot=True, steps=['hashing','tfidf']): ''' Evaluates the classifer by doing nested CV i.e. keeping 1/folds of the data out of the training and doing training (including model selection for regularizer) on the training set and testing on the held-out data Also prints some stats and figures INPUT folder folder with model files folds number of folds ''' # start timer import time t0 = time.time() # create bag of words representations vv = Vectorizer(steps=steps) # load data vec = Vectorizer(folder=folder) data = get_speech_text(folder=folder) for key in data.keys(): data[key] = vec.transform(data[key]) # create numerical labels Y = hstack(map((lambda x: ones(data[data.keys()[x]].shape[0])*x),range(len(data)))) # create data matrix X = vstack(data.values()) # permute data fsize = len(Y)/folds randidx = permutation(len(Y)) Y = Y[randidx] X = X[randidx,:] idx = reshape(arange(fsize*folds),(folds,fsize)) Y = Y[:fsize*folds] # allocate matrices for predictions predicted = zeros(fsize*folds) predicted_prob = zeros((fsize*folds,len(data))) # the regularization parameters to choose from parameters = {'C': (10.**arange(-4,4,1.)).tolist()} # do nested CV for ifold in range(folds): testidx = idx[ifold,:] trainidx = idx[setdiff1d(arange(folds),ifold),:].flatten() text_clf = LogisticRegression(class_weight='auto',dual=True) # for nested CV, do folds-1 CV for parameter optimization # within inner CV loop and use the outer testfold as held-out data # for model validation gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, cv=(folds-1)) gs_clf.fit(X[trainidx,:],Y[trainidx]) predicted[testidx] = gs_clf.predict(X[testidx,:]) predicted_prob[testidx,:] = gs_clf.predict_proba(X[testidx,:]) print '************ Fold %d *************'%(ifold+1) print metrics.classification_report(Y[testidx], predicted[testidx],target_names=data.keys()) t1 = time.time() total_time = t1 - t0 timestr = 'Wallclock time: %f sec\n'%total_time dimstr = 'Vocabulary size: %d\n'%X.shape[-1] report = timestr + dimstr # extract some metrics print '********************************' print '************ Total *************' print '********************************' report += metrics.classification_report(Y, predicted,target_names=data.keys()) # dump metrics to file open(folder+'/report_%s.txt'%'_'.join(sorted(steps)),'wb').write(report) print(report) conf_mat = metrics.confusion_matrix(Y,predicted) open(folder+'/conf_mat_%s.txt'%'_'.join(sorted(steps)),'wb').write(json.dumps(conf_mat.tolist())) print(conf_mat) if plot: # print confusion matrix import pylab pylab.figure(figsize=(16,16)) pylab.imshow(metrics.confusion_matrix(Y,predicted),interpolation='nearest') pylab.colorbar() pylab.xticks(arange(4),[x.decode('utf-8') for x in data.keys()]) pylab.yticks(arange(4),[x.decode('utf-8') for x in data.keys()]) pylab.xlabel('Predicted') pylab.ylabel('True') font = {'family' : 'normal', 'size' : 30} pylab.rc('font', **font) pylab.savefig(folder+'/conf_mat.pdf',bbox_inches='tight')
word_embeddings_file_path = args.word2vec pretrained_weights_file_path = args.save epochs = args.epochs df = read_SEMEVAL_data(args.data) # initialize objects print('Initializing objects ...') print('Initializing word embeddings ...') t1 = time.time() word_embeddings = WordEmbeddings(word_embeddings_file_path) t2 = time.time() print('\tTook %f seconds' % (t2 - t1)) print('Initializing tokenizer ...') tokenizer = Tokenizer() print('Initializing vectorizer ...') vectorizer = Vectorizer(word_embeddings, tokenizer) #### training dataset #### # vectorizing ids, train_a_vectors, train_b_vectors, train_gold = vectorizer.vectorize_df(df) train_max_a_length = len(max(train_a_vectors, key=len)) train_max_b_length = len(max(train_b_vectors, key=len)) print('maximum number of tokens per sentence A in training set is %d' % train_max_a_length) print('maximum number of tokens per sentence B in training set is %d' % train_max_b_length) max_len = max([train_max_a_length, train_max_b_length]) # padding train_a_vectors = pad_tensor(train_a_vectors, max_len) train_b_vectors = pad_tensor(train_b_vectors, max_len)
if pred == vec.end_tag: break else: res += pred # next_hidden = sess.run(tensors['next_hidden'], feed_dict=feed_dict) # initial_state = np.vstack((initial_state, next_hidden))[1:] return res if __name__ == '__main__': print 'Loading data...' with open('../../data/smalldata.txt', 'r') as f: data = [line.strip() for line in f] vectorizer = Vectorizer(seq_length=25) print 'Fitting Vectorizer...' X_data, y_data = vectorizer.fit_transform(data) with open('vectorizer.pkl', 'w') as f: pickle.dump(vectorizer, f) N, seq_length, input_dim = X_data.shape hidden_dim = 128 output_dim = input_dim X = tf.placeholder(tf.float32, [None, seq_length, input_dim], 'X') y = tf.placeholder(tf.float32, [None, output_dim], 'y') initial_state = tf.placeholder(tf.float32, [None, 2 * hidden_dim], 'initial_state') lstm, next_hidden = lstm_layer(X, input_dim, seq_length, hidden_dim,
class Trainer(object): """Trains the classifier with training data and does the cross validation. """ def __init__(self): """Initializes the datastructures required. """ # The actual text extraction object (does text to vector mapping). self.vectorizer = Vectorizer() # A list of already hand classified tweets to train our classifier. self.data = None # A list containing the classification to each individual tweet # in the tweets list. self.classification = None self.classifier = None self.scores = None def initialize_training_data(self): """Initializes all types of training data we have. """ corpus_file = open(os.path.join(datasettings.DATA_DIRECTORY, 'full-corpus.csv')) classification, tweets = parse_training_corpus(corpus_file) reviews_positive = parse_imdb_corpus( os.path.join(datasettings.DATA_DIRECTORY, 'positive')) num_postive_reviews = len(reviews_positive) class_positive = ['positive'] * num_postive_reviews reviews_negative = parse_imdb_corpus( os.path.join(datasettings.DATA_DIRECTORY, 'negative')) num_negative_reviews = len(reviews_negative) class_negative = ['negative'] * num_negative_reviews self.data = tweets self.classification = classification #self.date_time = date_time #self.retweet = retweets #self.favorited = favorited def initial_fit(self): """Initializes the vectorizer by doing a fit and then a transform. """ # We map the sentiments to the values specified in the SENTIMENT_MAP. # For any sentiment that is not part of the map we give a value 0. classification_vector = numpy.array(map( lambda s: SENTIMENT_MAP.get(s.lower(), 0), self.classification)) feature_vector = self.vectorizer.fit_transform(self.data) return (classification_vector, feature_vector) def build_word_dict(self): """ Build sentiment dictionary and build vector of weights for tweets. """ fileIn = open(os.path.join(datasettings.DATA_DIRECTORY, 'AFINN-96.txt')) wordDict = {} line = fileIn.readline() while line != '': temp = string.split(line, '\t') wordDict[temp[0]] = int(temp[1]) line = fileIn.readline() fileIn.close() fileIn = open(os.path.join(datasettings.DATA_DIRECTORY, 'AFINN-111.txt')) line = fileIn.readline() while line != '': temp = string.split(line, '\t') wordDict[temp[0]] = int(temp[1]) line = fileIn.readline() fileIn.close() word_dict_vector = [] for tweet in self.data: word_list = tweet.split() sum = 0 for word in word_list: if word in wordDict.keys(): sum += wordDict[word] word_dict_vector.append(sum) return word_dict_vector def transform(self, test_data): """Performs the transform using the already initialized vectorizer. """ feature_vector = self.vectorizer.transform(test_data) def score_func(self, true, predicted): """Score function for the validation. """ return metrics.precision_recall_fscore_support( true, predicted, pos_label=[ SENTIMENT_MAP['positive'], SENTIMENT_MAP['negative'], SENTIMENT_MAP['neutral'], ], average='macro') def cross_validate(self, k=10): """Performs a k-fold cross validation of our training data. Args: k: The number of folds for cross validation. """ self.scores = [] X, y = check_arrays(self.feature_vector, self.classification_vector, sparse_format='csr') cv = cross_validation.check_cv( k, self.feature_vector, self.classification_vector, classifier=True) for train, test in cv: self.classifier1.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier2.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier3.fit(self.feature_vector[train], self.classification_vector[train]) classification1 = self.classifier1.predict( self.feature_vector[test]) classification2 = self.classifier2.predict( self.feature_vector[test]) classification3 = self.classifier3.predict( self.feature_vector[test]) classification = [] for predictions in zip(classification1, classification2, classification3): neutral_count = predictions.count(0) positive_count = predictions.count(1) negative_count = predictions.count(-1) if (neutral_count == negative_count and negative_count == positive_count): classification.append(predictions[0]) elif (neutral_count > positive_count and neutral_count > negative_count): classification.append(0) elif (positive_count > neutral_count and positive_count > negative_count): classification.append(1) elif (negative_count > neutral_count and negative_count > positive_count): classification.append(-1) classification = numpy.array(classification) self.scores.append(self.score_func(y[test], classification)) def train_and_validate(self, cross_validate=False, mean=False, serialize=False): """Trains the SVC with the training data and validates with the test data. We do a K-Fold cross validation with K = 10. """ self.classification_vector, self.feature_vector = self.initial_fit() self.classifier1 = naive_bayes.MultinomialNB() self.classifier2 = naive_bayes.BernoulliNB() self.classifier3 = svm.LinearSVC(loss='l2', penalty='l1', C=1000,dual=False, tol=1e-3) if cross_validate: self.cross_validate(k=cross_validate) else: self.classifier1.fit(self.feature_vector, self.classification_vector) self.classifier2.fit(self.feature_vector, self.classification_vector) self.classifier3.fit(self.feature_vector, self.classification_vector) if serialize: classifiers_file = open(os.path.join( datasettings.DATA_DIRECTORY, 'classifiers.pickle'), 'wb') cPickle.dump([self.classifier1, self.classifier2, self.classifier3], classifiers_file) vectorizer_file = open(os.path.join( datasettings.DATA_DIRECTORY, 'vectorizer.pickle'), 'wb') cPickle.dump(self.vectorizer, vectorizer_file) return self.scores def build_ui(self, mean=False): """Prints out all the scores calculated. """ for i, score in enumerate(self.scores): print "Cross Validation: %d" % (i + 1) print "*" * 40 if mean: print "Mean Accuracy: %f" % (score) else: print "Precision\tRecall\t\tF-Score" print "~~~~~~~~~\t~~~~~~\t\t~~~~~~~" precision = score[0] recall = score[1] f_score = score[2] print "%f\t%f\t%f" % (precision, recall, f_score) print
class Classifier: def __init__(self,folder='model',train=False): ''' Creates a classifier object if no model is found, or train is set True, a new classifier is learned INPUT folder the root folder with the Bag-of-Word data, where the model is stored train set True if you want to train ''' self.folder = folder # load Bag-of-Word extractor self.bow_vectorizer = Vectorizer(self.folder) # if there is no classifier file or training is invoked if (not os.path.isfile(self.folder+'/classifier.pickle')) or train: print 'Training classifier' self.train() print 'Loading classifier' clfdict = cPickle.load(open(self.folder+'/classifier.pickle')) self.clf = clfdict['classifier'] self.parties = clfdict['labels'] def predict(self,text): ''' Loads scikit-learn Bag-of-Word extractor and classifier and applies it to some text. INPUT text a string to assign to a party folder the folder containing the classifier and bag-of-words transformer pickles ''' # transform string into sparse matrix x = self.bow(text) # predict probabilities of each party probabilities = self.clf.predict_proba(x) # transform the predictions into json output result = {'text':text,'prediction':[]} # the classifier returns parties in alphabetical order, so we reorder for pidx in range(len(self.parties)): result['prediction'].append( { 'party':self.parties[pidx], 'probability':probabilities.flatten()[pidx] }) return result def bow(self,text): if type(text) is not list: text = [text] return self.bow_vectorizer.transform(text) def train(self,folds = 2): ''' trains a classifier on the bag of word vectors extracted with extract_bundestag speeches.py INPUT folder the folder to store the model file and load the bag-of-words-vectorizer file folds number of cross-validation folds for optimizing the regularizer of the classifier ''' try: # load the data data = get_speech_text(folder=self.folder) for key in data: data[key] = self.bow(data[key]) except: print('Could not load text data file in\n' + \ 'Try executing [python downloader.py --download --parse]') raise # create numerical labels for each party Y = hstack(map((lambda x: ones(data[data.keys()[x]].shape[0])*x),range(len(data)))) # create the data matrix X = vstack(data.values()) # estimate fold size (if not a divisor of total samples) fsize = len(Y)/folds # permute data indices for training randidx = permutation(len(Y)) Y = Y[randidx] X = X[randidx,:] # the classifier, accounting for unbalanced classes text_clf = LogisticRegression(class_weight='auto',dual=True) # the regularizer parameters = {'C': (10.**arange(-5,5,1.)).tolist()} # perform gridsearch to get the best regularizer gs_clf = GridSearchCV(text_clf, parameters, cv=folds, n_jobs=-1,verbose=2) gs_clf.fit(X,Y) print "Classifier reached mean %0.2f accuracy with regularizer: %f"%(gs_clf.best_score_, gs_clf.best_params_['C']) # dump classifier to pickle cPickle.dump({'classifier':gs_clf,'labels':data.keys()},open(self.folder+'/classifier.pickle','wb'),-1)