def save_vectors_file(): data = load_data(FLAGS.data_path) vectorizer = Vectorizer() logging.info('getting vectors') img_vectors = [] genders = [] for img_path, gender_id in tqdm(data.items()): try: img_array = get_img(img_path) vector = vectorizer.get_vector(img_array) img_vectors.append(vector) genders.append(gender_id) except Exception as e: logging.warning('exception: {}'.format(e)) vectorizer.close() dim_reduction_technique = get_dim_reduction_technique( FLAGS.dim_reduction_technique) reduced, model = dim_reduction_technique(img_vectors, FLAGS.n_dimensions) save_pkl_file(model, FLAGS.reducter_path) save_pkl_file((reduced, genders), FLAGS.vectors_path)
def classify(self): # Classifies unknown forum posts if not self.fit: print("Fitting must be performed before classifying") return vectorizer = Vectorizer(self.dictionary.dictionary) input_file = input( "Enter the name of the .txt file containing the unknown posts (including file-ending: " ) try: with open(input_file, "r") as file: vectors = vectorizer.vectorize( self.preprocessor.preprocess(file)) except FileNotFoundError: if input("File not found. Press enter to try again or type 'm' and press enter to return to menu.").lower()\ == "m": return self.classify() return with open("result.txt", "w") as result_file: for line in self.classifier.classify(vectors): result_file.write((label_list[line] + "\n")) print( "Result saved in result.txt. " + "The predicted label of each post is printed on the corresponding line of the document." )
def create_tf_idf(file_path): reader = TrainingTextReader(file_path) keywords = KeywordExtractor(reader.articles[10], 'useless.txt') vector_index = Vectorizer(keywords.article_sents_tokened) freq_mat = vector_index.frequencyMatrix normalized_vector = VectorNormalizer(freq_mat) norm_mat = normalized_vector.l2_norm_matrice tf_idf = InverseDocumentFrequency(norm_mat) return tf_idf.tf_idf_matrice
def setUp(self): self.vec = Vectorizer(layer=-1, backend='gpu', cores=32) # Generate a list of images base_image = os.path.expanduser( '~') + '/SaturnServer/test_resources/map_image' self.imagenames = [] for i in range(1, self.vec.cores + 1): self.imagenames.append("{}{}.jpg".format(base_image, i))
def __init__(self): # vectorizer class # based on composition instead of inheritence principles self.vectorizer = Vectorizer() # weights learned and used by model self.weights = np.array([]) self.tag_enums = [] self.tag_dict = {}
def __init__(self,folder='model',modeltype='kpca',topics=10): # the classifier, which also contains the trained BoW transformer self.bow = Vectorizer(folder=folder,steps=['hashing','tfidf']) self.folder = folder self.modeltype = modeltype self.topics = topics if self.modeltype is 'kpca': from sklearn.decomposition import KernelPCA self.model = KernelPCA(kernel='rbf',gamma=1.,n_components=topics) if self.modeltype is 'nmf': from sklearn.decomposition import NMF self.model = NMF(n_components=topics)
def start(self): bag_of_words, words = TermFrequency(self.trained).create_vocabulary() v = Vectorizer(self.trained, self.classify, words, bag_of_words) tfidf_trained = v.tfidf_for_tweets_trained evaluations = v.evaluations tfidf_to_classify = v.tfidf_for_tweets_to_classify models = Models(tfidf_trained, evaluations, tfidf_to_classify) prediction = models.svm_linear() return prediction
def main(filename, category_filename, answer_col, predictor_col, hidden_nodes): df = pd.read_csv(filename, usecols=[answer_col, predictor_col]) categories = pd.read_csv(category_filename, usecols=[predictor_col])[predictor_col].values vectorizer = Vectorizer(df, categories, predictor_col, answer_col) vectorizer.format(0.6, 0.2) batch_size = 1000 epocs = 50 learning_rate = 1e-3 model = build_and_train(vectorizer, batch_size, epocs, learning_rate, hidden_nodes) validate(model, vectorizer) joblib.dump(model, filename + '.joblib')
def main(): img = get_img(FLAGS.img_path) vectorizer = Vectorizer() vector = vectorizer.get_vector(img) vectorizer.close() reducter = load_pkl_file(FLAGS.reducter_path) reduced = reducter.transform([vector]) model = load_pkl_file(FLAGS.model_path) output = model.predict(reduced)[0] print('result: {}'.format(output))
def __init__(self): """Initializes the datastructures required. """ # The actual text extraction object (does text to vector mapping). self.vectorizer = Vectorizer() # A list of already hand classified tweets to train our classifier. self.data = None # A list containing the classification to each individual tweet # in the tweets list. self.classification = None self.classifier = None self.scores = None
def startAnalysis(folder, S1_path, S2_path): fetcher = PageFetcher() S1 = fetcher.fetchPages(folder, S1_path) S2 = fetcher.fetchPages(folder, S2_path) #We use a document representation based on TF-IDF model TF_IDF = Vectorizer() S1_HTML = TF_IDF.fit_transform(S1) S2_HTML = TF_IDF.fit_transform(S2) pageAllignament = PageAllignament() S1S2_Pairs = pageAllignament.allignSources(S1_HTML, S2_HTML) print("Stats of: " + str(S1_path) + " and " + str(S2_path)) evaluation_pipeline(S1S2_Pairs)
def test_regression__vectorizer_layer_minus_one_behaves_same(self): # GIVEN a layer to test layer_under_test = -1 # AND a vectorizer that uses that layer vec = Vectorizer(layer=layer_under_test, prm_path=default_prm_path, backend='cpu') # AND an expected output expected_output = [0.0016, 0.9883, 0.0099, 0.00] # # WHEN extracting the attributes from an image print 'This test has not stalled, it takes 20-40 seconds on an fast-ish computer (%s)' % strftime("%H:%M:%S", gmtime()) actual_output = roundArray(vec.get_attribute_vector(image_loc)) # # THEN the output is as expected self.assertEqual(expected_output, actual_output, 'The output %s, does not match the expected output of %s' % (str(actual_output), str(expected_output)))
def vectorize_jobs(df_jobs, vectorizer_path, tfidfs_path, debug=False): #initializing tfidf vectorizer if debug: print('[Job Vectorization 2/5] Initializing Vectorizer \n') vectorizer = Vectorizer() if debug: print('[Job Vectorization 3/5] Tranforming/Vectorizing data \n') tfidf_jobs = vectorizer.fit_transform( (df_jobs['text'])) #fitting and transforming the vector if debug: print('[Job Vectorization 4/5] saving vectorizer to {path} \n'.format( path=vectorizer_path)) vectorizer.save_vectorizer(vectorizer_path) if debug: print('[Job Vectorization 5/5] saving tfidf to {path} \n'.format( path=tfidfs_path)) vectorizer.save_tfidfs(tfidf_jobs, tfidfs_path)
def __init__(self, folder='model', train=False): ''' Creates a classifier object if no model is found, or train is set True, a new classifier is learned INPUT folder the root folder with the Bag-of-Word data, where the model is stored train set True if you want to train ''' self.folder = folder # load Bag-of-Word extractor self.bow_vectorizer = Vectorizer(self.folder) # if there is no classifier file or training is invoked if (not os.path.isfile(self.folder + '/classifier.pickle')) or train: print 'Training classifier' self.train() print 'Loading classifier' clfdict = cPickle.load(open(self.folder + '/classifier.pickle')) self.clf = clfdict['classifier'] self.parties = clfdict['labels']
def main(): with timer("model loading"): # モデルとパイプラインの読込 model = ModelMLP() model.load_model() vectorizer = Vectorizer() vectorizer.load_vectorizer() with timer("data loading"): # 予測対象のデータをロード df = load_data_from_gcs() with timer("preprocess"): df = preprocess(df) with timer("predict"): X = df.drop(columns="price") X = vectorizer.transform(X) pred = model.predict(X) print(pred[:10])
def _load_data(self, data_dir, word_tokens, pristine_input, pristine_output, batch_size, seq_length, seq_step): try: with open(os.path.join(data_dir, 'input.txt'), encoding='utf-8') as input_file: text = input_file.read() except FileNotFoundError: print_red("No input.txt in data_dir") sys.exit(1) skip_validate = True try: with open(os.path.join(data_dir, 'validate.txt')) as validate_file: text_val = validate_file.read() skip_validate = False except FileNotFoundError: pass # Validation text optional # Find some good default seed string in our source text. self.seeds = find_random_seeds(text) # Include our validation texts with our vectorizer all_text = text if skip_validate else '\n'.join([text, text_val]) self.vectorizer = Vectorizer(all_text, word_tokens, pristine_input, pristine_output) data = self.vectorizer.vectorize(text) x, y = shape_for_stateful_rnn(data, batch_size, seq_length, seq_step) print('x.shape:', x.shape) print('y.shape:', y.shape) if skip_validate: return x, y, None, None data_val = self.vectorizer.vectorize(text_val) x_val, y_val = shape_for_stateful_rnn(data_val, batch_size, seq_length, seq_step) print('x_val.shape:', x_val.shape) print('y_val.shape:', y_val.shape) return x, y, x_val, y_val
def main(): # 学習データ読み込み with timer("train data load"): df = load_data_from_gcs() # 前処理 with timer("preprocess"): df = preprocess(df) vectorizer = Vectorizer() X_train = df.drop(columns="price") y_train = df["price"] with timer("training"): X_train = vectorizer.fit_transform(X_train) # 学習 base_params = { 'input_dropout': 0.2, 'hidden_layers': 3, 'hidden_units': 256, 'hidden_activation': 'relu', 'hidden_dropout': 0.2, 'batch_norm': 'before_act', 'optimizer': { 'type': 'adam', 'lr': 5e-5 }, 'batch_size': 64, } model = ModelMLP(base_params) model.fit(X_train, y_train) with timer("save model"): #モデルとパイプラインの保存 vectorizer.save_vectorizer() model.save_model()
def preprocess_and_fit(self): # Method that preprocesses data, indexes all words, vectorizes posts and finally trains and tests the classifier processed = [] processed_test = [] for category in self.categories: processed.append( self.preprocessor.preprocess('training' + str(category) + ".txt")) processed_test.append( self.preprocessor.preprocess('testing' + str(category) + ".txt")) # Word indexing for category in processed: # indexes all words into dictionary self.dictionary.index_words(category) print("Words indexed. Dictionary size: ", len(self.dictionary.dictionary), " words") # Vectorization vectorizer = Vectorizer( self.dictionary.dictionary ) # initializes vectorizer-object with dictionary vector_start = time.time() print("Vectorizing...") training_vectors = [] testing_vectors = [] for category in processed: training_vectors.append(vectorizer.vectorize(category)) for category in processed_test: testing_vectors.append(vectorizer.vectorize(category)) vector_time = time.time() - vector_start print("Vectorization completed in ", ("%.2f" % vector_time), "seconds") # Training and evaluation self.classifier.train(training_vectors) self.fit = True self.classifier.evaluate(testing_vectors)
def test_regression__vectorizer_layer_minus_four_behaves_same(self): # GIVEN a layer to test layer_under_test = -4 # AND a vectorizer that uses that layer vec = Vectorizer(layer=layer_under_test, prm_path=default_prm_path, backend='cpu') # AND an expected output stored in a file expected_output_file_path = os.path.expanduser('~')+'/SaturnServer/test_resources/layer4results.txt' # # WHEN extracting the attributes from an image print 'This test has not stalled, it takes 20-40 seconds on an fast-ish computer (%s)' % strftime("%H:%M:%S", gmtime()) actual_output = roundArray(vec.get_attribute_vector(image_loc)) # # THEN each element of the actual output array must match each element of the expected results with open(expected_output_file_path, 'r') as expected_output_file: element_no = 0 for expected_element in expected_output_file: self.assertEqual(float(expected_element), actual_output[element_no], 'The output (element %d) %s, does not match the expected output of %s' % (element_no, str(actual_output[element_no]), str(expected_element))) element_no += 1
] def tokenize(self): for desc in self.texts(): yield [ pos_tag(wordpunct_tokenize(sent)) for sent in sent_tokenize(desc) ] def describe(self): started = time.time() counts = FreqDist() tokens = FreqDist() for word in self.words(): counts['words'] += 1 tokens[word] += 1 return { 'words': counts['words'], 'vocab': len(tokens), 'lexdiv': float(counts['words']) / float(len(tokens)), 'secs': time.time() - started, } if __name__ == '__main__': from vectorizer import Vectorizer vec = Vectorizer() for vector in vec.tf_idf(): print(vector)
word_embeddings_file_path = args.word2vec pretrained_weights_file_path = args.save epochs = args.epochs df = read_SEMEVAL_data(args.data) # initialize objects print('Initializing objects ...') print('Initializing word embeddings ...') t1 = time.time() word_embeddings = WordEmbeddings(word_embeddings_file_path) t2 = time.time() print('\tTook %f seconds' % (t2 - t1)) print('Initializing tokenizer ...') tokenizer = Tokenizer() print('Initializing vectorizer ...') vectorizer = Vectorizer(word_embeddings, tokenizer) #### training dataset #### # vectorizing ids, train_a_vectors, train_b_vectors, train_gold = vectorizer.vectorize_df(df) train_max_a_length = len(max(train_a_vectors, key=len)) train_max_b_length = len(max(train_b_vectors, key=len)) print('maximum number of tokens per sentence A in training set is %d' % train_max_a_length) print('maximum number of tokens per sentence B in training set is %d' % train_max_b_length) max_len = max([train_max_a_length, train_max_b_length]) # padding train_a_vectors = pad_tensor(train_a_vectors, max_len) train_b_vectors = pad_tensor(train_b_vectors, max_len)
ix_to_rel = {i: r for i, r in enumerate(rel_set)} num_words = len(word_set) num_tags = len(tag_set) num_rels = len(rel_set) ROOT_TAG = "root" WORD_SIZE = 100 TAG_SIZE = 30 HIDDEN_SIZE = 100 NUM_EPOCHS = 3 word_vectorizer = Vectorizer(WordExtractor(sents), None, "parser_word", WORD_SIZE, filler=ZeroFiller(WORD_SIZE), ce_enabled=False, tf_enabled=False) tag_vectorizer = Vectorizer(TagExtractor(sents), None, "parser_pos", TAG_SIZE, filler=ZeroFiller(TAG_SIZE), ce_enabled=False, tf_enabled=False) parser = SyntaxParser(num_words, WORD_SIZE, num_tags, TAG_SIZE, WORD_SIZE + TAG_SIZE, HIDDEN_SIZE, num_rels) optimizer = optim.SGD(parser.parameters(), lr=0.1) loss_function = nn.NLLLoss()
PREPROCESSOR = Preprocessor(thesaurus_path) # シソーラス・パスを渡さなければ置換をしません。 print('前処理を行います') PREPROCESSOR.load_text([text_path]) whitelist = PREPROCESSOR.investigate_whitelist(thesaurus_path) print('保存します') PREPROCESSOR.save(auto_text_path) PARSER = Parser() print('かかり受け解析を行います..') PARSER.t2f([auto_text_path + '/' + root + '.text'], kytea_model=kytea_path, eda_model=eda_path) print('結果を保存します') PARSER.save(tree_path) # かかり受け解析したものをファイルに保存 print("Indexを読み込みます...") VECTORIZER = Vectorizer(index_path, t=1, list=whitelist) # Indexの読み込み print('Treeを読み込みます') vectors = VECTORIZER.get_vector([tree_path + '/' + root + '.eda'], filter=3) # ベクトルを生成 print(vectors) print('Vectorを保存します') VECTORIZER.save(vectors, [vector_path]) # ベクトルを保存 #----- # いまもっているTFIDFコーパスベクトル群と、クエリベクトルtfidf_vectorsを比較 #---- print('TFIDF corpus Vectorsを読み込みます') tfidf_corpus_vectors = VECTORIZER.load( sorted(glob.glob(tfidf_DB_path + '/*.vector'))) print(tfidf_corpus_vectors)
def __init__(self, name, sents, vectorizer_words, vectorizer_forms, embedding_size, tag_sents, tag_embedding_size, context_size, lrs=(0.1, 0.1, 0.1), lr_decrease_factor=0.5, epochs_per_decrease=10): ###################################################################### # Model's parameters. # 'sents' is a list of sentences of tuples ((form, word, tag), rel, head) self.name = name self.sents = sents self.embedding_size = embedding_size self.context_size = context_size ###################################################################### # Load or create indices. # Common self.path_base = "internal" self.num_words = 0 self.root_tag = "root" # CUDA flag self.is_cuda_available = torch.cuda.is_available() # For POS tags: self.tags = set() self.num_tags = 0 self.tag2index = {} self.index2tag = {} # For chunk tags: self.chunks = set() self.num_chunks = 0 self.chunk2index = {} self.index2chunk = {} # For relation tags: self.rels = set() self.num_rels = 0 self.rel2index = {} self.index2rel = {} # Update database self.create_or_load_indices() if self.num_words == 0: self.num_words = self.get_num_words(self.sents) ###################################################################### # Logic. # Learning rate controls self.lrs = lrs self.lr_decrease_factor = lr_decrease_factor self.epochs_per_decrease = epochs_per_decrease # Define machines self.vectorizer = Vectorizer(vectorizer_words, vectorizer_forms, name, embedding_size, filler=ZeroFiller(embedding_size), ce_enabled=True) # self.vectorizer = FastTextVectorizer(name, embedding_size * 2, "ft_sg_syntagrus.bin") self.tag_vectorizer = Vectorizer(tag_sents, None, name + "_pos", tag_embedding_size, filler=ZeroFiller(tag_embedding_size), ce_enabled=False, tf_enabled=False) # Tags embeddings (H). # Chunker will get linear combination as an input: # I = H^T * p # p - probabilities vector self.tag_embeddings = [] for i in range(self.num_tags): tag = self.index2tag[i].lower() self.tag_embeddings.append(self.tag_vectorizer(tag, tag)) self.tag_embeddings = torch.stack(self.tag_embeddings) if self.is_cuda_available: self.tag_embeddings = self.tag_embeddings.cuda() # Vector size is 1 (TF) + 100 (Word embedding) + 100 (Char grams embedding) self.vector_size = self.vectorizer.get_vector_size() self.tag_size = self.tag_vectorizer.get_vector_size() # Chunk size. # Benchmark is 200 (POS hidden) + 201 (embedding) + NUM_TAGS (probabilities) self.chunk_size = 2 * embedding_size + self.vector_size + self.tag_size # Parse size -- input size for parser. # When chunking is not available, parse size is equal to chunk size self.parse_size = self.chunk_size self.log("tagger input size: {}".format(self.vector_size)) self.log("chunker input size: {}".format(self.chunk_size)) self.log("parser input size: {}".format(self.parse_size)) self.tagger = Tagger(self.vector_size, self.num_tags, "GRU", embedding_size) # self.chunker = Tagger(self.chunk_size, self.num_chunks, "LSTM", embedding_size) self.parser = SyntaxParser(0, 0, 0, 0, self.parse_size, embedding_size, self.num_rels) self.is_tagger_trained = False # self.is_chunker_trained = False self.is_parser_trained = False self.tagger_name = "pos tagging" # self.chunker_name = "chunking" self.parser_name = "parsing" # Try to load from file self.tagger_path = "{}/model_pos_{}.pt".format(self.path_base, self.name) # self.chunker_path = "{}/model_chunk_{}.pt".format(self.path_base, self.name) self.parser_path = "{}/model_parse_{}.pt".format(self.path_base, self.name) if os.path.exists(self.tagger_path): self.log("Loading POS tagger") self.tagger = torch.load(self.tagger_path) self.tagger.unit.flatten_parameters() self.is_tagger_trained = True self.log("Done") # if os.path.exists(self.chunker_path): # self.log("Loading chunker") # self.chunker = torch.load(self.chunker_path) # self.chunker.unit.flatten_parameters() # self.is_chunker_trained = True # self.log("Done") if os.path.exists(self.parser_path): self.log("Loading parser") self.parser = torch.load(self.parser_path) self.parser.unit.flatten_parameters() self.is_parser_trained = True self.log("Done")
from data_analysis import DataManager from vectorizer import Vectorizer import numpy as np import pickle from tempfile import TemporaryFile dm = DataManager('./data/spam.csv') dm.most_frequent_character_in_spam() dm.most_frequent_character_in_legit() dm.most_frequent_characters() dm.average_text_length() sentences, labels = dm.get_text(), dm.get_labels() labels = list(map(lambda v: 0 if v == 'ham' else 1, labels)) vectorizer = Vectorizer(sentences) sentences_features = [] for sentence in sentences: sentence_vector = vectorizer.text_to_vec(sentence, alpha=0.3) sentences_features.append(sentence_vector) train_x, train_y = sentences_features[0:5000], labels[0:5000] train_x = np.asarray(train_x) train_y = np.asarray(train_y) test_x, test_y = sentences_features[5000:], labels[5000:] test_x = np.asarray(test_x) test_y = np.asarray(test_y) np.savetxt('train_x.txt', train_x)
from data_loader import load_data from data_preprocessor import message_cleaning from vectorizer import Vectorizer import pandas as pd tweets_df = load_data("../data/twitter.csv") # Let's test the newly added function tweets_df_clean = tweets_df['tweet'].apply(message_cleaning) print(tweets_df_clean[5]) # show the cleaned up version print(tweets_df['tweet'][5]) # show the original version # Vectorize the tweets using tokenizer tweets_countvectorizer = Vectorizer(tweets_df) print(tweets_countvectorizer) print(tweets_countvectorizer.shape) # dataframe to train tweets = pd.DataFrame(tweets_countvectorizer) X = tweets Y = tweets_df['label']
import numpy as np import pickle from keras.models import Model from spacy_data import get_train_dev_for_embedding, get_embeddings from model_similarity2 import create_model from spacy_features import anygram_kernel, anygram_similarity_fast as kernel, words_to_indices as get_features from vectorizer import VectorizerGlove as Vectorizer print "Loading spacy..." nlp = spacy.load('en_core_web_lg') print "Loading vectorizer..." vectorizer = Vectorizer(dims=50) def train(train_texts, train_labels, dev_texts, dev_labels, sentence_length=100, batch_size=100, nb_epoch=5): #embeddings, vocab = get_embeddings(nlp.vocab) # from spacy_features import w2v_matrix embeddings = vectorizer.get_word_matrix() vocab = None print embeddings.shape print("Parsing texts...") train_docs = list(nlp.pipe(train_texts)) dev_docs = list(nlp.pipe(dev_texts))
def evaluation(filenames, dictionary_root='../../lexica', cruncher_type='lemmatizer', vectorizer_type='word2vec', metrics=['f1-score', 'accuracy-score']): if not isinstance(filenames, list): raise ValueError("'" + filenames + "' is not an instance of 'list'") beg = time.time() vectorizer = Vectorizer(vectorizer_type) try: labels, vectors = vectorizer.vectorize(filenames, dictionary_root) except: preprocessor = Preprocessor(filenames, Cruncher(cruncher_type)) dictionary = Dictioanry(dictionary_root) if dictionary_root else None labels, vectors = vectorizer.vectorize(preprocessor, dictionary) test_ids, test_labels, test_vectors = [], [], [] train_ids, train_labels, train_vectors = [], [], [] for id, label in labels.items(): if label == 'unknown': test_ids.append(id) test_labels.append(label) test_vectors.append(vectors[id]) else: train_ids.append(id) train_labels.append(label) train_vectors.append(vectors[id]) evaluator = Evaluator() for classifing in ['knn', 'rrb', 'svm']: if classifing != 'rrb': classifier = Classifier(train_vectors, train_labels, classifing) predictions = classifier.predict(test_vectors) else: classifier = RoundRobin(train_labels, train_vectors, test_vectors) predictions = classifier.classify() for metric in metrics: value = evaluator.evaluate(dict(zip(test_ids, predictions)), metric) print('<LOG>: The performance of', "'" + classifing + "'", 'according to the', ("'" + metric + "'").ljust(max(map(len, metrics)) + 2), "metric is", '{0:.6f}'.format(value)) end = time.time() print('\n\nElapsed time:', '{0:.6f}'.format(end - beg), 'seconds', file=sys.stderr)
x, y, c = [], [], [] for token in tokens: if token in dictioanry.valences: x.append(np.random.rand()) y.append(np.random.rand()) c.append(rgb(np.mean(dictioanry.valences[token]))) plt.scatter(x, y, c=c, alpha=0.8) plt.show() if __name__ == "__main__": preprocessor = Preprocessor(['train.tsv', 'test.tsv'], Cruncher()) dictionary = Dictioanry('..\\..\\lexica') vectorizer = Vectorizer() labels, vectors = vectorizer.vectorize(preprocessor, dictionary) visualizer = Visualizer(preprocessor) for method in Visualizer.supported_methods: visualizer.visualize(method=method, dictionary=dictionary, model=vectorizer.underlying)
import plac import pathlib import spacy import numpy as np from spacy_data import get_train_dev_test, get_embeddings from spacy_features import get_hv_similar_sentences as get_features, anygram_similarity_fast as kernel from vectorizer import VectorizerGlove as Vectorizer print "Loading spacy..." nlp = spacy.load('en_core_web_lg') print "Loading vectorizer..." vectorizer = Vectorizer(dims=200) def train(train_texts, train_labels, dev_texts, dev_labels, test_texts, test_labels, sentence_length=100, batch_size=100, nb_epoch=5): print("Creating model...") from sklearn import svm from spacy_features import anygram_kernel for c in [0.8]: print("Parsing texts...") train_docs = list(nlp.pipe(train_texts)) dev_docs = list(nlp.pipe(dev_texts)) test_docs = list(nlp.pipe(test_texts))