def cross_validation_ah(): import random random.seed(1234567) import tensorflow sess_config = tensorflow.ConfigProto() sess_config.gpu_options.allow_growth = True from tensorflow.python.keras.backend import set_session set_session(tensorflow.Session(config=sess_config)) vocabulary = Vocabulary.deserialize('en-top100k.vocabulary.pkl.gz') embeddings = WordEmbeddings.deserialize('en-top100k.embeddings.pkl.gz') reader = JSONPerLineDocumentReader( 'data/experiments/ah-classification1/exported-3621-sampled-positive-negative-ah-no-context.json', True) # e = ClassificationExperiment(reader, RandomTokenizedDocumentClassifier(), ClassificationEvaluator()) # e = ClassificationExperiment(reader, MajorityClassTokenizedDocumentClassifier(), ClassificationEvaluator()) # e = ClassificationExperiment(reader, SimpleLSTMTokenizedDocumentClassifier(vocabulary, embeddings), ClassificationEvaluator()) e = ClassificationExperiment( reader, StackedLSTMTokenizedDocumentClassifier(vocabulary, embeddings), ClassificationEvaluator()) # e = ClassificationExperiment(reader, CNNTokenizedDocumentClassifier(vocabulary, embeddings), ClassificationEvaluator()) e.run()
def cross_validation_ah(model_type): # classification without context import random random.seed(1234567) import tensorflow as tf if tf.test.is_gpu_available(): strategy = tf.distribute.MirroredStrategy() print('Using GPU') else: raise ValueError('CPU not recommended.') with strategy.scope(): vocabulary = Vocabulary.deserialize('en-top100k.vocabulary.pkl.gz') embeddings = WordEmbeddings.deserialize('en-top100k.embeddings.pkl.gz') reader = JSONPerLineDocumentReader( 'data/experiments/ah-classification1/exported-3621-sampled-positive-negative-ah-no-context.json', True) e = None if model_type == 'cnn': e = ClassificationExperiment( reader, CNNTokenizedDocumentClassifier(vocabulary, embeddings), ClassificationEvaluator()) else: e = ClassificationExperiment( reader, StackedLSTMTokenizedDocumentClassifier(vocabulary, embeddings), ClassificationEvaluator()) e.run()
def train_test_model_with_context(train_dir, indir, outdir): '''Custom training and testing SSAE model :param train_dir: Path to JSON file containing training examples :param indir: Path to LOG file containing examples as Comment() object (which has already been classified by Bert) :param outdir: Path to LOG file to be created by adding prediction of this model as well''' import random random.seed(1234567) import tensorflow as tf if tf.test.is_gpu_available(): strategy = tf.distribute.MirroredStrategy() print('Using GPU') else: raise ValueError('CPU not recommended.') with strategy.scope(): vocabulary = Vocabulary.deserialize('en-top100k.vocabulary.pkl.gz') embeddings = WordEmbeddings.deserialize('en-top100k.embeddings.pkl.gz') reader = JSONPerLineDocumentReader(train_dir, True) e = ClassificationExperiment( reader, StructuredSelfAttentiveSentenceEmbedding(vocabulary, embeddings), ClassificationEvaluator()) test_comments = TokenizedDocumentReader(indir) result = e.label_external(test_comments) for k in result.keys(): print(f'{k}: {result[k]}') instances = dict() e = Comment(-1, 'lol', 'ah') f = open(indir, 'rb') try: while True: e = pickle.load(f) print(e) instances[str(e.id)] = e except EOFError: f.close() f = open(outdir, 'wb') for key in result.keys(): model_label, model_score = result[key] model_label = model_label.lower() score = model_score[1] if model_label == 'none': score = model_score[0] instances[key].add_model(model_type, model_label, score, None) e = instances[key] print(e) print(e.labels) print(e.scores) print('=' * 20) pickle.dump(instances[key], f) f.close()
def __init__(self): self.vocabulary = Vocabulary.deserialize( 'en-top100k.vocabulary.pkl.gz') self.embeddings = WordEmbeddings.deserialize( 'en-top100k.embeddings.pkl.gz') assert isinstance(self.vocabulary, Vocabulary) assert isinstance(self.embeddings, WordEmbeddings) # for caching computed average word vectors (it's expensive) # dictionary = (str, np.ndarray) # key = text, value = average word vector self._average_word_vector_cache = dict()
def run_it_all(dat, tok, rm_s, size, window, skipgram, workers, min_count): """ Return a WhiskyEmbeddings object which allows for some cool trickeries such as finding similar whiskies, describing whiskies and finding similar wordings ('synonyms') in the whisky-tasting vocabulary. This function does it all from beginning to the end: 1) Transform the scraped whisky reviews into a well-structured object 2) Use all whisky reviews to build a corpus and train a whisky-specific word2vec model. 3) Use the word embeddings to create whisky embeddings. The methods in WhiskyEmbeddings can then be used. All of this takes approx. 30-60 seconds. :param dat: input data :param tok: (bool) use tokenize and gensim preprocessing or not? :param rm_s: (bool) remove stopwords or not? :param size: (int) the number of word2vec dimensions :param window: (int) window size of the context while training word2vec :param skipgram: (bool) use skipgram model or CBOW? :param workers: (int) number of workers to train word2vec :param min_count: (int) min. number of occurrences in corpus. Words with less occurences will be deleted. :return: (WhiskyEmbeddings instance) """ # 1) Transform whisky reviews into well-structured objects: all_reviews = [WC(x, tokenize=tok, rm_stopwords=rm_s) for x in dat[1:]] # 2) Build a corpus and train a word2vec model: w2v = WordEmbeddings(all_reviews) word_vectors = w2v.train(size, window, skipgram, workers, min_count) # 3) Create whisky embeddings w_embedding = WhiskyEmbeddings(all_reviews, word_vectors) return w_embedding
def cross_validation_thread_ah_delta_context3(): import random random.seed(1234567) import tensorflow sess_config = tensorflow.ConfigProto() sess_config.gpu_options.allow_growth = True from tensorflow.python.keras.backend import set_session set_session(tensorflow.Session(config=sess_config)) vocabulary = Vocabulary.deserialize('en-top100k.vocabulary.pkl.gz') embeddings = WordEmbeddings.deserialize('en-top100k.embeddings.pkl.gz') reader = AHVersusDeltaThreadReader( 'data/sampled-threads-ah-delta-context3', True) e = ClassificationExperiment( reader, StructuredSelfAttentiveSentenceEmbedding( vocabulary, embeddings, '/tmp/visualization-context3'), ClassificationEvaluator()) e.run()
def cross_validation_thread_ah_delta_context3(): # classification with context import random random.seed(1234567) import tensorflow as tf if tf.test.is_gpu_available(): strategy = tf.distribute.MirroredStrategy() print('Using GPU') else: raise ValueError('CPU not recommended.') with strategy.scope(): vocabulary = Vocabulary.deserialize('en-top100k.vocabulary.pkl.gz') embeddings = WordEmbeddings.deserialize('en-top100k.embeddings.pkl.gz') reader = AHVersusDeltaThreadReader( 'data/sampled-threads-ah-delta-context3', True) e = ClassificationExperiment( reader, StructuredSelfAttentiveSentenceEmbedding( vocabulary, embeddings, '/tmp/visualization-context3'), ClassificationEvaluator()) e.run()
alt_loss_val, alt_accuracy_val = discriminator_1.train_fn(X, target_mat) if not skip_discriminator else discriminator_1.eval_fn(X, target_mat) if batch_id == 1: accumulators[:] = np.array([accuracy_val, loss_val, alt_accuracy_val, alt_loss_val, gen_loss_val, recon_gen_loss_val, adv_gen_loss_val, cos_gen_loss_val, float(skip_generator), float(skip_discriminator), preout_grad_norm_val]) else: accumulators[:] = ACCUMULATOR_EXPAVG * np.array([accuracy_val, loss_val, alt_accuracy_val, alt_loss_val, gen_loss_val, recon_gen_loss_val, adv_gen_loss_val, cos_gen_loss_val, float(skip_generator), float(skip_discriminator), preout_grad_norm_val]) + (1.0 - ACCUMULATOR_EXPAVG) * accumulators if batch_id % print_every_n == 0: print >> sys.stderr, 'batch: %s, acc: %s, loss: %s, alt acc: %s, alt loss: %s, gloss: %s, grloss: %s, galoss: %s, gcloss: %s, gskip: %s, dskip: %s, gn: %s' % tuple([batch_id] + accumulators.tolist()) def save_model(): params_vals = lasagne.layers.get_all_param_values([discriminator_0.l_out, discriminator_1.l_out, gen_l_out]) cPickle.dump(params_vals, open(MODEL_FILENAME, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) print >> sys.stderr, 'Loading Italian embeddings...' we_it = WordEmbeddings() we_it.load_from_word2vec('./it') we_it.downsample_frequent_words() skn_it = StandardScaler() we_it.vectors = skn_it.fit_transform(we_it.vectors).astype(theano.config.floatX) we_batches_it = we_it.sample_batches(batch_size=HALF_BATCH_SIZE, random_state=rng) print >> sys.stderr, 'Loading English embeddings...' we_en = WordEmbeddings() we_en.load_from_word2vec('./en') we_en.downsample_frequent_words() skn_en = StandardScaler() we_en.vectors = skn_en.fit_transform(we_en.vectors).astype(theano.config.floatX) we_batches_en = we_en.sample_batches(batch_size=HALF_BATCH_SIZE, random_state=rng) print >> sys.stderr, 'Ready to train.'
if batch_id % print_every_n == 0: print >> sys.stderr, 'batch: %s, acc: %s, loss: %s, alt acc: %s, alt loss: %s, gloss: %s, grloss: %s, galoss: %s, gcloss: %s, gskip: %s, dskip: %s, gn: %s' % tuple( [batch_id] + accumulators.tolist()) def save_model(): params_vals = lasagne.layers.get_all_param_values( [discriminator_0.l_out, discriminator_1.l_out, gen_l_out]) cPickle.dump(params_vals, open(MODEL_FILENAME, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) print >> sys.stderr, 'Loading Italian embeddings...' we_it = WordEmbeddings() we_it.load_from_word2vec('./it') we_it.downsample_frequent_words() skn_it = StandardScaler() we_it.vectors = skn_it.fit_transform(we_it.vectors).astype( theano.config.floatX) we_batches_it = we_it.sample_batches(batch_size=HALF_BATCH_SIZE, random_state=rng) print >> sys.stderr, 'Loading English embeddings...' we_en = WordEmbeddings() we_en.load_from_word2vec('./en') we_en.downsample_frequent_words() skn_en = StandardScaler() we_en.vectors = skn_en.fit_transform(we_en.vectors).astype( theano.config.floatX)
W=lasagne.init.Orthogonal(), b=None, name='gen_l_out') self.dec_l_out = lasagne.layers.DenseLayer(self.gen_l_out, num_units=n_input, nonlinearity=None, W=self.gen_l_out.W.T, b=None, name='dec_l_out') dataDir = './' rng = check_random_state(0) we1 = WordEmbeddings() we1.load_from_word2vec(dataDir, 'zh') we1.downsample_frequent_words() we1.vectors = normalize(we1.vectors) we_batches1 = we1.sample_batches(batch_size=HALF_BATCH_SIZE, random_state=rng) we2 = WordEmbeddings() we2.load_from_word2vec(dataDir, 'en') we2.downsample_frequent_words() we2.vectors = normalize(we2.vectors) we_batches2 = we2.sample_batches(batch_size=HALF_BATCH_SIZE, random_state=rng) assert we1.embedding_dim == we2.embedding_dim d = we1.embedding_dim discriminator = Discriminator()
args = parser.parse_args() DISCR_NUM_HIDDEN_LAYERS = args.Dlayers DISCR_HIDDEN_DIM = args.Ddim HALF_BATCH_SIZE = 128 MODEL_FILENAME = 'model.pkl' rng = check_random_state(0) lang1 = args.lang1 lang2 = args.lang2 dataDir = 'data/' + args.config + '/' print >> sys.stderr, 'Loading', lang1, 'embeddings...' we1 = WordEmbeddings() we1.load_from_word2vec(dataDir, lang1) we1.downsample_frequent_words() we1.vectors = normalize(we1.vectors).astype(theano.config.floatX) we_batches1 = we1.sample_batches(batch_size=HALF_BATCH_SIZE, random_state=rng) print >> sys.stderr, 'Loading', lang2, 'embeddings...' we2 = WordEmbeddings() we2.load_from_word2vec(dataDir, lang2) we2.downsample_frequent_words() we2.vectors = normalize(we2.vectors).astype(theano.config.floatX) we_batches2 = we2.sample_batches(batch_size=HALF_BATCH_SIZE, random_state=rng) assert we1.embedding_dim == we2.embedding_dim d = we1.embedding_dim