def cross_validation_ah(): import random random.seed(1234567) import tensorflow sess_config = tensorflow.ConfigProto() sess_config.gpu_options.allow_growth = True from tensorflow.python.keras.backend import set_session set_session(tensorflow.Session(config=sess_config)) vocabulary = Vocabulary.deserialize('en-top100k.vocabulary.pkl.gz') embeddings = WordEmbeddings.deserialize('en-top100k.embeddings.pkl.gz') reader = JSONPerLineDocumentReader( 'data/experiments/ah-classification1/exported-3621-sampled-positive-negative-ah-no-context.json', True) # e = ClassificationExperiment(reader, RandomTokenizedDocumentClassifier(), ClassificationEvaluator()) # e = ClassificationExperiment(reader, MajorityClassTokenizedDocumentClassifier(), ClassificationEvaluator()) # e = ClassificationExperiment(reader, SimpleLSTMTokenizedDocumentClassifier(vocabulary, embeddings), ClassificationEvaluator()) e = ClassificationExperiment( reader, StackedLSTMTokenizedDocumentClassifier(vocabulary, embeddings), ClassificationEvaluator()) # e = ClassificationExperiment(reader, CNNTokenizedDocumentClassifier(vocabulary, embeddings), ClassificationEvaluator()) e.run()
def cross_validation_ah(model_type): # classification without context import random random.seed(1234567) import tensorflow as tf if tf.test.is_gpu_available(): strategy = tf.distribute.MirroredStrategy() print('Using GPU') else: raise ValueError('CPU not recommended.') with strategy.scope(): vocabulary = Vocabulary.deserialize('en-top100k.vocabulary.pkl.gz') embeddings = WordEmbeddings.deserialize('en-top100k.embeddings.pkl.gz') reader = JSONPerLineDocumentReader( 'data/experiments/ah-classification1/exported-3621-sampled-positive-negative-ah-no-context.json', True) e = None if model_type == 'cnn': e = ClassificationExperiment( reader, CNNTokenizedDocumentClassifier(vocabulary, embeddings), ClassificationEvaluator()) else: e = ClassificationExperiment( reader, StackedLSTMTokenizedDocumentClassifier(vocabulary, embeddings), ClassificationEvaluator()) e.run()
def train_test_model_with_context(train_dir, indir, outdir): '''Custom training and testing SSAE model :param train_dir: Path to JSON file containing training examples :param indir: Path to LOG file containing examples as Comment() object (which has already been classified by Bert) :param outdir: Path to LOG file to be created by adding prediction of this model as well''' import random random.seed(1234567) import tensorflow as tf if tf.test.is_gpu_available(): strategy = tf.distribute.MirroredStrategy() print('Using GPU') else: raise ValueError('CPU not recommended.') with strategy.scope(): vocabulary = Vocabulary.deserialize('en-top100k.vocabulary.pkl.gz') embeddings = WordEmbeddings.deserialize('en-top100k.embeddings.pkl.gz') reader = JSONPerLineDocumentReader(train_dir, True) e = ClassificationExperiment( reader, StructuredSelfAttentiveSentenceEmbedding(vocabulary, embeddings), ClassificationEvaluator()) test_comments = TokenizedDocumentReader(indir) result = e.label_external(test_comments) for k in result.keys(): print(f'{k}: {result[k]}') instances = dict() e = Comment(-1, 'lol', 'ah') f = open(indir, 'rb') try: while True: e = pickle.load(f) print(e) instances[str(e.id)] = e except EOFError: f.close() f = open(outdir, 'wb') for key in result.keys(): model_label, model_score = result[key] model_label = model_label.lower() score = model_score[1] if model_label == 'none': score = model_score[0] instances[key].add_model(model_type, model_label, score, None) e = instances[key] print(e) print(e.labels) print(e.scores) print('=' * 20) pickle.dump(instances[key], f) f.close()
def __init__(self): self.vocabulary = Vocabulary.deserialize( 'en-top100k.vocabulary.pkl.gz') self.embeddings = WordEmbeddings.deserialize( 'en-top100k.embeddings.pkl.gz') assert isinstance(self.vocabulary, Vocabulary) assert isinstance(self.embeddings, WordEmbeddings) # for caching computed average word vectors (it's expensive) # dictionary = (str, np.ndarray) # key = text, value = average word vector self._average_word_vector_cache = dict()
def cross_validation_thread_ah_delta_context3(): import random random.seed(1234567) import tensorflow sess_config = tensorflow.ConfigProto() sess_config.gpu_options.allow_growth = True from tensorflow.python.keras.backend import set_session set_session(tensorflow.Session(config=sess_config)) vocabulary = Vocabulary.deserialize('en-top100k.vocabulary.pkl.gz') embeddings = WordEmbeddings.deserialize('en-top100k.embeddings.pkl.gz') reader = AHVersusDeltaThreadReader( 'data/sampled-threads-ah-delta-context3', True) e = ClassificationExperiment( reader, StructuredSelfAttentiveSentenceEmbedding( vocabulary, embeddings, '/tmp/visualization-context3'), ClassificationEvaluator()) e.run()
def cross_validation_thread_ah_delta_context3(): # classification with context import random random.seed(1234567) import tensorflow as tf if tf.test.is_gpu_available(): strategy = tf.distribute.MirroredStrategy() print('Using GPU') else: raise ValueError('CPU not recommended.') with strategy.scope(): vocabulary = Vocabulary.deserialize('en-top100k.vocabulary.pkl.gz') embeddings = WordEmbeddings.deserialize('en-top100k.embeddings.pkl.gz') reader = AHVersusDeltaThreadReader( 'data/sampled-threads-ah-delta-context3', True) e = ClassificationExperiment( reader, StructuredSelfAttentiveSentenceEmbedding( vocabulary, embeddings, '/tmp/visualization-context3'), ClassificationEvaluator()) e.run()