def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, vocab_file, output, input_weight_file_path=None): sarcasm_model.__init__(self) self._train_file = train_file self._validation_file = validation_file self._word_file_path = word_file_path self._split_word_file_path = split_word_path self._emoji_file_path = emoji_file_path self._model_file = model_file self._vocab_file_path = vocab_file self._output_file = output_file self._input_weight_file_path = input_weight_file_path self.load_train_validation_data() print(self._line_maxlen) # build vocabulary # truncates words with min freq=1 self._vocab = dh.build_vocab(self.train, min_freq=1) if ('unk' not in self._vocab): self._vocab['unk'] = len(self._vocab.keys()) + 1 print(len(self._vocab.keys()) + 1) print('unk::', self._vocab['unk']) dh.write_vocab(self._vocab_file_path, self._vocab) # prepares input X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab) X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) # prepares input tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab) tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) # embedding dimension self.dimension_size = 256 # solving class imbalance self.ratio = self.calculate_label_ratio(Y) self.ratio = [max(self.ratio.values()) / value for key, value in self.ratio.items()] print('class ratio::', self.ratio) Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] self.X = X self.tX = tX self.Y = Y self.tY = tY print('train_X', X.shape) print('train_Y', Y.shape) print('validation_X', tX.shape) print('validation_Y', tY.shape)
def test_predict(self, verbose=False): start = time.time() self.test = dh.loaddata(self._test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path, normalize_text=True, split_hashtag=True, ignore_profiles=False) end = time.time() if (verbose == True): print('test resource loading time::', (end - start)) self._vocab = dh.build_vocab(self.test, min_freq=1) if ('unk' not in self._vocab): self._vocab['unk'] = len(self._vocab.keys()) + 1 dh.write_vocab(self._vocab_file_path, self._vocab) tX, tY, D, C, A = dh.vectorize_word_dimension(self.test, self._vocab) tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) dimension_size = 300 emb_weights = load_glove_model( self._vocab, n=dimension_size, glove_path='/content/SarcasmDetection/src/glove.6B.300d.txt') label_dict = { 0: 'EXTRAVERSION', 1: 'NEUROTICISM', 2: 'AGREEABLENESS', 3: 'CONSCIENTIOUSNESS', 4: 'OPENNESS' } predictions = self.model.predict(tX) total_pred = np.array([0, 0, 0, 0, 0]) for i in predictions: total_pred = np.add(total_pred, np.array(i)) pos = np.where(total_pred == max(total_pred)) l_pos = pos[0].tolist() RESULT = l_pos[0] print("THE RESULT IS " + str(label_dict[RESULT]))
def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, vocab_file, output_file, input_weight_file_path=None): sarcasm_model.__init__(self) self._train_file = train_file self._validation_file = validation_file self._word_file_path = word_file_path self._split_word_file_path = split_word_path self._emoji_file_path = emoji_file_path self._model_file = model_file self._vocab_file_path = vocab_file self._output_file = output_file self._input_weight_file_path = input_weight_file_path self.load_train_validation_data() print(self._line_maxlen) # build vocabulary # truncates words with min freq=10 self._vocab = dh.build_vocab(self.train, min_freq=2) if ('unk' not in self._vocab): self._vocab['unk'] = len(self._vocab.keys()) + 1 print(len(self._vocab.keys()) + 1) print('unk::', self._vocab['unk']) dh.write_vocab(self._vocab_file_path, self._vocab) # prepares input X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab) X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) # prepares input tX, tY, tD, tC, tA = dh.vectorize_word_dimension( self.validation, self._vocab) tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) # embedding dimension dimension_size = 30 # solving class imbalance ratio = self.calculate_label_ratio(Y) ratio = [max(ratio.values()) / value for key, value in ratio.items()] print('class ratio::', ratio) Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] print('train_X', X.shape) print('train_Y', Y.shape) print('validation_X', tX.shape) print('validation_Y', tY.shape) # trainable true if you want word2vec weights to be updated model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, hidden_units=128, embedding_dimension=dimension_size, trainable=True) open(self._model_file + 'model.json', 'w').write(model.to_json()) save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True) save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5', save_best_only=False) early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1) lr_tuner = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0.000001) # training model.fit(X, Y, batch_size=8, epochs=10, validation_data=(tX, tY), shuffle=True, verbose=2, callbacks=[save_best, save_all, early_stopping], class_weight=ratio)
def __init__(self, train_file, word_file_path, split_word_path, emoji_file_path, model_file, vocab_file, output_file, input_weight_file_path=None): sarcasm_model.__init__(self) self._train_file = train_file self._word_file_path = word_file_path self._split_word_file_path = split_word_path self._emoji_file_path = emoji_file_path self._model_file = model_file self._vocab_file_path = vocab_file self._output_file = output_file self._input_weight_file_path = input_weight_file_path self.load_train_data() batch_size = 32 # build vocabulary # truncates words with min freq=1 self._vocab = dh.build_vocab(self.train, min_freq=1) if ('unk' not in self._vocab): self._vocab['unk'] = len(self._vocab.keys()) + 1 print(len(self._vocab.keys()) + 1) print('unk::', self._vocab['unk']) dh.write_vocab(self._vocab_file_path, self._vocab) self.train = self.train[:-(len(self.train) % batch_size)] # prepares input X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab) X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) # embedding dimension dimension_size = 300 emb_weights = load_glove_model( self._vocab, n=dimension_size, glove_path='/content/SarcasmDetection/src/glove.6B.300d.txt') LABEL = [] for l in Y: m = [] for b in str(l): m.append(int(b)) if len(m) != 5: o = 5 - len(m) m = [0] * o + m LABEL.append(m) Y = np.asarray(LABEL) # trainable true if you want word2vec weights to be updated # Not applicable in this code model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights, hidden_units=32, embedding_dimension=dimension_size, batch_size=batch_size) model.fit(X, Y, batch_size=batch_size, epochs=5, shuffle=True) model_json = model.to_json() with open(model_file + 'model.json', "w") as json_file: json_file.write(model_json) # serialize weights to HDF5 model.save_weights(model_file + 'model.json.hdf5') print("Saved model to disk")
def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, vocab_file, output_file, input_weight_file_path=None): sarcasm_model.__init__(self) self._train_file = train_file self._validation_file = validation_file self._word_file_path = word_file_path self._split_word_file_path = split_word_path self._emoji_file_path = emoji_file_path self._model_file = model_file self._vocab_file_path = vocab_file self._output_file = output_file self._input_weight_file_path = input_weight_file_path self.load_train_validation_test_data() batch_size = 32 print(self._line_maxlen) self._vocab = dh.build_vocab(self.train, ignore_context=False) self._vocab['unk'] = len(self._vocab.keys()) + 1 print(len(self._vocab.keys()) + 1) print('unk::', self._vocab['unk']) dh.write_vocab(self._vocab_file_path, self._vocab) X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab, drop_dimension_index=None) tX, tY, tD, tC, tA = dh.vectorize_word_dimension( self.validation, self._vocab, drop_dimension_index=None) X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) C = dh.pad_sequence_1d(C, maxlen=self._line_maxlen) D = dh.pad_sequence_1d(D, maxlen=11) tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen) tD = dh.pad_sequence_1d(tD, maxlen=11) hidden_units = 128 dimension_size = 300 W = dh.get_word2vec_weight(self._vocab, n=dimension_size, path=word2vec_path) cW = W print('Word2vec obtained....') ratio = self.calculate_label_ratio(Y) ratio = [max(ratio.values()) / value for key, value in ratio.items()] print('ratio', ratio) dimension_vocab = numpy.unique(D) print(len(dimension_vocab)) Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] print('train_X', X.shape) print('train_C', C.shape) print('train_D', D.shape) print('train_Y', Y.shape) print('validation_X', tX.shape) print('validation_C', tC.shape) print('validation_D', tD.shape) print('validation_Y', tY.shape) model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, c_emb_weights=cW, hidden_units=hidden_units, trainable=False, batch_size=batch_size) open(self._model_file + 'model.json', 'w').write(model.to_json()) save_best = ModelCheckpoint(self._model_file + 'model.json.hdf5', save_best_only=True, monitor='val_loss') # save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}-{val_loss:.2f}.hdf5', # save_best_only=False) early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1) lr_tuner = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0.000001) model.fit([C, X], Y, batch_size=batch_size, epochs=100, validation_data=([tC, tX], tY), shuffle=True, callbacks=[save_best, lr_tuner], class_weight=ratio)
def __init__(self, train_file, validation_file, word_file_path, model_file, vocab_file, output_file, word2vec_path=None, test_file=None): sarcasm_model.__init__(self) self._train_file = train_file self._validation_file = validation_file self._word_file_path = word_file_path self._model_file = model_file self._vocab_file_path = vocab_file self._output_file = output_file self._test_file = test_file self.load_train_validation_test_data() print(self._line_maxlen) # build vocabulary if (self._test_file != None): self._vocab = dh.build_vocab(self.train + self.validation + self.test, min_freq=2) else: self._vocab = dh.build_vocab(self.train + self.validation, min_freq=2) self._vocab['unk'] = len(self._vocab.keys()) + 1 print(len(self._vocab.keys()) + 1) print('unk::', self._vocab['unk']) dh.write_vocab(self._vocab_file_path, self._vocab) # prepares input X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab) X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) # prepares input tX, tY, tD, tC, tA = dh.vectorize_word_dimension( self.validation, self._vocab) tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) # embedding dimension W = dh.get_word2vec_weight(self._vocab, n=300, path=word2vec_path) # solving class imbalance ratio = self.calculate_label_ratio(Y) ratio = [max(ratio.values()) / value for key, value in ratio.items()] print('class ratio::', ratio) Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] print('train_X', X.shape) print('train_Y', Y.shape) print('validation_X', tX.shape) print('validation_Y', tY.shape) # trainable true if you want word2vec weights to be updated model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, trainable=False) open(self._model_file + 'model_wv.json', 'w').write(model.to_json()) save_best = ModelCheckpoint(model_file + 'model_wv.json.hdf5', save_best_only=True) # save_all = ModelCheckpoint(self._model_file + 'weights_wv.{epoch:02d}.hdf5', # save_best_only=False) # early_stopping = EarlyStopping(monitor='val_loss', patience=25, verbose=1) # training model.fit(X, Y, batch_size=8, epochs=100, validation_data=(tX, tY), shuffle=True, callbacks=[save_best], class_weight=ratio)
def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, vocab_file, output_file, input_weight_file_path=None): sarcasm_model.__init__(self) self._train_file = train_file self._validation_file = validation_file self._word_file_path = word_file_path self._split_word_file_path = split_word_path self._emoji_file_path = emoji_file_path self._model_file = model_file self._vocab_file_path = vocab_file self._output_file = output_file self._input_weight_file_path = input_weight_file_path self.load_train_validation_data() print(self._line_maxlen) batch_size = 32 # build vocabulary # truncates words with min freq=1 self._vocab = dh.build_vocab(self.train, min_freq=1) if ('unk' not in self._vocab): self._vocab['unk'] = len(self._vocab.keys()) + 1 print(len(self._vocab.keys()) + 1) print('unk::', self._vocab['unk']) dh.write_vocab(self._vocab_file_path, self._vocab) self.train = self.train[:-(len(self.train) % batch_size)] self.validation = self.validation[:-(len(self.validation) % batch_size)] # prepares input X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab) X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) # prepares input tX, tY, tD, tC, tA = dh.vectorize_word_dimension( self.validation, self._vocab) tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) # embedding dimension dimension_size = 300 emb_weights = load_glove_model( self._vocab, n=dimension_size, glove_path='/home/aghosh/backups/glove.6B.300d.txt') # aux inputs aux_train = build_auxiliary_feature(self.train) aux_validation = build_auxiliary_feature(self.validation) # solving class imbalance ratio = self.calculate_label_ratio(Y) ratio = [max(ratio.values()) / value for key, value in ratio.items()] print('class ratio::', ratio) Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] print('train_X', X.shape) print('train_Y', Y.shape) print('validation_X', tX.shape) print('validation_Y', tY.shape) # trainable true if you want word2vec weights to be updated # Not applicable in this code model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights, hidden_units=32, embedding_dimension=dimension_size, batch_size=batch_size) # open(self._model_file + 'model.json', 'w').write(model.to_json()) save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True) save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5', save_best_only=False) early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1) # training model.fit([X, aux_train], Y, batch_size=batch_size, epochs=10, validation_data=([tX, aux_validation], tY), shuffle=True, callbacks=[save_best, save_all, early_stopping], class_weight=ratio)
def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, vocab_file, output_file, model_filename=None): offensive_content_model.__init__(self) self._train_file = train_file self._validation_file = validation_file self._word_file_path = word_file_path self._split_word_file_path = split_word_path self._emoji_file_path = emoji_file_path self._model_file = model_file self._vocab_file_path = vocab_file self._output_file = output_file self._model_filename = model_filename # self.load_train_validation_data(lowercase=False, at_character=True) # self.char_train = self.train # self.char_validation = self.validation self.load_train_validation_data() # batch size batch_size = 16 print('bb', len(self.train)) # print('bb', len(self.char_train)) # self.train = self.train[-len(self.train) % batch_size:] # self.char_train = self.char_train[-len(self.char_train) % batch_size:] print('bb', len(self.train)) # print('bb', len(self.char_train)) print(self._line_maxlen) print(self._line_char_maxlen) # build vocabulary self._vocab = dh.build_vocab(self.train, min_freq=5) if ('unk' not in self._vocab): self._vocab['unk'] = len(self._vocab.keys()) + 1 self._char_vocab = {} # self._char_vocab = dh.build_vocab(self.char_train) # if ('unk' not in self._char_vocab): # self._char_vocab['unk'] = len(self._char_vocab.keys()) + 1 print(len(self._vocab.keys()) + 1) print('unk::', self._vocab['unk']) # print(len(self._char_vocab.keys()) + 1) # print('unk::', self._char_vocab['unk']) dh.write_vocab(self._vocab_file_path, self._vocab) # dh.write_vocab(self._vocab_file_path + '.char', self._char_vocab) # prepares input X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab) X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) # prepares input tX, tY, tD, tC, tA = dh.vectorize_word_dimension( self.validation, self._vocab) tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) # prepares character input # cX, cY, cD, cC, cA = dh.vectorize_word_dimension(self.char_train, self._char_vocab) # cX = dh.pad_sequence_1d(cX, maxlen=self._line_char_maxlen) # prepares character input # ctX, ctY, ctD, ctC, ctA = dh.vectorize_word_dimension(self.char_validation, self._char_vocab) # ctX = dh.pad_sequence_1d(ctX, maxlen=self._line_char_maxlen) print("X", X.shape) # print('cX', cX.shape) # hidden units hidden_units = 256 # word2vec dimension dimension_size = 128 W = [] W = dh.get_word2vec_weight( self._vocab, n=300, path='/home/striker/word2vec/GoogleNews-vectors-negative300.bin') # W = dh.get_glove_weights(self._vocab, n=200, path='/home/striker/word2vec/glove_model_200.txt.bin') print('Word2vec obtained....') # solving class imbalance ratio = self.calculate_label_ratio(Y) ratio = [max(ratio.values()) / value for key, value in ratio.items()] print('class ratio::', ratio) Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] # Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] print('train_X', X.shape) print('train_Y', Y.shape) print('validation_X', tX.shape) print('validation_Y', tY.shape) # trainable true if you want word2vec weights to be updated model = None if (model_filename == 'emotion.json'): model = self._build_emotion_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, hidden_units=hidden_units, trainable=False) if (model_filename == 'offensive.json'): model = self._build_network(len(self._vocab.keys()) + 1, len(self._char_vocab.keys()) + 1, emb_weights=W, hidden_units=hidden_units, trainable=False, batch_size=8) open(self._model_file + self._model_filename, 'w').write(model.to_json()) save_best = ModelCheckpoint(model_file + self._model_filename + '.hdf5', save_best_only=True) early_stopping = EarlyStopping(monitor='loss', patience=50, verbose=1) lr_tuner = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=1, verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0.000001) # training model.fit([X], Y, batch_size=16, epochs=100, validation_split=0.1, shuffle=True, callbacks=[save_best, early_stopping, lr_tuner], class_weight=ratio, verbose=1)