Exemplos de write_vocab em Python, exemplos de src.data_processing.data_handler.write_vocab em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: sarcasm_detection_model_CNN_LSTM_DNN_PSOm.py Projeto: batbrain7/sarcasm-detection-pso

    def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
                 vocab_file,
                 output,
                 input_weight_file_path=None):
        sarcasm_model.__init__(self)

        self._train_file = train_file
        self._validation_file = validation_file
        self._word_file_path = word_file_path
        self._split_word_file_path = split_word_path
        self._emoji_file_path = emoji_file_path
        self._model_file = model_file
        self._vocab_file_path = vocab_file
        self._output_file = output_file
        self._input_weight_file_path = input_weight_file_path

        self.load_train_validation_data()

        print(self._line_maxlen)

        # build vocabulary
        # truncates words with min freq=1
        self._vocab = dh.build_vocab(self.train, min_freq=1)
        if ('unk' not in self._vocab):
            self._vocab['unk'] = len(self._vocab.keys()) + 1

        print(len(self._vocab.keys()) + 1)
        print('unk::', self._vocab['unk'])

        dh.write_vocab(self._vocab_file_path, self._vocab)

        # prepares input
        X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
        X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)

        # prepares input
        tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab)
        tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)

        # embedding dimension
        self.dimension_size = 256

        # solving class imbalance
        self.ratio = self.calculate_label_ratio(Y)
        self.ratio = [max(self.ratio.values()) / value for key, value in self.ratio.items()]
        print('class ratio::', self.ratio)

        Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
        self.X = X
        self.tX = tX
        self.Y = Y
        self.tY = tY

        print('train_X', X.shape)
        print('train_Y', Y.shape)
        print('validation_X', tX.shape)
        print('validation_Y', tY.shape)

Exemplo n.º 2

0

Exibir arquivo

    def test_predict(self, verbose=False):
        start = time.time()
        self.test = dh.loaddata(self._test_file,
                                self._word_file_path,
                                self._split_word_file_path,
                                self._emoji_file_path,
                                normalize_text=True,
                                split_hashtag=True,
                                ignore_profiles=False)
        end = time.time()
        if (verbose == True):
            print('test resource loading time::', (end - start))

        self._vocab = dh.build_vocab(self.test, min_freq=1)
        if ('unk' not in self._vocab):
            self._vocab['unk'] = len(self._vocab.keys()) + 1

        dh.write_vocab(self._vocab_file_path, self._vocab)

        tX, tY, D, C, A = dh.vectorize_word_dimension(self.test, self._vocab)
        tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)

        dimension_size = 300
        emb_weights = load_glove_model(
            self._vocab,
            n=dimension_size,
            glove_path='/content/SarcasmDetection/src/glove.6B.300d.txt')

        label_dict = {
            0: 'EXTRAVERSION',
            1: 'NEUROTICISM',
            2: 'AGREEABLENESS',
            3: 'CONSCIENTIOUSNESS',
            4: 'OPENNESS'
        }
        predictions = self.model.predict(tX)
        total_pred = np.array([0, 0, 0, 0, 0])
        for i in predictions:
            total_pred = np.add(total_pred, np.array(i))
        pos = np.where(total_pred == max(total_pred))
        l_pos = pos[0].tolist()
        RESULT = l_pos[0]
        print("THE RESULT IS " + str(label_dict[RESULT]))

Exemplo n.º 3

0

Exibir arquivo

    def __init__(self,
                 train_file,
                 validation_file,
                 word_file_path,
                 split_word_path,
                 emoji_file_path,
                 model_file,
                 vocab_file,
                 output_file,
                 input_weight_file_path=None):
        sarcasm_model.__init__(self)

        self._train_file = train_file
        self._validation_file = validation_file
        self._word_file_path = word_file_path
        self._split_word_file_path = split_word_path
        self._emoji_file_path = emoji_file_path
        self._model_file = model_file
        self._vocab_file_path = vocab_file
        self._output_file = output_file
        self._input_weight_file_path = input_weight_file_path

        self.load_train_validation_data()

        print(self._line_maxlen)

        # build vocabulary
        # truncates words with min freq=10
        self._vocab = dh.build_vocab(self.train, min_freq=2)
        if ('unk' not in self._vocab):
            self._vocab['unk'] = len(self._vocab.keys()) + 1

        print(len(self._vocab.keys()) + 1)
        print('unk::', self._vocab['unk'])

        dh.write_vocab(self._vocab_file_path, self._vocab)

        # prepares input
        X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
        X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)

        # prepares input
        tX, tY, tD, tC, tA = dh.vectorize_word_dimension(
            self.validation, self._vocab)
        tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)

        # embedding dimension
        dimension_size = 30

        # solving class imbalance
        ratio = self.calculate_label_ratio(Y)
        ratio = [max(ratio.values()) / value for key, value in ratio.items()]
        print('class ratio::', ratio)

        Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]

        print('train_X', X.shape)
        print('train_Y', Y.shape)
        print('validation_X', tX.shape)
        print('validation_Y', tY.shape)

        # trainable true if you want word2vec weights to be updated
        model = self._build_network(len(self._vocab.keys()) + 1,
                                    self._line_maxlen,
                                    hidden_units=128,
                                    embedding_dimension=dimension_size,
                                    trainable=True)

        open(self._model_file + 'model.json', 'w').write(model.to_json())
        save_best = ModelCheckpoint(model_file + 'model.json.hdf5',
                                    save_best_only=True)
        save_all = ModelCheckpoint(self._model_file +
                                   'weights.{epoch:02d}__.hdf5',
                                   save_best_only=False)
        early_stopping = EarlyStopping(monitor='val_loss',
                                       patience=20,
                                       verbose=1)
        lr_tuner = ReduceLROnPlateau(monitor='val_loss',
                                     factor=0.1,
                                     patience=10,
                                     verbose=1,
                                     mode='auto',
                                     epsilon=0.0001,
                                     cooldown=0,
                                     min_lr=0.000001)

        # training
        model.fit(X,
                  Y,
                  batch_size=8,
                  epochs=10,
                  validation_data=(tX, tY),
                  shuffle=True,
                  verbose=2,
                  callbacks=[save_best, save_all, early_stopping],
                  class_weight=ratio)

Exemplo n.º 4

0

Exibir arquivo

    def __init__(self,
                 train_file,
                 word_file_path,
                 split_word_path,
                 emoji_file_path,
                 model_file,
                 vocab_file,
                 output_file,
                 input_weight_file_path=None):
        sarcasm_model.__init__(self)

        self._train_file = train_file
        self._word_file_path = word_file_path
        self._split_word_file_path = split_word_path
        self._emoji_file_path = emoji_file_path
        self._model_file = model_file
        self._vocab_file_path = vocab_file
        self._output_file = output_file
        self._input_weight_file_path = input_weight_file_path

        self.load_train_data()

        batch_size = 32

        # build vocabulary
        # truncates words with min freq=1

        self._vocab = dh.build_vocab(self.train, min_freq=1)
        if ('unk' not in self._vocab):
            self._vocab['unk'] = len(self._vocab.keys()) + 1

        print(len(self._vocab.keys()) + 1)
        print('unk::', self._vocab['unk'])

        dh.write_vocab(self._vocab_file_path, self._vocab)

        self.train = self.train[:-(len(self.train) % batch_size)]

        # prepares input
        X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
        X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
        # embedding dimension
        dimension_size = 300
        emb_weights = load_glove_model(
            self._vocab,
            n=dimension_size,
            glove_path='/content/SarcasmDetection/src/glove.6B.300d.txt')

        LABEL = []
        for l in Y:
            m = []
            for b in str(l):
                m.append(int(b))
            if len(m) != 5:
                o = 5 - len(m)
                m = [0] * o + m
            LABEL.append(m)
        Y = np.asarray(LABEL)

        # trainable true if you want word2vec weights to be updated
        # Not applicable in this code
        model = self._build_network(len(self._vocab.keys()) + 1,
                                    self._line_maxlen,
                                    emb_weights,
                                    hidden_units=32,
                                    embedding_dimension=dimension_size,
                                    batch_size=batch_size)
        model.fit(X, Y, batch_size=batch_size, epochs=5, shuffle=True)
        model_json = model.to_json()
        with open(model_file + 'model.json', "w") as json_file:
            json_file.write(model_json)
        # serialize weights to HDF5
        model.save_weights(model_file + 'model.json.hdf5')
        print("Saved model to disk")

Exemplo n.º 5

0

Exibir arquivo

    def __init__(self,
                 train_file,
                 validation_file,
                 word_file_path,
                 split_word_path,
                 emoji_file_path,
                 model_file,
                 vocab_file,
                 output_file,
                 input_weight_file_path=None):
        sarcasm_model.__init__(self)

        self._train_file = train_file
        self._validation_file = validation_file
        self._word_file_path = word_file_path
        self._split_word_file_path = split_word_path
        self._emoji_file_path = emoji_file_path
        self._model_file = model_file
        self._vocab_file_path = vocab_file
        self._output_file = output_file
        self._input_weight_file_path = input_weight_file_path

        self.load_train_validation_test_data()

        batch_size = 32

        print(self._line_maxlen)
        self._vocab = dh.build_vocab(self.train, ignore_context=False)
        self._vocab['unk'] = len(self._vocab.keys()) + 1

        print(len(self._vocab.keys()) + 1)
        print('unk::', self._vocab['unk'])

        dh.write_vocab(self._vocab_file_path, self._vocab)

        X, Y, D, C, A = dh.vectorize_word_dimension(self.train,
                                                    self._vocab,
                                                    drop_dimension_index=None)

        tX, tY, tD, tC, tA = dh.vectorize_word_dimension(
            self.validation, self._vocab, drop_dimension_index=None)

        X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
        C = dh.pad_sequence_1d(C, maxlen=self._line_maxlen)
        D = dh.pad_sequence_1d(D, maxlen=11)

        tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
        tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen)
        tD = dh.pad_sequence_1d(tD, maxlen=11)

        hidden_units = 128
        dimension_size = 300

        W = dh.get_word2vec_weight(self._vocab,
                                   n=dimension_size,
                                   path=word2vec_path)

        cW = W

        print('Word2vec obtained....')

        ratio = self.calculate_label_ratio(Y)
        ratio = [max(ratio.values()) / value for key, value in ratio.items()]

        print('ratio', ratio)

        dimension_vocab = numpy.unique(D)
        print(len(dimension_vocab))

        Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]

        print('train_X', X.shape)
        print('train_C', C.shape)
        print('train_D', D.shape)
        print('train_Y', Y.shape)

        print('validation_X', tX.shape)
        print('validation_C', tC.shape)
        print('validation_D', tD.shape)
        print('validation_Y', tY.shape)

        model = self._build_network(len(self._vocab.keys()) + 1,
                                    self._line_maxlen,
                                    emb_weights=W,
                                    c_emb_weights=cW,
                                    hidden_units=hidden_units,
                                    trainable=False,
                                    batch_size=batch_size)

        open(self._model_file + 'model.json', 'w').write(model.to_json())
        save_best = ModelCheckpoint(self._model_file + 'model.json.hdf5',
                                    save_best_only=True,
                                    monitor='val_loss')
        # save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
        #                            save_best_only=False)
        early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1)
        lr_tuner = ReduceLROnPlateau(monitor='loss',
                                     factor=0.1,
                                     patience=10,
                                     verbose=1,
                                     mode='auto',
                                     epsilon=0.0001,
                                     cooldown=0,
                                     min_lr=0.000001)

        model.fit([C, X],
                  Y,
                  batch_size=batch_size,
                  epochs=100,
                  validation_data=([tC, tX], tY),
                  shuffle=True,
                  callbacks=[save_best, lr_tuner],
                  class_weight=ratio)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: sarcasm_detection_model_CNN_LSTM_DNN_word2vec.py Projeto: qq345736500/ironly

    def __init__(self,
                 train_file,
                 validation_file,
                 word_file_path,
                 model_file,
                 vocab_file,
                 output_file,
                 word2vec_path=None,
                 test_file=None):

        sarcasm_model.__init__(self)

        self._train_file = train_file
        self._validation_file = validation_file
        self._word_file_path = word_file_path
        self._model_file = model_file
        self._vocab_file_path = vocab_file
        self._output_file = output_file
        self._test_file = test_file

        self.load_train_validation_test_data()

        print(self._line_maxlen)

        # build vocabulary
        if (self._test_file != None):
            self._vocab = dh.build_vocab(self.train + self.validation +
                                         self.test,
                                         min_freq=2)
        else:
            self._vocab = dh.build_vocab(self.train + self.validation,
                                         min_freq=2)

        self._vocab['unk'] = len(self._vocab.keys()) + 1

        print(len(self._vocab.keys()) + 1)
        print('unk::', self._vocab['unk'])

        dh.write_vocab(self._vocab_file_path, self._vocab)

        # prepares input
        X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
        X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)

        # prepares input
        tX, tY, tD, tC, tA = dh.vectorize_word_dimension(
            self.validation, self._vocab)
        tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)

        # embedding dimension
        W = dh.get_word2vec_weight(self._vocab, n=300, path=word2vec_path)

        # solving class imbalance
        ratio = self.calculate_label_ratio(Y)
        ratio = [max(ratio.values()) / value for key, value in ratio.items()]
        print('class ratio::', ratio)

        Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]

        print('train_X', X.shape)
        print('train_Y', Y.shape)
        print('validation_X', tX.shape)
        print('validation_Y', tY.shape)

        # trainable true if you want word2vec weights to be updated
        model = self._build_network(len(self._vocab.keys()) + 1,
                                    self._line_maxlen,
                                    emb_weights=W,
                                    trainable=False)

        open(self._model_file + 'model_wv.json', 'w').write(model.to_json())
        save_best = ModelCheckpoint(model_file + 'model_wv.json.hdf5',
                                    save_best_only=True)
        # save_all = ModelCheckpoint(self._model_file + 'weights_wv.{epoch:02d}.hdf5',
        #                            save_best_only=False)
        # early_stopping = EarlyStopping(monitor='val_loss', patience=25, verbose=1)

        # training
        model.fit(X,
                  Y,
                  batch_size=8,
                  epochs=100,
                  validation_data=(tX, tY),
                  shuffle=True,
                  callbacks=[save_best],
                  class_weight=ratio)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: sarcasm_detection_model_CNN_LSTM_ATTN.py Projeto: victorleejw/SarcasmDetection

    def __init__(self,
                 train_file,
                 validation_file,
                 word_file_path,
                 split_word_path,
                 emoji_file_path,
                 model_file,
                 vocab_file,
                 output_file,
                 input_weight_file_path=None):
        sarcasm_model.__init__(self)

        self._train_file = train_file
        self._validation_file = validation_file
        self._word_file_path = word_file_path
        self._split_word_file_path = split_word_path
        self._emoji_file_path = emoji_file_path
        self._model_file = model_file
        self._vocab_file_path = vocab_file
        self._output_file = output_file
        self._input_weight_file_path = input_weight_file_path

        self.load_train_validation_data()

        print(self._line_maxlen)
        batch_size = 32

        # build vocabulary
        # truncates words with min freq=1
        self._vocab = dh.build_vocab(self.train, min_freq=1)
        if ('unk' not in self._vocab):
            self._vocab['unk'] = len(self._vocab.keys()) + 1

        print(len(self._vocab.keys()) + 1)
        print('unk::', self._vocab['unk'])

        dh.write_vocab(self._vocab_file_path, self._vocab)

        self.train = self.train[:-(len(self.train) % batch_size)]
        self.validation = self.validation[:-(len(self.validation) %
                                             batch_size)]

        # prepares input
        X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
        X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)

        # prepares input
        tX, tY, tD, tC, tA = dh.vectorize_word_dimension(
            self.validation, self._vocab)
        tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)

        # embedding dimension
        dimension_size = 300
        emb_weights = load_glove_model(
            self._vocab,
            n=dimension_size,
            glove_path='/home/aghosh/backups/glove.6B.300d.txt')

        # aux inputs
        aux_train = build_auxiliary_feature(self.train)
        aux_validation = build_auxiliary_feature(self.validation)

        # solving class imbalance
        ratio = self.calculate_label_ratio(Y)
        ratio = [max(ratio.values()) / value for key, value in ratio.items()]
        print('class ratio::', ratio)

        Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]

        print('train_X', X.shape)
        print('train_Y', Y.shape)
        print('validation_X', tX.shape)
        print('validation_Y', tY.shape)

        # trainable true if you want word2vec weights to be updated
        # Not applicable in this code
        model = self._build_network(len(self._vocab.keys()) + 1,
                                    self._line_maxlen,
                                    emb_weights,
                                    hidden_units=32,
                                    embedding_dimension=dimension_size,
                                    batch_size=batch_size)

        # open(self._model_file + 'model.json', 'w').write(model.to_json())
        save_best = ModelCheckpoint(model_file + 'model.json.hdf5',
                                    save_best_only=True)
        save_all = ModelCheckpoint(self._model_file +
                                   'weights.{epoch:02d}__.hdf5',
                                   save_best_only=False)
        early_stopping = EarlyStopping(monitor='val_loss',
                                       patience=20,
                                       verbose=1)

        # training
        model.fit([X, aux_train],
                  Y,
                  batch_size=batch_size,
                  epochs=10,
                  validation_data=([tX, aux_validation], tY),
                  shuffle=True,
                  callbacks=[save_best, save_all, early_stopping],
                  class_weight=ratio)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: offensive_content_detection_text_character_model_CNN_LSTM_DNN.py Projeto: AniSkywalker/OffensiveContentDetection

    def __init__(self,
                 train_file,
                 validation_file,
                 word_file_path,
                 split_word_path,
                 emoji_file_path,
                 model_file,
                 vocab_file,
                 output_file,
                 model_filename=None):
        offensive_content_model.__init__(self)

        self._train_file = train_file
        self._validation_file = validation_file
        self._word_file_path = word_file_path
        self._split_word_file_path = split_word_path
        self._emoji_file_path = emoji_file_path
        self._model_file = model_file
        self._vocab_file_path = vocab_file
        self._output_file = output_file
        self._model_filename = model_filename

        # self.load_train_validation_data(lowercase=False, at_character=True)
        # self.char_train = self.train
        # self.char_validation = self.validation

        self.load_train_validation_data()

        # batch size
        batch_size = 16
        print('bb', len(self.train))
        # print('bb', len(self.char_train))

        # self.train = self.train[-len(self.train) % batch_size:]
        # self.char_train = self.char_train[-len(self.char_train) % batch_size:]
        print('bb', len(self.train))
        # print('bb', len(self.char_train))

        print(self._line_maxlen)
        print(self._line_char_maxlen)

        # build vocabulary
        self._vocab = dh.build_vocab(self.train, min_freq=5)
        if ('unk' not in self._vocab):
            self._vocab['unk'] = len(self._vocab.keys()) + 1

        self._char_vocab = {}
        # self._char_vocab = dh.build_vocab(self.char_train)
        # if ('unk' not in self._char_vocab):
        #     self._char_vocab['unk'] = len(self._char_vocab.keys()) + 1

        print(len(self._vocab.keys()) + 1)
        print('unk::', self._vocab['unk'])
        # print(len(self._char_vocab.keys()) + 1)
        # print('unk::', self._char_vocab['unk'])

        dh.write_vocab(self._vocab_file_path, self._vocab)
        # dh.write_vocab(self._vocab_file_path + '.char', self._char_vocab)

        # prepares input
        X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
        X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)

        # prepares input
        tX, tY, tD, tC, tA = dh.vectorize_word_dimension(
            self.validation, self._vocab)
        tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)

        # prepares character input
        # cX, cY, cD, cC, cA = dh.vectorize_word_dimension(self.char_train, self._char_vocab)
        # cX = dh.pad_sequence_1d(cX, maxlen=self._line_char_maxlen)

        # prepares character input
        # ctX, ctY, ctD, ctC, ctA = dh.vectorize_word_dimension(self.char_validation, self._char_vocab)
        # ctX = dh.pad_sequence_1d(ctX, maxlen=self._line_char_maxlen)

        print("X", X.shape)
        # print('cX', cX.shape)

        # hidden units
        hidden_units = 256

        # word2vec dimension
        dimension_size = 128
        W = []
        W = dh.get_word2vec_weight(
            self._vocab,
            n=300,
            path='/home/striker/word2vec/GoogleNews-vectors-negative300.bin')
        # W = dh.get_glove_weights(self._vocab, n=200, path='/home/striker/word2vec/glove_model_200.txt.bin')
        print('Word2vec obtained....')

        # solving class imbalance
        ratio = self.calculate_label_ratio(Y)
        ratio = [max(ratio.values()) / value for key, value in ratio.items()]
        print('class ratio::', ratio)

        Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
        # Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]

        print('train_X', X.shape)
        print('train_Y', Y.shape)
        print('validation_X', tX.shape)
        print('validation_Y', tY.shape)

        # trainable true if you want word2vec weights to be updated
        model = None
        if (model_filename == 'emotion.json'):
            model = self._build_emotion_network(len(self._vocab.keys()) + 1,
                                                self._line_maxlen,
                                                emb_weights=W,
                                                hidden_units=hidden_units,
                                                trainable=False)
        if (model_filename == 'offensive.json'):
            model = self._build_network(len(self._vocab.keys()) + 1,
                                        len(self._char_vocab.keys()) + 1,
                                        emb_weights=W,
                                        hidden_units=hidden_units,
                                        trainable=False,
                                        batch_size=8)

        open(self._model_file + self._model_filename,
             'w').write(model.to_json())
        save_best = ModelCheckpoint(model_file + self._model_filename +
                                    '.hdf5',
                                    save_best_only=True)
        early_stopping = EarlyStopping(monitor='loss', patience=50, verbose=1)
        lr_tuner = ReduceLROnPlateau(monitor='loss',
                                     factor=0.1,
                                     patience=1,
                                     verbose=1,
                                     mode='auto',
                                     epsilon=0.0001,
                                     cooldown=0,
                                     min_lr=0.000001)

        # training
        model.fit([X],
                  Y,
                  batch_size=16,
                  epochs=100,
                  validation_split=0.1,
                  shuffle=True,
                  callbacks=[save_best, early_stopping, lr_tuner],
                  class_weight=ratio,
                  verbose=1)