Exemplo n.º 1
0
def predict(data, maxlen, model_dir,output_file_name):

    cleaned_predict= preprocess(data['document'].values)

    if selected_layer=='bert':
        FullTokenizer = bert.bert_tokenization.FullTokenizer
        tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=False)

        predict_tokens = [["[CLS]"] + tokenizer.tokenize(sentence) + ["[SEP]"] for sentence in cleaned_predict]                
        predict_data = [tokenizer.convert_tokens_to_ids(token) for token in predict_tokens]                                                                                                                                                   
        predict_data = pad_sequences(predict_data, maxlen=maxlen, dtype="long", truncating="post", padding="post")

    else:
        with open('tokenizer.json') as f:
            json_data = json.load(f)
            tokenizer=tokenizer_from_json(json_data)
            
        predict_data = tokenizer.texts_to_sequences(cleaned_predict)
        predict_data = pad_sequences(predict_data, padding='post', maxlen=maxlen)

    model = load_model(model_dir)
    # 각 데이터의 텍스트, 확률, 라벨을 예측하여 파일에 저장함
    result = model.predict(predict_data)
    label = np.around(model.predict(predict_data))
    with open(output_file_name, 'w', encoding='utf-8') as fw:
        for i in range(len(data)):
            fw.write('{}\t{}\t{}\n'.format(data['document'].iloc[i], result[i],label[i]))
Exemplo n.º 2
0
def convert_text_to_index_array(text):
    # cnn_lstm_model()
    # we're still going to use a Tokenizer here, but we don't need to fit it
    # for human-friendly printing
    # labels = ['positive', 'neutral', 'negative']

    # read in our saved dictionary
    '''
    with open('D:\\app\\DL_models\\Aspect\\vocab.json', 'r') as dictionary_file:
        dictionary = json.load(dictionary_file)
    '''
    with open(
            'D:\\Final IT\\GRADUATE THESIS\\Projects\\CNN\\word2vec\\vocab.json'
    ) as f:
        data = json.load(f)
    dictionary = tokenizer_from_json(data)

    words = kpt.text_to_word_sequence(
        text, filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n\'')
    wordIndices = []
    for word in words:
        if word in dictionary.word_docs:
            wordIndices.append(dictionary.word_docs[word])
        else:
            print("'%s' not in training corpus; ignoring." % (word))
    return wordIndices
Exemplo n.º 3
0
    def setUp(self):
        self.model = load_model(filename)

        with open(vocabualry_file, "r") as json_file:
            json_content = json_file.read()

        self.tokenizer = tokenizer_from_json(json_content)
Exemplo n.º 4
0
def evaluation(text):

    clean_text = treatment(text)

    # Change texts into sequence of indexes
    with open('tokenizer.json') as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)
    X = tokenizer.texts_to_sequences(
        [clean_text]
    )  #Change the texts into numeric identifiers that represent the index of each word in the dictionary

    # Pad the sequences
    X = pad_sequences(X, 40)  #Sequences have the same size

    #Change input form
    X_ = X.reshape(X.shape[0], X.shape[1], 1)

    #Model loaded
    model = keras.models.load_model('best_model.hdf5')

    predictions = model.predict(X_).reshape(1, -1)[0]
    predictions = ["Positivo" if x < 0.5 else "Negativo" for x in predictions]

    return (clean_text, predictions[0])
Exemplo n.º 5
0
def test_tokenizer_serde_fitting():
    sample_texts = [
        'There was a time that the pieces fit, but I watched them fall away',
        'Mildewed and smoldering, strangled by our coveting',
        'I\'ve done the math enough to know the dangers of our second guessing'
    ]
    tokenizer = text.Tokenizer(num_words=100)
    tokenizer.fit_on_texts(sample_texts)

    seq_generator = tokenizer.texts_to_sequences_generator(sample_texts)
    sequences = [seq for seq in seq_generator]
    tokenizer.fit_on_sequences(sequences)

    tokenizer_json = tokenizer.to_json()
    recovered = text.tokenizer_from_json(tokenizer_json)

    assert tokenizer.char_level == recovered.char_level
    assert tokenizer.document_count == recovered.document_count
    assert tokenizer.filters == recovered.filters
    assert tokenizer.lower == recovered.lower
    assert tokenizer.num_words == recovered.num_words
    assert tokenizer.oov_token == recovered.oov_token

    assert tokenizer.word_docs == recovered.word_docs
    assert tokenizer.word_counts == recovered.word_counts
    assert tokenizer.word_index == recovered.word_index
    assert tokenizer.index_word == recovered.index_word
    assert tokenizer.index_docs == recovered.index_docs
Exemplo n.º 6
0
 def get_tokenizer(self):
     '''
     :param embedded_matrix_size: 嵌入矩阵大小
     :return: tokenizer
     '''
     if not os.path.exists(self.tokenizer_path):
         self.remove_stop_word_list = self.get_remove_stop_word()
         tokenizer = text.Tokenizer(num_words=self.embedded_matrix_size,
                                    lower=False,
                                    char_level=False)
         tokenizer.fit_on_texts(self.remove_stop_word_list)
         tokenizer_json = tokenizer.to_json()
         with open(self.tokenizer_path, "w") as f:
             f.write(tokenizer_json)
             print("save tokenizer_json success as '{}'".format(
                 self.tokenizer_path))
         return tokenizer
     else:
         print("更换数据集需手动删除{}此文件,并重新运行代码后会自动生成tokenizer.".format(
             self.tokenizer_path))
         with open(self.tokenizer_path, "r") as f:
             tokenizer_json = f.read()
         tokenizer = text.tokenizer_from_json(tokenizer_json)
         print("load tokenizer_json success as '{}'".format(
             self.tokenizer_path))
         return tokenizer
Exemplo n.º 7
0
def eval(ctx, evaluation_data, model_dir, batch_size):
    MODEL = ctx.obj['model']
    _, x_test, y_test = load_data(evaluation_data,
                                  sample_ratio=1,
                                  shuffle=False)
    with open(join(model_dir, 'tokenizer.json')) as f:
        json_string = f.read()
        tokenizer = tokenizer_from_json(json_string)
    with open(join(model_dir, 'model.config'), 'rb') as f:
        config = pickle.load(f)
    makedirs(model_dir, exist_ok=True)
    x_test, _, tokenizer = \
        data_preprocessing(x_test, max_len=config['max_len'], tokenizer=tokenizer)

    config['batch_size'] = batch_size

    config = Bunch(config)
    sess = tf.compat.v1.Session()
    model = MODEL(config)
    logger = Logger(sess, config)
    trainer = BaseTrain(sess,
                        model,
                        None,
                        config,
                        logger,
                        val_data=(x_test, y_test),
                        restore=True)

    summaries_dict, _, _ = trainer.eval()
    print(summaries_dict)
    with open(
            join(model_dir,
                 f'evaluation-on-"{basename(evaluation_data)}".json'),
            'w') as fp:
        json.dump(summaries_dict, fp, cls=CustomJSONEncoder)
Exemplo n.º 8
0
def _load_tokenizer(config):
    with open(
            os.path.join(config.get('PATHS', 'model_path'), 'saved_models',
                         config.get('INFERENCE', 'inference_model'),
                         'tokenizer.json'), 'rb') as f:
        tok = json.load(f)
    return tokenizer_from_json(tok)
Exemplo n.º 9
0
def test(dataset, style, test_file):
    obj = json.load(open(test_file, 'r'))
    # obj = json.load(open('/media/wentian/sdb2/work/caption_ma/save/2019-10-04_21-09-37_2agent_neg/annotation.json', 'r'))['annotations']
    sents = [i['caption'] for i in obj]

    d = pickle.load(
        open(r'../data/clf_nn/model_rnn_info_{}_{}.pkl'.format(dataset, style),
             'rb'))
    w, e, tokenizer_config = d['word_index'], d['embeddings_index'], d[
        'tokenizer_config']
    MAX_SEQUENCE_LENGTH, nclasses, EMBEDDING_DIM = d['MAX_SEQUENCE_LENGTH'], d[
        'nclasses'], d['EMBEDDING_DIM']
    tokenizer = tokenizer_from_json(tokenizer_config)
    model_RNN = Build_Model_RNN_Text(w,
                                     e,
                                     nclasses=nclasses,
                                     MAX_SEQUENCE_LENGTH=MAX_SEQUENCE_LENGTH,
                                     EMBEDDING_DIM=EMBEDDING_DIM)
    model_RNN.load_weights('../data/clf_nn/model_rnn_{}_{}.h5'.format(
        dataset, style))

    X_train_Glove = test_tokenize(tokenizer, sents, MAX_SEQUENCE_LENGTH)
    predicted = model_RNN.predict_classes(X_train_Glove, verbose=0)

    for i in range(len(predicted)):
        if predicted[i] == 0:
            print(sents[i], predicted[i])
    print(sum(predicted) / len(predicted))
Exemplo n.º 10
0
 def __init__(self):
     self.__assessment_model = tf.keras.models.load_model(
         'release/assessment_model.h5',
         custom_objects={'soft_acc': soft_acc})
     self.__simplification_model = tf.keras.models.load_model(
         'release/simplification_model.h5')
     with open('release/assessment_tokenizer.json') as f:
         data = json.load(f)
         self.__assessment_tokenizer = tokenizer_from_json(data)
     with open('release/simplification_tokenizer.json') as f:
         data = json.load(f)
         self.__simplification_tokenizer = tokenizer_from_json(data)
     self.levels = ['Beginner', 'Intermediate', 'Advanced']
     self.__id_to_word = {
         i: word
         for word, i in self.__simplification_tokenizer.word_index.items()
     }
Exemplo n.º 11
0
def get_tokenizer(folder_path):

    folder_path = os.path.join('tmp/', folder_path)
    tokenizer_path = os.path.join(folder_path, 'tokenizer.json')
    with open(tokenizer_path) as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)
    return tokenizer
Exemplo n.º 12
0
def loadModel(name='qasystem'):
    fin = open(name + '.tok', 'rb')
    tokJSON = pickle.load(fin)
    fin.close()

    return load_model(name + '.model',
                      custom_objects={'SeqSelfAttention': SeqSelfAttention
                                      }), tokenizer_from_json(tokJSON)
Exemplo n.º 13
0
 def __init__(self):
     self.logger = logging.getLogger('predictor.Predictor')
     self.nn = NeuralNetwork()
     with open(self.BRANDS, 'r') as f:
         self.brands = np.array(json.load(f))
     with open(self.CATEGORIES, 'r') as f:
         self.categories = np.array(json.load(f))
     with open(self.TOKENIZER, 'r') as f:
         self.tokenizer = tokenizer_from_json(f.read())
     self.nn.load([self.MODEL_ARCH, self.MODEL_WEIGHTS])
Exemplo n.º 14
0
 def __init__(self):
     self.preprocessor = textPreprocessor()
     self.model = load_model(
         os.path.join(CURRENT_PATH, "../../artifacts/c_lstm_reduced.h5"))
     self.model._make_predict_function()
     with open(
             os.path.join(CURRENT_PATH,
                          "../../artifacts/keras_tokenizer.json")) as f:
         data = json.load(f)
         self.tokenizer = tokenizer_from_json(data)
Exemplo n.º 15
0
 def get_tokenizer(self):
     self.tokenizer_path = config.data_preprocessing_config().tokenizer_path
     if not os.path.exists(self.tokenizer_path):
         print("please run data_preprocessing generate '{}'".format(
             self.tokenizer_path))
         exit()
     else:
         with open(self.tokenizer_path, "r") as f:
             tokenizer_json = f.read()
         tokenizer = text.tokenizer_from_json(tokenizer_json)
         return tokenizer
Exemplo n.º 16
0
def preload_model():
    ''' Returns nothing. Creates two global variables.
            model: the Keras model to perform sentiment analysis.
            tokenizer: the Keras tokenizer to tokenize predictions.
        '''
    global tokenizer
    global keras_model
    with open(os.path.join(dirname, 'tokenizer.json')) as f:
        json_data = json.load(f)
        tokenizer = tokenizer_from_json(json_data)
    keras_model = tf.keras.models.load_model(os.path.join(dirname, 'model.h5'))
Exemplo n.º 17
0
    def load_tokenizer(self, file_name=None):
        if file_name is None:
            file_name = tokenizer_path + "/" + self.source + '_tokenizer.json'

        if os.path.isfile(file_name):
            with open(file_name) as f:
                data = json.load(f)
                tokenizer = tokenizer_from_json(data)
                return tokenizer
        else:
            return Tokenizer(num_words=10000)
Exemplo n.º 18
0
 def inferEmotion(self, input_string):
     with open('tokenizer.json') as f:
       data = json.load(f)
     newdata = json.dumps(data)
     tokenizer = text.tokenizer_from_json(newdata)
     tokenized_inputs = tokenizer.texts_to_sequences([input_string])
     padded_inputs = sequence.pad_sequences(tokenized_inputs, maxlen=1000)
     formatted_input = padded_inputs.tolist()
     data = json.dumps({"instances": formatted_input})
     json_response = requests.post("http://diadist.herokuapp.com/v1/models/diarydistiller/versions/1:predict", data=data)
     response = json.loads(json_response.text)
     return response["predictions"][0]
Exemplo n.º 19
0
def test_tokenizer_serde_no_fitting():
    tokenizer = text.Tokenizer(num_words=100)

    tokenizer_json = tokenizer.to_json()
    recovered = text.tokenizer_from_json(tokenizer_json)

    assert tokenizer.get_config() == recovered.get_config()

    assert tokenizer.word_docs == recovered.word_docs
    assert tokenizer.word_counts == recovered.word_counts
    assert tokenizer.word_index == recovered.word_index
    assert tokenizer.index_word == recovered.index_word
    assert tokenizer.index_docs == recovered.index_docs
Exemplo n.º 20
0
def preprocess_data(data,
                    use_loaded=True,
                    file_emb="./data/glove.840B.300d.txt",
                    max_num_words=50000,
                    max_len_seq=35,
                    emb_dim=300):
    # preprocess data
    file_processed_data = dir_processed + "data_processed.pkl"
    file_tokenizer = dir_processed + "tokenizer.pkl"
    file_label_index = dir_processed + "label_index.npy"
    if use_loaded:
        X, y, emb = pickle.load(open(file_processed_data, "rb"))
        tokenizer = tokenizer_from_json(
            open(file_tokenizer, "r", encoding="utf-8").read())
        label_encoder = LabelEncoder()
        label_encoder.classes_ = np.load(file_label_index)
        return X, y, emb, tokenizer, label_encoder

    cleaned_text = data["text"].apply(clean_text).values
    tokenizer = Tokenizer(num_words=max_num_words,
                          oov_token='oov_token_placeholder')
    tokenizer.fit_on_texts(list(cleaned_text))
    tokenizer_json = tokenizer.to_json(ensure_ascii=False)
    with open(file_tokenizer, 'w', encoding='utf-8') as fout:
        fout.write(tokenizer_json)

    sequences = tokenizer.texts_to_sequences(cleaned_text)
    X = pad_sequences(sequences, maxlen=max_len_seq)
    word_index = tokenizer.word_index
    num_words = len(word_index)
    print('Found %s Words' % num_words)

    print(set(data["label"].values))
    label_encoder = LabelEncoder().fit(data["label"].values)
    np.save(file_label_index, label_encoder.classes_)
    print('Found %s Classes' % len(label_encoder.classes_))
    y = label_encoder.transform(data["label"].values)

    print('Loading Word Embeddings...')
    emb = (np.random.rand(min(num_words + 1, max_num_words), emb_dim) -
           0.5) * 0.1  # +1 because idx 0 is not used
    with open(file_emb, 'r', encoding='utf-8') as fin:
        for line in fin:
            tokens = line.rstrip().split(' ')
            if tokens[0] in word_index.keys(
            ) and word_index[tokens[0]] < max_num_words:
                emb[word_index[tokens[0]]] = np.asarray(tokens[1:],
                                                        dtype='float32')

    pickle.dump((X, y, emb), open(file_processed_data, "wb"))
    return X, y, emb, tokenizer, label_encoder
Exemplo n.º 21
0
 def __fit_tokenizer(self):
     if os.path.isfile(config.vocab_path):
         with open(config.vocab_path, 'r') as f:
             json_content = f.read()
             self.tokenizer = tokenizer_from_json(json_content)
     else:
         tmp_doc = (self.beg_token + ' ' + self.end_token + ' ') * 100
         docs = [tmp_doc, self.__read_raw_formulas('train')]
         num_tokens = config.vocab_size - 3  # for beg, and, unk token
         self.tokenizer = Tokenizer(num_words=num_tokens,
                                    filters='\t\n',
                                    lower=False,
                                    oov_token=self.unk_token)
         self.tokenizer.fit_on_texts(docs)
         with open(config.vocab_path, 'w+') as f:
             f.write(self.tokenizer.to_json())
Exemplo n.º 22
0
    def from_numpy(
        self,
        train_data_file: str,
        test_data_file: str = None,
        val_data_file: str = None,
        ds_type="TensorDataset",
    ):

        logging.info("Starting Data Preparation...")
        start_time = time.time()

        self.tokenizer = text.tokenizer_from_json()

        train_npz = np.load(train_data_file, allow_pickle=True)
        self.X_train = train_npz["X"].item()
        self.y_train = train_npz["y"]

        self.num_classes = len(np.unique(self.y_train))
        self.vocab_size = np.shape(self.X_train)[1]

        train_ds = CSRDataset(self.X_train, self.y_train)

        if test_data_file is not None:
            test_npz = np.load(test_data_file, allow_pickle=True)
            self.X_test = test_npz["X"].item()
            self.y_test = test_npz["y"]

            test_ds = CSRDataset(self.X_test, self.y_test)

        if val_data_file is not None:
            val_npz = np.load(val_data_file, allow_pickle=True)
            self.X_val = val_npz["X"].item()
            self.y_val = val_npz["y"]

            val_ds = CSRDataset(self.X_val, self.y_val)

        logging.info("Data Import Completed - Time elapsed: " +
                     get_elapsed_time(start_time))

        if val_data_file is not None:
            if test_data_file is not None:
                return train_ds, val_ds, test_ds
            else:
                return train_ds, val_ds
        else:
            return train_ds
Exemplo n.º 23
0
def load_vocabulary_file_from(directory_path_or_file, split_name=None, flat=True):
    """
        @param split_name: when given looks for the sub-directory or file in the flat directory
        @param flat: when True looks for a file in the given directory, otherwise looks into the sub-directory 
    """
    lookup_filename = DEFAULT_VOCABULARY_FILE_NAME
    
    if split_name and not flat:
        directory_path_or_file = "/".join([directory_path_or_file, split_name])
        
    if split_name and flat:
        lookup_filename = "mscoco_vocabulary_{}.json".format(split_name) 
        # print("No support for split specific vocabulary loading. Please just name the file to use to " + lookup_filename)
        
    tokenizer_config = load_json_from(directory_path_or_file, lookup_filename)
    tokenizer = tokenizer_from_json(json.dumps(tokenizer_config))
    return tokenizer
Exemplo n.º 24
0
def review_process(request):
    if request.method == 'POST':
        review_stmt = request.POST.get('review')
        english_stops = set(stopwords.words('english'))
        max_length = 130  #From train data
        #Loading stored model
        loaded_model = load_model(
            '/home/dsdroid/Desktop/6th Sem/SWLAB/Assignment 4/Sentiment-Analysis/review_classifier/review/sentiment.h5'
        )
        # Pre-process input
        regex = re.compile(r'[^a-zA-Z\s]')
        review_stmt = regex.sub('', review_stmt)
        print('Cleaned: ', review_stmt)

        words = review_stmt.split(' ')
        filtered = [w for w in words if w not in english_stops]
        filtered = ' '.join(filtered)
        filtered = [filtered.lower()]

        print('Filtered: ', filtered)
        with open(
                '/home/dsdroid/Desktop/6th Sem/SWLAB/Assignment 4/Sentiment-Analysis/review_classifier/review/data.txt'
        ) as json_file:
            token = json.load(json_file)
        token = tokenizer_from_json(token)
        print("TOKEN:", token)
        tokenize_words = token.texts_to_sequences(filtered)
        tokenize_words = pad_sequences(tokenize_words,
                                       maxlen=max_length,
                                       padding='post',
                                       truncating='post')
        print('tokenize words', tokenize_words)
        result = loaded_model.predict(tokenize_words)
        #result=1
        print(result)
        if result >= 0.5:
            print('positive')
            return HttpResponse('Positive review')
        else:
            print('negative')
            return HttpResponse('Negative review')
    else:
        return render(request, 'form.html')
Exemplo n.º 25
0
def input_Text_proc(text_Data):
	text_Data = text_Data.strip()
	text_Data = re.split('; |, |\*|\n| |', text_Data)
	print(text_Data)
	global tokenizer
	if tokenizer is None:
		print('Tokenizer is Loaded')
		with open('./Tokenizer/tokenizer2.json') as f:
			token_json = json.load(f)
			tokenizer = text.tokenizer_from_json( token_json )
	
	res = [[]]
	for w in text_Data:
		w = w.lower()
		if w not in tokenizer.word_index:
			res[0].append(0)
		else:
			res[0].append(tokenizer.word_index[w])
	print(res)
	return	res	
Exemplo n.º 26
0
def predict(ctx, prediction_data, model_dir):
    MODEL = ctx.obj['model']
    titles_test, x_test, y_test = load_data(prediction_data, shuffle=False)
    labels_available = True
    with open(join(model_dir, 'tokenizer.json')) as f:
        json_string = f.read()
        tokenizer = tokenizer_from_json(json_string)
    with open(join(model_dir, 'model.config'), 'rb') as f:
        config = pickle.load(f)
    makedirs(model_dir, exist_ok=True)
    x_test, _, tokenizer = \
        data_preprocessing(x_test, max_len=config['max_len'], tokenizer=tokenizer)
    config['batch_size'] = 1
    config = Bunch(config)
    sess = tf.compat.v1.Session()
    model = MODEL(config)
    if y_test is None:
        y_test = np.zeros((x_test.shape[0], model.y.shape[1]))
        labels_available = False
    logger = Logger(sess, config)
    trainer = BaseTrain(sess,
                        model,
                        None,
                        config,
                        logger,
                        val_data=(x_test, y_test),
                        restore=True,
                        no_labels=True)

    _, predictions, probabilities = trainer.eval()
    frame_dict = {'title': titles_test, 'prediction': predictions}
    for i in range(probabilities.shape[1]):
        frame_dict[f'probability_{i}'] = probabilities[:, i]
    if labels_available:
        frame_dict[f'true'] = np.argmax(y_test, axis=1)
    df = pd.DataFrame.from_dict(frame_dict)
    print(df)
    df.to_csv(join(model_dir,
                   f'predictions-for-"{basename(prediction_data)}".csv'),
              index=False)
Exemplo n.º 27
0
def get_predict(text):
    tokenizer = Tokenizer(num_words=50000)
    labels = ['chưa xác định', 'bệnh hạ huyết áp', 'bệnh viêm đường ruột']

    with open('C:\\Users\\HONGANH\\OneDrive\\Resources\\vocab.json') as f:
        data = json.load(f)
    dictionary = tokenizer_from_json(data)

    json_file = open(
        'C:\\Users\\HONGANH\\OneDrive\\Resources\\CNN_train_3c_relu.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()

    model = model_from_json(loaded_model_json)
    model.load_weights(
        'C:\\Users\\HONGANH\\OneDrive\\Resources\\CNN_train_3c-035-0.0476-0.9940.h5'
    )

    tok_sam, seq_sam, sample = load_full_data()

    sentence = []

    evalSentence = text
    if evalSentence:
        evalSentence = evalSentence.lower()

    sentence.append(evalSentence)
    eval_text = tok_sam.texts_to_sequences(sentence)
    text_test = pad_sequences(eval_text,
                              maxlen=sample.shape[1],
                              padding=pad[0])
    # pred = model.predict(testArr)
    pred = model.predict(text_test)
    print("%s; độ tin cậy %f%%" %
          (labels[np.argmax(pred)], pred[0][np.argmax(pred)] * 100))
    del evalSentence
    sentence = []

    return labels[np.argmax(pred)]
Exemplo n.º 28
0
def classify_review(review_stmt):  #returns 1 for positive,0 for negative
    english_stops = set(stopwords.words('english'))
    max_length = 130  #From train data
    #Loading stored model
    loaded_model = load_model(
        '/home/dsdroid/Desktop/6th Sem/SWLAB/Assignment 4/Sentiment-Analysis/review_classifier/review/sentiment.h5'
    )
    # Pre-process input
    regex = re.compile(r'[^a-zA-Z\s]')
    review_stmt = regex.sub('', review_stmt)
    print('Cleaned: ', review_stmt)

    words = review_stmt.split(' ')
    filtered = [w for w in words if w not in english_stops]
    filtered = ' '.join(filtered)
    filtered = [filtered.lower()]

    # print('Filtered: ', filtered)
    #token = Tokenizer(lower=False)
    with open(
            '/home/dsdroid/Desktop/6th Sem/SWLAB/Assignment 4/Sentiment-Analysis/review_classifier/review/data.txt'
    ) as json_file:
        token = json.load(json_file)
    token = tokenizer_from_json(token)
    tokenize_words = token.texts_to_sequences(filtered)
    tokenize_words = pad_sequences(tokenize_words,
                                   maxlen=max_length,
                                   padding='post',
                                   truncating='post')
    print(tokenize_words)
    result = loaded_model.predict(tokenize_words)
    #print(result)
    if result >= 0.5:
        #print('positive')
        return True  #1
    else:
        #print('negative')
        return False  #0
Exemplo n.º 29
0
def eval(data,batch, maxlen, selected_layer,model_dir):

    cleaned_test = preprocess(data['document'].values)

    if selected_layer=='bert':
        FullTokenizer = bert.bert_tokenization.FullTokenizer
        tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=False)

        eval_tokens = [["[CLS]"] + tokenizer.tokenize(sentence) + ["[SEP]"] for sentence in cleaned_test]
        eval_data = [tokenizer.convert_tokens_to_ids(token) for token in eval_tokens]
        eval_data = pad_sequences(eval_data, maxlen=maxlen, dtype="long", truncating="post", padding="post")

    else:
        with open('tokenizer.json') as f:
            json_data = json.load(f)
            tokenizer=tokenizer_from_json(json_data)
        eval_data = tokenizer.texts_to_sequences(cleaned_test)
        eval_data = pad_sequences(eval_data, padding='post', maxlen=maxlen)

    labels = np.array(data['label'])
    model= load_model(model_dir)

    test_loss, test_acc = model.evaluate(x=eval_data, y=labels, verbose=1, batch_size=batch)
    print("Test Loss: {}\nTest Accuracy:{}".format(test_loss,test_acc))
Exemplo n.º 30
0
def load_dictionary(dict_path, encoding="utf-8"):
    with open(dict_path, mode="r", encoding=encoding) as file:
        return tokenizer_from_json(json.load(file))