예제 #1
0
    def update_tokenizer(self, init=False, **kwargs):
        if (init == True):
            tokenizer = Tokenizer(num_words=self.max_features)
        elif (init == False):
            with open(self.tokenizer_path) as handle:
                tokenizer = tokenizer_from_json(json.load(handle))

        questions = []
        answers = []
        answer_categories = []
        for element in self.question:
            questions.append(mecab.morphs(element.question))
            if (element.answer != None):
                answers.append(mecab.morphs(element.answer))
                answer_categories.append(mecab.morphs(element.answer_category))
            else:
                answers.append("없음")
                answer_categories.append("없음")

        tokenizer.fit_on_texts(questions)
        tokenizer.fit_on_texts(answers)
        tokenizer.fit_on_texts(answer_categories)

        with open(self.tokenizer_path, "w", encoding="utf-8") as handle:
            json.dump(tokenizer.to_json(), handle)

        return tokenizer
예제 #2
0
def loader():
    model = load_model('../assets/lstm1/rnn')

    with open('../assets/lstm1/rnn_tokenizer.json') as f:
        data = json.load(f)
        tokenizer = text.tokenizer_from_json(data)
    return model, tokenizer
예제 #3
0
def prepare_ans_ques_ref(ans, ques, ref):

    max_length = 40
    trunc_type = 'post'

    with open(os.path.join('data', 'tokenizer.json')) as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)

    ans_sequences = tokenizer.texts_to_sequences([ans])
    ques_sequences = tokenizer.texts_to_sequences([ques])
    new_list = ' '.join(ref)
    ref_sequences = tokenizer.texts_to_sequences([new_list])

    ans_padded = pad_sequences(ans_sequences,
                               maxlen=max_length,
                               truncating=trunc_type)
    ques_padded = pad_sequences(ques_sequences,
                                maxlen=max_length,
                                truncating=trunc_type)
    ref_padded = pad_sequences(ref_sequences,
                               maxlen=2 * max_length,
                               truncating=trunc_type)

    return [
        np.expand_dims(np.concatenate((ans_padded[0], ques_padded[0]), axis=0),
                       axis=0),
        np.expand_dims(ref_padded[0], axis=0)
    ]
예제 #4
0
def generate_output(next_words, token_file, model_used):
    max_sequence_len = 32
    #generated = ''
    #sentence = text[start_index: start_index + Tx]
    #sentence = '0'*Tx
    usr_input = input("Your input: ")
    print("\n")
    # zero pad the sentence to Tx characters.
    #sentence = str(usr_input)
    sys.stdout.write("Generating Psalmist word...\n")
    print("\n")
    #print("You: "+str(str(usr_input)))
    #print("Machine: "+str(sentence))
    with open(token_file) as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([usr_input])[0]
        token_list = pad_sequences([token_list],
                                   maxlen=max_sequence_len - 1,
                                   padding='pre')
        #print(token_list)
        model = load_model(model_used)
        predicted = model.predict_classes(token_list, verbose=0)
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        usr_input += " " + output_word
    sys.stdout.write("Genesys-Machine: " + '"' + usr_input + '"')
예제 #5
0
    def __init__(self):
        self.tokenizer_path = "./tokenizer/tokenizer.json"
        self.checkpoint_path = "./model/cp.ckpt"
        self.model_path = "./model/model.h5"
        self.max_features = 1000
        self.maxlen = 100  # 학습할때 참조할 문장단 문자 갯수를 정의한다.

        stmt = text("select question, answer, answer_category from question")
        stmt = stmt.columns(Question.question, Question.answer,
                            Question.answer_category)
        self.question = db_session.query(
            Question.question, Question.answer,
            Question.answer_category).from_statement(stmt).all()

        if (path.exists(self.tokenizer_path)):
            with open(self.tokenizer_path) as handle:
                self.tokenizer = tokenizer_from_json(json.load(handle))
        else:
            self.tokenizer = update_tokenizer(init=True)

        if (path.exists(self.model_path)):
            self.model = load_model(self.model_path)
        else:
            self.model = self.create_model()
            self.learn()
예제 #6
0
def generate_sentences(model_path, seed_path, num_words):
    model = getmodel(31, 10000, 32, model_path)
    tokenizer_path = model_path + '.tokenizer.json'

    with open(tokenizer_path, 'r') as f:
        tokenizer = tokenizer_from_json(f.readlines()[0])


#    with open(seed_path, 'r') as f:
#        seed = f.readlines()[0]

    seed = create_indexes_tape('seed3', tokenizer)
    seed_seq = seed
    #    seed_seq = tokenizer.texts_to_sequences([seed])[0]
    #    seed_seq = tf.keras.preprocessing.sequence.pad_sequences([seed_seq], 31)[0]
    seed_seq = list(seed_seq)
    pred = None
    out_seq = []

    for _ in range(num_words):
        seed_seq.extend(out_seq)
        seq_input = seed_seq[-(31):]
        seq_input = np.expand_dims(seq_input, 0)
        res = model.predict([seq_input], 1)
        pred = res.squeeze().argmax()
        out_seq.append(pred)

    words = [tokenizer.index_word[w] for w in out_seq if w not in [0]]
    sentence = ' '.join(words).replace('<eom>', '\n')
    print(sentence)
 def load_tokenizer(self, filename='tokens.json'):
     # read as <str> from JSON file
     with open(f'{self.source_path}/{filename}', 'r') as tokenfile:
         tokens_info = tokenfile.read()
     #
     from tensorflow.keras.preprocessing.text import tokenizer_from_json
     self.tokenizer = tokenizer_from_json(tokens_info)
     return self.tokenizer
예제 #8
0
def loadTokenizer(path_to_file):

    with open(path_to_file) as f:
        tk_json = json.load(f)

    tokenizer = tokenizer_from_json(tk_json)

    return tokenizer
예제 #9
0
def loadModelAndTokenizer():
    # test 2
    document_path = os.getcwd() + "/app/models/"
    model = keras.models.load_model(document_path)
    with open(os.getcwd() + '/app/models/tokenizer.json') as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)
    return model, tokenizer
예제 #10
0
 def load_tokenizer(self):
     try:
         with io.open(tokenizer_path) as f:
             json_string = json.load(f)
             self._tokenizer = tokenizer_from_json(json_string)
     except Exception as e:
         print("Model.load_tokenizer() failed. Error: {}".format(e))
     return self
예제 #11
0
    def __init__(self, cache_path=None, stop_words=None, **extra):
        if cache_path and os.path.exists(cache_path):
            with open(cache_path, 'r') as f:
                self._tk = tokenizer_from_json(f.read())
        else:
            self._tk = Tokenizer(lower=True, **extra)

        self._cache_path = cache_path
예제 #12
0
파일: ml.py 프로젝트: shindohikaru87/adl
    def __init__(self, model_dir):
        model_path = os.path.join(model_dir, 'gru.h5')
        artifacts_path = os.path.join(model_dir, 'model_artifacts.pkl')

        self.model = load_model(model_path)
        self.artifacts = pickle.load(open(artifacts_path, 'rb'))
        tokenizer_config = self.artifacts['tokenizer_config']
        self.tokenizer = tokenizer_from_json(json.dumps(tokenizer_config))
        self.sequence_len = self.artifacts['sequence_len']
예제 #13
0
def train(dataset_path,
        run_hash, 
        seq_len=32, 
        vocab_size=10000,
        emb_dim=32,
        batch_size=128,
        epochs=20,
        train_split = 0.8,
        val_split = 0.2):
    
    logs_path, ckp_path, tok_path = check_dirs(run_hash)

    ckp_cb = tf.keras.callbacks.ModelCheckpoint(
        ckp_path,
        'val_accuracy', 
        save_best_only=False,
        save_weights_only=True)

    lr_cb = tf.keras.callbacks.LearningRateScheduler(
        create_lr_sched(epochs/2, epochs), True)

    tb_cb = tf.keras.callbacks.TensorBoard(
            logs_path, 10, True, True, 
            embeddings_freq=10,  
            embeddings_metadata=logs_path+'/meta.tsv')

    with open(tok_path, 'r') as f:
        tokenizer = tokenizer_from_json(f.read())

    indexes_tape = create_indexes_tape(dataset_path, tokenizer)
    train_nbatches = int((len(indexes_tape)-seq_len) * train_split / batch_size)
    val_nbatches = int((len(indexes_tape)-seq_len) * val_split / batch_size)
    
    train_ds, val_ds = create_datasets(
        indexes_tape,
        train_nbatches, 
        val_nbatches,
        batch_size, 
        seq_len, 
        vocab_size)

    model = getmodel(seq_len-1, vocab_size, emb_dim, ckp_path)

    embeddings = model.layers[0].weights[0].numpy() 
    export_vocabulary(vocab_size, tokenizer.word_index, logs_path)
    export_embeddings(embeddings, logs_path)

    hist = model.fit(
        train_ds, 
        batch_size=batch_size, 
        epochs=epochs,
        steps_per_epoch=train_nbatches,
        validation_data=val_ds, 
        callbacks=[ckp_cb, lr_cb, tb_cb])
예제 #14
0
파일: model.py 프로젝트: ramonsaraiva/olie
def load_from_cache(model_cache, model_weights, tokenizer_cache, tags_cache):
    with open(model_cache, 'r') as model_f:
        model = model_from_json(json.load(model_f))
    model.load_weights(str(model_weights))

    with open(tokenizer_cache, 'r') as tokenizer_cache_f:
        tokenizer = tokenizer_from_json(json.load(tokenizer_cache_f))

    with open(tags_cache, 'r') as tags_cache_f:
        tags = json.load(tags_cache_f)

    return model, tokenizer, tags
def prepare_embeddings(gcp_bucket, num_words, w2v_model_path, embedding_dim,
                       json_tokenizer_path, output_emb_matrix_path,
                       vocabulary_size_path):
    logging.basicConfig(level=logging.INFO)
    logging.info('Starting embbedings preparation step ..')
    logging.info('Input data:')
    logging.info('gcp_bucket:{}'.format(gcp_bucket))
    logging.info('num_words:{}'.format(num_words))
    logging.info('w2v_model_path:{}'.format(w2v_model_path))
    logging.info('embedding_dim:{}'.format(embedding_dim))
    logging.info('json_tokenizer_path:{}'.format(json_tokenizer_path))
    logging.info('output_emb_matrix_path:{}'.format(output_emb_matrix_path))
    logging.info('vocabulary_size_path:{}'.format(vocabulary_size_path))
    storage_client = storage.Client()
    bucket = storage_client.bucket(gcp_bucket)
    model = Word2Vec()
    blob_w2v = bucket.get_blob(w2v_model_path)
    destination_uri = "/02_w2v_model.bin"
    blob_w2v.download_to_filename(destination_uri)
    w2v_model = model.wv.load(destination_uri)
    word_vectors = w2v_model.wv
    logging.info('STEP: PREP EMB (1/3) Word2Vec model loaded.')
    # Load JSON tokenizer
    with open(json_tokenizer_path) as f:
        json_token = json.load(f)
    tokenizer = tokenizer_from_json(json_token)
    word_index = tokenizer.word_index
    vocabulary_size = min(len(word_index) + 1, num_words)
    logging.info('STEP: PREP EMB (2/3) Tokenizer loaded.')
    embedding_matrix = np.zeros((vocabulary_size, embedding_dim),
                                dtype=np.int32)
    for word, i in word_index.items():
        if i >= num_words:
            continue
        try:
            embedding_vector = word_vectors[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25),
                                                   embedding_dim)
    del (word_vectors)
    logging.info('STEP: PREP EMB (3/3) Embedding matrix generated.')
    #Save the matrix @ output_embd_matrix_path
    logging.info('Writing output data.')
    if not os.path.exists(os.path.dirname(output_emb_matrix_path)):
        os.makedirs(os.path.dirname(output_emb_matrix_path))
    if not os.path.exists(os.path.dirname(vocabulary_size_path)):
        os.makedirs(os.path.dirname(vocabulary_size_path))
    with open(output_emb_matrix_path, 'w') as f:
        embedding_matrix.tofile(output_emb_matrix_path)
    with open(vocabulary_size_path, 'w') as f:
        f.write(str(vocabulary_size))
    logging.info('Prepare embeddings step finished.')
예제 #16
0
def tokenize(text):
	# load tokenizer
	max_len = 128

	with open(BASE_DIR+"tokenizer.json") as json_file:
		tok_json = json.load(json_file)

	tok = tokenizer_from_json(tok_json)

	# tokenize 
	sequences = tok.texts_to_sequences([text])
	return sequence.pad_sequences(sequences,maxlen=max_len)
def prepare_embeddings(
    gcp_bucket: str, num_words: int, w2v_model_path: str, embedding_dim: int,
    json_tokenizer_path: InputPath(str), num_classes: int,
    output_emb_matrix_path: OutputPath(str)
) -> NamedTuple('PrepareEmbOutput', [('vocabulary_size', int)]):
    from gensim.models import Word2Vec
    from google.cloud import storage
    from tensorflow.keras.preprocessing.text import tokenizer_from_json
    import os
    import json
    import numpy as np
    from collections import namedtuple
    # Storage client for loading the w2v model
    storage_client = storage.Client()
    bucket = storage_client.bucket(gcp_bucket)
    # Load w2v model
    model = Word2Vec()
    blob_w2v = bucket.get_blob(w2v_model_path)
    destination_uri = '{}/{}'.format(".", blob_w2v.name)
    if not os.path.exists(destination_uri):
        os.mkdir("/model")
    blob_w2v.download_to_filename(destination_uri)
    w2v_model = model.wv.load(destination_uri)
    word_vectors = w2v_model.wv

    # Load Json tokenizer

    # blob_tok = bucket.get_blob(json_tokenizer_path)
    with open(json_tokenizer_path) as f:
        json_token = json.load(f)

    tokenizer = tokenizer_from_json(json_token)
    word_index = tokenizer.word_index

    vocabulary_size = min(len(word_index) + 1, num_words)
    embedding_matrix = np.zeros((vocabulary_size, embedding_dim),
                                dtype=np.int32)
    for word, i in word_index.items():
        if i >= num_words:
            continue
        try:
            embedding_vector = word_vectors[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25),
                                                   embedding_dim)
    del (word_vectors)
    # Save the matrix @ output_embd_matrix_path
    embedding_matrix.tofile(output_emb_matrix_path)

    PrepareEmbOutput = namedtuple('PrepareEmbOuput', ['vocabulary_size'])
    return (PrepareEmbOutput(vocabulary_size))
예제 #18
0
def tokenize(corpus, ngrams=1, grams_join=" "):
    lst_corpus = ngrams_preprocess(corpus,
                                   ngrams=ngrams,
                                   grams_join=grams_join)
    with open('\\Users\\Zeden\\Desktop\\tokenizer.json') as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)
    lst_text2seq = tokenizer.texts_to_sequences(lst_corpus)
    X = preprocessing.sequence.pad_sequences(lst_text2seq,
                                             maxlen=15,
                                             padding="post",
                                             truncating="post")
    return X
예제 #19
0
def predictor(sentence):
    sentence = [sentence]
    with open('tokenizer_2.json') as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)
    sequence = tokenizer.texts_to_sequences(sentence)
    padded = pad_sequences(sequence,
                           padding='post',
                           maxlen=100,
                           truncating='post')
    model = load_model('fake_news_predictor_2.h5')
    prediction = model.predict_classes(padded)
    return prediction[0][0]
예제 #20
0
def load_tokenizer(dict_path: AnyStr) -> Tokenizer:
    """ 加载分词器工具

    :param dict_path: 字典路径
    :return: 分词器
    """
    if not os.path.exists(dict_path):
        raise FileNotFoundError("字典不存在,请检查后重试!")

    with open(dict_path, "r", encoding="utf-8") as dict_file:
        json_string = dict_file.read().strip().strip("\n")
        tokenizer = tokenizer_from_json(json_string=json_string)

    return tokenizer
예제 #21
0
def clean_split_pad_data(sentences, config_obj):
    """ Clean data, split into train/test groups, and pad sentences for training """
    print("Cleaning data (This may take a few minutes)...")
    if config_obj.language == "english":
        stopwords_list = set(stopwords.words('english'))
        important_words_english = [
            'above', 'below', 'off', 'over', 'under', 'more', 'most', 'such',
            'no', 'nor', 'not', 'only', 'so', 'than', 'too', 'very', 'just',
            'but'
        ]
        stopwords_list = stopwords_list - set(important_words_english)
        sentences_clean = [
            " ".join(clean_words_eng(sentence, stopwords_list))
            for sentence in sentences
        ]
    elif config_obj.language == "spanish":
        nlp = spacy.load('es_core_news_sm', disable=['ner', 'parser'])
        stopwords_list = set(stopwords.words('spanish'))
        important_words_spanish = [
            'encima', 'debajo', 'menos', 'abajo', 'más', 'tal', 'no', 'ni',
            'solamente', 'entonces', 'que', 'demasiado', 'muy', 'también',
            'sólo', 'pero'
        ]
        stopwords_list = stopwords_list - set(important_words_spanish)
        sentences_clean = [
            " ".join(clean_words_esp(sentence, nlp, stopwords_list))
            for sentence in sentences
        ]

    # Load tokenizer from json file
    tokenizer_name = "tokenizer_" + config_obj.language[:2] + ".json"
    with open(tokenizer_name) as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)

    # Creating padded sequences from train and test data
    sequences_clean = tokenizer.texts_to_sequences(sentences_clean)
    padded = pad_sequences(sequences_clean,
                           maxlen=config_obj.max_len,
                           padding=config_obj.pad_type,
                           truncating=config_obj.trunc_type)

    # Converting the list to numpy array
    sentences_np = np.array(padded)

    return sentences_np
예제 #22
0
def get_run_components(run_dir):
    # Load args
    config = utils.load_json(os.path.join(run_dir, 'config.json'))
    args = Namespace(**config)

    # Load tokenizers
    with open(os.path.join(run_dir, 'X_tokenizer.json'), 'r') as fp:
        X_tokenizer = tokenizer_from_json(json.load(fp))
        y_tokenizer = LabelEncoder()
        y_tokenizer.classes_ = np.load(os.path.join(run_dir,
                                                    'y_tokenizer.npy'),
                                       allow_pickle=True)

        # Load model
        model = models.TextCNN(embedding_dim=args.embedding_dim,
                               vocab_size=len(X_tokenizer.word_index) + 1,
                               num_filters=args.num_filters,
                               filter_sizes=args.filter_sizes,
                               hidden_dim=args.hidden_dim,
                               dropout_p=args.dropout_p,
                               num_classes=len(y_tokenizer.classes_))

        model.summary(input_shape=(10, ))  # build it
        model_path = os.path.join(run_dir, 'model/cp.ckpt')
        model.load_weights(model_path)

        # Conv output model
        conv_outputs_model = models.ConvOutputsModel(
            vocab_size=len(X_tokenizer.word_index) + 1,
            embedding_dim=args.embedding_dim,
            filter_sizes=args.filter_sizes,
            num_filters=args.num_filters)
        conv_outputs_model.summary(input_shape=(10, ))  # build it

        # Set weights
        conv_outputs_model.layers[0].set_weights(model.layers[0].get_weights())
        conv_layer_start_num = 1

        for layer_num in range(conv_layer_start_num,
                               conv_layer_start_num + len(args.filter_sizes)):
            conv_outputs_model.layers[layer_num].set_weights(
                model.layers[layer_num].get_weights())

        return args, model, conv_outputs_model, X_tokenizer, y_tokenizer
def train_embedding():
    MAX_WORDS = 10000
    EMBEDDING_DIM = 300
    embeddings_index = load_vectors(VECTORS)

    with open(TOKENIZER) as f:
        json_obj = json.load(f)
        tokenizer = tokenizer_from_json(json_obj)

    word_index = tokenizer.word_index
    num_words = min(MAX_WORDS, len(word_index) + 1)

    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i >= num_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = list(embedding_vector)
    pickle.dump(embedding_matrix, open(EMBEDDING_MATRIX, "wb"))
예제 #24
0
def fit_tokenizer(load_exising=False):
    '''
    Returns the tokenizer and if it doesn't exist will create it

    load_existing (Type: boolean): used to indicate whether to load existing tolenizer 
                                    or create a new one... useful when training data updated
    '''
    if load_exising:
        if subject != "":
            with open(subject + 'tokenizer.json') as f:
                data = json.load(f)
                tokenizer = tokenizer_from_json(data)
            return tokenizer
    else:
        if subject != "":
            tokenizer = Tokenizer(num_words=num_words, oov_token='UNK')
            tokenizer.fit_on_texts(patterns)
            tokenizer_json = tokenizer.to_json()
            with io.open(subject + 'tokenizer.json', 'w',
                         encoding='utf-8') as f:
                f.write(json.dumps(tokenizer_json, ensure_ascii=False))
            return tokenizer
예제 #25
0
 def post(self, request):
     try:
         max_len = 150
         trunc_type = 'post'
         padding = 'post'
         model = load_model('api/res/model/sentiment_version_2.h5')
         print('Model loaded successfully.')
         with open('api/res/assets/tokenizer_version_2.json') as t:
             datas = json.load(t)
             tokenizer = tokenizer_from_json(datas)
             print('Tokenizer loaded successfully.')
         cleaned_review = CleanTokenize([request.data.get('review')])
         test_sequences = tokenizer.texts_to_sequences(cleaned_review)
         test_padded = pad_sequences(test_sequences,
                                     maxlen=max_len,
                                     padding=padding)
         prob = model.predict(test_padded)
         if prob[0][0] <= 0.2:
             result = 1
         elif prob[0][0] <= 0.4:
             result = 2
         elif prob[0][0] <= 0.6:
             result = 3
         elif prob[0][0] <= 0.8:
             result = 4
         else:
             result = 5
         # Saving data to db Model
         result_data = {
             'review': request.data.get('review'),
             'rating': result
         }
         serializer = SentimentSerializer(data=result_data)
         if serializer.is_valid():
             serializer.save()
         return Response(data=serializer.data,
                         status=status.HTTP_201_CREATED)
     except ValueError as V:
         return Response(data=V, status=status.HTTP_400_BAD_REQUEST)
예제 #26
0
    def __init__(self,
                 json_tokenizer,
                 VOCAB_SIZE,
                 max_seq_lenght=None,
                 red_seq_from_common=True):

        self.tokenizer = tokenizer_from_json(json_tokenizer)
        self.VOCAB_SIZE = VOCAB_SIZE
        self.max_seq_lenght = max_seq_lenght
        self.red_seq_from_common = red_seq_from_common

        dictionary = self.tokenizer.word_index
        self.word2idx = {}
        self.idx2word = {}
        for k, v in dictionary.items():
            if v < VOCAB_SIZE:
                self.word2idx[k] = v
                self.idx2word[v] = k
            if v >= VOCAB_SIZE - 1:
                continue

        self.idx_2_keep = [
            self.word2idx[BOS.strip()], self.word2idx[EOS.strip()]
        ]
예제 #27
0
def sentimentclassifier(request):
    try:
        max_length = 100
        trunc_type = 'post'
        mydata = request.data
        model = load_model('./app/model/sentiment.h5')

        with open('./app/model/tokenizer.json') as f:
            data = json.load(f)
            tokenizer = tokenizer_from_json(data)

        test_sequence = tokenizer.texts_to_sequences(list(mydata.values()))
        test_padded = pad_sequences(test_sequence,
                                    maxlen=max_length,
                                    padding='post',
                                    truncating=trunc_type)
        result = model.predict(test_padded)
        result = (result >= 0.5)
        new_df = pd.DataFrame(result, columns=['Status'])
        new_df = new_df.replace({True: 'Positive', False: 'Negative'})

        return Response(new_df)
    except ValueError as e:
        return Response(e.args[0], status.HTTP_400_BAD_REQUEST)
예제 #28
0
from tensorflow.keras.models import load_model
import json
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import pandas as pd
import app.algorithms.process_predictions as pp

MAXWORDS = 3000

model = load_model("./data/models/nb_stream_fasttext_10k.h5")
with open("./data/tokenizer_stream_10k.json") as f:
    json_obj = json.load(f)
    tokenizer = tokenizer_from_json(json_obj)

vid = "cnpUNEWP1i8"
channel = "MentourPilot"  #For file naming only

transcript, full_text, captionCount = pp.processVideo(vid)
predictions = pp.getPredictions(model, tokenizer, full_text)

df = pd.DataFrame(predictions)
words = full_text.split(" ")
df["text"] = words + ["N/A"] * (len(predictions) - len(words))
df.to_csv(f"./examples/{channel}_{vid}.csv", index=False)

sponsorTimestamps = pp.getTimestamps(transcript, captionCount, predictions,
                                     words)
print(sponsorTimestamps)

with open(f"./examples/{channel}_{vid}.txt", 'w') as file:
    file.write("Timestamps:\n")
    for ts in sponsorTimestamps:
예제 #29
0
 def loadTokenizer(self):
     with open(self.MODEL_DIR + 'tokenizer.json', 'r') as f:
         data = json.load(f)
         self.tokenizer = tokenizer_from_json(data)
예제 #30
0
import tensorflow as tf
from tensorflow.keras.models import load_model
import json
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 100
trunc_type = 'post'

model = load_model("sentiment.h5")

with open('tokenizer.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

sentence = input(str("Give your reviews: "))


def classify(sentence):
    test_sequence = tokenizer.texts_to_sequences([sentence])
    test_padded = pad_sequences(test_sequence,
                                maxlen=max_length,
                                padding='post',
                                truncating=trunc_type)
    result = model.predict(test_padded)
    if result[0][0] >= 0.5:
        return 1
    else:
        return 0