Пример #1
0
def get_tokens(questions, answers):
    """
    Computes the tokens of a lists of questions and answers
    
    Returns:
    -- token_ques_input: tokens of questions to input to encoder
    -- token_ans_input: tokens of answers to input to decoder
    -- token_ans_target: tokens of answers as target to decoder
    -- vocab_size: size of the vocabulary (number of words)
    -- t: Tokenizer() object instance

    parameters:
    -- questions: list of strings (questions)
    -- answers: list of strings (answers)
    """

    t = Tokenizer()

    t.fit_on_texts(answers + questions)

    t.texts_to_matrix(answers, mode='count').shape

    vocab_size = len(t.word_index) + 1

    token_ans_target = [one_hot(sentence, n=len(t.word_index)) for sentence in answers]
    token_ans_input = [[vocab_size] + one_hot(sentence, n=len(t.word_index)) for sentence in answers]
    token_ques_input = [one_hot(sentence, n=len(t.word_index)) for sentence in questions]

    return token_ques_input, token_ans_input, token_ans_target, vocab_size, t
Пример #2
0
def text_preparation(data):

    # Input vars
    nltk.download('stopwords')
    embedding_vector_feature_title = 10
    embedding_vector_feature_text = 100
    sent_length_title = 20
    sent_length_text = 1000
    vo_size = 500
    ps_title = PorterStemmer()
    ps_text = PorterStemmer()
    corpus_title = []
    corpus_text = []

    #Copy df with title and description columns
    X_test = data[['title', 'description']]
    messages = X_test.copy()
    messages.reset_index(inplace=True)

    #Preproc text
    for i in range(0, len(messages)):
        print("Status: %s / %s" % (i, len(messages)), end="\r")

        #preproc title
        review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
        review = review.lower()
        review = review.split()

        review = [
            ps_title.stem(word) for word in review
            if not word in stopwords.words('english')
        ]
        review = ' '.join(review)
        corpus_title.append(review)

        #preproc text
        review = re.sub('[^a-zA-Z]', ' ', messages['description'][i])
        review = review.lower()
        review = review.split()

        review = [
            ps_text.stem(word) for word in review
            if not word in stopwords.words('english')
        ]
        review = ' '.join(review)
        corpus_text.append(review)

    #Data frame representation for NN
    onehot_rep_title = [one_hot(words, vo_size) for words in corpus_title]
    onehot_rep_text = [one_hot(words, vo_size) for words in corpus_text]
    embedded_doc_title = pad_sequences(onehot_rep_title,
                                       padding='pre',
                                       maxlen=sent_length_title)
    embedded_doc_text = pad_sequences(onehot_rep_text,
                                      padding='pre',
                                      maxlen=sent_length_text)
    X_final_title = np.array(embedded_doc_title)
    X_final_text = np.array(embedded_doc_text)

    return X_final_title, X_final_text
Пример #3
0
def pred_method(z):
    model = None
    nltk.download('stopwords')
    ps = PorterStemmer()

    def process_data(test_sen):
        test_sen = test_sen.lower()
        test_sen = test_sen.split()
        test_sen = [
            ps.stem(word) for word in test_sen
            if not word in stopwords.words('english')
        ]
        test_sen = ' '.join(test_sen)
        return test_sen

    test_sen = z
    processed = process_data(test_sen)
    oh = one_hot(processed, 1000)
    sent_len = 20
    embedded_docs = pad_sequences([oh], padding='pre', maxlen=sent_len)
    X = np.array(embedded_docs)

    model = load_model('spam_classification.h5')
    pred = model.predict_classes(X)[0][0]

    return pred
def pred():
    x = [text for text in request.form.values()]
    LEMMATIZER = WordNetLemmatizer()
    unknown = np.array(x)
    corpus1 = []
    sentence_length = 80

    lines1 = re.sub('[^a-zA-Z]', ' ', str(unknown))
    #lines=line.sub('https?:\/\/.*[\r\n]*','',lines)
    lines1 = lines1.lower()
    lines1 = lines1.split()
    lines1 = [
        LEMMATIZER.lemmatize(j) for j in lines1
        if j not in stopwords.words('english')
    ]
    lines1 = ' '.join(lines1)
    corpus1.append(lines1)
    OHR = [one_hot(k, 4000) for k in corpus1]
    embeddings = pad_sequences(OHR, sentence_length)
    embeddings = embeddings.reshape(-1, 1)
    embeddings = np.transpose(embeddings)
    predicted = model.predict(embeddings)
    #predicted=np.argmax(predicted,axis=-1)

    return render_template(
        'index.html',
        prediction_text=
        'Mood = {} Softmax Distr between Negative|Neutral|Positive'.format(
            predicted))
Пример #5
0
def preprocessing_and_all(dataframe_dir):
  df_test = pd.read_csv(dataframe_dir)
  df_temp = df_test.copy()
  df_temp['text'] = df_temp['text'].str[:10]
  df_temp['title'] = df_temp['title'].fillna(df_temp['text'])
  df_temp.reset_index(inplace=True)

  corpus = []
  ps = PorterStemmer()

  for i in range(0, len(df_temp)):
    cleaning = re.sub('[^a-zA-Z]', ' ', df_temp['title'][i])
    cleaning = cleaning.lower()
    cleaning = cleaning.split()

    cleaning = [ps.stem(word) for word in cleaning if not word in stopwords.words('english')]
    cleaning = " ".join(cleaning)
    corpus.append(cleaning)
  
  voc_size = 5000 #Number of words for the one hot encoding
  sent_length = 20 #Max length for padding
  embedding_vector_features = 40 #Number of vector features for embedding

  #One hot encoding
  onehot_repr = [one_hot(sentence, voc_size) for sentence in corpus]

  #Padding
  embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)
  
  X = np.array(embedded_docs)
  y_pred_real = model.predict_classes(X)  
  df_temp['label'] = y_pred_real
  df_to_submit = df_temp[['id', 'label']]
  return df_to_submit
Пример #6
0
def income_message(message):
    voc_size = 5000
    nltk.download('stopwords')
    model = tf.keras.models.load_model('main_model.h5')
    Test = []
    Test.append(message)
    ps = PorterStemmer()
    corpus_test = []
    for i in range(0, len(Test)):
        #     review = re.sub('[^a-zA-Z]', ' ',Test['title'][i])
        review = re.sub('[^a-zA-Z]', ' ', Test[i])
        review = review.lower()
        review = review.split()

        review = [
            ps.stem(word) for word in review
            if not word in stopwords.words('english')
        ]
        review = ' '.join(review)
        corpus_test.append(review)

    one_hot_rep_Test = [one_hot(words, voc_size) for words in corpus_test]
    sentence_length = 25
    embedded_docs_test = pad_sequences(one_hot_rep_Test,
                                       padding='pre',
                                       maxlen=sentence_length)
    x_test = np.array(embedded_docs_test)
    check = model.predict_classes(x_test)
    final_ans = int(check[0][0])
    return final_ans
Пример #7
0
def predict():
    if request.method == 'POST':
        sen = str(request.form['comment'])

        review1 = re.sub('[^a-zA-Z]', ' ', sen)
        review1 = review1.lower()
        review1 = review1.split()
        review1 = [
            lt.lemmatize(word) for word in review1
            if not word in stopwords.words('english')
        ]
        review1 = ' '.join(review1)

        onehot_repx = [one_hot(review1, voc_size)]

        emd_x = pad_sequences(onehot_repx, padding='pre', maxlen=100)

        x_input = np.array(emd_x)

        y_out = model.predict_classes(x_input)

        for i, j in kv.items():
            if y_out == j:
                output = i

        return render_template(
            'result.html',
            prediction_text='your customer is {}'.format(output))
Пример #8
0
def predict():
    if request.method == "POST":
        news = (request.form["News"])
        corpus = []
        review = re.sub('[^a-zA-Z]', ' ', news)
        review = review.lower()
        review = review.split()
        print(review)

        review = [
            ps.stem(word) for word in review
            if not word in stopwords.words('english')
        ]
        review = ' '.join(review)
        corpus.append(review)
        print(corpus)

        onehot_repr = [one_hot(words, 10000) for words in corpus]
        print(onehot_repr)
        embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=20)
        print(embedded_docs)
        prediction = model.predict(embedded_docs)
        print(prediction)

        if prediction > 0.1:
            output = "Real"
        else:
            output = "Fake"
        print(output)

        return render_template(
            'index.html', prediction_text=f'This News Headline is {output}!')

    return render_template("index.html")
Пример #9
0
def get_text_to_vect(text):
    onehot_repr = [one_hot(words, voc_size) for words in text]

    embedded_docs = pad_sequences(onehot_repr,
                                  padding='pre',
                                  maxlen=sent_length)
    return embedded_docs
Пример #10
0
def detect():

    model = keras.models.load_model("pickles/news_classifier_model.h5")
    vectorizer = joblib.load("pickles/data_transformer.joblib")

    if request.method == "POST":
        url = request.form["textNews"]
        urls = [url]
        print(urls)
        data = pd.DataFrame(data=[urls], columns=["title"])
        # print(data.url)
        preprocessed_title = vectorizer(data.title)
        one_hot_repr_title = [
            one_hot(words, 5000) for words in preprocessed_title
        ]
        embedded_title = pad_sequences(one_hot_repr_title,
                                       padding='pre',
                                       maxlen=20)
        print("Embeded", embedded_title)
        embedded_title = np.array(embedded_title)
        prediction_proba = model.predict_proba(embedded_title)[0]
        print("Prediction", model.predict(embedded_title))

        return render_template("result.html", prediction=prediction_proba)
    return render_template("index.html")
def preprocessing(sentence):
    # divide sentences as each words using(text to word sequences)
    words = set(text_to_word_sequence(sentence))  # remove duplicate words
    vocab_size = len(words)
    # words to numeric value(vector)
    results = one_hot(sentence, round(vocab_size * 1.3))
    return results
Пример #12
0
def fnm_text(inputnewstext):
    manual_variable_initialization(True)

    stemnews = re.sub('[^a-zA-Z]', ' ', inputnewstext)
    stemnews = stemnews.lower()
    stemnews = stemnews.split()
    ps = PorterStemmer()
    stemnews = [
        ps.stem(word) for word in stemnews
        if not word in stopwords.words('english')
    ]
    stemnews = ' '.join(stemnews)[0:1000]
    vocabulary_size = 10000
    onehot_repr = [one_hot(stemnews, vocabulary_size)]
    embedded_text = pad_sequences(onehot_repr, padding='pre', maxlen=1000)
    embedding_vector_features = 50
    tmodel = Sequential()
    tmodel.add(
        Embedding(vocabulary_size,
                  embedding_vector_features,
                  input_length=1000))
    tmodel.add(Bidirectional(LSTM(100)))
    tmodel.add(Dense(1, activation='sigmoid'))
    tmodel.compile(loss='binary_crossentropy',
                   optimizer='adam',
                   metrics=['accuracy'])
    tmodel.load_weights(os.path.join(settings.MODEL_ROOT, 'textweights.h5'))

    x = embedded_text.reshape(1, 1000)
    result = tmodel.predict_classes(x)
    return result[0][0]
Пример #13
0
def reliable(file):
    with open(file, 'r') as file:
        text = file.read().replace('\n', ' ')

    text = nltk.tokenize.sent_tokenize(text)

    text = [token.lower() for token in text]

    onehot_enc = [one_hot(words, 10000) for words in text]
    sen_len = 300
    embedded_doc = pad_sequences(onehot_enc, padding='pre', maxlen=sen_len)
    text_final = np.array(embedded_doc)

    pred = model.predict(text_final)

    pred_df = pd.DataFrame(pred)
    text_df = pd.DataFrame(text)
    result_df = pd.concat([pred_df, text_df], axis=1)

    pred_df.columns = ["predictions"]

    result_df.columns = ["predictions", "Sentence"]

    print(result_df.loc[result_df['predictions'] <= 0.7])

    fakeres_df = result_df.loc[result_df['predictions'] <= 0.7]
    fake_df = fakeres_df["Sentence"]

    return fake_df
Пример #14
0
 def encode(self, message):
     encoded = one_hot(message, max_features)
     encoded = pad_sequences([encoded],
                             dtype='int32',
                             padding='post',
                             value=0,
                             maxlen=maxlen)
     return encoded
Пример #15
0
def clean(corpus):
    onehot_repr = [one_hot(words, voc_size) for words in corpus]
    sent_length = 20
    embedded_docs = pad_sequences(onehot_repr,
                                  padding='pre',
                                  maxlen=sent_length)
    finalnew = np.array(embedded_docs)
    return finalnew
Пример #16
0
def predict_function():
    if request.method == "POST":
        message = request.form["message"]
        input_data = ["message"]
        one_hot_data = [one_hot(words, 5000) for words in input_data]
        sequenced_data = pad_sequences(one_hot_data, maxlen=20, padding='pre')
        final_data = np.array(sequenced_data)
        my_prediction = classifier.predict(final_data)
        return render_template('result.html', prediction=my_prediction)
Пример #17
0
def train_model():
    Train = pd.read_csv("train.csv")
    Train.head()
    X = Train.drop('label', axis=1)
    Train.isnull().sum()
    Train = Train.dropna()
    Train.isnull().sum()
    Train = Train.copy()
    Train.reset_index(inplace=True)
    x = Train['title']
    y = Train['label']
    ps = PorterStemmer()
    corpus = []
    for i in range(0, len(Train)):
        review = re.sub('[^a-zA-Z]', ' ', Train['title'][i])
        review = review.lower()
        review = review.split()

        review = [
            ps.stem(word) for word in review
            if not word in stopwords.words('english')
        ]
        review = ' '.join(review)
        corpus.append(review)

    voc_size = 5000
    one_hot_rep = [one_hot(words, voc_size) for words in corpus]
    sentence_length = 25
    embedded_docs = pad_sequences(one_hot_rep,
                                  padding='pre',
                                  maxlen=sentence_length)
    embedding_vector_features = 40
    model = Sequential()
    model.add(
        Embedding(voc_size,
                  embedding_vector_features,
                  input_length=sentence_length))
    model.add(Dropout(0.3))
    model.add(LSTM(200))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    z = np.array(embedded_docs)
    y = np.array(y)
    x_train, x_test, y_train, y_test = train_test_split(z,
                                                        y,
                                                        test_size=0.10,
                                                        random_state=42)
    model.fit(x_train,
              y_train,
              validation_data=(x_test, y_test),
              epochs=20,
              batch_size=64)
    model.save("main_model.h5")
def load_data(data_dir):
    with open('as4/data/sample_sentences.txt')as f:
        lines=f.readlines()
    voc_size=1000
    onehot1=[one_hot(words,voc_size)for words in lines]
    sent_lenth=494
    embedded_docs=pad_sequences(onehot1,padding='pre',maxlen=sent_lenth)
    x_pred=np.array(embedded_docs)

    return x_pred
Пример #19
0
def testing_model(test_str):
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
    from tensorflow.keras.layers import SpatialDropout1D, BatchNormalization, Input
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.preprocessing.text import one_hot
    from tensorflow.keras.preprocessing.text import Tokenizer

    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report, accuracy_score

    #Downloading stopwords
    #Stopwords are the words in any language which does not add much meaning to a sentence.
    #They can safely be ignored without sacrificing the meaning of the sentence.
    import nltk
    import re
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    from nltk.stem.lancaster import LancasterStemmer
    from nltk.stem.wordnet import WordNetLemmatizer

    d = {'Total': test_str}
    X_test = pd.DataFrame(data=d)
    corpus_test = []
    for i in range(len(X_test)):
        #     if i == 20800: continue
        input1 = re.sub(
            '[^a-zA-Z]', ' ', str(X_test.iloc[i].total)
        )  # except a-z and A-Z, substitute all other characters with ' '
        input1 = input1.lower()  # Lower case
        input1 = input1.split()  # tokenize the text
        input1 = [
            ps.stem(word) for word in input1
            if word not in stopwords.words('english')
        ]
        text = ' '.join(
            input1
        )  # concatenating all words into a single text (list is created)#
        corpus_test.append(text)  # appending text into a single corpus #

    #Choosing vocabulary size to be 5000 and copying data to msg for further cleaning
    voc_size = 5000
    onehot_rep_test = [one_hot(words, voc_size) for words in corpus_test]

    #Padding Sentences to make them of same size
    embedded_docs_test = pad_sequences(onehot_rep_test,
                                       padding='pre',
                                       maxlen=25)

    test_final = np.array(embedded_docs_test)

    return test_final
Пример #20
0
def analyze(headline):
    headline=headline.lower()
    headline=headline.split()
    headline = [ps.stem(word) for word in headline if not word in stopwords.words('english')]
    headline = ' '.join(headline)
    onehot_repr = [one_hot(words, voc_size) for words in headline]
    sent_length = 20
    embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)
    x=np.array(embedded_docs)
    pred=model1.predict_classes(x)
    return(pred)
Пример #21
0
def news():
    if request.method == 'POST':
        data = request.form['news']

        onehotrepr = [one_hot(data, voc_size)]
        embedded_docs = pad_sequences(onehotrepr,
                                      padding='pre',
                                      maxlen=sent_length)

        news = np.argmax(model.predict(embedded_docs), axis=1)[0]

    return render_template("home.html", news=dict[news])
def get_array_from_directory(path):
    array = os.listdir(path)
    m = []
    for n in range(len(array)):

        with open(os.path.join(path, array[n]), encoding='utf8') as f:
            data = f.read()
            words = set(text_to_word_sequence(data))
            result = one_hot(data, round(len(words) * 1.3))
            m.append(result)

    m = pad_sequences(m, maxlen=2000)
    return m
Пример #23
0
def data_preprocessing1(text_message):
    corpus = []
    review = re.sub('[^a-zA-Z]', ' ', text_message)
    review = review.lower()
    review = review.split()
    review = [
        ps.stem(word) for word in review
        if not word in stopwords.words('english')
    ]
    review = ' '.join(review)
    corpus.append(review)

    return [one_hot(words, voc_size) for words in corpus]
Пример #24
0
def predict(content, neural_net_data_path, stemmer, stopwords):
    corpus = []
    review = re.sub('[^a-zA-Z]', ' ', content)
    review = review.lower()
    review = review.split()
    review = [stemmer.stem(word) for word in review if word not in stopwords]
    review = ' '.join(review)
    corpus.append(review)

    bag_of_words = [one_hot(words, vocabulary_size) for words in corpus]
    X = np.array(pad_sequences(bag_of_words, padding='pre', maxlen=neural_input_length))

    model.load_weights(neural_net_data_path)
    prediction = model.predict(X).tolist()[0][0]
    return prediction
Пример #25
0
def preprocess(msg):
	ps=PorterStemmer()
	corpus=[]
	headline=re.sub('[^a-zA-Z]',' ',str(msg))
	headline=headline.lower()
	headline=headline.split()
	#stemming each word of the title
	headline=[ps.stem(word) for word in headline if not word in stopwords.words('english')]
	headline=' '.join(headline)
	corpus.append(headline)

	vocab_size=5000
	one_hot_representation=[one_hot(words,vocab_size) for words in corpus]
	sent_length=20
	embedded_rep=pad_sequences(one_hot_representation,sent_length,padding='pre')
	return embedded_rep
def input_preprocessing(x):
    corpus = []

    review = re.sub('[^a-zA-z]', ' ', str(x))
    review = review.lower()
    review = review.split()
    review = [wnl.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

    onehot_repr = [one_hot(words, voc_size) for words in corpus]


    embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)
    X=np.array(embedded_docs)
    return X
Пример #27
0
def newsPredict(news):
    ps = PorterStemmer()
    latestNews = news
    corpus_latest = []
    
    review = re.sub('[^a-zA-Z]', ' ', latestNews)
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus_latest.append(review)
    
    encoded_docs_latest = [one_hot(d, vocab_size) for d in corpus_latest]
    padded_docs_latest = pad_sequences(encoded_docs_latest, maxlen=max_length, padding='post')
    
    return padded_docs_latest
Пример #28
0
def fnm(inputnewstext):
    stemnews = re.sub('[^a-zA-Z]', ' ', inputnewstext)
    stemnews = stemnews.lower()
    stemnews = stemnews.split()
    ps = PorterStemmer()
    stemnews = [
        ps.stem(word) for word in stemnews
        if not word in stopwords.words('english')
    ]
    stemnews = ' '.join(stemnews)

    vocabulary_size = 10000

    onehot_repr = [one_hot(stemnews, vocabulary_size)]
    embedded_text = pad_sequences(onehot_repr, padding='pre', maxlen=50)
    x = embedded_text.reshape(1, 50)
    fnm_model = keras.models.load_model('my_model.h5', compile=False)
    result = fnm_model.predict_classes(x)
    print(result[0][0])
Пример #29
0
def fnm(title, text):
    stemnews1 = re.sub('[^a-zA-Z]', ' ', title)
    stemnews1 = stemnews1.lower()
    stemnews1 = stemnews1.split()
    ps = PorterStemmer()

    stemnews1 = [
        ps.stem(word) for word in stemnews1
        if not word in stopwords.words('english')
    ]
    stemnews1 = ' '.join(stemnews1)[0:50]
    stemnews2 = re.sub('[^a-zA-Z]', ' ', text)
    stemnews2 = stemnews2.lower()
    stemnews2 = stemnews2.split()
    stemnews2 = [
        ps.stem(word) for word in stemnews2
        if not word in stopwords.words('english')
    ]
    stemnews2 = ' '.join(stemnews2)[0:450]
    stemnews = stemnews1 + " text: " + stemnews2

    vocabulary_size = 10000
    onehot_repr = [one_hot(stemnews, vocabulary_size)]
    embedded_text = pad_sequences(onehot_repr, padding='pre', maxlen=500)

    embedding_vector_features = 50
    model = Sequential()
    model.add(
        Embedding(vocabulary_size, embedding_vector_features,
                  input_length=500))
    model.add(LSTM(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.load_weights(os.path.join(settings.MODEL_ROOT,
                                    'titletextweights.h5'))

    x = embedded_text.reshape(1, 500)
    result = model.predict_classes(x)

    return result[0][0]
Пример #30
0
def preprocess_transform(message):
    ps = PorterStemmer()
    corpus = []
    vocab_size = 10000
    sent_len = 20

    msg = re.sub('[^A-Za-z]', ' ', message)
    msg = msg.lower()
    msg = msg.split()
    msg = [
        ps.stem(word) for word in msg if word not in stopwords.words('english')
    ]
    msg = ' '.join(msg)
    corpus.append(msg)

    one_hot_rep = [one_hot(word, vocab_size) for word in corpus]

    embedded = pad_sequences(one_hot_rep, maxlen=sent_len)

    return np.array(embedded)