Exemplo n.º 1
0
def get_tokens(questions, answers):
    """
    Computes the tokens of a lists of questions and answers
    
    Returns:
    -- token_ques_input: tokens of questions to input to encoder
    -- token_ans_input: tokens of answers to input to decoder
    -- token_ans_target: tokens of answers as target to decoder
    -- vocab_size: size of the vocabulary (number of words)
    -- t: Tokenizer() object instance

    parameters:
    -- questions: list of strings (questions)
    -- answers: list of strings (answers)
    """

    t = Tokenizer()

    t.fit_on_texts(answers + questions)

    t.texts_to_matrix(answers, mode='count').shape

    vocab_size = len(t.word_index) + 1

    token_ans_target = [one_hot(sentence, n=len(t.word_index)) for sentence in answers]
    token_ans_input = [[vocab_size] + one_hot(sentence, n=len(t.word_index)) for sentence in answers]
    token_ques_input = [one_hot(sentence, n=len(t.word_index)) for sentence in questions]

    return token_ques_input, token_ans_input, token_ans_target, vocab_size, t
Exemplo n.º 2
0
def text_preparation(data):

    # Input vars
    nltk.download('stopwords')
    embedding_vector_feature_title = 10
    embedding_vector_feature_text = 100
    sent_length_title = 20
    sent_length_text = 1000
    vo_size = 500
    ps_title = PorterStemmer()
    ps_text = PorterStemmer()
    corpus_title = []
    corpus_text = []

    #Copy df with title and description columns
    X_test = data[['title', 'description']]
    messages = X_test.copy()
    messages.reset_index(inplace=True)

    #Preproc text
    for i in range(0, len(messages)):
        print("Status: %s / %s" % (i, len(messages)), end="\r")

        #preproc title
        review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
        review = review.lower()
        review = review.split()

        review = [
            ps_title.stem(word) for word in review
            if not word in stopwords.words('english')
        ]
        review = ' '.join(review)
        corpus_title.append(review)

        #preproc text
        review = re.sub('[^a-zA-Z]', ' ', messages['description'][i])
        review = review.lower()
        review = review.split()

        review = [
            ps_text.stem(word) for word in review
            if not word in stopwords.words('english')
        ]
        review = ' '.join(review)
        corpus_text.append(review)

    #Data frame representation for NN
    onehot_rep_title = [one_hot(words, vo_size) for words in corpus_title]
    onehot_rep_text = [one_hot(words, vo_size) for words in corpus_text]
    embedded_doc_title = pad_sequences(onehot_rep_title,
                                       padding='pre',
                                       maxlen=sent_length_title)
    embedded_doc_text = pad_sequences(onehot_rep_text,
                                      padding='pre',
                                      maxlen=sent_length_text)
    X_final_title = np.array(embedded_doc_title)
    X_final_text = np.array(embedded_doc_text)

    return X_final_title, X_final_text
Exemplo n.º 3
0
def pred_method(z):
    model = None
    nltk.download('stopwords')
    ps = PorterStemmer()

    def process_data(test_sen):
        test_sen = test_sen.lower()
        test_sen = test_sen.split()
        test_sen = [
            ps.stem(word) for word in test_sen
            if not word in stopwords.words('english')
        ]
        test_sen = ' '.join(test_sen)
        return test_sen

    test_sen = z
    processed = process_data(test_sen)
    oh = one_hot(processed, 1000)
    sent_len = 20
    embedded_docs = pad_sequences([oh], padding='pre', maxlen=sent_len)
    X = np.array(embedded_docs)

    model = load_model('spam_classification.h5')
    pred = model.predict_classes(X)[0][0]

    return pred
def pred():
    x = [text for text in request.form.values()]
    LEMMATIZER = WordNetLemmatizer()
    unknown = np.array(x)
    corpus1 = []
    sentence_length = 80

    lines1 = re.sub('[^a-zA-Z]', ' ', str(unknown))
    #lines=line.sub('https?:\/\/.*[\r\n]*','',lines)
    lines1 = lines1.lower()
    lines1 = lines1.split()
    lines1 = [
        LEMMATIZER.lemmatize(j) for j in lines1
        if j not in stopwords.words('english')
    ]
    lines1 = ' '.join(lines1)
    corpus1.append(lines1)
    OHR = [one_hot(k, 4000) for k in corpus1]
    embeddings = pad_sequences(OHR, sentence_length)
    embeddings = embeddings.reshape(-1, 1)
    embeddings = np.transpose(embeddings)
    predicted = model.predict(embeddings)
    #predicted=np.argmax(predicted,axis=-1)

    return render_template(
        'index.html',
        prediction_text=
        'Mood = {} Softmax Distr between Negative|Neutral|Positive'.format(
            predicted))
Exemplo n.º 5
0
def preprocessing_and_all(dataframe_dir):
  df_test = pd.read_csv(dataframe_dir)
  df_temp = df_test.copy()
  df_temp['text'] = df_temp['text'].str[:10]
  df_temp['title'] = df_temp['title'].fillna(df_temp['text'])
  df_temp.reset_index(inplace=True)

  corpus = []
  ps = PorterStemmer()

  for i in range(0, len(df_temp)):
    cleaning = re.sub('[^a-zA-Z]', ' ', df_temp['title'][i])
    cleaning = cleaning.lower()
    cleaning = cleaning.split()

    cleaning = [ps.stem(word) for word in cleaning if not word in stopwords.words('english')]
    cleaning = " ".join(cleaning)
    corpus.append(cleaning)
  
  voc_size = 5000 #Number of words for the one hot encoding
  sent_length = 20 #Max length for padding
  embedding_vector_features = 40 #Number of vector features for embedding

  #One hot encoding
  onehot_repr = [one_hot(sentence, voc_size) for sentence in corpus]

  #Padding
  embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)
  
  X = np.array(embedded_docs)
  y_pred_real = model.predict_classes(X)  
  df_temp['label'] = y_pred_real
  df_to_submit = df_temp[['id', 'label']]
  return df_to_submit
Exemplo n.º 6
0
def income_message(message):
    voc_size = 5000
    nltk.download('stopwords')
    model = tf.keras.models.load_model('main_model.h5')
    Test = []
    Test.append(message)
    ps = PorterStemmer()
    corpus_test = []
    for i in range(0, len(Test)):
        #     review = re.sub('[^a-zA-Z]', ' ',Test['title'][i])
        review = re.sub('[^a-zA-Z]', ' ', Test[i])
        review = review.lower()
        review = review.split()

        review = [
            ps.stem(word) for word in review
            if not word in stopwords.words('english')
        ]
        review = ' '.join(review)
        corpus_test.append(review)

    one_hot_rep_Test = [one_hot(words, voc_size) for words in corpus_test]
    sentence_length = 25
    embedded_docs_test = pad_sequences(one_hot_rep_Test,
                                       padding='pre',
                                       maxlen=sentence_length)
    x_test = np.array(embedded_docs_test)
    check = model.predict_classes(x_test)
    final_ans = int(check[0][0])
    return final_ans
Exemplo n.º 7
0
def predict():
    if request.method == 'POST':
        sen = str(request.form['comment'])

        review1 = re.sub('[^a-zA-Z]', ' ', sen)
        review1 = review1.lower()
        review1 = review1.split()
        review1 = [
            lt.lemmatize(word) for word in review1
            if not word in stopwords.words('english')
        ]
        review1 = ' '.join(review1)

        onehot_repx = [one_hot(review1, voc_size)]

        emd_x = pad_sequences(onehot_repx, padding='pre', maxlen=100)

        x_input = np.array(emd_x)

        y_out = model.predict_classes(x_input)

        for i, j in kv.items():
            if y_out == j:
                output = i

        return render_template(
            'result.html',
            prediction_text='your customer is {}'.format(output))
Exemplo n.º 8
0
def predict():
    if request.method == "POST":
        news = (request.form["News"])
        corpus = []
        review = re.sub('[^a-zA-Z]', ' ', news)
        review = review.lower()
        review = review.split()
        print(review)

        review = [
            ps.stem(word) for word in review
            if not word in stopwords.words('english')
        ]
        review = ' '.join(review)
        corpus.append(review)
        print(corpus)

        onehot_repr = [one_hot(words, 10000) for words in corpus]
        print(onehot_repr)
        embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=20)
        print(embedded_docs)
        prediction = model.predict(embedded_docs)
        print(prediction)

        if prediction > 0.1:
            output = "Real"
        else:
            output = "Fake"
        print(output)

        return render_template(
            'index.html', prediction_text=f'This News Headline is {output}!')

    return render_template("index.html")
Exemplo n.º 9
0
def get_text_to_vect(text):
    onehot_repr = [one_hot(words, voc_size) for words in text]

    embedded_docs = pad_sequences(onehot_repr,
                                  padding='pre',
                                  maxlen=sent_length)
    return embedded_docs
Exemplo n.º 10
0
def detect():

    model = keras.models.load_model("pickles/news_classifier_model.h5")
    vectorizer = joblib.load("pickles/data_transformer.joblib")

    if request.method == "POST":
        url = request.form["textNews"]
        urls = [url]
        print(urls)
        data = pd.DataFrame(data=[urls], columns=["title"])
        # print(data.url)
        preprocessed_title = vectorizer(data.title)
        one_hot_repr_title = [
            one_hot(words, 5000) for words in preprocessed_title
        ]
        embedded_title = pad_sequences(one_hot_repr_title,
                                       padding='pre',
                                       maxlen=20)
        print("Embeded", embedded_title)
        embedded_title = np.array(embedded_title)
        prediction_proba = model.predict_proba(embedded_title)[0]
        print("Prediction", model.predict(embedded_title))

        return render_template("result.html", prediction=prediction_proba)
    return render_template("index.html")
def preprocessing(sentence):
    # divide sentences as each words using(text to word sequences)
    words = set(text_to_word_sequence(sentence))  # remove duplicate words
    vocab_size = len(words)
    # words to numeric value(vector)
    results = one_hot(sentence, round(vocab_size * 1.3))
    return results
Exemplo n.º 12
0
def fnm_text(inputnewstext):
    manual_variable_initialization(True)

    stemnews = re.sub('[^a-zA-Z]', ' ', inputnewstext)
    stemnews = stemnews.lower()
    stemnews = stemnews.split()
    ps = PorterStemmer()
    stemnews = [
        ps.stem(word) for word in stemnews
        if not word in stopwords.words('english')
    ]
    stemnews = ' '.join(stemnews)[0:1000]
    vocabulary_size = 10000
    onehot_repr = [one_hot(stemnews, vocabulary_size)]
    embedded_text = pad_sequences(onehot_repr, padding='pre', maxlen=1000)
    embedding_vector_features = 50
    tmodel = Sequential()
    tmodel.add(
        Embedding(vocabulary_size,
                  embedding_vector_features,
                  input_length=1000))
    tmodel.add(Bidirectional(LSTM(100)))
    tmodel.add(Dense(1, activation='sigmoid'))
    tmodel.compile(loss='binary_crossentropy',
                   optimizer='adam',
                   metrics=['accuracy'])
    tmodel.load_weights(os.path.join(settings.MODEL_ROOT, 'textweights.h5'))

    x = embedded_text.reshape(1, 1000)
    result = tmodel.predict_classes(x)
    return result[0][0]
Exemplo n.º 13
0
def reliable(file):
    with open(file, 'r') as file:
        text = file.read().replace('\n', ' ')

    text = nltk.tokenize.sent_tokenize(text)

    text = [token.lower() for token in text]

    onehot_enc = [one_hot(words, 10000) for words in text]
    sen_len = 300
    embedded_doc = pad_sequences(onehot_enc, padding='pre', maxlen=sen_len)
    text_final = np.array(embedded_doc)

    pred = model.predict(text_final)

    pred_df = pd.DataFrame(pred)
    text_df = pd.DataFrame(text)
    result_df = pd.concat([pred_df, text_df], axis=1)

    pred_df.columns = ["predictions"]

    result_df.columns = ["predictions", "Sentence"]

    print(result_df.loc[result_df['predictions'] <= 0.7])

    fakeres_df = result_df.loc[result_df['predictions'] <= 0.7]
    fake_df = fakeres_df["Sentence"]

    return fake_df
Exemplo n.º 14
0
 def encode(self, message):
     encoded = one_hot(message, max_features)
     encoded = pad_sequences([encoded],
                             dtype='int32',
                             padding='post',
                             value=0,
                             maxlen=maxlen)
     return encoded
Exemplo n.º 15
0
def clean(corpus):
    onehot_repr = [one_hot(words, voc_size) for words in corpus]
    sent_length = 20
    embedded_docs = pad_sequences(onehot_repr,
                                  padding='pre',
                                  maxlen=sent_length)
    finalnew = np.array(embedded_docs)
    return finalnew
Exemplo n.º 16
0
def predict_function():
    if request.method == "POST":
        message = request.form["message"]
        input_data = ["message"]
        one_hot_data = [one_hot(words, 5000) for words in input_data]
        sequenced_data = pad_sequences(one_hot_data, maxlen=20, padding='pre')
        final_data = np.array(sequenced_data)
        my_prediction = classifier.predict(final_data)
        return render_template('result.html', prediction=my_prediction)
Exemplo n.º 17
0
def train_model():
    Train = pd.read_csv("train.csv")
    Train.head()
    X = Train.drop('label', axis=1)
    Train.isnull().sum()
    Train = Train.dropna()
    Train.isnull().sum()
    Train = Train.copy()
    Train.reset_index(inplace=True)
    x = Train['title']
    y = Train['label']
    ps = PorterStemmer()
    corpus = []
    for i in range(0, len(Train)):
        review = re.sub('[^a-zA-Z]', ' ', Train['title'][i])
        review = review.lower()
        review = review.split()

        review = [
            ps.stem(word) for word in review
            if not word in stopwords.words('english')
        ]
        review = ' '.join(review)
        corpus.append(review)

    voc_size = 5000
    one_hot_rep = [one_hot(words, voc_size) for words in corpus]
    sentence_length = 25
    embedded_docs = pad_sequences(one_hot_rep,
                                  padding='pre',
                                  maxlen=sentence_length)
    embedding_vector_features = 40
    model = Sequential()
    model.add(
        Embedding(voc_size,
                  embedding_vector_features,
                  input_length=sentence_length))
    model.add(Dropout(0.3))
    model.add(LSTM(200))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    z = np.array(embedded_docs)
    y = np.array(y)
    x_train, x_test, y_train, y_test = train_test_split(z,
                                                        y,
                                                        test_size=0.10,
                                                        random_state=42)
    model.fit(x_train,
              y_train,
              validation_data=(x_test, y_test),
              epochs=20,
              batch_size=64)
    model.save("main_model.h5")
def load_data(data_dir):
    with open('as4/data/sample_sentences.txt')as f:
        lines=f.readlines()
    voc_size=1000
    onehot1=[one_hot(words,voc_size)for words in lines]
    sent_lenth=494
    embedded_docs=pad_sequences(onehot1,padding='pre',maxlen=sent_lenth)
    x_pred=np.array(embedded_docs)

    return x_pred
Exemplo n.º 19
0
def testing_model(test_str):
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
    from tensorflow.keras.layers import SpatialDropout1D, BatchNormalization, Input
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.preprocessing.text import one_hot
    from tensorflow.keras.preprocessing.text import Tokenizer

    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report, accuracy_score

    #Downloading stopwords
    #Stopwords are the words in any language which does not add much meaning to a sentence.
    #They can safely be ignored without sacrificing the meaning of the sentence.
    import nltk
    import re
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    from nltk.stem.lancaster import LancasterStemmer
    from nltk.stem.wordnet import WordNetLemmatizer

    d = {'Total': test_str}
    X_test = pd.DataFrame(data=d)
    corpus_test = []
    for i in range(len(X_test)):
        #     if i == 20800: continue
        input1 = re.sub(
            '[^a-zA-Z]', ' ', str(X_test.iloc[i].total)
        )  # except a-z and A-Z, substitute all other characters with ' '
        input1 = input1.lower()  # Lower case
        input1 = input1.split()  # tokenize the text
        input1 = [
            ps.stem(word) for word in input1
            if word not in stopwords.words('english')
        ]
        text = ' '.join(
            input1
        )  # concatenating all words into a single text (list is created)#
        corpus_test.append(text)  # appending text into a single corpus #

    #Choosing vocabulary size to be 5000 and copying data to msg for further cleaning
    voc_size = 5000
    onehot_rep_test = [one_hot(words, voc_size) for words in corpus_test]

    #Padding Sentences to make them of same size
    embedded_docs_test = pad_sequences(onehot_rep_test,
                                       padding='pre',
                                       maxlen=25)

    test_final = np.array(embedded_docs_test)

    return test_final
Exemplo n.º 20
0
def analyze(headline):
    headline=headline.lower()
    headline=headline.split()
    headline = [ps.stem(word) for word in headline if not word in stopwords.words('english')]
    headline = ' '.join(headline)
    onehot_repr = [one_hot(words, voc_size) for words in headline]
    sent_length = 20
    embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)
    x=np.array(embedded_docs)
    pred=model1.predict_classes(x)
    return(pred)
Exemplo n.º 21
0
def news():
    if request.method == 'POST':
        data = request.form['news']

        onehotrepr = [one_hot(data, voc_size)]
        embedded_docs = pad_sequences(onehotrepr,
                                      padding='pre',
                                      maxlen=sent_length)

        news = np.argmax(model.predict(embedded_docs), axis=1)[0]

    return render_template("home.html", news=dict[news])
def get_array_from_directory(path):
    array = os.listdir(path)
    m = []
    for n in range(len(array)):

        with open(os.path.join(path, array[n]), encoding='utf8') as f:
            data = f.read()
            words = set(text_to_word_sequence(data))
            result = one_hot(data, round(len(words) * 1.3))
            m.append(result)

    m = pad_sequences(m, maxlen=2000)
    return m
Exemplo n.º 23
0
def data_preprocessing1(text_message):
    corpus = []
    review = re.sub('[^a-zA-Z]', ' ', text_message)
    review = review.lower()
    review = review.split()
    review = [
        ps.stem(word) for word in review
        if not word in stopwords.words('english')
    ]
    review = ' '.join(review)
    corpus.append(review)

    return [one_hot(words, voc_size) for words in corpus]
Exemplo n.º 24
0
def predict(content, neural_net_data_path, stemmer, stopwords):
    corpus = []
    review = re.sub('[^a-zA-Z]', ' ', content)
    review = review.lower()
    review = review.split()
    review = [stemmer.stem(word) for word in review if word not in stopwords]
    review = ' '.join(review)
    corpus.append(review)

    bag_of_words = [one_hot(words, vocabulary_size) for words in corpus]
    X = np.array(pad_sequences(bag_of_words, padding='pre', maxlen=neural_input_length))

    model.load_weights(neural_net_data_path)
    prediction = model.predict(X).tolist()[0][0]
    return prediction
Exemplo n.º 25
0
def preprocess(msg):
	ps=PorterStemmer()
	corpus=[]
	headline=re.sub('[^a-zA-Z]',' ',str(msg))
	headline=headline.lower()
	headline=headline.split()
	#stemming each word of the title
	headline=[ps.stem(word) for word in headline if not word in stopwords.words('english')]
	headline=' '.join(headline)
	corpus.append(headline)

	vocab_size=5000
	one_hot_representation=[one_hot(words,vocab_size) for words in corpus]
	sent_length=20
	embedded_rep=pad_sequences(one_hot_representation,sent_length,padding='pre')
	return embedded_rep
def input_preprocessing(x):
    corpus = []

    review = re.sub('[^a-zA-z]', ' ', str(x))
    review = review.lower()
    review = review.split()
    review = [wnl.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

    onehot_repr = [one_hot(words, voc_size) for words in corpus]


    embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)
    X=np.array(embedded_docs)
    return X
Exemplo n.º 27
0
def newsPredict(news):
    ps = PorterStemmer()
    latestNews = news
    corpus_latest = []
    
    review = re.sub('[^a-zA-Z]', ' ', latestNews)
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus_latest.append(review)
    
    encoded_docs_latest = [one_hot(d, vocab_size) for d in corpus_latest]
    padded_docs_latest = pad_sequences(encoded_docs_latest, maxlen=max_length, padding='post')
    
    return padded_docs_latest
Exemplo n.º 28
0
def fnm(inputnewstext):
    stemnews = re.sub('[^a-zA-Z]', ' ', inputnewstext)
    stemnews = stemnews.lower()
    stemnews = stemnews.split()
    ps = PorterStemmer()
    stemnews = [
        ps.stem(word) for word in stemnews
        if not word in stopwords.words('english')
    ]
    stemnews = ' '.join(stemnews)

    vocabulary_size = 10000

    onehot_repr = [one_hot(stemnews, vocabulary_size)]
    embedded_text = pad_sequences(onehot_repr, padding='pre', maxlen=50)
    x = embedded_text.reshape(1, 50)
    fnm_model = keras.models.load_model('my_model.h5', compile=False)
    result = fnm_model.predict_classes(x)
    print(result[0][0])
Exemplo n.º 29
0
def fnm(title, text):
    stemnews1 = re.sub('[^a-zA-Z]', ' ', title)
    stemnews1 = stemnews1.lower()
    stemnews1 = stemnews1.split()
    ps = PorterStemmer()

    stemnews1 = [
        ps.stem(word) for word in stemnews1
        if not word in stopwords.words('english')
    ]
    stemnews1 = ' '.join(stemnews1)[0:50]
    stemnews2 = re.sub('[^a-zA-Z]', ' ', text)
    stemnews2 = stemnews2.lower()
    stemnews2 = stemnews2.split()
    stemnews2 = [
        ps.stem(word) for word in stemnews2
        if not word in stopwords.words('english')
    ]
    stemnews2 = ' '.join(stemnews2)[0:450]
    stemnews = stemnews1 + " text: " + stemnews2

    vocabulary_size = 10000
    onehot_repr = [one_hot(stemnews, vocabulary_size)]
    embedded_text = pad_sequences(onehot_repr, padding='pre', maxlen=500)

    embedding_vector_features = 50
    model = Sequential()
    model.add(
        Embedding(vocabulary_size, embedding_vector_features,
                  input_length=500))
    model.add(LSTM(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.load_weights(os.path.join(settings.MODEL_ROOT,
                                    'titletextweights.h5'))

    x = embedded_text.reshape(1, 500)
    result = model.predict_classes(x)

    return result[0][0]
Exemplo n.º 30
0
def preprocess_transform(message):
    ps = PorterStemmer()
    corpus = []
    vocab_size = 10000
    sent_len = 20

    msg = re.sub('[^A-Za-z]', ' ', message)
    msg = msg.lower()
    msg = msg.split()
    msg = [
        ps.stem(word) for word in msg if word not in stopwords.words('english')
    ]
    msg = ' '.join(msg)
    corpus.append(msg)

    one_hot_rep = [one_hot(word, vocab_size) for word in corpus]

    embedded = pad_sequences(one_hot_rep, maxlen=sent_len)

    return np.array(embedded)