def analysis():
    """
    Taking search query into the variable 'key'.
    """
    key = request.form['InputText']
    """
    Performing authentication to access twitter's data.
    (Use twitter developer credentials below and uncomment the following piece commented code).
    """
    """
    consumer_key = ''
    consumer_secret = ''
    access_token = ''
    access_token_secret = ''
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 
    auth.set_access_token(access_token, access_token_secret)
    """
    """
    Creating an api object using tweepy.
    """
    api = tweepy.API(auth)
    """
    Fetching tweets and storing them in results array. 'num' variable denotes the number of tweets to be fetched.
    """
    results = []
    num = 50
    for tweet in tweepy.Cursor(api.search, q=key, lang="en").items(num):
        results.append(tweet)
    """
    Creating a pandas dataframe to capture tweet information.
    """
    dataset = pd.DataFrame()
    dataset["tweet_id"] = pd.Series([tweet.id for tweet in results])
    dataset["username"] = pd.Series(
        [tweet.author.screen_name for tweet in results])
    dataset["text"] = pd.Series([tweet.text for tweet in results])
    dataset["followers"] = pd.Series(
        [tweet.author.followers_count for tweet in results])
    dataset["hashtags"] = pd.Series(
        [tweet.entities.get('hashtags') for tweet in results])
    dataset["emojis"] = pd.Series([
        ','.join(c for c in tweet.text if c in emoji.UNICODE_EMOJI)
        for tweet in results
    ])
    """
    Following piece of code is used to generate wordcloud of the hashtags used in fetched tweets
    """
    Hashtag_df = pd.DataFrame(columns=["Hashtag"])
    j = 0
    for tweet in range(0, len(results)):
        hashtag = results[tweet].entities.get('hashtags')
        for i in range(0, len(hashtag)):
            Htag = hashtag[i]['text']
            Hashtag_df.at[j, 'Hashtag'] = Htag
            j = j + 1
    Hashtag_Combined = " ".join(Hashtag_df['Hashtag'].values.astype(str))
    text = " ".join(dataset['text'].values.astype(str))
    cleaned_text = " ".join([
        word for word in text.split()
        if word != "https" and word != "RT" and word != "co"
    ])
    wc = WordCloud(width=500,
                   height=500,
                   background_color="white",
                   stopwords=STOPWORDS).generate(Hashtag_Combined)
    plt.imshow(wc)
    plt.axis("off")
    r = random.randint(1, 101)
    st = 'static\hashtag' + str(r) + '.png'
    plt.savefig(st, dpi=300)
    """
    Following piece of code is used to get a list of top 5 hashtags
    """
    hashtag = Hashtag_Combined.split(" ")
    df = pd.DataFrame()
    df['hashtags'] = pd.Series([i for i in hashtag])
    data = df['hashtags'].value_counts()
    tag_count_list = data.values[:5]
    tag_list = data.keys()[:5]
    """
    Following piece of code generates tokens using training set.
    """
    x = np.load('../Classification network/X_train.npy', allow_pickle=True)
    tk = Tokenizer(num_words=80000)
    tk.fit_on_texts(x)
    """
    Following piece of code is used to preprocess the fetched tweets. Preprocessing steps : 
    -> Remove links, hashtag symbols and twitter handles.
    -> Convert text to lowercase.
    -> Remove punctuations.
    -> Remove 'rt' from the text. 
    """
    sentDataFrame = dataset.copy(deep=True)
    sentDataFrame['text'] = sentDataFrame['text'].apply(lambda x: ' '.join(
        re.sub(
            "(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)",
            " ", x).split()))
    sentDataFrame["text"] = sentDataFrame["text"].apply(lambda x: x.lower())
    sentDataFrame["text"] = sentDataFrame["text"].apply(
        lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    sentDataFrame["text"] = sentDataFrame["text"].apply(
        lambda x: x.replace('rt', ''))
    """
    Following poece of code is used to generate vector of tokens for each tweet and padding the vectors such that every vector will have a length of 35.
    """
    tweet_tokens = tk.texts_to_sequences(dataset['text'].values)
    tweet_tokens_pad = pad_sequences(tweet_tokens, maxlen=35, padding='post')
    """
    Following poece of code is used to load the model for sentiment classification.
    """
    json_file = open("../Classification network/model.json", 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)
    model.load_weights("../Classification network/model.h5")
    """
    Performing predictions using the model
    """
    senModelList = model.predict(x=tweet_tokens_pad)
    em = dataset["emojis"].values
    """
    Following piece of code stores sentiment score for each emoji in a dictionary 'dict'.
    """
    df_emojis = pd.read_csv('emoji_sentiment.csv')
    x = df_emojis['emoji'].values
    y = df_emojis['sentiment_score'].values
    dict = {}
    for i in range(len(x)):
        dict[x[i]] = y[i]
    """
    Following piece of code performs averaging between the sentiment score obtained from the model and the sentiment score of the emojis present in the model. 
    """
    for q in range(len(em)):
        if (em[q] != 'nan'):
            em_sent_score = 0
            sc_list = em[q].split(",")
            emj_count = 0
            for emj in sc_list:
                if emj in dict.keys():
                    em_sent_score = em_sent_score + dict[emj]
                    emj_count += 1
            if (emj_count > 0):
                senModelList[q] = ((
                    (em_sent_score / emj_count) + 1) / 2 + senModelList[q]) / 2
    """
    Following piece of code classifies the sentiment of the tweet and stores it in the datafrane 'dataset'.
    """
    senList = []
    for i in range(num):
        if (senModelList[i] <= 0.5):
            senList.append('n')
        else:
            senList.append('p')
    dataset['sentiment'] = pd.Series(senList)
    """
    Following piece of code stores the sum of positively classified tweets in 'posSentPer' and negatively classified tweets in 'negSentPe' which are than stored in the list 'opList'.
    """
    posSentPer = len(dataset[dataset['sentiment'] == 'p'].sentiment)
    negSentPer = len(dataset[dataset['sentiment'] == 'n'].sentiment)
    opList = [posSentPer, negSentPer]
    """
    Following piece of code stores the positive visibility score in 'posVis' and negative visibility score in 'negVis' which are than combinely stored in the list 'vbList'.
    """
    pos_dataset_for_visibility = dataset[dataset['sentiment'] == 'p']
    posVis = pos_dataset_for_visibility['followers'].sum(axis=0, skipna=True)
    neg_dataset_for_visibility = dataset[dataset['sentiment'] == 'n']
    negVis = neg_dataset_for_visibility['followers'].sum(axis=0, skipna=True)
    vbList = [posVis, negVis]
    """
    Following piece of code stores the tweets in 'tw_text', username of tweets in 'tw_uname' and number of followers of the authors of tweets in 'tw_foll'.
    """
    tw_uname = dataset['username'].values.tolist()
    tw_text = dataset['text'].values.tolist()
    tw_foll = dataset['followers'].values.tolist()

    return render_template('analysis.html',
                           title='analysis',
                           vbList=vbList,
                           key=key,
                           r=r,
                           tag_list=tag_list,
                           opList=opList,
                           tag_count_list=tag_count_list,
                           tw_uname=tw_uname,
                           tw_text=tw_text,
                           tw_foll=tw_foll)
for i in train_sentences:
    tokens = word_tokenize(i) # word tokenizacia
    words.append(tokens) # pridá výsledné slová do prázdneho listu, ktorý sme na začiatku vytvorili
print('Word2Vec...') 

from gensim.models import Word2Vec
from keras.layers import Embedding

model = Word2Vec(words, min_count = 1) #word to vector, implementacia z kniznice gensim
vocabulary = model.wv.vocab #slovnik
name = 'w2v.txt'
model.wv.save_word2vec_format(name, binary = False)

EMBEDDING_FILE = 'w2v.txt' # load embeddings

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_sentences))

tokenized_train_sentences = tokenizer.texts_to_sequences(train_sentences)
tokenized_test_sentences = tokenizer.texts_to_sequences(test_sentences)

train_padding = pad_sequences(tokenized_train_sentences, maxlen)
test_padding = pad_sequences(tokenized_test_sentences, maxlen)

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding='utf8'))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
y_train = df.loc[:24999, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

# summarize size
print("Training data: ")
print(X.shape)
print(y.shape)

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

tokenizer_obj = Tokenizer()
total_reviews = X_train + X_test
tokenizer_obj.fit_on_texts(total_reviews)

# pad sequences
max_length = max([len(s.split()) for s in total_reviews])
#max_length=500

# define vocabulary size
vocab_size = len(tokenizer_obj.word_index) + 1

X_train_tokens = tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='pre')
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='pre')
示例#4
0
text13 = "eeek"
text14 = "already got tix to watch it again"
text15 = "really awful"
text16 = "not bad"
x_train_text = [text1, text2, text3, text4, text5, text6, text7, text8]
y_train = [1, 1, 0, 0, 0, 0, 0, 0]
x_test_text = [
    text8, text9, text10, text11, text12, text13, text14, text15, text16
]
y_test = [0, 0, 1, 1, 0, 0, 1, 0, 1]
texts = x_train_text + x_test_text
num_words = 100  # use only 100 most popular words from the dataset
max_tokens = 10  # truncate individual texts to only 10 words

# We first convert these texts to arrays of integer-tokens because that is needed by the model
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(x_train_text)
x_train_tokens = tokenizer.texts_to_sequences(x_train_text)
x_test_tokens = tokenizer.texts_to_sequences(x_test_text)
tokenizer.word_index  # inspect vocabulary
np.array(x_train_tokens[1])  # how text 2 has been tokenized

# To input texts with different lengths into the model, we also need to pad and truncate them
num_tokens = np.array([len(i) for i in x_train_tokens])
np.sum(num_tokens < max_tokens) / len(
    num_tokens)  # check text covered after truncating
pad = 'pre'  # better to add 0 at beginning of sequence
x_train_pad = pad_sequences(x_train_tokens,
                            maxlen=max_tokens,
                            padding=pad,
                            truncating='pre')
        j += 1
        if max < len(words):
            max = len(words)
reviews = new_review
world_net = reviews
rating = new_rating
print("The total number of reviews after modifying",len(reviews))
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(reviews)
print("The total number of different words from dataset's dictionary",X.toarray().shape[1])
# using token to modify text into array index

# finding adjectives from tokenizer based on world_net dictionary.
find_sentiment = False
if find_sentiment:
    t = Tokenizer()
    t.fit_on_texts(world_net)
    dic_tokenizer = t.word_counts
    sentiment = dict()
    for word in dic_tokenizer:
        open = os.popen("C:\\Users\\billpham\\Desktop\\WordNet\\bin\\wn "+word+" -n# -searchtype -over").read()
        if "adj" in open:
            sentiment[word] = 1
    new_data = open("new_data.txt","r")
    for x in dic:
        new_data.write(str(x))
    exit()
# create the adjective data file
generate_data_file = False
if generate_data_file:
    new_data = open("new_data.txt","r").readlines()
示例#6
0
num_validation_samples = int(Validation_split *
                             np.asarray(processed_features).shape[0])

x_train = processed_features[:-num_validation_samples]
x_test = processed_features[-num_validation_samples:]
Y_train = lb_make.fit_transform(labels[:-num_validation_samples])
Y_test = lb_make.fit_transform(labels[-num_validation_samples:])

# In[9]:

#Learning Embeddings
# Tokenizing sentences using Keras
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(processed_feature)

#Determining number of words in largest sentence and vocabulary size of the corpus
max_length = max([len(sentence.split()) for sentence in processed_feature])

vocab_size = len(tokenizer_obj.word_index) + 1

# In[10]:

#Converting sentences in train and test sets to sequences
x_train_tokens = tokenizer_obj.texts_to_sequences(x_train)
x_test_tokens = tokenizer_obj.texts_to_sequences(x_test)

#Padding to make sure all the vectors are the same length
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_length, padding='post')
示例#7
0
print('Found %s word vectors.' % len(embeddings_index))

#PARAMETERS
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 100
output_size = n_labels
batch_size = 64
nb_epochs = 1000

## Creating the data for a keras neural network and starting the embedding.

comments = df_train.text.tolist() + df_test.text.tolist(
) + df_valid.text.tolist()
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, char_level=False)
tokenizer.fit_on_texts(comments)
word_index = tokenizer.word_index

sequences_train = tokenizer.texts_to_sequences(df_train.text)
sequences_test = tokenizer.texts_to_sequences(df_test.text)
sequences_valid = tokenizer.texts_to_sequences(df_valid.text)

x_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
x_valid = pad_sequences(sequences_valid, maxlen=MAX_SEQUENCE_LENGTH)

y_train = df_train.drop(df_train.columns[[0]], axis=1).values
y_test = df_test.drop(df_test.columns[[0]], axis=1).values
y_valid = df_test.drop(df_test.columns[[0]], axis=1).values
示例#8
0
def main():
    if len(sys.argv) != 4:
        sys.exit("Usage: python3 train_hw4.py $1 $2 $3")

    # read data
    x_train = pd.read_csv(sys.argv[1]).values
    y_train = pd.read_csv(sys.argv[2]).values
    x_test = pd.read_csv(sys.argv[3]).values

    nlp = en_core_web_sm.load()
    sentences = x_train[:,1]
    for i in range(len(sentences)):
        doc = nlp(sentences[i])
        sentences[i] = [t.text for t in doc]
    max_length = max([len(sentence) for sentence in sentences])
    vocab_size = len(sentences)
    EMBEDDING_DIM = 400

    # Train & Save Word2Vec model
    model = Word2Vec(min_count=1, size = EMBEDDING_DIM, workers=4)
    model.build_vocab(sentences)  # prepare the model vocabulary
    model.train(sentences, total_examples=model.corpus_count, epochs=20)  # train word vectors

    # save model
    filename = 'word2vec_{}.txt'.format(TIME)
    model.wv.save_word2vec_format(filename, binary=False)

    # read the model
    embeddings_index = {}
    f = open(os.path.join('', filename), encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:])
        embeddings_index[word] = coefs
    f.close

    x_train = list(x_train[:,1])

    # Train & Save Tokenizer
    tokenizer_obj = Tokenizer()
    tokenizer_obj.fit_on_texts(x_train)

    # save model
    pickle_name = 'tokenizer_{}.txt'.format(TIME)
    with open(pickle_name, 'wb') as handle:
        pickle.dump(tokenizer_obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # loading
    with open(pickle_name, 'rb') as handle:
        tokenizer_obj = pickle.load(handle)

    sequences = tokenizer_obj.texts_to_sequences(x_train)
    word_index = tokenizer_obj.word_index
    review_pad = pad_sequences(sequences, maxlen=max_length)
    target = y_train[:,1]

    num_words = len(word_index) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

    for word, i in word_index.items():
        if i > num_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    # split to training and testing data
    VALIDATION_SPLIT = 0.01

    indices = np.arange(review_pad.shape[0])
    np.random.shuffle(indices)
    review_pad = review_pad[indices]
    target = target[indices]
    num_validation_samples = int(VALIDATION_SPLIT * review_pad.shape[0])

    x_train_pad = review_pad[:-num_validation_samples]
    y_train = target[:-num_validation_samples]

    x_test_pad = review_pad[-num_validation_samples:]
    y_test = target[-num_validation_samples:]

    # model1 - GRU
    model= Sequential()
    embedding_layer = Embedding(num_words,
                                EMBEDDING_DIM,
                                embeddings_initializer=Constant(embedding_matrix),
                                input_length=max_length,
                                trainable=False)

    model.add(embedding_layer)
    model.add(GRU(units=50, dropout=0.5, recurrent_dropout=0.5,return_sequences=True))
    model.add(GRU(units=50, dropout=0.5, recurrent_dropout=0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(x_train_pad, y_train, batch_size=128, epochs=1, validation_data=(x_test_pad, y_test), verbose=2, shuffle=True)
    model.save('model_{}.h5'.format(TIME))

    # model2: Bi-GRU
    model= Sequential()
    embedding_layer = Embedding(num_words,
                                EMBEDDING_DIM,
                                embeddings_initializer=Constant(embedding_matrix),
                                input_length=max_length,
                                trainable=False)

    model.add(embedding_layer)
    model.add(Bidirectional(GRU(units=40, dropout=0.4, recurrent_dropout=0.4,return_sequences=True)))
    model.add(Bidirectional(GRU(units=40, dropout=0.4, recurrent_dropout=0.4)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(x_train_pad, y_train, batch_size=128, epochs=1, validation_data=(x_test_pad, y_test), verbose=2, shuffle=True)
    model.save('model_{}.h5'.format(TIME))

    # model3: Bi-GRU
    model= Sequential()
    embedding_layer = Embedding(num_words,
                                EMBEDDING_DIM,
                                embeddings_initializer=Constant(embedding_matrix),
                                input_length=max_length,
                                trainable=False)

    model.add(embedding_layer)
    model.add(Bidirectional(GRU(units=30, dropout=0.5, recurrent_dropout=0.5,return_sequences=True)))
    model.add(Bidirectional(GRU(units=30, dropout=0.5, recurrent_dropout=0.5)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(x_train_pad, y_train, batch_size=128, epochs=1, validation_data=(x_test_pad, y_test), verbose=2, shuffle=True)
    model.save('model_{}.h5'.format(TIME))

    print("TRAINING COMPLETED")
示例#9
0

# In[15]:


X,mid=text_clense_frame(X)


# Tokenization of words

# In[16]:


num_words=mid
#Tokenize the text
tokenize=Tokenizer(num_words=num_words)
tokenize.fit_on_texts(X)
idx=tokenize.word_index
x_train_token=tokenize.texts_to_sequences(X)
#x_test_token=tokenize.texts_to_sequences(X_test)


# In[18]:


num_tokens=[len(token) for token in x_train_token]
num_tokens=np.array(num_tokens)
max_tokens=np.mean(num_tokens)+2*np.std(num_tokens)
max_tokens=int(max_tokens)
print("Max Tokens")
print(max_tokens)
url = "final_english_data_tagged.txt"
names = ['data', 'class']
df = pd.read_csv(url, names=names, delimiter='\t')

train_text, test_text, train_y, test_y = train_test_split(df['data'],
                                                          df['class'],
                                                          test_size=0.2)

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 1300
# This is fixed.
EMBEDDING_DIM = 300

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True)
tokenizer.fit_on_texts(df['data'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = tokenizer.texts_to_sequences(df['data'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Y = pd.get_dummies(df['class']).values
print('Shape of label tensor:', Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=42)
示例#11
0
embed_size = 50
# max number of unique words
max_features = 2000
# max number of words from review to use
maxlen = 50
embedding_file = "Models/glove.6B.50d.txt"


def coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


embeddings_index = dict(
    coefs(*f.strip().split())
    for f in open(embedding_file, mode="r", encoding="utf-8"))
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train['Reviews'].values))

X_train = tokenizer.texts_to_sequences(train['Reviews'].values)
X_test = tokenizer.texts_to_sequences(test['Reviews'].values)

x_train = pad_sequences(X_train, maxlen=maxlen)
x_test = pad_sequences(X_test, maxlen=maxlen)

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))

for word, i in word_index.items():
    if i >= max_features:
        break
示例#12
0
def main():
    MAX_LENGTH = 50
    MAX_NB_WORDS = 250000
    embed_size = 300
    # german
    Data = read_data('DeTrainingData_3')
    # english
    # Data = read_data('EnTrainingData_3')

    X_train, X_val, y_train, y_val = train_test_split(
        Data['Title'], Data['Class'], test_size=0.1, random_state=42)

    all_embs, emb_mean, emb_std, embeddings_index = load_Embeddings(
        MAX_LENGTH=50, MAX_NB_WORDS=250000)

    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(Data['Title'])

    # german
    joblib.dump(tokenizer, './Models_test/tok_DE_3.pickle')
    # english
    # joblib.dump(tokenizer, './Models_test/tok_EN_3.pickle')

    # to numerical vectors
    padded_train_sequences = Text_to_Sequence(X_train, tokenizer, MAX_LENGTH)
    padded_val_sequences = Text_to_Sequence(X_val, tokenizer, MAX_LENGTH)
    # --------------------------------------------------------------------
    """
    - The data is first fed into the first-level model and predictions are acquired.
    - Then the same data is concatenated with the acquired predictions (as metadata) 
      and are fed into the second level model. 
    - Finally the original data is again concatenated with both predictions from both
      first and second models and fed into the network
    """
    # Predictions by first-level model
    # german
    model_lev_1 = load_model("./Models_test/DE_1_0.67.hdf5")
    # english
    # model_lev_1 = load_model("./Models_test/EN_1_0.77.hdf5")
    level_1_num_train = np.argmax(model_lev_1.predict(
        padded_train_sequences, batch_size=256), axis=1)
    level_1_num_val = np.argmax(model_lev_1.predict(
        padded_val_sequences, batch_size=256), axis=1)

    # Predictions by second-level model with first level predictions as metadata
    # german
    model_lev_2 = load_model("./Models/DE_2_0.53.hdf5")
    # english
    # model_lev_2 = load_model("./Models/EN_2_0.65.hdf5")
    level_2_num_train = np.argmax(model_lev_2.predict(
        [padded_train_sequences, np.float32(level_1_num_train)], batch_size=256), axis=1)
    level_2_num_val = np.argmax(model_lev_2.predict(
        [padded_val_sequences, np.float32(level_1_num_val)], batch_size=256), axis=1)
    # -----------------------------------------------------------------------------
    # get embedding matrix (is fed into the first layer => embedding layer)
    word_index = tokenizer.word_index
    embedding_matrix_2 = Create_EmbeddingMatrix(
        emb_mean, emb_std, embed_size, MAX_NB_WORDS, word_index, embeddings_index)
    # -----------------------------------------------------------------------------
    # compile the network
    rnn_model = get_rnn_model(MAX_NB_WORDS, embedding_matrix_2)
    print(rnn_model.summary())
    # ----------------------------------------------------------------------------------
    # take care of inbalance
    class_weights = class_weight.compute_class_weight(
        'balanced', np.unique(y_train), y_train)
    # -------------------------------------------------------------------------------------------
    #### Converting Classes into label Encoding ######
    # encode class values as integers
    encoder = LabelEncoder()
    encoder.fit(y_train)
    joblib.dump(encoder, './Models/encoder_DE_3.pickle')
    # convert integers to one-hot encoding
    dummy_y_train = Encoder(y_train, encoder)
    dummy_y_val = Encoder(y_val, encoder)
    # ----------------------------------------------------------------------------------------
    # callbacks
    lr_scheduler = ReduceLROnPlateau(
        monitor='val_loss', factor=0.1, patience=3, verbose=1, mode='auto', cooldown=0, min_lr=0.0000001)
    es = EarlyStopping(monitor='val_loss', min_delta=0.00005,
                       patience=5, verbose=0, mode='auto')

    # save checkpoint
    filepath = "./Models_test/weights-improvement-{epoch:02d}-{val_accuracy:.2f}.hdf5"
    checkpoint = ModelCheckpoint(
        filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    # -----------------------------------------------------------------------------------------------
    # run the network
    """
    train and validation sets = [data, predictions from the first model, predictions from the second model]
    """
    history = rnn_model.fit([padded_train_sequences, level_1_num_train, level_2_num_train], dummy_y_train, validation_data=(
        [padded_val_sequences, level_1_num_val, level_2_num_val], dummy_y_val), batch_size=256, callbacks=[lr_scheduler, es, checkpoint], class_weight=class_weights, epochs=50)

    # summarize history for accuracy
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    # Saving Accuracy Figure
    plt.savefig('DE_3_Accuracy.png')
    plt.show()

    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    # Saving Loss Figure
    plt.savefig('DE_3_Loss.png')
    plt.show()
示例#13
0
def load_data(target):

    all_article = []
    train_artic = []
    train_bound = []
    train_abstr = []
    train_extra = []

    # Load all data
    with open('./data/train.jsonl') as j:
        for each in j:
            each = json.loads(each)
            if each['text'] == "\n":
                # print(each['id'])
                continue
            train_artic += [each['text']]
            train_bound += [each['sent_bounds']]
            train_abstr += [each['summary']]
            train_extra += [each['extractive_summary']]
        j.close()

    valid_artic = []
    valid_bound = []
    valid_abstr = []
    valid_extra = []

    with open('./data/valid.jsonl') as j:
        for each in j:
            each = json.loads(each)
            if each['text'] == "\n":
                print(each['id'])
            valid_artic += [each['text']]
            valid_bound += [each['sent_bounds']]
            valid_abstr += [each['summary']]
            valid_extra += [each['extractive_summary']]
        j.close()

    test_artic = []
    test_bound = []

    with open('./data/test.jsonl') as j:
        for each in j:
            each = json.loads(each)
            if each['text'] == "\n":
                print('null article in test data:', each['id'])
            test_artic += [each['text']]
            test_bound += [each['sent_bounds']]
        j.close()

    if target == 'extractive':

        # Parsing article
        # Split each article into lists
        X_train = text_parsing(train_artic, train_bound)
        X_valid = text_parsing(valid_artic, valid_bound)
        X_test = text_parsing(test_artic, test_bound)

        # Data cleaning

        print('Data cleaning.')

        X_all_text = []
        X_train_seq = text_cleaning(X_train, X_all_text)
        X_valid_seq = text_cleaning(X_valid, [])
        X_test_seq = text_cleaning(X_test, [])

        # Load pretrained

        MAX_LEN = 100
        MAX_NUM_WORDS = 20000
        EMBEDDING_DIM = 300

        # finally, vectorize the text samples into a 2D integer tensor
        print('Tokenization')
        tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)

        # Fit the tokenizer on our text
        tokenizer.fit_on_texts(X_all_text)

        # Get all words that the tokenizer knows
        word_index = tokenizer.word_index
        num_words = len(word_index) + 1
        print('Found %s unique tokens' % len(word_index))

        print('text to sequence vector')
        train_sentence_length = []
        valid_sentence_length = []
        test_sentence_length = []

        X_train_seq = text_to_sequence(X_train, train_sentence_length,
                                       tokenizer)
        X_valid_seq = text_to_sequence(X_valid, valid_sentence_length,
                                       tokenizer)
        X_test_seq = text_to_sequence(X_test, test_sentence_length, tokenizer)

        print('Making Label...')

        def make_label(X, extractive):
            label = []
            for i in range(len(X)):
                temp = []
                extrac_index = extractive[i]
                for j in range(len(X[i])):
                    if j == extrac_index:
                        temp += [1] * len(X[i][j])
                    else:
                        temp += [0] * len(X[i][j])

                label.append(temp)
            return label

        Y_train = make_label(X_train, train_extra)
        Y_valid = make_label(X_valid, valid_extra)

        X_train = text_to_one_list(X_train)
        X_valid = text_to_one_list(X_valid)
        X_test = text_to_one_list(X_test)

        X_train = pad_sequences(
            X_train, maxlen=MAX_LEN, truncating='post', padding='post'
        )  # Each sentence is padding to a size=max_len vector
        X_valid = pad_sequences(X_valid,
                                maxlen=MAX_LEN,
                                truncating='post',
                                padding='post')
        X_test = pad_sequences(X_test,
                               maxlen=MAX_LEN,
                               truncating='post',
                               padding='post')
        Y_train = pad_sequences(Y_train,
                                maxlen=MAX_LEN,
                                truncating='post',
                                padding='post')
        Y_valid = pad_sequences(Y_valid,
                                maxlen=MAX_LEN,
                                truncating='post',
                                padding='post')

        # Preparing embedding matrix
        num_words = min(MAX_NUM_WORDS, len(word_index) + 1)

        np.save('./data/extractive_label.npy', Y_train)
        return

    if target == 'abstractive':

        # Parsing article into pieces of sentence
        X_train = text_parsing(train_artic, train_bound)
        X_valid = text_parsing(valid_artic, valid_bound)
        X_test = text_parsing(test_artic, test_bound)

        # Data cleaning
        # remove punctuation, ...
        print('Data cleaning.')

        X_all_text = []
        X_train = text_cleaning(X_train, X_all_text)
        X_valid = text_cleaning(X_valid, [])
        X_test = text_cleaning(X_test, [])

        # prepare label text
        print('Preparing label text')
        for i in tqdm(range(len(train_abstr))):
            train_abstr[i] = text_cleaner(train_abstr[i])

        for j in tqdm(range(len(valid_abstr))):
            valid_abstr[j] = text_cleaner(valid_abstr[j])

        print('Add start and end tagger to the labels')

        def add_tagger(abstractive, all_text):
            for i in tqdm(range(len(abstractive))):
                abstractive[i] = '_BOS_ ' + abstractive[i] + ' _EOS_'
                all_text.append(abstractive[i])
            return abstractive

        Y_all_text = []

        train_abstr = add_tagger(train_abstr, Y_all_text)
        valid_abstr = add_tagger(valid_abstr, [])

        MAX_LEN = 100
        MAX_SUMMARY_LEN = 30
        MAX_NUM_WORDS = 20000
        EMBEDDING_DIM = 300

        # finally, vectorize the text samples into a 2D integer tensor
        print('Tokenization')
        X_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)

        # Fit the tokenizer on training text
        X_tokenizer.fit_on_texts(X_all_text)

        # Get all words that the tokenizer knows
        X_word_index = X_tokenizer.word_index
        num_words = min(len(X_word_index) + 1, MAX_NUM_WORDS)
        print('Found %s unique tokens' % len(X_word_index))

        print('text to sequence vector')

        X_train = text_to_sequence(X_train, [], X_tokenizer)
        X_valid = text_to_sequence(X_valid, [], X_tokenizer)
        X_test = text_to_sequence(X_test, [], X_tokenizer)

        X_train = text_to_one_list(X_train)
        X_valid = text_to_one_list(X_valid)
        X_test = text_to_one_list(X_test)

        Y_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
        Y_tokenizer.fit_on_texts(train_abstr)
        Y_word_index = Y_tokenizer.word_index

        # Y tokenization only trained on train data label
        Y_train = Y_tokenizer.texts_to_sequences(train_abstr)
        Y_valid = Y_tokenizer.texts_to_sequences(valid_abstr)

        X_train = pad_sequences(
            X_train, maxlen=MAX_LEN, truncating='post', padding='post'
        )  # Each sentence is padding to a size=max_len vector
        X_valid = pad_sequences(X_valid,
                                maxlen=MAX_LEN,
                                truncating='post',
                                padding='post')
        X_test = pad_sequences(X_test,
                               maxlen=MAX_LEN,
                               truncating='post',
                               padding='post')
        Y_train = pad_sequences(Y_train,
                                maxlen=MAX_SUMMARY_LEN,
                                truncating='post',
                                padding='post')
        Y_valid = pad_sequences(Y_valid,
                                maxlen=MAX_SUMMARY_LEN,
                                truncating='post',
                                padding='post')

        # Load pretrained
        # Prepare pre-trained embedding matrix
        np.save('./data/train.npy', X_train)
        np.save('./data/abstractive_label.npy', Y_train)
        with open('./data/X_tokenizer.pkl', 'wb') as file:
            pickle.dump(X_tokenizer, file)
            file.close()
        with open('./data/Y_tokenizer.pkl', 'wb') as file:
            pickle.dump(Y_tokenizer, file)

        return
示例#14
0
    model.compile(optimizer=adam,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model


def get_layer2(embedding):
    return 0


if __name__ == '__main__':
    x_data, y_data = load_data()
    corpus = [sentence_seg(x) for x in x_data]

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)

    sequences = tokenizer.texts_to_sequences(corpus)

    max_sequence_len = max([len(s) for s in sequences])
    print("max sequence lenght:", max_sequence_len)

    data = pad_sequences(sequences, maxlen=max_sequence_len)
    labels = to_categorical(np.asarray(y_data))

    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        labels,
                                                        test_size=0.3)

    ##########################################################
示例#15
0
def build_model(use_gpu: bool = False,
                num_units: int = 64,
                num_layers: int = 1,
                dropout_rate: float = 0.0,
                batch_size: int = 1000,
                window_size: int = 10,
                num_params: int = 0):
    """
    Builds the RNN-Model for character prediction.

    :param window_size: Sequence size
    :param batch_size: {int} Size of batch
    :param dropout_rate: {float} Regulating Dropout rate between layers
    :param num_layers: {int} Number of layers to build
    :param num_units: {int} Number of LSTM-Units to use in network
    :param use_gpu: {bool} Uses Tensorflow GPU support if True, otherwise trains on CPU
    :param num_params: {int} Number of control parameters
    :return: Keras model
    """

    # Load max 5000 entries from the dataset to build the Tokenizer / vocabulary
    loader = Loader(min(batch_size, 5000), 0)
    tokenizer = Tokenizer(filters='', split='°', lower=False)

    for dataframe in loader:

        chars = set()

        for name in dataframe['name']:
            chars.update(set(str(name)))

        tokenizer.fit_on_texts(list(chars))

    tokenizer.fit_on_texts(['pre', '<end>', 'pad'])

    # Build Keras Model
    model = Sequential()
    for r in range(0, max(num_layers - 1, 0)):
        model.add(layer=(CuDNNLSTM if use_gpu else LSTM
                         )(num_units,
                           input_shape=(window_size,
                                        len(tokenizer.index_word) + 1 +
                                        num_params),
                           return_sequences=True))
        model.add(Dropout(dropout_rate))

    model.add(
        layer=(CuDNNLSTM if use_gpu else LSTM)(num_units,
                                               input_shape=(
                                                   window_size,
                                                   len(tokenizer.index_word) +
                                                   1 + num_params)))
    model.add(Dense(len(tokenizer.index_word) + 1, activation='softmax'))

    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # Show summary
    print(model.summary())

    return model, tokenizer
示例#16
0
class TokenizerWrapper:
    def __init__(self,
                 dataset_csv_file,
                 class_name,
                 max_caption_length,
                 tokenizer_num_words=None):
        dataset_df = pd.read_csv(dataset_csv_file)
        sentences = dataset_df[class_name].tolist()
        self.max_caption_length = max_caption_length
        self.tokenizer_num_words = tokenizer_num_words
        self.init_tokenizer(sentences)

    def clean_sentence(self, sentence):
        return text_to_word_sequence(
            sentence,
            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
            lower=True,
            split=' ')

    def init_tokenizer(self, sentences):

        for i in range(len(sentences)):
            if pd.isna(sentences[i]):
                sentences[i] = ""
            sentences[i] = self.clean_sentence(sentences[i])

        # Tokenize the reviews
        print("Tokenizing dataset..")
        self.tokenizer = Tokenizer(oov_token='UNK',
                                   num_words=self.tokenizer_num_words)
        self.tokenizer.fit_on_texts(sentences)  # give each word a unique id
        print("number of tokens: {}".format(self.tokenizer.word_index))
        print("Tokenizing is complete.")

    def get_tokenizer_num_words(self):
        return self.tokenizer_num_words

    def get_token_of_word(self, word):
        return self.tokenizer.word_index[word]

    def get_word_from_token(self, token):
        try:
            return self.tokenizer.index_word[token]
        except:
            return ""

    def get_sentence_from_tokens(self, tokens):
        sentence = []
        for token in tokens[0]:
            word = self.get_word_from_token(token)
            if word == 'endseq':
                return sentence
            if word != 'startseq':
                sentence.append(word)

        return sentence

    def get_string_from_word_list(self, word_list):

        return " ".join(word_list)

    def get_word_tokens_list(self):
        return self.tokenizer.word_index

    def tokenize_sentences(self, sentences):
        index = 0
        tokenized_sentences = np.zeros(
            (sentences.shape[0], self.max_caption_length), dtype=int)
        for caption in sentences:
            tokenized_caption = self.tokenizer.texts_to_sequences(
                [self.clean_sentence(caption[0])])
            tokenized_sentences[index] = pad_sequences(
                tokenized_caption,
                maxlen=self.max_caption_length,
                padding='post')  # padded with max length
            index = index + 1
        return tokenized_sentences
class TokenizerWrapper:
    def __init__(self, dataset_csv_file, class_name, max_caption_length, tokenizer_num_words=None):
        dataset_df = pd.read_csv(dataset_csv_file)
        sentences = dataset_df[class_name].tolist()
        self.max_caption_length = max_caption_length
        self.tokenizer_num_words = tokenizer_num_words
        self.init_tokenizer(sentences)
        self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', add_prefix_space=True)
        self.gpt2_tokenizer.pad_token = "<"

    def clean_sentence(self, sentence):
        return text_to_word_sequence(sentence, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')

    def GPT2_pad_token_id(self):
        return self.gpt2_tokenizer.pad_token_id

    def GPT2_eos_token_id(self):
        return self.gpt2_tokenizer.eos_token_id

    def GPT2_encode(self, sentences, pad=True, max_length=None):
        if max_length is None:
            max_length = self.max_caption_length
        if isinstance(sentences, str):
            return self.gpt2_tokenizer.encode(sentences, add_special_tokens=True, max_length=max_length,
                                              pad_to_max_length=pad)
        tokens = np.zeros((sentences.shape[0], max_length), dtype=int)

        for i in range(len(sentences)):
            if pd.isna(sentences[i]):
                sentences[i][0] = ""
            sentence = sentences[i][0].lower()
            sentence = sentence.replace('"', '')
            sentence = sentence.replace('xxxx', '')
            sentence = sentence.replace('endseq', '<|endoftext|>')
            tokens[i] = self.gpt2_tokenizer.encode(sentence, add_special_tokens=True,
                                                   max_length=max_length, pad_to_max_length=pad)
        return tokens

    def GPT2_decode(self, tokens):
        return self.gpt2_tokenizer.decode(tokens, skip_special_tokens=True)

    def GPT2_format_output(self, sentence):
        sentence = self.clean_sentence(sentence)
        return sentence

    def filter_special_words(self, sentence):
        sentence = sentence.replace('startseq', '')
        sentence = sentence.replace('endseq', '')
        sentence = sentence.replace('<|endoftext|>', '')
        sentence = sentence.replace('<', '')
        sentence = sentence.strip()
        return sentence

    def init_tokenizer(self, sentences):

        for i in range(len(sentences)):
            if pd.isna(sentences[i]):
                sentences[i] = ""
            sentences[i] = self.clean_sentence(sentences[i])

        # Tokenize the reviews
        print("Tokenizing dataset..")
        self.tokenizer = Tokenizer(oov_token='UNK', num_words=self.tokenizer_num_words)
        self.tokenizer.fit_on_texts(sentences)  # give each word a unique id
        print("number of tokens: {}".format(self.tokenizer.word_index))
        print("Tokenizing is complete.")

    def get_tokenizer_num_words(self):
        return self.tokenizer_num_words

    def get_token_of_word(self, word):
        return self.tokenizer.word_index[word]

    def get_word_from_token(self, token):
        try:
            return self.tokenizer.index_word[token]
        except:
            return ""

    def get_sentence_from_tokens(self, tokens):
        sentence = []
        for token in tokens[0]:
            word = self.get_word_from_token(token)
            if word == 'endseq':
                return sentence
            if word != 'startseq':
                sentence.append(word)

        return sentence

    def get_string_from_word_list(self, word_list):

        return " ".join(word_list)

    def get_word_tokens_list(self):
        return self.tokenizer.word_index

    def tokenize_sentences(self, sentences):
        index = 0
        tokenized_sentences = np.zeros((sentences.shape[0], self.max_caption_length), dtype=int)
        for caption in sentences:
            tokenized_caption = self.tokenizer.texts_to_sequences([self.clean_sentence(caption[0])])
            tokenized_sentences[index] = pad_sequences(tokenized_caption, maxlen=self.max_caption_length,
                                                       padding='post')  # padded with max length
            index = index + 1
        return tokenized_sentences
# In[ ]:

#

# In[22]:

X_train = df.loc[:24999, 'review'].values
y_train = df.loc[:24999, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

total_reviews = X_train + X_test
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(total_reviews)

max_length = max([len(s.split()) for s in df['review'].values.tolist()])

vocab_size = len(tokenizer_obj.word_index) + 1
X_train_tokens = tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post')

X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post')

# In[49]:

# In[55]:
示例#19
0
def make_model():
    # load all training reviews
    positive_docs = process_docs('data/pos_train', vocab, True)
    negative_docs = process_docs('data/neg_train', vocab, True)
    train_docs = negative_docs + positive_docs
    # create the tokenizer
    tokenizer = Tokenizer()
    # fit the tokenizer on the documents
    tokenizer.fit_on_texts(train_docs)
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # sequence encode
    encoded_docs = tokenizer.texts_to_sequences(train_docs)

    # pad sequences
    max_length = max([len(s.split()) for s in train_docs])
    print("\n\n maxlenght="+str(max_length))

    from tensorflow.python.keras.preprocessing.sequence import pad_sequences
    X = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

    # define training labels
    y = np.array([0 for _ in range(270)] + [1 for _ in range(270)])

    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.30, random_state=42)
    '''
    # load all test reviews
    positive_docs = process_docs('data/pos_test', vocab, False)
    negative_docs = process_docs('data/neg_test', vocab, False)
    test_docs = negative_docs + positive_docs
    # sequence encode
    encoded_docs = tokenizer.texts_to_sequences(test_docs)
    # pad sequences
    Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    # define test labels
    ytest = np.array([0 for _ in range(len(listdir("data/neg_test")))] + [1 for _ in range(len(listdir("data/pos_test")))])
    '''
    print("\n pad_sequences : ",Xtest)
    print("\n ytest : ",ytest)

    # define vocabulary size (largest integer value)
    vocab_size = len(tokenizer.word_index) + 1

    # define model
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Conv1D(filters=64, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    print(model.summary())
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # fit network
    model.fit(Xtrain, ytrain, epochs=20, verbose=1)
    # evaluate
    loss, acc = model.evaluate(Xtest, ytest, verbose=0)
    print('Test Accuracy: %f' % (acc*100))

    model.save("relevancy_model_v2.0.1.h5")
    print("Done!")
示例#20
0
	if wc > 2:
		input_x.append(text)
		input_y.append( (float(rating) - 1.0)/4.0 )
print("%d texts %d ratings" % ( len(input_x), len(input_y) ) )
max_length = max([len(s.split()) for s in input_x])
print("max_length = %d" % max_length)
#print(input_x.shape)


#quit()

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

tokenizer = Tokenizer()
input_x = np.array(input_x)
tokenizer.fit_on_texts(input_x)
sequences = tokenizer.texts_to_sequences(input_x)
word_index = tokenizer.word_index

review_pad = pad_sequences(sequences, maxlen=max_length)
#ratings = np.array(input_y)/5.0
ratings = np.array(input_y)

inv_index = {}
for w, i in word_index.items():
	inv_index[i]=w
#ratings = to_categorical(ratings)

print("%d tokens, reviews shape: %s, ratings shape: %s" % (len(word_index), review_pad.shape, ratings.shape))
示例#21
0
training_x = []
for i, question in enumerate(questions):
    # tokenize
    tokens = nltk.word_tokenize(question, language='german')
    # remove punctution
    table = str.maketrans("", "", string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove non-alphabetic/non-numeric
    real_tokens = [word for word in stripped if word.isalpha() or word.isnumeric()]
    # stemming
    sequence = [snowball.stem(token) for token in real_tokens]

    training_x.append(sequence)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(training_x)

def index(request):
    msg = ""
    if request.method == 'POST':
        user_message = request.POST.get('nachricht', False)
        print(user_message)
        with open("chat.txt", "a+") as f:
            f.write("("+str(timezone.now())+")"+" Sie: ")
            f.write(user_message + "\n")
            f.write("(" + str(timezone.now()) + ")" + " Chatbot: ")
            msg = answer(user_message)
            f.write(msg+"\n")

    f = open("chat.txt", "r")
示例#22
0
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
for sentence in train_data['document']:
    temp_x=[]
    temp_x = okt.morphs(sentence, stem=True, norm=True)
    # norm은 현대적인말 그래욬ㅋㅋㅋ -> 그래요
    # stem은 그래요 -> 그렇다 원형으로 바꾸어 준다
    temp_x = [word for word in temp_x if not word in stopwords]
    train_X.append(temp_x)

#
train_y=['송금', '잔액']
print(train_X)
train_seq=[]
#test_seq=[]
# 전체 단어 개수 중 빈도수 2이하인 단어 개수는 제거. 0번 패딩 토큰을 고려하여 +1
tokenizer = Tokenizer(19416)
tokenizer.fit_on_texts(train_X)
train_seq = tokenizer.texts_to_sequences(train_X)
train_lab = tokenizer.texts_to_sequences(train_y)
print(train_seq)
print(train_lab)
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 8

train_inputs=[]
test_inputs=[]
train_inputs = pad_sequences(train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
lab_inputs = pad_sequences(train_lab, maxlen=1, padding='post')
print(lab_inputs)
print(train_inputs.shape)
max_headline_length = 22  #max length of headline is 22. Since the data follows an approximately normal distribution, we set that as the maximum
max_text_length = 1500  #a majority of texts are below 1500 with several outliers, so we set max text length to 1200

# In[13]:

x_train, x_test, y_train, y_test = train_test_split(df['cleaned text'],
                                                    df['cleaned headline'],
                                                    random_state=0,
                                                    shuffle=True)

# In[14]:

#text tokenization. Credits go to https://www.analyticsvidhya.com/blog/2019/06/comprehensive-guide-text-summarization-using-deep-learning-python/
#word2vec, bag of words, glove or word embeddings?
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(list(x_train))

x_train = x_tokenizer.texts_to_sequences(x_train)
x_test = x_tokenizer.texts_to_sequences(x_test)

x_train = pad_sequences(x_train, maxlen=max_text_length, padding='post')
x_test = pad_sequences(x_test, maxlen=max_text_length, padding='post')

x_voc_size = len(x_tokenizer.word_index) + 1

# In[15]:

y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(list(y_train))
示例#24
0
#creating an embedding dictionary
print("creating embedding_index\n")
embeddings_index = {}
'''
f = open("all_in_one_20/cbow_model.txt")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
    
f.close()
'''
#vetorize the text samples
print("vetorizing the text descriptions\n")
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(X)
sequences = tokenizer_obj.texts_to_sequences(X)

#padding sequences
word_index = tokenizer_obj.word_index
padded_pre_text = pad_sequences(sequences,
                                maxlen=max_length,
                                padding='post',
                                truncating='post')

#######NO NEED TO SHUFFLE AGAIN SINCE ITS ALREADY SHUFFLED######
'''
#shuffling features and labels (ie shuffling both in same order)
indices = np.arange(padded_pre_text.shape[0])
np.random.shuffle(indices)
示例#25
0
def read_wordembedding(file_name):
    embedding_index = {}
    f = open(os.path.join('',file_name), encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:])
        embedding_index[word]=coefs
    f.close()
    return embedding_index

docs = preprocessing_dataset(X)
del X

tokenizer_object = Tokenizer()
tokenizer_object.fit_on_texts(docs)
max_length = max([len(doc) for doc in docs])
max_length_index = np.argmax([len(doc) for doc in docs])
sequences = tokenizer_object.texts_to_sequences(docs)

#pad sequence
word_index = tokenizer_object.word_index
doc_pad = pad_sequences(sequences, maxlen=max_length)

word_embedding_file_name = 'word2vector2.txt'
embedding_index = read_wordembedding(word_embedding_file_name)
num_words = len(word_index)+1


def average_embedding(word_index,embedding_index):
示例#26
0
文件: PoolGRU.py 项目: ifuding/Kaggle
def load_data():
    train = pd.read_csv(FLAGS.input_training_data_path +
                        '/train.csv')  #.iloc[:200]
    test = pd.read_csv(FLAGS.input_training_data_path +
                       '/test.csv')  #.iloc[:200]
    # sub1 = pd.read_csv(data_dir + '/submission_ensemble.csv')
    nrow = train.shape[0]
    print("Train Size: {0}".format(nrow))
    print("Test Size: {0}".format(test.shape[0]))

    coly = [c for c in train.columns if c not in ['id', 'comment_text']]
    print("Label columns: {0}".format(coly))
    y = train[coly]
    tid = test['id'].values

    if FLAGS.load_stacking_data:
        data_dir = "../../Data/2fold/"
        svd_features = np.load(data_dir + 'svd.npy')
        svd_train = svd_features[:nrow]
        svd_test = svd_features[nrow:]
        kf = KFold(n_splits=2, shuffle=False)
        for train_index, test_index in kf.split(svd_train):
            svd_train_part = svd_train[test_index]
            break
        train_data = np.load(data_dir + 'stacking_train_data.npy')
        print(train_data.shape, svd_train_part.shape)
        train_data = np.c_[train_data, svd_train_part]
        train_label = np.load(data_dir + 'stacking_train_label.npy')
        # train_data = train_data[:100]
        # train_label = train_label[:100]
        test_data = np.load(data_dir + 'stacking_test_data.npy')
        emb_weight = None
    else:
        df = pd.concat([train['comment_text'], test['comment_text']], axis=0)
        df = df.fillna("unknown")

        data = df.values
        # Text to sequence
        @contextmanager
        def timer(name):
            """
            Taken from Konstantin Lopuhin https://www.kaggle.com/lopuhin
            in script named : Mercari Golf: 0.3875 CV in 75 LOC, 1900 s
            https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s
            """
            t0 = time.time()
            yield
            print('[{0}] done in {1} s'.format(name, time.time() - t0))

        with timer("Performing stemming"):
            if FLAGS.stem:
                # stem_sentence = lambda s: " ".join(ps.stem(word) for word in s.strip().split())
                data = [gensim.parsing.stem_text(comment) for comment in data]
        print('Tokenizer...')
        if not FLAGS.char_split:
            tokenizer = Tokenizer(num_words=FLAGS.vocab_size)
            tokenizer.fit_on_texts(data)
            data = tokenizer.texts_to_sequences(data)
            data = pad_sequences(data, maxlen=FLAGS.max_seq_len)
            if FLAGS.load_wv_model:
                emb_weight = get_word2vec_embedding(location = FLAGS.input_training_data_path + FLAGS.wv_model_file, \
                        tokenizer = tokenizer, nb_words = FLAGS.vocab_size, embed_size = FLAGS.emb_dim, \
                        model_type = FLAGS.wv_model_type, uniform_init_emb = FLAGS.uniform_init_emb)
            else:
                if FLAGS.uniform_init_emb:
                    emb_weight = np.random.uniform(
                        0, 1, (FLAGS.vocab_size, FLAGS.emb_dim))
                else:
                    emb_weight = np.zeros((FLAGS.vocab_size, FLAGS.emb_dim))
        else:
            tokenizer = None
            data_helper = data_helper(sequence_max_length = FLAGS.max_seq_len, \
                    wv_model_path = FLAGS.input_training_data_path + FLAGS.wv_model_file, \
                    letter_num = FLAGS.letter_num, emb_dim = FLAGS.emb_dim, load_wv_model = FLAGS.load_wv_model)
            data, emb_weight, FLAGS.vocab_size = data_helper.text_to_triletter_sequence(
                data)

        train_data, train_label = data[:nrow], y.values[:nrow]
        test_data = data[nrow:]

    return train_data, train_label, test_data, coly, tid, emb_weight
示例#27
0
#neu 0
#neg 1
#pos 2

text = []
label = []
for i in df.Text:
    text.append(i)
for i in df.Label:
    label.append(i)

########## text  preprocessing #########################################

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(text)

#pad sequences
max_length = max([len(s.split()) for s in text])

#define vocablury size
vocab_size = len(tokenizer_obj.word_index) + 1

print(max_length)
print(vocab_size)

##generating tokens
text_token = tokenizer_obj.texts_to_sequences(text)

##adding padding
示例#28
0
for line in lines:
    tokens = word_tokenize(line)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('','',string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    triple_lines.append(words)

print(colored(len(triple_lines),'green'))

EMBEDDING_DIM = 200

#Vectorize the text samples into a S2 integer tensor
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(triple_lines)
sequences = tokenizer_obj.texts_to_sequences(triple_lines)

#pad sequences : add padding to make all the vectors of same length

#define vocabulary size
vocab_size = len(tokenizer_obj.word_index) + 1

print(colored(sequences,'green'))

#pad sequences
word_index = tokenizer_obj.word_index
max_length = 9

triple_pad = pad_sequences(sequences, maxlen=max_length)
示例#29
0
    output = np.floor(output)
    output = output.astype(np.int16)
    return output


def quanti_convert_int16_to_float(data, fix_pos):
    amp = 2**fix_pos
    output = data.astype(np.float32)
    output = data / amp
    return output


for idx, row in complain_data.iterrows():
    row[0] = row[0].replace('rt', ' ')

tokenizer = Tokenizer(num_words=vocab_size, split=' ')
tokenizer.fit_on_texts(complain_data['Customer_Service'].values)
X = tokenizer.texts_to_sequences(complain_data['Customer_Service'].values)

max_sentence_len = 50
X = pad_sequences(X, maxlen=max_sentence_len)

Y = complain_data['Satisfaction'].values
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.33,
                                                    random_state=42)

embedding_vector_length = 32

model = Sequential()
示例#30
0
NUMBER_OF_UNIQUE_WORDS_CONSIDERED = 25000
LENGTH_OF_SENTENCES = 250

DROPOUT_FACTOR = 0.1

BATCH_SIZE = 32
EPOCHS = 2

############################################################
########## Creating and training the Neural Network
############################################################

train_set = pd.read_csv('train.csv')
sentences_train_set = train_set["comment_text"]

tokenizer = Tokenizer(num_words=NUMBER_OF_UNIQUE_WORDS_CONSIDERED)
tokenizer.fit_on_texts(list(sentences_train_set))
tokenized_train_set = tokenizer.texts_to_sequences(sentences_train_set)

columns = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]
y = train_set[columns].values
X_t = pad_sequences(tokenized_train_set, maxlen=LENGTH_OF_SENTENCES)

inputs = Input(shape=(LENGTH_OF_SENTENCES, ))
x = Embedding(NUMBER_OF_UNIQUE_WORDS_CONSIDERED, 128)(inputs)
x = LSTM(80, return_sequences=True, name='lstm')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(DROPOUT_FACTOR)(x)
x = Dense(50, activation="relu")(x)