示例#1
0
def get_normalized_data(dataset_name):
    reviews = []
    review_sentences = []
    review_tokens = []

    data = get_data.Datasets()
    if (dataset_name == "train"):
        train_data = data.get_train_data()
        data = train_data
    if (dataset_name == "test"):
        test_data = data.get_test_data()
        data = test_data
    if (dataset_name == "unlabeled"):
        unlabeled_data = data.get_unlabeled_data()
        data = unlabeled_data

    # clean the test dataset
    for review in data["review"]:
        cleaned_review = data.clean_text_to_text(review)
        reviews.append(cleaned_review)
        sentence = tokenize.sent_tokenize(cleaned_review)
        # number of the review
        review_sentences.append(sentence)

        for s in sentence:
            if (len(s) > 0):
                tokens = text_to_word_sequence(s)
                tokens = [token for token in tokens if token.isalpha()]
                review_tokens.append(tokens)

    return reviews, review_sentences, review_tokens
示例#2
0
    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))

        ai = K.exp(eij)
        weights = ai / K.sum(ai, axis=1).dimshuffle(0, 'x')

        weighted_input = x * weights.dimshuffle(0, 1, 'x')
        return weighted_input.sum(axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]


#get the train_data
data = get_data.Datasets()
train_data = data.get_train_data()

# declare the lists
reviews = []
sentences = []
labels = []

#separate and cleaning the dataset
for review in train_data["review"]:
    #append all the reviews in the list"reviews"
    cleaned_review = data.clean_text_to_text(review)
    reviews.append(cleaned_review)
    #append all the sentenes to the list sentences
    review_sentences = tokenize.sent_tokenize(cleaned_review)
    sentences.append(review_sentences)
示例#3
0
        return weighted_input.sum(axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

    def get_config(self):
        config = {}
        base_config = super(AttLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def compute_mask(self, inputs, mask):
        return None


#----------------get the dataset--------------------------------
data_main = get_data.Datasets()

#----------------get the vocab------------------------------------
print("1: get the vocab!")
train_reviews, train_sentences, train_tokens = data_main.get_normalized_data(
    "train")
unlabeled_reviews, unlabeled_sentences, unlabeled_tokens = data_main.get_normalized_data(
    "unlabeled")
test_reviews, test_sentences, test_tokens = data_main.get_normalized_data(
    "test")

#------add extra dataset------------------------

extra_train = pd.read_csv(
    "/Users/xiaoyiwen/Desktop/MasterProject/MasterProject/data_Preprocessing/Datasets/kaggle_data/training_set.csv"
)
示例#4
0
def get_nomalized_test_data(train_vocab):
    #get the train data
    data = get_data.Datasets()
    test_data = data.get_test_data()
    test_reviews = []
    test_sentences = []
    test_labels = []

    #test data preprocessing ----------

    #clean the test dataset
    for test in test_data["review"]:
        cleaned_test = data.clean_text_to_text(test)
        test_reviews.append(cleaned_test)
        sentences = tokenize.sent_tokenize(cleaned_test)
        test_sentences.append(sentences)

    #define the label
    for id in test_data["id"]:
        # print(id)
        id = id.strip('"')
        # print(id)
        id, score = id.split('_')
        score = int(score)
        if (score < 5):
            test_labels.append(0)
        if (score >= 7):
            test_labels.append(1)

    #test----
    #create a tokenizer and limit only dealt with top 20000 words
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(test_reviews)

    print("train_vocab")
    print(train_vocab)
    print(len(train_vocab))

    #define the test_matrix
    test_matrix = np.zeros((len(test_reviews), SEN_NUM, WORDS_NUM),
                           dtype='int32')  #(250000,15,100)

    #print(test_matrix.shape)
    non_exist = 0
    for review_index, review in enumerate(test_sentences):

        for sentence_index, sentence in enumerate(review):

            if (sentence_index < SEN_NUM):
                #print(sentence)
                tokens = text_to_word_sequence(sentence)

                num = 0

                for _, token in enumerate(tokens):
                    #see if the token is in the vocab

                    if (token not in train_vocab.keys()):
                        print(token)
                        non_exist += 1
                        continue
                    if (num < WORDS_NUM and train_vocab[token] < MAX_NB_WORDS):
                        test_matrix[review_index, sentence_index,
                                    num] = train_vocab[token]
                        num += 1

    print(non_exist)
    #test_labels-> tocategory

    predicted_labels = to_categorical(np.asarray(test_labels))

    return test_matrix, predicted_labels
示例#5
0
import logging
from gensim.models import word2vec
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
import nltk
import re
from nltk import tokenize
from MasterProject.data_Preprocessing.Datasets import get_data
import pickle
#get the train_data,
data_manipulation = get_data.Datasets()

def clean_text_to_text(review):
    # remove the the',",\
    review = re.sub(r"\\", "", review)
    review = re.sub(r"\'", "", review)
    review = re.sub(r"\"", "", review)
    # return the lower case
    text = review.strip().lower()

    return text


def get_normalized_data(data):
    reviews = []
    review_sentences = []
    review_tokens = []