예제 #1
0
    def preprocess_input_sequences(self, data):
        """
        preprocess,pad to fixed length.
        """
        documents, questions, answer, candidates = data

        questions_ok = pad_sequences(questions, maxlen=self.q_len, dtype="int32", padding="post", truncating="post")
        documents_ok = pad_sequences(documents, maxlen=self.d_len, dtype="int32", padding="post", truncating="post")
        candidates_ok = pad_sequences(candidates, maxlen=self.A_len, dtype="int32", padding="post", truncating="post")
        y_true = np.zeros_like(candidates_ok)
        y_true[:, 0] = 1
        return questions_ok, documents_ok, candidates_ok, y_true
예제 #2
0
 def preprocess_input_sequences(self, data):
     documents, questions, answer_spans = data
     documents_ok = pad_sequences(documents,
                                  maxlen=self.d_len,
                                  dtype="int32",
                                  padding="post",
                                  truncating="post")
     questions_ok = pad_sequences(questions,
                                  maxlen=self.q_len,
                                  dtype="int32",
                                  padding="post",
                                  truncating="post")
     return documents_ok, questions_ok, answer_spans
예제 #3
0
def predict(_t_w, _id,_word_index,_model,_model_shape1):
    sec = []
    sequences = []
    for w in _t_w:
        sec.append(_word_index.item().get(w, 0))
    sequences.append(sec)

    data = pad_sequences([sec], maxlen=_model_shape1)
    data_gen = pad_sequences([diction[id]],maxlen=2)

    prediction = _model.predict([data,data_gen], batch_size=1)


    return prediction[0]
예제 #4
0
def load_data():
    ''' '''
    print('Loading data...')
    (x_train, y_train), (x_test,
                         y_test) = imdb.load_data(num_words=max_features)
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')

    print('Pad sequences (samples x time)')
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_test.shape)
    return [x_train, y_train, x_test, y_test]
  def input_fn():
    # calculates the length of the sequences, where
    # length = min(actual_length, MAX_LENGT
    x_len = np.minimum(np.array([len(seq) for seq in x_in]),
                       max_length).astype('int32')

    # x_post_pad = sequence.pad_sequences(x_in, maxlen=max_length, padding='post')
    x_post_pad = sequence.pad_sequences(x_in, maxlen=max_length, padding='post')   

    # creates the dataset from in memory data
    ds = tf.contrib.data.Dataset.from_tensor_slices((x_post_pad, x_len, y_in))
   
    # repeats the dataset `epochs` times.
    ds = ds.repeat(epochs)
    
    if shuffle:
        ds = ds.shuffle(buffer_size=10000)

    ds = ds.batch(batch_size)
    
    # creates iterator
    x, x_len, y = ds.make_one_shot_iterator().get_next()

    dict_x = {'x': x, rnn_common.RNNKeys.SEQUENCE_LENGTH_KEY: x_len}
    return dict_x, y
예제 #6
0
파일: utils.py 프로젝트: hangyav/biadapt
def to_sequence(texts, window=5, maxlen=None):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    nb_words = len(tokenizer.word_index.items()) + 1

    if maxlen is None:
        maxlen = max([len(seq) for seq in tokenizer.texts_to_sequences(texts)])

    logger.info('Maximum sentence length: {}'.format(maxlen))
    logger.info('Padded sentence length: {}'.format(maxlen + 2 * (window - 1)))
    logger.info('Number of words: {}'.format(nb_words))

    maxlen += window - 1

    seqs = tokenizer.texts_to_sequences(texts)
    seqs = sequence.pad_sequences(seqs, padding='post', maxlen=maxlen)
    seqs = sequence.pad_sequences(seqs,
                                  padding='pre',
                                  maxlen=maxlen + window - 1)

    return seqs, tokenizer, nb_words, maxlen + (window - 1)
예제 #7
0
 def preprocess_input_sequences(self, data):
     documents, questions, answer_spans = data
     documents_ok = pad_sequences(documents,
                                  maxlen=self.d_len,
                                  dtype="int32",
                                  padding="post",
                                  truncating="post")
     questions_ok = pad_sequences(questions,
                                  maxlen=self.q_len,
                                  dtype="int32",
                                  padding="post",
                                  truncating="post")
     answer_start = [
         np.array([int(i == answer_span[0]) for i in range(self.d_len)])
         for answer_span in answer_spans
     ]
     answer_end = [
         np.array([int(i == answer_span[1]) for i in range(self.d_len)])
         for answer_span in answer_spans
     ]
     return documents_ok, questions_ok, np.asarray(
         answer_start), np.asarray(answer_end)
예제 #8
0
    def input_fn():
        # calculates the length of the sequences, where
        # length = min(actual_length, MAX_LENGT
        x_len = np.minimum(np.array([len(seq) for seq in x_in]),
                           max_length).astype('int32')

        # DynamicRNNEstimator uses `rnn_common.select_last_activations`:
        # https://goo.gl/L8jtfh
        # so we need add padding at the end of the sequence,
        # the default is the beginning of the sequence:
        # https://goo.gl/NVjJgT
        x_post_pad = sequence.pad_sequences(x_in,
                                            maxlen=max_length,
                                            padding='post')

        # creates the dataset from in memory data
        ds = tf.contrib.data.Dataset.from_tensor_slices(
            (x_post_pad, x_len, y_in))

        # repeats the dataset `epochs` times.
        ds = ds.repeat(epochs)

        if shuffle:
            ds = ds.shuffle(buffer_size=10000)

        if batch_by_seq_len:
            # manually implement bucket by sequence length
            # the idea is to make batches with sequences of similar length
            # https://goo.gl/y67FQm
            ds = ds.group_by_window(
                key_func=lambda x, x_len, y: _length_bin(x_len, max_length),
                reduce_func=_make_batch,
                window_size=batch_size)
        else:
            ds = ds.batch(batch_size)

        # creates iterator
        x, x_len, y = ds.make_one_shot_iterator().get_next()

        dict_x = {'x': x, rnn_common.RNNKeys.SEQUENCE_LENGTH_KEY: x_len}
        return dict_x, y
예제 #9
0
파일: utils.py 프로젝트: zedom1/nlp
def multi_sequences_padding(all_sequences, config):
    max_num_utterance = config.max_num_utterance
    max_sentence_len = config.max_length_q
    PAD_SEQUENCE = [0] * max_sentence_len
    padded_sequences = []
    sequences_length = []
    for sequences in all_sequences:
        sequences_len = len(sequences)
        sequences_length.append(
            get_sequences_length(sequences, maxlen=max_sentence_len))
        if sequences_len < max_num_utterance:
            sequences += [PAD_SEQUENCE] * (max_num_utterance - sequences_len)
            sequences_length[-1] += [0] * (max_num_utterance - sequences_len)
        else:
            sequences = sequences[-max_num_utterance:]
            sequences_length[-1] = sequences_length[-1][-max_num_utterance:]
        sequences = pad_sequences(sequences,
                                  padding='post',
                                  maxlen=max_sentence_len)
        padded_sequences.append(sequences)
    return padded_sequences, sequences_length
예제 #10
0
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(
    num_words=MAX_NB_WORDS
)  # def __init__(self, num_words=None, filters=\'!"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', lower=True, split=' ', char_level=False, **kwargs); # num_words: how many vocabulary to use in this model, the downloaded and used file has 400000 vocabularies to offer; ### tokenizer.fit_on_texts(texts) create many useful attributes: # self.word_index: dictionary {word: index}, index: from 0 to 174046, 0 is highest count, 174046 refer to lowest counts; # there are 174047 unique words in all samples together; # self.index_docs: dictionary {index_word: counts_doc}, key is index of word from dict self.word_index, and value is num_docs this word appear
tokenizer.fit_on_texts(
    texts
)  # texts: a list of strings, or generator of strings; Updates internal vocabulary based on a list of texts; # tokenizer.document_count: num of samples processed so far; # tokenizer.text_to_word_sequence: convert a long string to a list of words; # tokenizer.word_counts: dictionary, {word: counts} added up in each and every sample; # tokenizer.word_docs: dictionary {unique_word: counts}, each sample count unique word only once, add up if appear in a different sample; # self.word_counts.__len__(): total unique words in all samples; # self.word_docs.get("the"): how many documents or samples have "the"; # self.word_counts.get("the"): how many times "the" has occured in all samples; # wcounts = list(self.word_counts.items()): wcounts is a list of tuples (word, counts); # wcounts.sort(key=lambda x: x[1], reverse=True): sort the list from highest to smallest counts; # sorted_voc = [wc[0] for wc in wcounts]: get a list of words sorted by counts from highest to lowest
sequences = tokenizer.texts_to_sequences(
    texts
)  # Transforms each text in texts in a sequence of integers (each integer refers to a word); # Only top "num_words" most frequent words (top 20000 most frequent words out of 174047 unique words will be taken into account. Only words known by the tokenizer will be taken into account. # sequences: a list of 19997 sublist, each list has less 20000 unique but most frequent words `for sq in sequences: np.array(sq).max()`

word_index = tokenizer.word_index  # total num of unique words in all samples; also total vocabularies based on all samples here
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(
    sequences, maxlen=MAX_SEQUENCE_LENGTH
)  # Pads each sequence to the same length (length of the longest sequence), If maxlen is provided, any sequence longer than maxlen is truncated to maxlen. Truncation happens off either the beginning (default) or the end of the sequence. Supports post-padding and pre-padding (default). # previously, maximum length of each sequence is 20000, now maxlen is set to 1000, then we can check data's sublist length won't be longer than 1000; # data.shape == (19997, 1000), for each sample text, there are 1000 most frequent words to summarize it

labels = to_categorical(
    np.asarray(labels)
)  # each sample text has its category, from 0 to 19; # to_categorical convert 0-9 to one-hot encoding
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])  # indices of all samples
np.random.shuffle(indices)  # shuffle the indices
data = data[indices]  # shuffle data samples
labels = labels[indices]  # shuffle labels (one hot encoded)
num_test_samples = int(TEST_SPLIT * data.shape[0])
예제 #11
0
texts, labels, labels_index = read_data(filename, filename_v)

print(max([len(t) for t in texts]))
print(min([len(t) for t in texts]))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
print(max([len(t) for t in sequences]))
print(min([len(t) for t in sequences]))

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
MAX_SEQUENCE_LENGTH = data.shape[1]
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'), encoding="utf-8")
예제 #12
0
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))

# Data preprocessing
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
예제 #13
0
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer_gen = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
tokenizer_gen.fit_on_texts(texts_gen)
sequences = tokenizer.texts_to_sequences(texts)
sequences_gen = tokenizer_gen.texts_to_sequences(texts_gen)
print(max([len(t)for t in sequences]))
print(min([len(t)for t in sequences]))

word_index = tokenizer.word_index
word_index_gen = tokenizer_gen.word_index

print('Found %s unique tokens.' % len(word_index))
print('Found %s unique tokens.' % len(word_index_gen))

data = pad_sequences(sequences)
data_gen = pad_sequences(sequences_gen)


labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
MAX_SEQUENCE_LENGTH = data.shape[1]
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
data_gen = data_gen[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
TEST_SPLIT = 0.2
INIT_SEED = 2017
GLOBAL_SEED = 2018
MAXLEN = 80
BATCH_SIZE = 128
TEST_BATCH_SIZE = 512


# In[2]:


(X_train, Y_train), (X_test, Y_test) = imdb.load_data(num_words=NB_WORDS)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')
print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=MAXLEN)
X_test = sequence.pad_sequences(X_test, maxlen=MAXLEN)
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)


# In[3]:


class Model(nn.Module):
    def __init__(self, nb_words, hidden_size=128, embedding_size=128, n_layers=1,
                 wdrop=0.25, odrop=0.25, edrop=0.1, idrop=0.25, variational=False,
                 standard_dropout=False, batch_first=True):
        super(Model, self).__init__()
        self.standard_dropout = standard_dropout
        self.lockdrop = LockedDropout(batch_first=batch_first)
    def __init__(self, init_seed, maxlen, nb_words, skip_top, test_split):
        self.start_char = 1
        self.oov_char = 2
        self.index_from = 3

        files = [
            "Dennis+Schwartz", "James+Berardinelli", "Scott+Renshaw",
            "Steve+Rhodes"
        ]
        texts, ratings = [], []
        for file in files:
            with open("data/scaledata/" + file + "/subj." + file, "r") as f:
                texts += list(f)
            with open("data/scaledata/" + file + "/rating." + file, "r") as f:
                ratings += list(f)
        tokenizer = text.Tokenizer(filters='')
        tokenizer.fit_on_texts(texts)
        X = tokenizer.texts_to_sequences(texts)
        Y = [float(rating) for rating in ratings]

        # Shuffle data:
        np.random.seed(init_seed)
        np.random.shuffle(X)
        np.random.seed(init_seed)
        np.random.shuffle(Y)

        # Parse data
        X = [[self.start_char] + [w + self.index_from for w in x] for x in X]

        new_X = []
        new_Y = []
        for x, y in zip(X, Y):
            for i in range(0, len(x), maxlen):
                new_X.append(x[i:i + maxlen])
                new_Y.append(y)
        X = np.array(new_X)
        Y = np.array(new_Y)
        # by convention, use 2 as OOV word
        # reserve 'index_from' (=3 by default) characters: 0 (padding), 1 (start), 2 (OOV)
        X = [[
            self.oov_char if (w >= nb_words or w < skip_top) else w for w in x
        ] for x in X]

        self.X_train = X[:int(len(X) * (1 - test_split))]
        self.Y_train = Y[:int(len(X) * (1 - test_split))]
        self.mean_y_train = np.mean(self.Y_train)
        self.std_y_train = np.std(self.Y_train)
        self.Y_train = (self.Y_train - self.mean_y_train) / self.std_y_train

        self.X_test = X[int(len(X) * (1 - test_split)):]
        self.Y_test = Y[int(len(X) * (1 - test_split)):]
        self.Y_test = (self.Y_test - self.mean_y_train) / self.std_y_train

        print(len(self.X_train), 'train sequences')
        print(len(self.X_test), 'test sequences')

        print("Pad sequences (samples x time)")
        self.X_train = sequence.pad_sequences(self.X_train, maxlen=maxlen)
        self.X_test = sequence.pad_sequences(self.X_test, maxlen=maxlen)
        print('X_train shape:', self.X_train.shape)
        print('X_test shape:', self.X_test.shape)
예제 #16
0
with open(filename, 'r', encoding="utf-8") as f:
    with open(os.path.join(SAVE_DIR, 'submissionFile'), 'a') as sf:
        sf.write('ID,class1,class2,class3,class4,class5,class6,class7,class8,class9\n')
        for line in f:
            if i>0:
                text= line[line.find('||') + 2:]
                id =int(line[:line.find('||')])

                t_w = text_to_word_sequence(text)
                sec = []
                sequences = []
                for w in t_w:
                    sec.append(word_index.item().get(w, 0))
                sequences.append(sec)

                data = pad_sequences([sec], maxlen=model_shape1)

                prediction = model.predict(data, batch_size=1)
                outputstr = str(id)
                j = 0
                for p_i in prediction[0]:
                    if j > 0:
                        outputstr += "," + "%.2f" % p_i
                    j += 1
                print(outputstr)
                sf.write(outputstr + '\n')
            i+=1
            if i>=NUM_ROWS_FROM_TEXT :
                break

print("saved in "+ os.path.join(SAVE_DIR, 'submissionFile'))
예제 #17
0
from tensorflow.contrib.keras.python.keras.datasets import imdb
from tensorflow.contrib.keras.python.keras.layers import Embedding, SimpleRNN, Dropout, Dense, Activation, LSTM, GRU
from tensorflow.contrib.keras.python.keras.models import Sequential
from tensorflow.contrib.keras.python.keras.preprocessing import sequence

max_features = 20000
maxlen = 100
batch_size = 32


(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)

X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)


model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))

#model.add(SimpleRNN(128))
#model.add(GRU(128))
model.add(LSTM(128))

model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))


model.compile(loss='binary_crossentropy', optimizer='adam')