def get_sequence(rangeOfSequence, mode):
    #modes available (1,2,3,4)
    #1 categorical input, categorical output
    #2 categorical input, continous output
    #3 continous input, categorical output
    #4 continous input, continous output
    c = rangeOfSequence

    data = [[i, j] for i in range(1, c + 1) for j in range(i, c + 1)]
    data2 = [[j, i] for i in range(1, c + 1) for j in range(i, c + 1)]
    #the -1 serves as an 'end of sentence' indicator
    target_in = [[[0]] + [[k] for k in range(d[0], d[1] + 1)] + [[-1]]
                 for d in data]  #teacher forcing
    target_out = [[[k] for k in range(d[0], d[1] + 1)] + [[-1]] + [[0]]
                  for d in data]  #output
    target_in2 = [[[0]] + [[k] for k in range(d[0], d[1] - 1, -1)] + [[-1]]
                  for d in data2]  #for the other way around
    target_out2 = [[[k] for k in range(d[0], d[1] - 1, -1)] + [[-1]] + [[0]]
                   for d in data2]

    #combine
    target_in = target_in + target_in2
    target_out = target_out + target_out2

    target_in = pad(target_in, padding='post')
    target_out = pad(target_out, padding='post')

    data = [[[i], [j]] for i in range(1, c + 1) for j in range(i, c + 1)]
    data2 = [[[j], [i]] for i in range(1, c + 1) for j in range(i, c + 1)]
    data = data + data2

    data = np.array(data, dtype=float)
    target_out = np.array(target_out, dtype=float)
    target_in = np.array(target_in, dtype=float)
    if mode == 4:
        return data, target_in, target_out

    dim1 = target_in.shape[0]
    dim2 = target_in.shape[1]
    d = to_categorical([data], num_classes=c + 2)
    d = d.reshape(dim1, 2, c + 2)
    if mode == 2:
        return d, target_in, target_out

    t_in = to_categorical([target_in], num_classes=c + 2)
    t_in = t_in.reshape(dim1, dim2, c + 2)

    t_out = to_categorical([target_out], num_classes=c + 2)
    t_out = t_out.reshape(dim1, dim2, c + 2)
    if mode == 3:
        return data, t_in, t_out

    return d, t_in, t_out  #mode 1
示例#2
0
def pad_sequences(train_X, train_y):

    # Used to find the max array size.
    tempMax = -1

    # Keep and store the max length of a sentence as it goes through train_X.
    for s in train_X:
        if (len(s) > tempMax):
            tempMax = len(s)

    # Define the MAX_LENGTH of a sentence.
    MAX_LENGTH = tempMax

    # Pad train_X and train_y.
    train_X = pad(train_X, MAX_LENGTH, 'int32', 'post', 'pre', 0.0)
    train_y = pad(train_y, MAX_LENGTH, 'int32', 'post', 'pre', 0.0)
    
    # Return the numpy arrays and MAX-LENGTH.
    return train_X, train_y, MAX_LENGTH
示例#3
0
def process_text(text,
                 to_pad=False,
                 max_len=None,
                 tok=None,
                 save_name=None,
                 num_word=None):
    """tips :
             tok is tokenizer:
                 pass tok as none while processing trainig data 
            text :
                should have line breaks and in each line there should be a '\t' 
                also it should be a String
                NAME : Caption formatted                 
        return :
            tok: tokenizer,
            text_dict:
    """
    names = [i.split('\t')[0] for i in text.split('\n')]
    descs = [i.split('\t')[1] for i in text.split('\n')]

    clean_descs = clean_text(descs)
    clean_descs = [i.split(' ') for i in clean_descs]

    if tok == None:
        tok = Tokenizer(num_words=num_word)
        tok.fit_on_texts(clean_descs)
    desc_seqs = tok.texts_to_sequences(clean_descs)

    text_dict = dict()

    if to_pad:
        if max_len == None:
            max_len = max([len(i) for i in clean_descs])
        else:
            desc_seqs = pad(desc_seqs, maxlen=max_len, padding='post', value=0)

    for i in range(len(names)):

        if names[i] in text_dict:
            (text_dict[names[i]]).append(desc_seqs[i])
        else:
            text_dict[names[i]] = [desc_seqs[i]]

    if not save_name == None:
        with open(save_name, 'wb') as fil:
            pickle.dump(obj=text_dict, file=fil)
        with open(save_name + 'tokenizer', 'wb') as fil:
            pickle.dump(obj=tok, file=fil)

    return tok, text_dict
    def __getitem__(self, i):
        pg, tg = 'post', 'post'
        target = [self.target[i]]
        question = str(self.text[i])
        quest_ids = self.tokenizer.encode(question.strip())

        attention_mask_idx = len(quest_ids) - 1
        if 0 not in quest_ids: quest_ids = 0 + quest_ids
        quest_ids = pad([quest_ids], maxlen=MAXLEN, value=1, padding=pg, truncating=tg)

        attention_mask = np.zeros(MAXLEN)
        attention_mask[1:attention_mask_idx] = 1
        attention_mask = attention_mask.reshape((1, -1))
        if 2 not in quest_ids: quest_ids[-1], attention_mask[-1] = 2, 0
        return FloatTensor(target), LongTensor(quest_ids), LongTensor(attention_mask)
示例#5
0
def predict_sentiment(tweet):
    pg, tg = 'post', 'post'
    tweet_ids = tokenizer.encode(tweet.strip())
    sent = {0: 'positive', 1: 'neutral', 2: 'negative'}

    att_mask_idx = len(tweet_ids) - 1
    if 0 not in tweet_ids: tweet_ids = 0 + tweet_ids
    tweet_ids = pad([tweet_ids], maxlen=MAXLEN, value=1, padding=pg, truncating=tg)

    att_mask = np.zeros(MAXLEN)
    att_mask[1:att_mask_idx] = 1
    att_mask = att_mask.reshape((1, -1))
    if 2 not in tweet_ids: tweet_ids[-1], att_mask[-1] = 2, 0
    tweet_ids, att_mask = torch.LongTensor(tweet_ids), torch.LongTensor(att_mask)
    return sent[np.argmax(network.forward(tweet_ids.to(device), att_mask.to(device)).detach().cpu().numpy())]
def predict_insincerity(question):
    pg, tg = 'post', 'post'
    ins = {0: 'sincere', 1: 'insincere'}
    quest_ids = tokenizer.encode(question.strip())

    attention_mask_idx = len(quest_ids) - 1
    if 0 not in quest_ids: quest_ids = 0 + quest_ids
    quest_ids = pad([quest_ids], maxlen=MAXLEN, value=1, padding=pg, truncating=tg)

    att_mask = np.zeros(MAXLEN)
    att_mask[1:attention_mask_idx] = 1
    att_mask = att_mask.reshape((1, -1))
    if 2 not in quest_ids: quest_ids[-1], attention_mask[-1] = 2, 0
    quest_ids, att_mask = torch.LongTensor(quest_ids), torch.LongTensor(att_mask)
    
    output = network.forward(quest_ids.to(device), att_mask.to(device))
    return ins[int(np.round(nn.Sigmoid()(output.detach().cpu()).item()))]
示例#7
0
    def __getitem__(self, i):
        pg, tg = 'post', 'post'
        tweet = str(self.text[i]).strip()
        tweet_ids = self.tokenizer.encode(tweet)

        attention_mask_idx = len(tweet_ids) - 1
        if 0 not in tweet_ids: tweet_ids = 0 + tweet_ids
        tweet_ids = pad([tweet_ids], maxlen=MAXLEN, value=1, padding=pg, truncating=tg)

        attention_mask = np.zeros(MAXLEN)
        attention_mask[1:attention_mask_idx] = 1
        attention_mask = attention_mask.reshape((1, -1))
        if 2 not in tweet_ids: tweet_ids[-1], attention_mask[-1] = 2, 0
            
        sentiment = [self.sentiment_dict[self.sentiment[i]]]
        sentiment = torch.FloatTensor(to_categorical(sentiment, num_classes=3))
        return sentiment, torch.LongTensor(tweet_ids), torch.LongTensor(attention_mask)
示例#8
0
文件: memory.py 项目: ssamot/distnet
def get_supporting_facts_training(X, Xq, word_idx, train_supporting_facts, trained_attention, max_hops=2):
    supporting_sentences = [[0] for i in range(len(X))]
    totalX = []
    totalXq = []
    totalY = []
    enough_memories = [0 for i in range(len(X))]
    allX = X[:]
    selected = [[] for i in range(len(X))]
    leftoversX = []

    for i in range(0, max_hops):
        print (bcolors.BOLD +  "Entering hop " + str(i) + " ..." +  bcolors.ENDC)

        _, combinedXq, leftoverS, supporting_sentences, X, Xq, Y , found_supporting, leftover= supporting_facts_inc(None, Xq, word_idx,
                                                                                                  train_supporting_facts,
                                                                                                  supporting_sentences,
                                                                                                  trained_attention,
                                                                                                  enough_memories, allX, selected,
                                                                                                  )

        totalX.extend(X)
        totalY.extend(Y)
        totalXq.extend(Xq)
        leftoversX.extend(leftover)






        Xq = combinedXq
        train_supporting_facts = leftoverS



        print (bcolors.BOLD +  "Found supporting facts " + str(len(selected)) + " ..." + bcolors.ENDC)
        print(np.sum(enough_memories), "break")

        if (np.sum(enough_memories) == len(enough_memories) or found_supporting == 0 ):
            print (bcolors.BOLD +  "breaking at hop " + str(i) + " ..." )
            break



    X = pad(totalX, maxlen=max(map(len, totalX)))
    Xq = pad(totalXq, maxlen=max(map(len, totalXq)))
    Y = pad(totalY, maxlen=max(map(len, totalY)))
    leftoversX = pad(leftoversX, maxlen=max(map(len, leftoversX)))
    # print (len(supporting_sentences))
    # for sentence in supporting_sentences:
    #     print(sentence)
    supporting_sentences = pad(supporting_sentences, maxlen=max(map(len, supporting_sentences)))

    #X = reverse(X, word_idx)
    #Xq = reverse(Xq, word_idx)


    # import collections
    # y=collections.Counter([tuple(list(x[0]) + list(x[1])) for x in zip(X,Xq)])
    # for yi,v in y.items():
    #             if(v > 1):
    #                 print yi, v



    print('X.shape = {}'.format(X.shape))
    print('Xq.shape = {}'.format(Xq.shape))
    print('Y.shape = {}'.format(Y.shape))
    print('leftover.shape = {}'.format(leftoversX.shape))
    print('supporting_sentences.shape = {}'.format(supporting_sentences.shape))
    print(bcolors.ENDC)
    return X, Xq, Y, supporting_sentences, leftoversX
list_sent_train = X_train["comment_text"]
list_sent_test = X_test["comment_text"]
sent_text = t_data['comment_text']
l_toxic = t_label['toxic']
l_sever_t = t_label['severe_toxic']
l_obscene = t_label['obscene']
l_idh = t_label['identity_hate']
l_in = t_label['insult']
l_th = t_label['threat']
max_features = 20000
tokenizer = Tokenizer(num_words=max_features, char_level=True)
tokenizer.fit_on_texts(list(list_sent_train))
list_token_train = tokenizer.texts_to_sequences(list_sent_train)
list_sent_test = tokenizer.texts_to_sequences(list_sent_test)
maxlength = 500
X_t = pad(list_token_train, maxlen=maxlength)
X_te = pad(list_sent_test, maxlen=maxlength)
inp = Input(shape=(maxlength, ))

embedding_size = 240
x = Embedding(len(tokenizer.word_index) + 1, embedding_size)(inp)

x = Conv1D(filters=100, kernel_size=4, padding='same', activation='relu')(x)

x = MaxPooling1D(pool_size=4)(x)

#x = Bidirectional(GRU(60, return_sequences=True,name='lstm_layer',dropout=0.2,recurrent_dropout=0.2))(x)

x = GlobalMaxPool1D()(x)

x = Dense(50, activation="relu")(x)
示例#10
0
def coco_generator(
        mappings,
        captions,  #used for one hotting the target should be vocabularies size                 
        dict_size,
        max_len,
        image_batch_szie=1,
        path_to_pkl_files=".",
        pkl_file_extension='.pkl',
        epochs=1):

    for _ in range(epochs):
        acc_features = np.array([[[0 for i in range(4096)]]])
        acc_caption = np.array([[[0 for i in range(max_len)]]])
        acc_target = np.array([[[0 for i in range(dict_size + 1)]]])
        counter = 0

        for pkl_file, image_subset in mappings.items():

            with open(path_to_pkl_files + '/' + pkl_file + pkl_file_extension,
                      'rb') as file:
                feature_dict = pickle.load(file)

            for image_name in image_subset:
                image_name = image_name.split('/')[-1]
                temp = []

                caption = captions[image_name]
                [[temp.append(line[:i]) for i in range(1, len(line))]
                 for line in caption]
                caption = np.array(
                    pad(temp, maxlen=max_len, padding='post', value=0))

                temp = []
                [
                    np.array([temp.append(i) for i in line[1:]])
                    for line in captions[image_name]
                ]
                temp = np.asarray(temp)
                target = np.array(
                    [np.array(one_hot(i, dict_size)) for i in temp])

                features = feature_dict[image_name]
                features = (features.repeat(len(caption),
                                            axis=0)).reshape(-1, 1, 4096)
                #				caption = caption.reshape( -1, 1 , max_len)
                target = target.reshape(-1, 1, dict_size + 1)

                counter += 1

                if image_batch_szie > 1:
                    acc_features = np.append(acc_features, features, axis=0)
                    acc_caption = np.append(acc_caption, caption, axis=0)
                    acc_target = np.append(acc_target, target, axis=0)

                    if counter == image_batch_szie:
                        acc_features = acc_features.reshape(-1, 1, 4096)
                        #						acc_caption  = acc_caption.reshape(-1,1, max_len)
                        acc_target = acc_target.reshape(-1, 1, dict_size + 1)
                        yield [acc_features[1:],
                               acc_caption[1:]], acc_target[1:]
                        acc_features = np.array([[[0 for i in range(4096)]]])
                        acc_caption = np.array([[[0 for i in range(max_len)]]])
                        acc_target = np.array(
                            [[[0 for i in range(dict_size + 1)]]])
                        counter = 0
                    continue
                else:
                    yield [[features, caption], target]

        if len(acc_caption) > 1:
            acc_features = acc_features.reshape(-1, 1, 4096)
            #			acc_caption  = acc_caption.reshape(-1,1, max_len)
            acc_target = acc_target.reshape(-1, 1, dict_size + 1)
            yield [acc_features[1:], acc_caption[1:]], acc_target[1:]