Пример #1
0
def run(df, fold):

    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)

    print(len(train_df))
    print(len(valid_df))

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df.review.values.tolist(), info=True)

    xtrain = tokenizer.texts_to_sequences(train_df.review.values)
    xtest = tokenizer.texts_to_sequences(valid_df.review.values)

    xtrain = tokenizer.pad_sequences(xtrain, max_len=200)
    xtest = tokenizer.pad_sequences(xtest, max_len=200)
    print(xtrain.shape)
    print(xtest.shape)

    train_dataset = dataset.Dataset(reviews=xtrain,
                                    targets=train_df.sentiment.values)

    valid_dataset = dataset.Dataset(reviews=xtest,
                                    targets=valid_df.sentiment.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.BATCH_SIZE, num_workers=2)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.BATCH_SIZE, num_workers=2)
    print(psutil.virtual_memory())

    if not os.path.exists("../data/embedding_matrix.npy"):
        embedding_dict = load_vectors('../models/crawl-300d-2M.vec')
        print("Loaded Vectors")
        embedding_matrix = create_embedding_matrix(tokenizer.word_index,
                                                   embedding_dict)
        print("Created Matrix")

        np.save("../data/embedding_matrix.npy", embedding_matrix)
Пример #2
0
class prepareData(object):
    def __init__(self, EMBEDDING_DIM, MAXLEN, TRIM_NUM):
        self.EMBEDDING_DIM = EMBEDDING_DIM
        self.MAXLEN = MAXLEN
        self.TRIM_NUM = TRIM_NUM
        self.tokenizer = Tokenizer()

    def cleanText(self, text):
        text = text.encode('utf-8', 'ignore')
        text = text.lower().decode('utf-8')
        #text = re.sub(r"[^a-z ]", r' ', text.decode('utf-8'))
        #print (text)
        text = re.sub(' +', ' ', text)
        return text

    def _read_tsv(self, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines

    def tokenize_claims(self, articles):

        seqs_articles = []
        for article in articles:
            seqs_articles.append(
                self.tokenizer.texts_to_sequences(article, self.MAXLEN))

        return seqs_articles

    def get_guided_masks(self, claim, atten):
        seqs_masks = [0 for i in range(self.MAXLEN)]
        atten_set = set(atten.split(' '))
        claim_seqs = claim.split(' ')
        for (i, word) in enumerate(claim_seqs):
            if i >= self.MAXLEN:
                break
            if word in atten_set:
                seqs_masks[i] = 1
        return seqs_masks

    def tokenize_claims_tfidf(self, claims_list):
        docNum = len(claims_list)
        term_df = dict()
        for claim in claims_list:
            for term in set(claim.split(' ')):
                if term not in term_df:
                    term_df[term] = 1.0
                else:
                    term_df[term] += 1.0

        for term in term_df:
            term_df[term] = log10(docNum / term_df[term])

        seqs_claims = []
        for claim in claims_list:
            seqs_claim = [0 for j in range(self.tokenizer.num_words)]
            term_tf = dict()
            terms = claim.split(' ')
            for term in terms:
                if term not in term_tf:
                    term_tf[term] = 1.0
                else:
                    term_tf[term] += 1.0
            docLen = len(terms)
            for term in set(terms):
                if term in self.tokenizer.word2index:
                    tfidf = term_tf[term] / docLen * term_df[term]
                    word_id = self.tokenizer.word2index[term]
                    seqs_claim[word_id] = tfidf

            seqs_claims.append(seqs_claim)

        return seqs_claims

    def get_embeddings_index(self):
        embeddings_index = {}
        path = r'../data/glove.6B.100d.txt'
        f = open(path, 'r')
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()
        return embeddings_index

    def get_embedding_matrix(self):
        embeddings_index = self.get_embeddings_index()
        word_index = self.tokenizer.word2index
        embedding_matrix = np.zeros((len(word_index) + 1, self.EMBEDDING_DIM))
        for word, i in word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
            else:
                embedding_matrix[i] = np.random.rand(self.EMBEDDING_DIM)
        #embedding_matrix = np.array(embedding_matrix, dtype=np.float32)
        return embedding_matrix

    def data_process(self, path_train, path_dev):
        claims_train = []
        labels_train = []
        #lines=self._read_tsv(path+'train-all.tsv')
        lines = self._read_tsv(path_train)
        for (i, line) in enumerate(lines):
            claims_train.append(self.cleanText(line[2]))
            labels_train.append(int(line[3]))

        claims_dev = []
        labels_dev = []
        #lines=self._read_tsv(path+'leaderboard-dev.tsv')
        lines = self._read_tsv(path_dev)
        for (i, line) in enumerate(lines):
            claims_dev.append(self.cleanText(line[2]))
            labels_dev.append(int(line[3]))

        self.tokenizer.fit_on_texts(claims_train + claims_dev)
        seqs_claims_train = self.tokenize_claims(claims_train)
        seqs_claims_dev = self.tokenize_claims(claims_dev)

        #shuffle
        np.random.seed(0)
        idx = np.arange(0, len(claims_train), 1)
        np.random.shuffle(idx)
        seqs_claims_train = [seqs_claims_train[t] for t in idx]
        labels_train = [labels_train[t] for t in idx]
        print('training set length %s' % (len(seqs_claims_train)))
        print('dev set length %s' % (len(seqs_claims_dev)))

        return [[seqs_claims_train, labels_train],
                [seqs_claims_dev, labels_dev]]

    def data_process_pair(self, xtrain, train_y, xvalid, valid_y, separate,
                          sampleNum):
        train_pos, train_neg, valid_text = [], [], []
        if separate:
            self.tokenizer.fit_on_texts(xtrain)
        else:
            self.tokenizer.fit_on_texts(xtrain + xvalid)
        seqs_train = self.tokenize_claims(xtrain)
        seqs_valid = self.tokenize_claims(xvalid)

        pos_claims, neg_claims = [], []
        for i, label in enumerate(train_y):
            if label == 1:
                pos_claims.append(seqs_train[i])
            else:
                neg_claims.append(seqs_train[i])
        pos_len = len(pos_claims)
        neg_len = len(neg_claims)
        pos_claims = np.asarray(pos_claims)
        neg_claims = np.asarray(neg_claims)
        if pos_len < neg_len:
            neg_claims = neg_claims[0:pos_len]
        else:
            pos_claims = pos_claims[0:neg_len]
        assert (pos_claims.shape[0] == neg_claims.shape[0])
        for i in range(sampleNum[0]):
            indxs = np.random.permutation(pos_claims.shape[0])
            train_pos += list(pos_claims)
            train_neg += list(neg_claims[indxs])

        print('training set length %s' % (len(train_pos)))
        print('dev set length %s' % (len(seqs_valid)))

        np.random.seed(0)
        idx = np.arange(0, len(train_pos), 1)
        np.random.shuffle(idx)
        train_pos = [train_pos[t] for t in idx]
        train_neg = [train_neg[t] for t in idx]
        print(len(train_pos))

        return [[train_pos, train_neg], [seqs_valid, valid_y]]

    def data_process_bert_pair(self, path_train, path_dev, sampleNum=5):
        claims_train = []
        labels_train = []
        #lines=self._read_tsv(path+'train-all.tsv')
        lines = self._read_tsv(path_train)
        for (i, line) in enumerate(lines):
            claims_train.append('[CLS] ' + self.cleanText(line[2]) + ' [SEP]')
            labels_train.append(int(line[3]))

        claims_dev = []
        labels_dev = []
        #lines=self._read_tsv(path+'leaderboard-dev.tsv')
        lines = self._read_tsv(path_dev)
        for (i, line) in enumerate(lines):
            claims_dev.append('[CLS] ' + self.cleanText(line[2]) + ' [SEP]')
            labels_dev.append(int(line[3]))

        #shuffle
        #np.random.seed(0)
        #idx = np.arange(0, len(claims_train), 1)
        #np.random.shuffle(idx)
        #seqs_claims_train = [seqs_claims_train[t] for t in idx]
        #labels_train = [labels_train[t] for t in idx]
        pos_claims = []
        neg_claims = []
        for i, label in enumerate(labels_train):
            if label == 1:
                pos_claims.append(claims_train[i])
            else:
                neg_claims.append(claims_train[i])
        pos_claims = np.asarray(pos_claims)
        neg_claims = np.asarray(neg_claims)
        pos_inds = []
        neg_inds = []
        neg_len = neg_claims.shape[0]
        for i in range(pos_claims.shape[0]):
            count_id = 0
            while (count_id < sampleNum):
                j = random.randint(0, neg_len - 1)
                pos_inds.append(i)
                neg_inds.append(j)
                count_id += 1

        print('training set length %s' % (len(pos_inds)))
        print('dev set length %s' % (len(claims_dev)))

        return [[pos_claims[pos_inds], neg_claims[neg_inds]],
                [claims_dev, labels_dev]]

    def data_process_bow(self, path):
        claims_train = []
        labels_train = []
        lines = self._read_tsv(path + 'train-all.tsv')
        #lines=self._read_tsv(path+'train.tsv')
        for (i, line) in enumerate(lines):
            claims_train.append(self.cleanText(line[2]))
            labels_train.append(int(line[3]))

        claims_dev = []
        labels_dev = []
        lines = self._read_tsv(path + 'leaderboard-dev.tsv')
        #lines=self._read_tsv(path+'dev.tsv')
        for (i, line) in enumerate(lines):
            claims_dev.append(self.cleanText(line[2]))
            labels_dev.append(int(line[3]))
        #initialize the voc
        all_claims = claims_train + claims_dev
        self.tokenizer.fit_on_texts(all_claims)
        self.tokenizer.trim(self.TRIM_NUM)

        ##get the bow feature
        seqs_all = self.tokenize_claims_tfidf(all_claims)

        seqs_claims_train = seqs_all[0:len(claims_train)]
        seqs_claims_dev = seqs_all[len(claims_train):]

        #shuffle
        np.random.seed(0)
        idx = np.arange(0, len(claims_train), 1)
        np.random.shuffle(idx)
        seqs_claims_train = [seqs_claims_train[t] for t in idx]
        labels_train = [labels_train[t] for t in idx]
        print('training set length %s' % (len(seqs_claims_train)))
        print('dev set length %s' % (len(seqs_claims_dev)))

        return [[seqs_claims_train, labels_train],
                [seqs_claims_dev, labels_dev]]

    def data_process_guided(self, path):
        claims_train = []
        labels_train = []
        guided_train = []
        #lines=self._read_tsv(path+'train_guided_all.tsv')
        lines = self._read_tsv(path + 'train_guided.tsv')
        for (i, line) in enumerate(lines):
            claim = self.cleanText(line[2])
            claims_train.append(claim)
            labels_train.append(int(line[3]))
            attens = self.cleanText(line[4])
            guided_train.append(self.get_guided_masks(claim, attens))

        claims_dev = []
        labels_dev = []
        guided_dev = []
        #lines=self._read_tsv(path+'leaderboard-dev.tsv')
        lines = self._read_tsv(path + 'dev_guided.tsv')
        for (i, line) in enumerate(lines):
            claim = self.cleanText(line[2])
            claims_dev.append(claim)
            labels_dev.append(int(line[3]))
            attens = ''
            guided_dev.append([])

        self.tokenizer.fit_on_texts(claims_train + claims_dev)
        seqs_claims_train = self.tokenize_claims(claims_train)
        seqs_claims_dev = self.tokenize_claims(claims_dev)

        #shuffle
        np.random.seed(0)
        idx = np.arange(0, len(claims_train), 1)
        np.random.shuffle(idx)
        seqs_claims_train = [seqs_claims_train[t] for t in idx]
        labels_train = [labels_train[t] for t in idx]
        guided_train = [guided_train[t] for t in idx]
        print('training set length %s' % (len(seqs_claims_train)))
        print('dev set length %s' % (len(seqs_claims_dev)))

        return [[seqs_claims_train, labels_train, guided_train],
                [seqs_claims_dev, labels_dev, guided_dev]]

    def data_process_partly_guided(self, path):
        claims_train = []
        labels_train = []
        lines = self._read_tsv(path + 'train_guided.tsv')
        for (i, line) in enumerate(lines):
            claim = self.cleanText(line[2])
            claim_words = claim.split(' ')
            claim = ' '.join(claim_words[0:5])
            label = int(line[3])
            labels_train.append(label)
            if label == 1:
                atten = self.cleanText(line[4])
                if atten != '':
                    claim = atten
            claims_train.append(claim)

        claims_dev = []
        labels_dev = []
        lines = self._read_tsv(path + 'dev_guided.tsv')
        for (i, line) in enumerate(lines):
            claim = self.cleanText(line[2])
            claim_words = claim.split(' ')
            claim = ' '.join(claim_words[0:5])
            label = int(line[3])
            labels_dev.append(label)
            if label == 1:
                atten = self.cleanText(line[4])
                if atten != '':
                    claim = atten
            claims_dev.append(claim)

        self.tokenizer.fit_on_texts(claims_train + claims_dev)
        seqs_claims_train = self.tokenize_claims(claims_train)
        seqs_claims_dev = self.tokenize_claims(claims_dev)

        #shuffle
        np.random.seed(0)
        idx = np.arange(0, len(claims_train), 1)
        np.random.shuffle(idx)
        seqs_claims_train = [seqs_claims_train[t] for t in idx]
        labels_train = [labels_train[t] for t in idx]
        print('training set length %s' % (len(seqs_claims_train)))
        print('dev set length %s' % (len(seqs_claims_dev)))

        return [[seqs_claims_train, labels_train],
                [seqs_claims_dev, labels_dev]]
Пример #3
0
    inp = tf.placeholder(tf.int32, [None, None], name='input')
    # Should be [batch_size x num_classes]
    target = tf.placeholder(tf.int32, [None, None], name='labels')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    keep_probability = tf.placeholder(tf.float32, name='keep_prob')
    return inp, target, learning_rate, keep_probability


# Load main dataframe
df_balanced = pd.read_csv(args.file)
# Load tokenizer class
tokenizer = Tokenizer()
# Load Embeddings matrix
embeddings_index = emb_utils.load_embeddings(args.embedding_path)
# Have tokenizer fit to our data
tokenizer.fit_on_texts(df_balanced.text, embeddings_index)

word_embedding_matrix = emb_utils.create_embedding_matrix(
    tokenizer.word2int, embeddings_index, args.embedding_dim)
seq = tokenizer.text_to_sequence(df_balanced['text'])

# Creating graph for TensorFlow
tf.reset_default_graph()
train_graph = tf.Graph()
with train_graph.as_default():
    with tf.name_scope("inputs"):
        input_data, labels, lr, keep_prob = model_inputs()
        weight = tf.Variable(
            tf.truncated_normal(
                [args.hidden_units, NUM_CLASSES],
                stddev=(1 / np.sqrt(args.hidden_units * NUM_CLASSES))))