def get_sents_encoded(sentence_1, sentence_2, dt=quora):
    data = [
        datasets.tokenize(sentence_1, lang='en'),
        datasets.tokenize(sentence_2, lang='en')
    ]
    vocab_is = dt.w2i
    lst_sent_ids = seq2id(data, vocab_is, seq_begin=False, seq_end=False)
    s1_ids = lst_sent_ids[0]
    s2_ids = lst_sent_ids[1]
    return s1_ids, s2_ids
예제 #2
0
    def next_batch(self,
                   batch_size=64,
                   seq_begin=False,
                   seq_end=False,
                   pad=0,
                   raw=False,
                   mark_entities=False,
                   tokenizer='spacy',
                   one_hot=False):

        if not self.datafile:
            raise Exception('The dataset needs to be open before being used. '
                            'Please call dataset.open() before calling '
                            'dataset.next_batch()')
        text, emotion = [], []

        while len(text) < batch_size:
            row = self.datafile.readline()
            if row == '':
                self._epochs_completed += 1
                self.datafile.seek(0)
                continue
            cols = row.strip().split('\t')
            try:
                tweet, emo = cols[0], int(cols[1])
            except Exception as e:
                print('Invalid data instance. Skipping line.')
                continue
            text.append(datasets.tokenize(tweet, tokenizer))
            emotion.append(emo)

        if one_hot:
            emotion = to_categorical(emotion, nb_classes=self.n_classes)

        if mark_entities:
            text = datasets.mark_entities(text, lang='en')

        if not raw:
            text = datasets.seq2id(text[:batch_size], self.vocab_w2i,
                                   seq_begin, seq_end)
        else:
            text = datasets.append_seq_markers(text[:batch_size], seq_begin,
                                               seq_end)

        if pad != 0:
            text = datasets.padseq(text[:batch_size], pad, raw)

        batch = self.Batch(text=text, emotion=emotion)
        return batch
예제 #3
0
    def next_batch(self, batch_size=64, seq_begin=False, seq_end=False,
                   rescale=None, pad=0, raw=False, mark_entities=False,
                   tokenizer='spacy', sentence_pad=0, one_hot=False):
        if not self.datafile:
            raise Exception('The dataset needs to be open before being used. '
                            'Please call dataset.open() before calling '
                            'dataset.next_batch()')
        text, sentences, ratings, titles, lengths = [], [], [], [], []

        while len(text) < batch_size:
            row = self.datafile.readline()
            if row == '':
                self._epochs_completed += 1
                self.close()
                self.datafile = open(self.path_list[self.epochs_completed % len(self.path_list)])
                continue
            json_obj = json.loads(row.strip())
            text.append(datasets.tokenize(json_obj["review_text"], tokenizer))
            lengths.append(len(text[-1]))
            sentences.append(datasets.sentence_tokenizer(json_obj["review_text"]))
            ratings.append(int(json_obj["review_rating"]))
            titles.append(datasets.tokenize(json_obj["review_header"]))


        if rescale is not None and one_hot == False:
            ratings = datasets.rescale(ratings, rescale, [1.0, 5.0])
        elif rescale is None and one_hot == True:
            ratings = [x - 1 for x in ratings]
            ratings = to_categorical(ratings, nb_classes=5)
        elif rescale is None and one_hot == False:
            pass
        else:
            raise ValueError('rescale and one_hot cannot be set together')
        if mark_entities:
            text = datasets.mark_entities(text, lang='de')
            titles = datasets.mark_entities(titles, lang='de')
            sentences = [datasets.mark_entities(sentence, lang='de')
                         for sentence in sentences]

        if not raw:
            text = datasets.seq2id(text[:batch_size], self.vocab_w2i, seq_begin,
                                  seq_end)
            titles = datasets.seq2id(titles[:batch_size], self.vocab_w2i,
                                     seq_begin, seq_end)
            sentences = [datasets.seq2id(sentence, self.vocab_w2i,
                     seq_begin, seq_end) for sentence in sentences[:batch_size]]
        else:
            text = datasets.append_seq_markers(text[:batch_size],
                                               seq_begin, seq_end)
            titles = datasets.append_seq_markers(titles[:batch_size],
                                                 seq_begin, seq_end)
            sentences = [datasets.append_seq_markers(sentence, seq_begin,
                         seq_end) for sentence in sentences[:batch_size]]

        if pad != 0:
            text = datasets.padseq(text[:batch_size], pad, raw)
            titles = datasets.padseq(titles[:batch_size], pad, raw)
            sentences = [datasets.padseq(sentence, pad, raw) for sentence in
                         sentences[:batch_size]]
        if sentence_pad != 0:
            sentences = [datasets.pad_sentences(sentence, sentence_pad, raw) for
                         sentence in sentences[:batch_size]]

        batch = self.Batch(text=text, sentences=sentences,
                           ratings=ratings, titles=titles, lengths=lengths)
        return batch
    def next_batch(self,
                   batch_size=64,
                   seq_begin=False,
                   seq_end=False,
                   rescale=None,
                   pad=0,
                   raw=False,
                   mark_entities=False,
                   tokenizer='spacy',
                   sentence_pad=0,
                   one_hot=False):
        if not self.datafile:
            raise Exception('The dataset needs to be open before being used. '
                            'Please call dataset.open() before calling '
                            'dataset.next_batch()')
        text, sentences, ratings_service, ratings_cleanliness, \
        ratings_overall, ratings_value, ratings_sleep_quality, ratings_rooms, \
        titles, helpful_votes, lengths = [], [], [], [], [], [], [], [], [], [], []

        while len(text) < batch_size:
            row = self.datafile.readline()
            if row == '':
                self._epochs_completed += 1
                self.close()
                self.datafile = open(self.path_list[self.epochs_completed %
                                                    len(self.path_list)])
                continue
            json_obj = json.loads(row.strip())
            text.append(datasets.tokenize(json_obj["text"], tokenizer))
            lengths.append(len(text[-1]))
            sentences.append(datasets.sentence_tokenizer((json_obj["text"])))
            ratings_service.append(
                int(json_obj["ratings"]["service"]) if 'service' in
                json_obj['ratings'] else int(json_obj['ratings']['overall']))
            ratings_cleanliness.append(
                int(json_obj["ratings"]["cleanliness"]) if 'cleanliness' in
                json_obj['ratings'] else int(json_obj['ratings']['overall']))
            ratings_overall.append(int(json_obj["ratings"]["overall"]))
            ratings_value.append(
                int(json_obj["ratings"]["value"]) if 'value' in
                json_obj['ratings'] else int(json_obj['ratings']['overall']))
            ratings_sleep_quality.append(
                int(json_obj["ratings"]["sleep_quality"]) if 'sleep_quality' in
                json_obj['ratings'] else int(json_obj['ratings']['overall']))
            ratings_rooms.append(
                int(json_obj["ratings"]["rooms"]) if 'rooms' in
                json_obj['ratings'] else int(json_obj['ratings']['overall']))
            helpful_votes.append(json_obj["num_helpful_votes"])
            titles.append(datasets.tokenize(json_obj["title"]))

        if rescale is not None and one_hot == False:
            ratings_service = datasets.rescale(ratings_service, rescale,
                                               [1.0, 5.0])
            ratings_cleanliness = datasets.rescale(ratings_cleanliness,
                                                   rescale, [1.0, 5.0])
            ratings_overall = datasets.rescale(ratings_overall, rescale,
                                               [1.0, 5.0])
            ratings_value = datasets.rescale(ratings_value, rescale,
                                             [1.0, 5.0])
            ratings_sleep_quality = datasets.rescale(ratings_sleep_quality,
                                                     rescale, [1.0, 5.0])
            ratings_rooms = datasets.rescale(ratings_rooms, rescale,
                                             [1.0, 5.0])
        elif rescale is None and one_hot == True:
            ratings_service = to_categorical([x - 1 for x in ratings_service],
                                             nb_classes=5)
            ratings_cleanliness = to_categorical(
                [x - 1 for x in ratings_cleanliness], nb_classes=5)
            ratings_overall = to_categorical([x - 1 for x in ratings_overall],
                                             nb_classes=5)
            ratings_value = to_categorical([x - 1 for x in ratings_value],
                                           nb_classes=5)
            ratings_sleep_quality = to_categorical(
                [x - 1 for x in ratings_sleep_quality], nb_classes=5)
            ratings_rooms = to_categorical([x - 1 for x in ratings_rooms],
                                           nb_classes=5)
        elif rescale is None and one_hot == False:
            pass
        else:
            raise ValueError('rescale and one_hot cannot be set together')

        if mark_entities:
            text = datasets.mark_entities(text)
            titles = datasets.mark_entities(titles)
            sentences = [
                datasets.mark_entities(sentence) for sentence in sentences
            ]

        if not raw:
            text = datasets.seq2id(text[:batch_size], self.vocab_w2i,
                                   seq_begin, seq_end)
            titles = datasets.seq2id(titles[:batch_size], self.vocab_w2i,
                                     seq_begin, seq_end)
            sentences = [
                datasets.seq2id(sentence, self.vocab_w2i, seq_begin, seq_end)
                for sentence in sentences[:batch_size]
            ]
        else:
            text = datasets.append_seq_markers(text[:batch_size], seq_begin,
                                               seq_end)
            titles = datasets.append_seq_markers(titles[:batch_size],
                                                 seq_begin, seq_end)
            sentences = [
                datasets.append_seq_markers(sentence, seq_begin, seq_end)
                for sentence in sentences[:batch_size]
            ]

        if pad != 0:
            text = datasets.padseq(text[:batch_size], pad, raw)
            titles = datasets.padseq(titles[:batch_size], pad, raw)
            sentences = [
                datasets.padseq(sentence, pad, raw)
                for sentence in sentences[:batch_size]
            ]
        if sentence_pad != 0:
            sentences = [
                datasets.pad_sentences(sentence, pad, raw)
                for sentence in sentences[:batch_size]
            ]

        batch = self.Batch(text=text,
                           sentences=sentences,
                           ratings_service=ratings_service,
                           ratings_cleanliness=ratings_cleanliness,
                           ratings=ratings_overall,
                           ratings_value=ratings_value,
                           ratings_sleep_quality=ratings_sleep_quality,
                           ratings_rooms=ratings_rooms,
                           titles=titles,
                           helpful_votes=helpful_votes,
                           lengths=lengths)
        return batch
예제 #5
0
 def generate_sequences(self, x, tokenizer):
     new_x = []
     for instance in x:
         tokens = datasets.tokenize(instance, tokenizer)
         new_x.append(tokens)
     return new_x