def get_sents_encoded(sentence_1, sentence_2, dt=quora): data = [ datasets.tokenize(sentence_1, lang='en'), datasets.tokenize(sentence_2, lang='en') ] vocab_is = dt.w2i lst_sent_ids = seq2id(data, vocab_is, seq_begin=False, seq_end=False) s1_ids = lst_sent_ids[0] s2_ids = lst_sent_ids[1] return s1_ids, s2_ids
def next_batch(self, batch_size=64, seq_begin=False, seq_end=False, pad=0, raw=False, mark_entities=False, tokenizer='spacy', one_hot=False): if not self.datafile: raise Exception('The dataset needs to be open before being used. ' 'Please call dataset.open() before calling ' 'dataset.next_batch()') text, emotion = [], [] while len(text) < batch_size: row = self.datafile.readline() if row == '': self._epochs_completed += 1 self.datafile.seek(0) continue cols = row.strip().split('\t') try: tweet, emo = cols[0], int(cols[1]) except Exception as e: print('Invalid data instance. Skipping line.') continue text.append(datasets.tokenize(tweet, tokenizer)) emotion.append(emo) if one_hot: emotion = to_categorical(emotion, nb_classes=self.n_classes) if mark_entities: text = datasets.mark_entities(text, lang='en') if not raw: text = datasets.seq2id(text[:batch_size], self.vocab_w2i, seq_begin, seq_end) else: text = datasets.append_seq_markers(text[:batch_size], seq_begin, seq_end) if pad != 0: text = datasets.padseq(text[:batch_size], pad, raw) batch = self.Batch(text=text, emotion=emotion) return batch
def next_batch(self, batch_size=64, seq_begin=False, seq_end=False, rescale=None, pad=0, raw=False, mark_entities=False, tokenizer='spacy', sentence_pad=0, one_hot=False): if not self.datafile: raise Exception('The dataset needs to be open before being used. ' 'Please call dataset.open() before calling ' 'dataset.next_batch()') text, sentences, ratings, titles, lengths = [], [], [], [], [] while len(text) < batch_size: row = self.datafile.readline() if row == '': self._epochs_completed += 1 self.close() self.datafile = open(self.path_list[self.epochs_completed % len(self.path_list)]) continue json_obj = json.loads(row.strip()) text.append(datasets.tokenize(json_obj["review_text"], tokenizer)) lengths.append(len(text[-1])) sentences.append(datasets.sentence_tokenizer(json_obj["review_text"])) ratings.append(int(json_obj["review_rating"])) titles.append(datasets.tokenize(json_obj["review_header"])) if rescale is not None and one_hot == False: ratings = datasets.rescale(ratings, rescale, [1.0, 5.0]) elif rescale is None and one_hot == True: ratings = [x - 1 for x in ratings] ratings = to_categorical(ratings, nb_classes=5) elif rescale is None and one_hot == False: pass else: raise ValueError('rescale and one_hot cannot be set together') if mark_entities: text = datasets.mark_entities(text, lang='de') titles = datasets.mark_entities(titles, lang='de') sentences = [datasets.mark_entities(sentence, lang='de') for sentence in sentences] if not raw: text = datasets.seq2id(text[:batch_size], self.vocab_w2i, seq_begin, seq_end) titles = datasets.seq2id(titles[:batch_size], self.vocab_w2i, seq_begin, seq_end) sentences = [datasets.seq2id(sentence, self.vocab_w2i, seq_begin, seq_end) for sentence in sentences[:batch_size]] else: text = datasets.append_seq_markers(text[:batch_size], seq_begin, seq_end) titles = datasets.append_seq_markers(titles[:batch_size], seq_begin, seq_end) sentences = [datasets.append_seq_markers(sentence, seq_begin, seq_end) for sentence in sentences[:batch_size]] if pad != 0: text = datasets.padseq(text[:batch_size], pad, raw) titles = datasets.padseq(titles[:batch_size], pad, raw) sentences = [datasets.padseq(sentence, pad, raw) for sentence in sentences[:batch_size]] if sentence_pad != 0: sentences = [datasets.pad_sentences(sentence, sentence_pad, raw) for sentence in sentences[:batch_size]] batch = self.Batch(text=text, sentences=sentences, ratings=ratings, titles=titles, lengths=lengths) return batch
def next_batch(self, batch_size=64, seq_begin=False, seq_end=False, rescale=None, pad=0, raw=False, mark_entities=False, tokenizer='spacy', sentence_pad=0, one_hot=False): if not self.datafile: raise Exception('The dataset needs to be open before being used. ' 'Please call dataset.open() before calling ' 'dataset.next_batch()') text, sentences, ratings_service, ratings_cleanliness, \ ratings_overall, ratings_value, ratings_sleep_quality, ratings_rooms, \ titles, helpful_votes, lengths = [], [], [], [], [], [], [], [], [], [], [] while len(text) < batch_size: row = self.datafile.readline() if row == '': self._epochs_completed += 1 self.close() self.datafile = open(self.path_list[self.epochs_completed % len(self.path_list)]) continue json_obj = json.loads(row.strip()) text.append(datasets.tokenize(json_obj["text"], tokenizer)) lengths.append(len(text[-1])) sentences.append(datasets.sentence_tokenizer((json_obj["text"]))) ratings_service.append( int(json_obj["ratings"]["service"]) if 'service' in json_obj['ratings'] else int(json_obj['ratings']['overall'])) ratings_cleanliness.append( int(json_obj["ratings"]["cleanliness"]) if 'cleanliness' in json_obj['ratings'] else int(json_obj['ratings']['overall'])) ratings_overall.append(int(json_obj["ratings"]["overall"])) ratings_value.append( int(json_obj["ratings"]["value"]) if 'value' in json_obj['ratings'] else int(json_obj['ratings']['overall'])) ratings_sleep_quality.append( int(json_obj["ratings"]["sleep_quality"]) if 'sleep_quality' in json_obj['ratings'] else int(json_obj['ratings']['overall'])) ratings_rooms.append( int(json_obj["ratings"]["rooms"]) if 'rooms' in json_obj['ratings'] else int(json_obj['ratings']['overall'])) helpful_votes.append(json_obj["num_helpful_votes"]) titles.append(datasets.tokenize(json_obj["title"])) if rescale is not None and one_hot == False: ratings_service = datasets.rescale(ratings_service, rescale, [1.0, 5.0]) ratings_cleanliness = datasets.rescale(ratings_cleanliness, rescale, [1.0, 5.0]) ratings_overall = datasets.rescale(ratings_overall, rescale, [1.0, 5.0]) ratings_value = datasets.rescale(ratings_value, rescale, [1.0, 5.0]) ratings_sleep_quality = datasets.rescale(ratings_sleep_quality, rescale, [1.0, 5.0]) ratings_rooms = datasets.rescale(ratings_rooms, rescale, [1.0, 5.0]) elif rescale is None and one_hot == True: ratings_service = to_categorical([x - 1 for x in ratings_service], nb_classes=5) ratings_cleanliness = to_categorical( [x - 1 for x in ratings_cleanliness], nb_classes=5) ratings_overall = to_categorical([x - 1 for x in ratings_overall], nb_classes=5) ratings_value = to_categorical([x - 1 for x in ratings_value], nb_classes=5) ratings_sleep_quality = to_categorical( [x - 1 for x in ratings_sleep_quality], nb_classes=5) ratings_rooms = to_categorical([x - 1 for x in ratings_rooms], nb_classes=5) elif rescale is None and one_hot == False: pass else: raise ValueError('rescale and one_hot cannot be set together') if mark_entities: text = datasets.mark_entities(text) titles = datasets.mark_entities(titles) sentences = [ datasets.mark_entities(sentence) for sentence in sentences ] if not raw: text = datasets.seq2id(text[:batch_size], self.vocab_w2i, seq_begin, seq_end) titles = datasets.seq2id(titles[:batch_size], self.vocab_w2i, seq_begin, seq_end) sentences = [ datasets.seq2id(sentence, self.vocab_w2i, seq_begin, seq_end) for sentence in sentences[:batch_size] ] else: text = datasets.append_seq_markers(text[:batch_size], seq_begin, seq_end) titles = datasets.append_seq_markers(titles[:batch_size], seq_begin, seq_end) sentences = [ datasets.append_seq_markers(sentence, seq_begin, seq_end) for sentence in sentences[:batch_size] ] if pad != 0: text = datasets.padseq(text[:batch_size], pad, raw) titles = datasets.padseq(titles[:batch_size], pad, raw) sentences = [ datasets.padseq(sentence, pad, raw) for sentence in sentences[:batch_size] ] if sentence_pad != 0: sentences = [ datasets.pad_sentences(sentence, pad, raw) for sentence in sentences[:batch_size] ] batch = self.Batch(text=text, sentences=sentences, ratings_service=ratings_service, ratings_cleanliness=ratings_cleanliness, ratings=ratings_overall, ratings_value=ratings_value, ratings_sleep_quality=ratings_sleep_quality, ratings_rooms=ratings_rooms, titles=titles, helpful_votes=helpful_votes, lengths=lengths) return batch
def generate_sequences(self, x, tokenizer): new_x = [] for instance in x: tokens = datasets.tokenize(instance, tokenizer) new_x.append(tokens) return new_x