def create_data(self): if self.split == 'train': self._create_vocab() else: self._load_vocab() print(f'Creating data for {self.split} split...') tokenizer = TweetTokenizer(preserve_case=False) sent_tokenizer = PunktSentenceTokenizer() DetectorFactory.seed = 0 data = defaultdict(dict) df = pd.read_csv(self.raw_data_path) for _, row in df.iterrows(): # Only keep English plot samples if detect(row['plot']) != 'en': continue tokens = tokenizer.tokenize(row['plot']) # Split the plot into separate sentences sentences = sent_tokenizer.sentences_from_tokens(tokens) # Generate a sample from each sentence for words in sentences: randn = np.random.uniform() # Only save 30 percent of the sentences in our dataset # due to performance limitations # if sentence longer than max sequence length don't use it if randn > 0.3 or len(words) > self.max_sequence_length - 1: continue input = ['<sos>'] + words input = input[:self.max_sequence_length] target = words[:self.max_sequence_length - 1] target = target + ['<eos>'] assert len(input) == len(target), "%i, %i" % (len(input), len(target)) length = len(input) input.extend(['<pad>'] * (self.max_sequence_length - length)) target.extend(['<pad>'] * (self.max_sequence_length - length)) input = [self.w2i.get(w, self.w2i['<unk>']) for w in input] target = [self.w2i.get(w, self.w2i['<unk>']) for w in target] id = len(data) data[id]['input'] = input data[id]['target'] = target data[id]['length'] = length with io.open(os.path.join(self.data_dir, self.data_file), 'wb') as data_file: data = json.dumps(data, ensure_ascii=False) data_file.write(data.encode('utf8', 'replace')) self._load_data(vocab=False)