def create_data(self):
        if self.split == 'train':
            self._create_vocab()
        else:
            self._load_vocab()

        print(f'Creating data for {self.split} split...')
        tokenizer = TweetTokenizer(preserve_case=False)
        sent_tokenizer = PunktSentenceTokenizer()

        DetectorFactory.seed = 0

        data = defaultdict(dict)
        df = pd.read_csv(self.raw_data_path)
        for _, row in df.iterrows():
            # Only keep English plot samples
            if detect(row['plot']) != 'en':
                continue
            tokens = tokenizer.tokenize(row['plot'])
            # Split the plot into separate sentences
            sentences = sent_tokenizer.sentences_from_tokens(tokens)
            # Generate a sample from each sentence
            for words in sentences:
                randn = np.random.uniform()
                # Only save 30 percent of the sentences in our dataset
                # due to performance limitations
                # if sentence longer than max sequence length don't use it
                if randn > 0.3 or len(words) > self.max_sequence_length - 1:
                    continue

                input = ['<sos>'] + words
                input = input[:self.max_sequence_length]

                target = words[:self.max_sequence_length - 1]
                target = target + ['<eos>']

                assert len(input) == len(target), "%i, %i" % (len(input),
                                                              len(target))
                length = len(input)

                input.extend(['<pad>'] * (self.max_sequence_length - length))
                target.extend(['<pad>'] * (self.max_sequence_length - length))

                input = [self.w2i.get(w, self.w2i['<unk>']) for w in input]
                target = [self.w2i.get(w, self.w2i['<unk>']) for w in target]

                id = len(data)
                data[id]['input'] = input
                data[id]['target'] = target
                data[id]['length'] = length

        with io.open(os.path.join(self.data_dir, self.data_file),
                     'wb') as data_file:
            data = json.dumps(data, ensure_ascii=False)
            data_file.write(data.encode('utf8', 'replace'))

        self._load_data(vocab=False)