예제 #1
0
파일: test_utils.py 프로젝트: tu-artem/text
    def test_get_tokenizer(self):
        # Test the default case with str.split
        assert data.get_tokenizer(str.split) == str.split
        test_str = "A string, particularly one with slightly complex punctuation."
        assert data.get_tokenizer(str.split)(test_str) == str.split(test_str)

        # Test SpaCy option, and verify it properly handles punctuation.
        assert data.get_tokenizer("spacy")(six.text_type(test_str)) == [
            "A", "string", ",", "particularly", "one", "with", "slightly",
            "complex", "punctuation", "."]

        # Test Moses option.
        # Note that internally, MosesTokenizer converts to unicode if applicable
        moses_tokenizer = data.get_tokenizer("moses")
        assert moses_tokenizer(test_str) == [
            "A", "string", ",", "particularly", "one", "with", "slightly",
            "complex", "punctuation", "."]

        # Nonbreaking prefixes should tokenize the final period.
        assert moses_tokenizer(six.text_type("abc def.")) == ["abc", "def", "."]

        # Test Toktok option. Test strings taken from NLTK doctests.
        # Note that internally, MosesTokenizer converts to unicode if applicable
        toktok_tokenizer = data.get_tokenizer("toktok")
        assert toktok_tokenizer(test_str) == [
            "A", "string", ",", "particularly", "one", "with", "slightly",
            "complex", "punctuation", "."]

        # Test that errors are raised for invalid input arguments.
        with self.assertRaises(ValueError):
            data.get_tokenizer(1)
        with self.assertRaises(ValueError):
            data.get_tokenizer("some other string")
예제 #2
0
파일: test_utils.py 프로젝트: mhossny/text
    def test_get_tokenizer(self):
        # Test the default case with str.split
        assert data.get_tokenizer(str.split) == str.split
        test_str = "A string, particularly one with slightly complex punctuation."
        assert data.get_tokenizer(str.split)(test_str) == str.split(test_str)

        # Test SpaCy option, and verify it properly handles punctuation.
        assert data.get_tokenizer("spacy")(six.text_type(test_str)) == [
            "A", "string", ",", "particularly", "one", "with", "slightly",
            "complex", "punctuation", "."
        ]

        # Test Moses option. Test strings taken from NLTK doctests.
        # Note that internally, MosesTokenizer converts to unicode if applicable
        moses_tokenizer = data.get_tokenizer("moses")
        assert moses_tokenizer(test_str) == [
            "A", "string", ",", "particularly", "one", "with", "slightly",
            "complex", "punctuation", "."
        ]

        # Nonbreaking prefixes should tokenize the final period.
        assert moses_tokenizer(
            six.text_type("abc def.")) == ["abc", "def", "."]

        # Test that errors are raised for invalid input arguments.
        with self.assertRaises(ValueError):
            data.get_tokenizer(1)
        with self.assertRaises(ValueError):
            data.get_tokenizer("some other string")
예제 #3
0
 def __init__(self):
     LOGGER.info('Loading field for source (en), target (fr)')
     self.src = data.Field(tokenize=data.get_tokenizer('spacy'), 
                           init_token='<start>',
                           eos_token='<eos>',
                           include_lengths=True,
                           batch_first=True,
                           lower=True)
     self.trg = data.Field(tokenize=data.get_tokenizer('spacy'), 
                           init_token='<start>',
                           eos_token='<eos>',
                           include_lengths=True,
                           batch_first=True,
                           lower=True)
예제 #4
0
def save_data_yahoo_answers():
    train_val_data, test_data = torchtext.datasets.YahooAnswers()
    TRAIN_LEN = 25000
    TEST_LEN = 25000
    tokenize = get_tokenizer("basic_english")

    tokens = []
    labels = []
    for label, line in random.sample(list(train_val_data), TRAIN_LEN):
        tokens.append(tokenize(line))
        labels.append(label)

    train_val_text_dataset = TextDataset(tokens, labels)
    with open('train_val_data_yahoo.pt', 'wb') as f:
        torch.save(train_val_text_dataset, f)

    tokens = []
    labels = []
    for label, line in random.sample(list(test_data), TEST_LEN):
        tokens.append(tokenize(line))
        labels.append(label)

    test_text_dataset = TextDataset(tokens, labels)

    with open('test_data_yahoo.pt', 'wb') as f:
        torch.save(test_text_dataset, f)
예제 #5
0
    def __init__(self,
                 path_to_json,
                 num_samples=10000,
                 p2nr=0.5,
                 mixing_strategy="random"):

        with open(path_to_json) as in_:
            train_df, dev_df, test_df = json.load(in_)
            print(f'training on {len(train_df)} authors...')

        self.tokenizer = get_tokenizer("spacy")
        self.embedding = fasttext.load_model("wiki.simple/wiki.simple.bin")
        self.p2n_ratio = p2nr
        self.num_samples = num_samples
        self.SEP = self.embedding.get_word_vector("<SEP>")

        df = {}
        # merge train and dev into one & do this split using torch built-in
        for author in dev_df:
            df[author] = train_df[author] + dev_df[author]

        tokenized_df = self._preprocess(df)
        positives, negatives = self._example_mixer(
            df=tokenized_df, mixing_strategy=mixing_strategy)

        X = np.array(positives + negatives)
        y = np.array([1] * len(positives) + [0] * len(negatives))

        indexes = list(range(self.num_samples))
        random.shuffle(indexes)

        self.X = X[indexes]
        self.y = y[indexes]
예제 #6
0
    def __init__(self, data_path, data_split, lang, vocab):
        self.vocab = vocab
        self.lang = lang
        self.tokenizer = get_tokenizer('spacy', language=self.lang)
        self.data_path = data_path
        loc = data_path + '/'
        self.image_path = os.path.join(self.data_path.removesuffix('_precomp'),
                                       'images')

        # Captions
        self.captions = []
        with open(loc + '%s_caps_%s.txt' % (data_split, lang), 'r') as f:
            for line in f:
                self.captions.append(line.strip())

        # Image filenames
        self.image_filenames = []
        with open(loc + '%s_img_filenames.txt' % data_split, 'r') as f:
            for line in f:
                self.image_filenames.append(line.strip())

        # Image features
        self.images = np.load(loc + '%s_img.npy' % data_split)
        self.length = len(self.captions)
        # multiple indexes for one image
        if self.images.shape[0] != self.length:
            self.im_div = 5
        else:
            self.im_div = 1
예제 #7
0
def prepare_data(args: PrepareDataArgs, train_output, test_output, vocab_output):
    """Runs data processing scripts to turn raw data from (../raw) into
    cleaned data ready to be analyzed (saved in ../processed).
    """
    logger.info("Processing raw data to final data set")
    train_df = load_raw_csv(to_absolute_path(args.train_file))
    test_df = load_raw_csv(to_absolute_path(args.test_file))

    label_encoder = LabelEncoder()
    label_encoder.fit(train_df.label.values)

    tokenizer = get_tokenizer(args.tokenizer_name)
    vocab = build_vocab(
        train_df.text.values,
        tokenizer,
        args.pretrained_vectors,
        to_absolute_path(args.vectors_cache_directory),
    )
    logger.info(f"Save vocab into {vocab_output}")
    torch.save(vocab, vocab_output)

    train_ds = make_dataset(
        train_df, label_encoder, transforms=None, tokenizer=tokenizer, vocab=vocab
    )
    save_dataset(train_ds, train_output)

    test_ds = make_dataset(
        test_df, label_encoder, transforms=None, tokenizer=tokenizer, vocab=vocab
    )
    save_dataset(test_ds, test_output)
예제 #8
0
 def __init__(self,
              tokenize=data.get_tokenizer('spacy'),
              eos_token='<pad>',
              include_lengths=True):
     super(SSUField, self).__init__(tokenize=tokenize,
                                    eos_token=eos_token,
                                    include_lengths=include_lengths)
예제 #9
0
    def generate_fields(self):
        src_field = data.Field(tokenize=data.get_tokenizer('spacy'),
                               init_token=SOS_WORD,
                               eos_token=EOS_WORD,
                               pad_token=PAD_WORD,
                               include_lengths=True,
                               batch_first=True)

        trg_field = data.Field(tokenize=data.get_tokenizer('spacy'),
                               init_token=SOS_WORD,
                               eos_token=EOS_WORD,
                               pad_token=PAD_WORD,
                               include_lengths=True,
                               batch_first=True)

        return src_field, trg_field
예제 #10
0
    def test_BasicEnglishNormalize(self):
        test_sample = '\'".<br />,()!?;:   Basic English Normalization for a Line of Text   \'".<br />,()!?;:'
        ref_results = [
            "'", '.', ',', '(', ')', '!', '?', 'basic', 'english',
            'normalization', 'for', 'a', 'line', 'of', 'text', "'", '.', ',',
            '(', ')', '!', '?'
        ]

        basic_eng_norm = basic_english_normalize()
        experimental_eager_tokens = basic_eng_norm(test_sample)

        jit_basic_eng_norm = torch.jit.script(basic_eng_norm.to_ivalue())
        experimental_jit_tokens = jit_basic_eng_norm(test_sample)

        basic_english_tokenizer = data.get_tokenizer("basic_english")
        eager_tokens = basic_english_tokenizer(test_sample)

        assert not basic_eng_norm.is_jitable
        assert basic_eng_norm.to_ivalue().is_jitable

        self.assertEqual(experimental_jit_tokens, ref_results)
        self.assertEqual(eager_tokens, ref_results)
        self.assertEqual(experimental_eager_tokens, ref_results)

        # test load and save
        save_path = os.path.join(self.test_dir, 'basic_english_normalize.pt')
        torch.save(basic_eng_norm.to_ivalue(), save_path)
        loaded_basic_eng_norm = torch.load(save_path)

        loaded_eager_tokens = loaded_basic_eng_norm(test_sample)
        self.assertEqual(loaded_eager_tokens, ref_results)
예제 #11
0
def save_data_imdb():
    train_val_data, test_data = torchtext.datasets.IMDB()


    tokenize = get_tokenizer("basic_english")

    tokens = []
    labels = []
    for label, line in train_val_data:
        tokens.append(tokenize(line))
        labels.append(label)

    train_val_text_dataset = TextDataset(tokens, labels)
    with open('train_val_data_imdb.pt', 'wb') as f:
        torch.save(train_val_text_dataset, f)

    tokens = []
    labels = []
    for label, line in test_data:
        tokens.append(tokenize(line))
        labels.append(label)

    test_text_dataset = TextDataset(tokens, labels)

    with open('test_data_imdb.pt', 'wb') as f:
        torch.save(test_text_dataset, f)
예제 #12
0
    def __init__(self,
                 url,
                 root='.data',
                 text_field=None,
                 label_field=None,
                 ngrams=1):
        """Initiate text-classification dataset.

        Arguments:
            url: url of the online raw data files.
            root: Directory where the dataset are saved. Default: ".data"
            text_field: The field that will be used for the sentence. If not given,
                'spacy' token will be used.
            label_field: The field that will be used for the label. If not given,
                'float' token will be used.
            ngrams: a contiguous sequence of n items from s string text.
                Default: 1
        """

        super(TextClassificationDataset, self).__init__()
        fields = []
        fields.append(('text', text_field if text_field is not None else
                       data.Field(tokenize=data.get_tokenizer('spacy'),
                                  init_token='<SOS>',
                                  eos_token='<EOS>')))
        fields.append(
            ('label',
             label_field if label_field is not None else data.LabelField(
                 dtype=torch.float)))
        self.fields = dict(fields)

        self.dataset_name = self.__class__.__name__
        self.root = root
        self.raw_folder = os.path.join(root, self.__class__.__name__, 'raw')
        self.processed_folder = os.path.join(self.root,
                                             self.__class__.__name__,
                                             'processed')
        filepath = os.path.join(self.processed_folder,
                                self.dataset_name + '.train')
        if not os.path.isfile(filepath):
            download_extract_archive(url, self.raw_folder, self.dataset_name)
            _preprocess(self.raw_folder, self.processed_folder,
                        self.dataset_name)
        with open(filepath) as src_data:
            self.train_examples = _load_text_classification_data(
                src_data, self.fields, ngrams)

        filepath = os.path.join(self.processed_folder,
                                self.dataset_name + '.test')
        with open(filepath) as src_data:
            self.test_examples = _load_text_classification_data(
                src_data, self.fields, ngrams)

        self.examples = self.train_examples + self.test_examples
        self.fields['text'].vocab = build_dictionary(self, self.fields['text'],
                                                     'text')
        self.fields['label'].vocab = build_dictionary(self,
                                                      self.fields['label'],
                                                      'label')
예제 #13
0
    def __init__(
        self,
        ravdess_path,
        acoustic_length,
        glove,
        train_prop=0.6,
        test_prop=0.2,
        f_end="IS10.csv",
        use_cols=None,
        add_avging=True,
        avgd=False,
    ):
        # path to dataset--all within acoustic files for ravdess
        self.path = ravdess_path

        # get tokenizer
        self.tokenizer = get_tokenizer("basic_english")

        # get data tensors
        self.all_data = make_ravdess_data_tensors(self.path,
                                                  glove,
                                                  f_end,
                                                  use_cols,
                                                  add_avging=add_avging,
                                                  avgd=avgd)

        (
            self.train_data,
            self.dev_data,
            self.test_data,
        ) = create_data_folds_list(self.all_data, train_prop, test_prop)

        # pull out ys from train to get class weights
        self.train_y_emotion = torch.tensor(
            [item[4] for item in self.train_data])
        self.train_y_intensity = torch.tensor(
            [item[5] for item in self.train_data])

        # set the sarcasm weights
        self.emotion_weights = get_class_weights(self.train_y_emotion)
        self.intensity_weights = get_class_weights(self.train_y_intensity)

        # pull out acoustic data and gender data from train for normalization
        self.train_acoustic = torch.tensor(
            [item[0].tolist() for item in self.train_data])
        self.train_genders = [item[3] for item in self.train_data]

        # acoustic feature normalization based on train
        # todo: incorporate acoustic means into data!!
        self.all_acoustic_means = self.train_acoustic.mean(dim=0,
                                                           keepdim=False)
        self.all_acoustic_deviations = self.train_acoustic.std(dim=0,
                                                               keepdim=False)

        self.male_acoustic_means, self.male_deviations = get_gender_avgs(
            self.train_acoustic, self.train_genders, gender=2)
        self.female_acoustic_means, self.female_deviations = get_gender_avgs(
            self.train_acoustic, self.train_genders, gender=1)
예제 #14
0
 def __init__(self, fix_length, lower, tokenize=(lambda s: list(s))):
     self.fix_length = fix_length
     self.lower = lower
     self.tokenize = get_tokenizer(tokenize)
     self.alphabet = config['alphabet']
     # self.alphabet.append("'")
     super(CharField, self).__init__(fix_length=self.fix_length,
                                     lower=self.lower,
                                     tokenize=self.tokenize)
예제 #15
0
    def test_get_tokenizer_moses(self):
        # Test Moses option.
        # Note that internally, MosesTokenizer converts to unicode if applicable
        moses_tokenizer = data.get_tokenizer("moses")
        assert moses_tokenizer(self.TEST_STR) == [
            "A", "string", ",", "particularly", "one", "with", "slightly",
            "complex", "punctuation", "."]

        # Nonbreaking prefixes should tokenize the final period.
        assert moses_tokenizer("abc def.") == ["abc", "def", "."]
예제 #16
0
def fill_vocab(txt: List[Tuple]):
    tokenizer = get_tokenizer("spacy")
    list_v = []
    for i in txt:
        tok = tokenizer(i)
        for j in tok:
            if list_v.count(j) == 0:
                list_v.append(j)
    vocab = Vocabulary(tokens=list_v)
    return vocab
예제 #17
0
    def __init__(self, root, json, lang, vocab, transform=None, ids=None):
        self.data_name = 'coco'
        self.data_path = root
        self.vocab = vocab
        self.lang = lang
        self.transform = transform
        self.tokenizer = get_tokenizer('spacy', language=self.lang)
        self.dataset = jsonmod.load(open(json, 'r'))['images']
        self.coco = COCO(json)

        self.ids = ids
예제 #18
0
def parse_lines(fname):
    tokenizer = get_tokenizer("basic_english")
    vocab = defaultdict(int)
    data = []
    with open(fname) as f:
        for line in f.readlines():
            line = tokenizer(line.strip())
            if len(line) <= 1:
                continue
            data.append(line)
            for word in line:
                vocab[word] += 1
    return data
예제 #19
0
def tokenize_english_text(input_path: str, output_path: str, col_index: int = 0):
  """Tokenize and lowercase text"""
  # Download en tokenizer with `python -m spacy download en`
  en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
  tokenized_rows = []
  with open(input_path) as f:
    reader = csv.reader(f, delimiter='\t')
    header_text = next(reader)
    for row in reader:
      tokenized_rows.append([' '.join(en_tokenizer(row[col_index].lower()))])
  with open(output_path, 'w') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerow(header_text)
    writer.writerows(tokenized_rows)
예제 #20
0
    def test_BasicEnglishNormalize(self):
        test_sample = 'Basic English Normalization for a Line of Text'
        ref_results = [
            'basic', 'english', 'normalization', 'for', 'a', 'line', 'of',
            'text'
        ]

        basic_english_normalize = BasicEnglishNormalize()
        experimental_eager_tokens = basic_english_normalize(test_sample)

        basic_english_tokenizer = data.get_tokenizer("basic_english")
        tokens_eager = basic_english_tokenizer(test_sample)

        self.assertEqual(experimental_eager_tokens, ref_results)
        self.assertEqual(experimental_eager_tokens, tokens_eager)
예제 #21
0
def predict_rnn():
    model = load_model_rnn('rnn-model.pt')
    tokenize = get_tokenizer("basic_english")
    min_length = 10
    while True:
        text = input('Please input: ')
        if text:
            x = vector.get_vecs_by_tokens(tokenize(text)).to(device).squeeze(1)
            x = vector.get_vecs_by_tokens(tokenize(text)).to(device).squeeze(1)
            if x.size(0) < min_length:
                padded = torch.zeros((min_length, x.size(1)))
                padded[:x.size(0), :] = x
                x = padded
            X = x.unsqueeze(0)
            print(predict(X, model))
예제 #22
0
def basic_english():
    """
    ### Basic  english tokenizer

    We use character level tokenizer in this experiment.
    You can switch by setting,

    ```
    'tokenizer': 'basic_english',
    ```

    in the configurations dictionary when starting the experiment.
    """

    from torchtext.data import get_tokenizer
    return get_tokenizer('basic_english')
예제 #23
0
    def __init__(self, test=False, data_dir="data", vocab_path='data/vocab'):
        super(Articles, self).__init__()
        '''Initialization'''
        self.vocab = Vocab(vocab_path, voc_size)
        self.tokenizer = data.get_tokenizer('basic_english')
        self.max_len_story = MAX_LEN_STORY
        self.max_len_highlight = MAX_LEN_HIGHLIGHT

        is_test = {
            False: os.path.join(data_dir, "train.pkl"),
            True: os.path.join(data_dir, "test.pkl")
        }
        self.data_path = is_test.get(test, "Wrong set name.")

        with open(self.data_path, 'rb') as f:
            self.data = load(f)
예제 #24
0
    def __init__(self, root, json, lang, split, vocab, transform=None):
        self.data_name = 'f8k'
        self.data_path = root
        self.vocab = vocab
        self.lang = lang
        self.split = split
        self.transform = transform
        self.tokenizer = get_tokenizer('spacy', language=self.lang)
        self.dataset = jsonmod.load(open(json, 'r'))['images']
        # TODO self.captions
        # TODO self.images_filenames

        self.ids = []
        for i, d in enumerate(self.dataset):
            if d['split'] == split:
                self.ids += [(i, x) for x in range(len(d['sentences']))]
예제 #25
0
 def process_text(self, text):
     """Transform each description into vectors
     """
     # filter text
     text = text.apply(lambda doc: self.filter_text(doc))
     tokenizer = get_tokenizer('spacy', 'en_core_web_sm')
     # get idf (inverse document frequency)
     print('Calculating tf-idf...')
     warnings.filterwarnings("ignore")
     tfidf = TfidfVectorizer(tokenizer=tokenizer)
     tfidf.fit(text.dropna())
     idf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
     print('Converting text to document embedding...')
     # get document embedding
     w2v = FastText(language='en')
     self.text_dim = w2v.dim
     text = text.apply(lambda doc: self.doc2vec(doc, tokenizer, idf, w2v))
     return text
예제 #26
0
def make_datasets(
    labels,
    texts,
    test_size: float,
    pretrained_vectors: Optional[str],
    tokenizer_name: str,
    vector_cache_dir: str,
) -> Tuple[Dataset, Dataset, Vocab]:
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        texts, labels, test_size=test_size, random_state=RANDOM_SEED, shuffle=True
    )
    tokenizer = get_tokenizer(tokenizer_name)
    vocab = build_vocab(train_texts, tokenizer, pretrained_vectors, vector_cache_dir)
    transforms = None
    return (
        MyTextDataset(train_texts, train_labels, transforms, tokenizer, vocab),
        MyTextDataset(test_texts, test_labels, transforms, tokenizer, vocab),
        vocab,
    )
예제 #27
0
def load_data_semeval_join():

    tokenize = get_tokenizer('basic_english')
    data = []
    lines = []
    with open('semeval/gold/twitter-2016dev-A.txt') as f:
        lines += list(f)
    with open('semeval/gold/twitter-2016test-A.txt') as f:
        lines += list(f)
    with open('semeval/gold/twitter-2016dev-A.txt') as f:
        lines += list(f)
    
    for line in lines:
        id_, label, text, *rest = line.split('\t')
        if label == 'negative':
            y = 1
        elif label == 'neutral':
            continue
        else:
            y = 0
        token = tokenize(text)
        data.append((token, y))
    negatives = [d for d in data if d[1] == 1]
    non_negatives = [d for d in data if d[1] != 1]
    non_negatives = random.sample(non_negatives, len(negatives))
    data = non_negatives + negatives
    random.shuffle(data)

    test_ratio = 0.2
    test_length = round(len(data) * test_ratio)
    train_length = len(data) - test_length

    test_data = data[:test_length]
    test_dataset = TextDataset([d[0] for d in test_data], [d[1] for d in test_data])

    train_data = data[test_length:test_length+train_length]
    train_dataset = TextDataset([d[0] for d in train_data], [d[1] for d in train_data])


    train_dataset = train_dataset.sample(TRAIN_SAMPLE_SIZE)

    return test_dataset, train_dataset, test_dataset
예제 #28
0
    def __getitem__(self, index: int) -> (torch.Tensor, torch.Tensor):
        """
        This function converts an example to a Tensor containing the indices

        :param index: position of example to be retrieved.
        """
        # retrieve sentence and label (correct class index)
        example, label = self.examples[index], self.labels[index]

        # tokenize sentence into words and other symbols
        tokenizer = get_tokenizer("spacy")
        tokens = tokenizer(example)

        # convert tokens to their corresponding indices, according to
        # vocabulary
        token_indices = []
        for i in tokens:
            token_indices.append(self.vocab.get_index_of_token(i))

        return torch.LongTensor(token_indices), torch.LongTensor(label)
예제 #29
0
    def test_text_nomalize_function(self):
        # Test text_nomalize function in torchtext.datasets.text_classification
        ref_lines = []
        test_lines = []

        tokenizer = data.get_tokenizer("basic_english")
        data_path = 'test/asset/text_normalization_ag_news_test.csv'
        with io.open(data_path, encoding="utf8") as f:
            reader = unicode_csv_reader(f)
            for row in reader:
                test_lines.append(tokenizer(' , '.join(row)))

        data_path = 'test/asset/text_normalization_ag_news_ref_results.test'
        with io.open(data_path, encoding="utf8") as ref_data:
            for line in ref_data:
                line = line.split()
                self.assertEqual(line[0][:9], '__label__')
                line[0] = line[0][9:]  # remove '__label__'
                ref_lines.append(line)

        self.assertEqual(ref_lines, test_lines)
예제 #30
0
def translate_f8k(data_path, lang_in, lang_out):
    path_in = os.path.join(data_path, 'f8k',
                           'dataset_flickr8k_%s.json' % lang_in)
    path_out = os.path.join(data_path, 'f8k',
                            'dataset_flickr8k_%s.json' % lang_out)
    dataset = jsonmod.load(open(path_in, 'r'))
    tokenizer = get_tokenizer('spacy', language=lang_out)

    for img_id in range(len(dataset['images'])):
        for sent_id in range(len(dataset['images'][img_id]['sentences'])):
            sentence_to_translate = dataset['images'][img_id]['sentences'][
                sent_id]['raw']
            translated_sentence = translate(sentence_to_translate)
            dataset['images'][img_id]['sentences'][sent_id][
                'raw'] = translated_sentence
            tokens = tokenizer(str(translated_sentence).lower())
            if tokens[-1] == '.':
                tokens = tokens[:-1]
            dataset['images'][img_id]['sentences'][sent_id]['tokens'] = tokens

    jsonmod.dump(dataset, open(path_out, 'w+'))
def init(config):
    ''' Loads the GloVe embeddings for the words
        which occur in the IMDB train set vocab 
        and uses that vocab to create train, validation
        and test sets for the IMDB dataset. Extracts the
        pad_id token.
    '''
    import os
    if not os.path.isdir('.data'):
        os.mkdir('.data')

    # Extract the initial vocab from the IMDB dataset
    vocab = IMDB(data_select='train')[0].get_vocab()
    # Create GloVe embeddings based on original vocab
    # word freqs
    glove_vocab = torchtext.vocab.Vocab(
        counter=vocab.freqs,
        max_size=MAX_VOCAB_SIZE,
        min_freq=MIN_FREQ,
        vectors=torchtext.vocab.GloVe(name='6B'))
    # Acquire 'Spacy' tokenizer for the vocab words
    tokenizer = get_tokenizer('spacy', 'en_core_web_sm')
    # Acquire train and test IMDB sets with previously created
    # GloVe vocab and 'Spacy' tokenizer
    train_set, test_set = IMDB(tokenizer=tokenizer, vocab=glove_vocab)

    # Extract the vocab of the acquired train set
    vocab = train_set.get_vocab()
    # Extract the token used for padding
    pad_id = vocab['<pad>']

    # Split the train set into train and validation sets
    train_set, valid_set = split_train_val(train_set)

    config['train'] = train_set
    config['val'] = valid_set
    config['test'] = test_set
    config['vocab'] = vocab
    config['pad_id'] = pad_id