def test_get_tokenizer(self): # Test the default case with str.split assert data.get_tokenizer(str.split) == str.split test_str = "A string, particularly one with slightly complex punctuation." assert data.get_tokenizer(str.split)(test_str) == str.split(test_str) # Test SpaCy option, and verify it properly handles punctuation. assert data.get_tokenizer("spacy")(six.text_type(test_str)) == [ "A", "string", ",", "particularly", "one", "with", "slightly", "complex", "punctuation", "."] # Test Moses option. # Note that internally, MosesTokenizer converts to unicode if applicable moses_tokenizer = data.get_tokenizer("moses") assert moses_tokenizer(test_str) == [ "A", "string", ",", "particularly", "one", "with", "slightly", "complex", "punctuation", "."] # Nonbreaking prefixes should tokenize the final period. assert moses_tokenizer(six.text_type("abc def.")) == ["abc", "def", "."] # Test Toktok option. Test strings taken from NLTK doctests. # Note that internally, MosesTokenizer converts to unicode if applicable toktok_tokenizer = data.get_tokenizer("toktok") assert toktok_tokenizer(test_str) == [ "A", "string", ",", "particularly", "one", "with", "slightly", "complex", "punctuation", "."] # Test that errors are raised for invalid input arguments. with self.assertRaises(ValueError): data.get_tokenizer(1) with self.assertRaises(ValueError): data.get_tokenizer("some other string")
def test_get_tokenizer(self): # Test the default case with str.split assert data.get_tokenizer(str.split) == str.split test_str = "A string, particularly one with slightly complex punctuation." assert data.get_tokenizer(str.split)(test_str) == str.split(test_str) # Test SpaCy option, and verify it properly handles punctuation. assert data.get_tokenizer("spacy")(six.text_type(test_str)) == [ "A", "string", ",", "particularly", "one", "with", "slightly", "complex", "punctuation", "." ] # Test Moses option. Test strings taken from NLTK doctests. # Note that internally, MosesTokenizer converts to unicode if applicable moses_tokenizer = data.get_tokenizer("moses") assert moses_tokenizer(test_str) == [ "A", "string", ",", "particularly", "one", "with", "slightly", "complex", "punctuation", "." ] # Nonbreaking prefixes should tokenize the final period. assert moses_tokenizer( six.text_type("abc def.")) == ["abc", "def", "."] # Test that errors are raised for invalid input arguments. with self.assertRaises(ValueError): data.get_tokenizer(1) with self.assertRaises(ValueError): data.get_tokenizer("some other string")
def __init__(self): LOGGER.info('Loading field for source (en), target (fr)') self.src = data.Field(tokenize=data.get_tokenizer('spacy'), init_token='<start>', eos_token='<eos>', include_lengths=True, batch_first=True, lower=True) self.trg = data.Field(tokenize=data.get_tokenizer('spacy'), init_token='<start>', eos_token='<eos>', include_lengths=True, batch_first=True, lower=True)
def save_data_yahoo_answers(): train_val_data, test_data = torchtext.datasets.YahooAnswers() TRAIN_LEN = 25000 TEST_LEN = 25000 tokenize = get_tokenizer("basic_english") tokens = [] labels = [] for label, line in random.sample(list(train_val_data), TRAIN_LEN): tokens.append(tokenize(line)) labels.append(label) train_val_text_dataset = TextDataset(tokens, labels) with open('train_val_data_yahoo.pt', 'wb') as f: torch.save(train_val_text_dataset, f) tokens = [] labels = [] for label, line in random.sample(list(test_data), TEST_LEN): tokens.append(tokenize(line)) labels.append(label) test_text_dataset = TextDataset(tokens, labels) with open('test_data_yahoo.pt', 'wb') as f: torch.save(test_text_dataset, f)
def __init__(self, path_to_json, num_samples=10000, p2nr=0.5, mixing_strategy="random"): with open(path_to_json) as in_: train_df, dev_df, test_df = json.load(in_) print(f'training on {len(train_df)} authors...') self.tokenizer = get_tokenizer("spacy") self.embedding = fasttext.load_model("wiki.simple/wiki.simple.bin") self.p2n_ratio = p2nr self.num_samples = num_samples self.SEP = self.embedding.get_word_vector("<SEP>") df = {} # merge train and dev into one & do this split using torch built-in for author in dev_df: df[author] = train_df[author] + dev_df[author] tokenized_df = self._preprocess(df) positives, negatives = self._example_mixer( df=tokenized_df, mixing_strategy=mixing_strategy) X = np.array(positives + negatives) y = np.array([1] * len(positives) + [0] * len(negatives)) indexes = list(range(self.num_samples)) random.shuffle(indexes) self.X = X[indexes] self.y = y[indexes]
def __init__(self, data_path, data_split, lang, vocab): self.vocab = vocab self.lang = lang self.tokenizer = get_tokenizer('spacy', language=self.lang) self.data_path = data_path loc = data_path + '/' self.image_path = os.path.join(self.data_path.removesuffix('_precomp'), 'images') # Captions self.captions = [] with open(loc + '%s_caps_%s.txt' % (data_split, lang), 'r') as f: for line in f: self.captions.append(line.strip()) # Image filenames self.image_filenames = [] with open(loc + '%s_img_filenames.txt' % data_split, 'r') as f: for line in f: self.image_filenames.append(line.strip()) # Image features self.images = np.load(loc + '%s_img.npy' % data_split) self.length = len(self.captions) # multiple indexes for one image if self.images.shape[0] != self.length: self.im_div = 5 else: self.im_div = 1
def prepare_data(args: PrepareDataArgs, train_output, test_output, vocab_output): """Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger.info("Processing raw data to final data set") train_df = load_raw_csv(to_absolute_path(args.train_file)) test_df = load_raw_csv(to_absolute_path(args.test_file)) label_encoder = LabelEncoder() label_encoder.fit(train_df.label.values) tokenizer = get_tokenizer(args.tokenizer_name) vocab = build_vocab( train_df.text.values, tokenizer, args.pretrained_vectors, to_absolute_path(args.vectors_cache_directory), ) logger.info(f"Save vocab into {vocab_output}") torch.save(vocab, vocab_output) train_ds = make_dataset( train_df, label_encoder, transforms=None, tokenizer=tokenizer, vocab=vocab ) save_dataset(train_ds, train_output) test_ds = make_dataset( test_df, label_encoder, transforms=None, tokenizer=tokenizer, vocab=vocab ) save_dataset(test_ds, test_output)
def __init__(self, tokenize=data.get_tokenizer('spacy'), eos_token='<pad>', include_lengths=True): super(SSUField, self).__init__(tokenize=tokenize, eos_token=eos_token, include_lengths=include_lengths)
def generate_fields(self): src_field = data.Field(tokenize=data.get_tokenizer('spacy'), init_token=SOS_WORD, eos_token=EOS_WORD, pad_token=PAD_WORD, include_lengths=True, batch_first=True) trg_field = data.Field(tokenize=data.get_tokenizer('spacy'), init_token=SOS_WORD, eos_token=EOS_WORD, pad_token=PAD_WORD, include_lengths=True, batch_first=True) return src_field, trg_field
def test_BasicEnglishNormalize(self): test_sample = '\'".<br />,()!?;: Basic English Normalization for a Line of Text \'".<br />,()!?;:' ref_results = [ "'", '.', ',', '(', ')', '!', '?', 'basic', 'english', 'normalization', 'for', 'a', 'line', 'of', 'text', "'", '.', ',', '(', ')', '!', '?' ] basic_eng_norm = basic_english_normalize() experimental_eager_tokens = basic_eng_norm(test_sample) jit_basic_eng_norm = torch.jit.script(basic_eng_norm.to_ivalue()) experimental_jit_tokens = jit_basic_eng_norm(test_sample) basic_english_tokenizer = data.get_tokenizer("basic_english") eager_tokens = basic_english_tokenizer(test_sample) assert not basic_eng_norm.is_jitable assert basic_eng_norm.to_ivalue().is_jitable self.assertEqual(experimental_jit_tokens, ref_results) self.assertEqual(eager_tokens, ref_results) self.assertEqual(experimental_eager_tokens, ref_results) # test load and save save_path = os.path.join(self.test_dir, 'basic_english_normalize.pt') torch.save(basic_eng_norm.to_ivalue(), save_path) loaded_basic_eng_norm = torch.load(save_path) loaded_eager_tokens = loaded_basic_eng_norm(test_sample) self.assertEqual(loaded_eager_tokens, ref_results)
def save_data_imdb(): train_val_data, test_data = torchtext.datasets.IMDB() tokenize = get_tokenizer("basic_english") tokens = [] labels = [] for label, line in train_val_data: tokens.append(tokenize(line)) labels.append(label) train_val_text_dataset = TextDataset(tokens, labels) with open('train_val_data_imdb.pt', 'wb') as f: torch.save(train_val_text_dataset, f) tokens = [] labels = [] for label, line in test_data: tokens.append(tokenize(line)) labels.append(label) test_text_dataset = TextDataset(tokens, labels) with open('test_data_imdb.pt', 'wb') as f: torch.save(test_text_dataset, f)
def __init__(self, url, root='.data', text_field=None, label_field=None, ngrams=1): """Initiate text-classification dataset. Arguments: url: url of the online raw data files. root: Directory where the dataset are saved. Default: ".data" text_field: The field that will be used for the sentence. If not given, 'spacy' token will be used. label_field: The field that will be used for the label. If not given, 'float' token will be used. ngrams: a contiguous sequence of n items from s string text. Default: 1 """ super(TextClassificationDataset, self).__init__() fields = [] fields.append(('text', text_field if text_field is not None else data.Field(tokenize=data.get_tokenizer('spacy'), init_token='<SOS>', eos_token='<EOS>'))) fields.append( ('label', label_field if label_field is not None else data.LabelField( dtype=torch.float))) self.fields = dict(fields) self.dataset_name = self.__class__.__name__ self.root = root self.raw_folder = os.path.join(root, self.__class__.__name__, 'raw') self.processed_folder = os.path.join(self.root, self.__class__.__name__, 'processed') filepath = os.path.join(self.processed_folder, self.dataset_name + '.train') if not os.path.isfile(filepath): download_extract_archive(url, self.raw_folder, self.dataset_name) _preprocess(self.raw_folder, self.processed_folder, self.dataset_name) with open(filepath) as src_data: self.train_examples = _load_text_classification_data( src_data, self.fields, ngrams) filepath = os.path.join(self.processed_folder, self.dataset_name + '.test') with open(filepath) as src_data: self.test_examples = _load_text_classification_data( src_data, self.fields, ngrams) self.examples = self.train_examples + self.test_examples self.fields['text'].vocab = build_dictionary(self, self.fields['text'], 'text') self.fields['label'].vocab = build_dictionary(self, self.fields['label'], 'label')
def __init__( self, ravdess_path, acoustic_length, glove, train_prop=0.6, test_prop=0.2, f_end="IS10.csv", use_cols=None, add_avging=True, avgd=False, ): # path to dataset--all within acoustic files for ravdess self.path = ravdess_path # get tokenizer self.tokenizer = get_tokenizer("basic_english") # get data tensors self.all_data = make_ravdess_data_tensors(self.path, glove, f_end, use_cols, add_avging=add_avging, avgd=avgd) ( self.train_data, self.dev_data, self.test_data, ) = create_data_folds_list(self.all_data, train_prop, test_prop) # pull out ys from train to get class weights self.train_y_emotion = torch.tensor( [item[4] for item in self.train_data]) self.train_y_intensity = torch.tensor( [item[5] for item in self.train_data]) # set the sarcasm weights self.emotion_weights = get_class_weights(self.train_y_emotion) self.intensity_weights = get_class_weights(self.train_y_intensity) # pull out acoustic data and gender data from train for normalization self.train_acoustic = torch.tensor( [item[0].tolist() for item in self.train_data]) self.train_genders = [item[3] for item in self.train_data] # acoustic feature normalization based on train # todo: incorporate acoustic means into data!! self.all_acoustic_means = self.train_acoustic.mean(dim=0, keepdim=False) self.all_acoustic_deviations = self.train_acoustic.std(dim=0, keepdim=False) self.male_acoustic_means, self.male_deviations = get_gender_avgs( self.train_acoustic, self.train_genders, gender=2) self.female_acoustic_means, self.female_deviations = get_gender_avgs( self.train_acoustic, self.train_genders, gender=1)
def __init__(self, fix_length, lower, tokenize=(lambda s: list(s))): self.fix_length = fix_length self.lower = lower self.tokenize = get_tokenizer(tokenize) self.alphabet = config['alphabet'] # self.alphabet.append("'") super(CharField, self).__init__(fix_length=self.fix_length, lower=self.lower, tokenize=self.tokenize)
def test_get_tokenizer_moses(self): # Test Moses option. # Note that internally, MosesTokenizer converts to unicode if applicable moses_tokenizer = data.get_tokenizer("moses") assert moses_tokenizer(self.TEST_STR) == [ "A", "string", ",", "particularly", "one", "with", "slightly", "complex", "punctuation", "."] # Nonbreaking prefixes should tokenize the final period. assert moses_tokenizer("abc def.") == ["abc", "def", "."]
def fill_vocab(txt: List[Tuple]): tokenizer = get_tokenizer("spacy") list_v = [] for i in txt: tok = tokenizer(i) for j in tok: if list_v.count(j) == 0: list_v.append(j) vocab = Vocabulary(tokens=list_v) return vocab
def __init__(self, root, json, lang, vocab, transform=None, ids=None): self.data_name = 'coco' self.data_path = root self.vocab = vocab self.lang = lang self.transform = transform self.tokenizer = get_tokenizer('spacy', language=self.lang) self.dataset = jsonmod.load(open(json, 'r'))['images'] self.coco = COCO(json) self.ids = ids
def parse_lines(fname): tokenizer = get_tokenizer("basic_english") vocab = defaultdict(int) data = [] with open(fname) as f: for line in f.readlines(): line = tokenizer(line.strip()) if len(line) <= 1: continue data.append(line) for word in line: vocab[word] += 1 return data
def tokenize_english_text(input_path: str, output_path: str, col_index: int = 0): """Tokenize and lowercase text""" # Download en tokenizer with `python -m spacy download en` en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm') tokenized_rows = [] with open(input_path) as f: reader = csv.reader(f, delimiter='\t') header_text = next(reader) for row in reader: tokenized_rows.append([' '.join(en_tokenizer(row[col_index].lower()))]) with open(output_path, 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerow(header_text) writer.writerows(tokenized_rows)
def test_BasicEnglishNormalize(self): test_sample = 'Basic English Normalization for a Line of Text' ref_results = [ 'basic', 'english', 'normalization', 'for', 'a', 'line', 'of', 'text' ] basic_english_normalize = BasicEnglishNormalize() experimental_eager_tokens = basic_english_normalize(test_sample) basic_english_tokenizer = data.get_tokenizer("basic_english") tokens_eager = basic_english_tokenizer(test_sample) self.assertEqual(experimental_eager_tokens, ref_results) self.assertEqual(experimental_eager_tokens, tokens_eager)
def predict_rnn(): model = load_model_rnn('rnn-model.pt') tokenize = get_tokenizer("basic_english") min_length = 10 while True: text = input('Please input: ') if text: x = vector.get_vecs_by_tokens(tokenize(text)).to(device).squeeze(1) x = vector.get_vecs_by_tokens(tokenize(text)).to(device).squeeze(1) if x.size(0) < min_length: padded = torch.zeros((min_length, x.size(1))) padded[:x.size(0), :] = x x = padded X = x.unsqueeze(0) print(predict(X, model))
def basic_english(): """ ### Basic english tokenizer We use character level tokenizer in this experiment. You can switch by setting, ``` 'tokenizer': 'basic_english', ``` in the configurations dictionary when starting the experiment. """ from torchtext.data import get_tokenizer return get_tokenizer('basic_english')
def __init__(self, test=False, data_dir="data", vocab_path='data/vocab'): super(Articles, self).__init__() '''Initialization''' self.vocab = Vocab(vocab_path, voc_size) self.tokenizer = data.get_tokenizer('basic_english') self.max_len_story = MAX_LEN_STORY self.max_len_highlight = MAX_LEN_HIGHLIGHT is_test = { False: os.path.join(data_dir, "train.pkl"), True: os.path.join(data_dir, "test.pkl") } self.data_path = is_test.get(test, "Wrong set name.") with open(self.data_path, 'rb') as f: self.data = load(f)
def __init__(self, root, json, lang, split, vocab, transform=None): self.data_name = 'f8k' self.data_path = root self.vocab = vocab self.lang = lang self.split = split self.transform = transform self.tokenizer = get_tokenizer('spacy', language=self.lang) self.dataset = jsonmod.load(open(json, 'r'))['images'] # TODO self.captions # TODO self.images_filenames self.ids = [] for i, d in enumerate(self.dataset): if d['split'] == split: self.ids += [(i, x) for x in range(len(d['sentences']))]
def process_text(self, text): """Transform each description into vectors """ # filter text text = text.apply(lambda doc: self.filter_text(doc)) tokenizer = get_tokenizer('spacy', 'en_core_web_sm') # get idf (inverse document frequency) print('Calculating tf-idf...') warnings.filterwarnings("ignore") tfidf = TfidfVectorizer(tokenizer=tokenizer) tfidf.fit(text.dropna()) idf = dict(zip(tfidf.get_feature_names(), tfidf.idf_)) print('Converting text to document embedding...') # get document embedding w2v = FastText(language='en') self.text_dim = w2v.dim text = text.apply(lambda doc: self.doc2vec(doc, tokenizer, idf, w2v)) return text
def make_datasets( labels, texts, test_size: float, pretrained_vectors: Optional[str], tokenizer_name: str, vector_cache_dir: str, ) -> Tuple[Dataset, Dataset, Vocab]: train_texts, test_texts, train_labels, test_labels = train_test_split( texts, labels, test_size=test_size, random_state=RANDOM_SEED, shuffle=True ) tokenizer = get_tokenizer(tokenizer_name) vocab = build_vocab(train_texts, tokenizer, pretrained_vectors, vector_cache_dir) transforms = None return ( MyTextDataset(train_texts, train_labels, transforms, tokenizer, vocab), MyTextDataset(test_texts, test_labels, transforms, tokenizer, vocab), vocab, )
def load_data_semeval_join(): tokenize = get_tokenizer('basic_english') data = [] lines = [] with open('semeval/gold/twitter-2016dev-A.txt') as f: lines += list(f) with open('semeval/gold/twitter-2016test-A.txt') as f: lines += list(f) with open('semeval/gold/twitter-2016dev-A.txt') as f: lines += list(f) for line in lines: id_, label, text, *rest = line.split('\t') if label == 'negative': y = 1 elif label == 'neutral': continue else: y = 0 token = tokenize(text) data.append((token, y)) negatives = [d for d in data if d[1] == 1] non_negatives = [d for d in data if d[1] != 1] non_negatives = random.sample(non_negatives, len(negatives)) data = non_negatives + negatives random.shuffle(data) test_ratio = 0.2 test_length = round(len(data) * test_ratio) train_length = len(data) - test_length test_data = data[:test_length] test_dataset = TextDataset([d[0] for d in test_data], [d[1] for d in test_data]) train_data = data[test_length:test_length+train_length] train_dataset = TextDataset([d[0] for d in train_data], [d[1] for d in train_data]) train_dataset = train_dataset.sample(TRAIN_SAMPLE_SIZE) return test_dataset, train_dataset, test_dataset
def __getitem__(self, index: int) -> (torch.Tensor, torch.Tensor): """ This function converts an example to a Tensor containing the indices :param index: position of example to be retrieved. """ # retrieve sentence and label (correct class index) example, label = self.examples[index], self.labels[index] # tokenize sentence into words and other symbols tokenizer = get_tokenizer("spacy") tokens = tokenizer(example) # convert tokens to their corresponding indices, according to # vocabulary token_indices = [] for i in tokens: token_indices.append(self.vocab.get_index_of_token(i)) return torch.LongTensor(token_indices), torch.LongTensor(label)
def test_text_nomalize_function(self): # Test text_nomalize function in torchtext.datasets.text_classification ref_lines = [] test_lines = [] tokenizer = data.get_tokenizer("basic_english") data_path = 'test/asset/text_normalization_ag_news_test.csv' with io.open(data_path, encoding="utf8") as f: reader = unicode_csv_reader(f) for row in reader: test_lines.append(tokenizer(' , '.join(row))) data_path = 'test/asset/text_normalization_ag_news_ref_results.test' with io.open(data_path, encoding="utf8") as ref_data: for line in ref_data: line = line.split() self.assertEqual(line[0][:9], '__label__') line[0] = line[0][9:] # remove '__label__' ref_lines.append(line) self.assertEqual(ref_lines, test_lines)
def translate_f8k(data_path, lang_in, lang_out): path_in = os.path.join(data_path, 'f8k', 'dataset_flickr8k_%s.json' % lang_in) path_out = os.path.join(data_path, 'f8k', 'dataset_flickr8k_%s.json' % lang_out) dataset = jsonmod.load(open(path_in, 'r')) tokenizer = get_tokenizer('spacy', language=lang_out) for img_id in range(len(dataset['images'])): for sent_id in range(len(dataset['images'][img_id]['sentences'])): sentence_to_translate = dataset['images'][img_id]['sentences'][ sent_id]['raw'] translated_sentence = translate(sentence_to_translate) dataset['images'][img_id]['sentences'][sent_id][ 'raw'] = translated_sentence tokens = tokenizer(str(translated_sentence).lower()) if tokens[-1] == '.': tokens = tokens[:-1] dataset['images'][img_id]['sentences'][sent_id]['tokens'] = tokens jsonmod.dump(dataset, open(path_out, 'w+'))
def init(config): ''' Loads the GloVe embeddings for the words which occur in the IMDB train set vocab and uses that vocab to create train, validation and test sets for the IMDB dataset. Extracts the pad_id token. ''' import os if not os.path.isdir('.data'): os.mkdir('.data') # Extract the initial vocab from the IMDB dataset vocab = IMDB(data_select='train')[0].get_vocab() # Create GloVe embeddings based on original vocab # word freqs glove_vocab = torchtext.vocab.Vocab( counter=vocab.freqs, max_size=MAX_VOCAB_SIZE, min_freq=MIN_FREQ, vectors=torchtext.vocab.GloVe(name='6B')) # Acquire 'Spacy' tokenizer for the vocab words tokenizer = get_tokenizer('spacy', 'en_core_web_sm') # Acquire train and test IMDB sets with previously created # GloVe vocab and 'Spacy' tokenizer train_set, test_set = IMDB(tokenizer=tokenizer, vocab=glove_vocab) # Extract the vocab of the acquired train set vocab = train_set.get_vocab() # Extract the token used for padding pad_id = vocab['<pad>'] # Split the train set into train and validation sets train_set, valid_set = split_train_val(train_set) config['train'] = train_set config['val'] = valid_set config['test'] = test_set config['vocab'] = vocab config['pad_id'] = pad_id