def attach_tokenizer(field: Field, tokenizer: PreTrainedTokenizer) -> None: """Creates a tokenizer that is attached to a Corpus Field. Parameters ---------- field : Field Field to which the vocabulary will be attached tokenizer : PreTrainedTokenizer Tokenizer that will convert tokens to their index. """ def preprocess(value: Union[str, List[str]]) -> List[str]: """We only perform the splitting as a preprocessing step. This allows us to still have access to the original tokens, including those that will be mapped to <unk> later. """ if isinstance(value, list): value = " ".join(value) return [tokenizer.convert_ids_to_tokens(t) for t in tokenizer.encode(value)] field.preprocessing = preprocess field.pad_token = tokenizer.pad_token field.vocab = tokenizer field.vocab.stoi = tokenizer.vocab
def create_tabular_dataset(data_info, **args): disable = [ 'tagger', 'parser', 'ner', 'textcat' 'entity_ruler', 'sentencizer', 'merge_noun_chunks', 'merge_entities', 'merge_subtokens' ] lang = args.get('lang', 'en') pretrained_emb = args.get('pretrained_emb', 'glove.6B.300d') _, path_train_dataset, path_valid_dataset = analyze_datainfo_paths( data_info) try: spacy_en = spacy.load(f'{lang}_core_web_sm', disable=disable) except: log(f"Download {lang}") import importlib os.system(f"python -m spacy download {lang}") spacy_en = importlib.import_module(f'{lang}_core_web_sm').load( disable=disable) # sleep(60) # spacy_en = spacy.load( f'{lang}_core_web_sm', disable= disable) def tokenizer(text): return [tok.text for tok in spacy_en.tokenizer(text)] # Creating field for text and label TEXT = Field(sequential=True, tokenize=tokenizer, lower=True) LABEL = Field(sequential=False) print('Preprocessing the text...') # clean the text TEXT.preprocessing = torchtext.data.Pipeline(clean_str) print('Creating tabular datasets...It might take a while to finish!') train_datafield = [('text', TEXT), ('label', LABEL)] tabular_train = TabularDataset(path=path_train_dataset, format='csv', skip_header=True, fields=train_datafield) valid_datafield = [('text', TEXT), ('label', LABEL)] tabular_valid = TabularDataset(path=path_valid_dataset, format='csv', skip_header=True, fields=valid_datafield) print('Building vocaulary...') TEXT.build_vocab(tabular_train, vectors=pretrained_emb) LABEL.build_vocab(tabular_train) return tabular_train, tabular_valid, TEXT.vocab
def create_tabular_dataset(path_train, path_valid, lang='en', pretrained_emb='glove.6B.300d'): spacy_en = spacy.load('en', disable=['tagger', 'parser', 'ner', 'textcat' 'entity_ruler', 'sentencizer', 'merge_noun_chunks', 'merge_entities', 'merge_subtokens']) def tokenizer(text): return [tok.text for tok in spacy_en.tokenizer(text)] #Creating field for text and label TEXT = Field(sequential=True, tokenize=tokenizer, lower=True) LABEL = Field(sequential=False) print('Preprocessing the text...') #clean the text TEXT.preprocessing = torchtext.data.Pipeline(clean_str) print('Creating tabular datasets...It might take a while to finish!') train_datafield = [('text', TEXT), ('label', LABEL)] tabular_train = TabularDataset(path = path_train, format= 'csv', skip_header=True, fields=train_datafield) valid_datafield = [('text', TEXT), ('label',LABEL)] tabular_valid = TabularDataset(path = path_valid, format='csv', skip_header=True, fields=valid_datafield) print('Building vocaulary...') TEXT.build_vocab(tabular_train, vectors= pretrained_emb) LABEL.build_vocab(tabular_train) return tabular_train, tabular_valid, TEXT.vocab
def __init__(self, text_field: Field, label_field: Field, score_field: Field, lexicon, dataset='sst-2'): fields = [('text', text_field), ('score', score_field), ('label', label_field)] # tokenize later text_tokenizer = identity text_tokenizer, text_field.tokenize = text_field.tokenize, text_tokenizer tokenizer = get_tokenizer(score_field.tokenize) if lexicon == 'wordnet': net = SentiWordNet(default=-1, exclude_stop_words=False) else: net = Sentiment(lexicon, default=-1) def build_score(text): texts = tokenizer(text) return [net[tok] for tok in texts] def build_tag_score(text): texts = tokenizer(text) tags = nltk.pos_tag(texts) return [net.get_score(tok, tag) for tok, tag in tags] score_field.tokenize = identity if 'sst' in dataset: phases = torchtext.datasets.SST.splits(text_field=text_field, label_field=label_field, fine_grained=True, root=const.data_path) if dataset == 'sst-2': mapping = {'very positive': 1, 'positive': 1, 'negative': 0, 'very negative': 0} else: mapping = {'very positive': 0, 'positive': 1, 'negative': 2, 'very negative': 3, 'neutral': 4} elif dataset == 'imdb': phases = torchtext.datasets.IMDB.splits(text_field=text_field, label_field=label_field, root=const.data_path) mapping = { 'pos': 1, 'neg': 0 } else: raise LookupError self.n_class = len(set(mapping.values())) label_field.preprocessing = None text_field.tokenize = text_tokenizer score_field.tokenize = build_tag_score if lexicon == 'wordnet' else build_score def process(sample): return Example.fromlist([sample.text, sample.text, mapping[sample.label]], fields) pool = ProcessPoolExecutor() self.dataset = [] for i, phase in enumerate(phases): examples = [] rets = [] for example in phase.examples: if example.label in mapping.keys(): ret = process(example) examples.append(ret) # for ret in tqdm(rets): # ex = ret.result() # examples.append(ex) self.dataset.append(Dataset(examples, fields))