Exemplo n.º 1
0
def attach_tokenizer(field: Field, tokenizer: PreTrainedTokenizer) -> None:
    """Creates a tokenizer that is attached to a Corpus Field.

    Parameters
    ----------
    field : Field
        Field to which the vocabulary will be attached
    tokenizer : PreTrainedTokenizer
        Tokenizer that will convert tokens to their index.
    """

    def preprocess(value: Union[str, List[str]]) -> List[str]:
        """We only perform the splitting as a preprocessing step.

        This allows us to still have access to the original tokens,
        including those that will be mapped to <unk> later.
        """
        if isinstance(value, list):
            value = " ".join(value)

        return [tokenizer.convert_ids_to_tokens(t) for t in tokenizer.encode(value)]

    field.preprocessing = preprocess
    field.pad_token = tokenizer.pad_token
    field.vocab = tokenizer
    field.vocab.stoi = tokenizer.vocab
Exemplo n.º 2
0
def create_tabular_dataset(data_info, **args):
    disable = [
        'tagger', 'parser', 'ner', 'textcat'
        'entity_ruler', 'sentencizer', 'merge_noun_chunks', 'merge_entities',
        'merge_subtokens'
    ]

    lang = args.get('lang', 'en')
    pretrained_emb = args.get('pretrained_emb', 'glove.6B.300d')

    _, path_train_dataset, path_valid_dataset = analyze_datainfo_paths(
        data_info)

    try:
        spacy_en = spacy.load(f'{lang}_core_web_sm', disable=disable)

    except:
        log(f"Download {lang}")
        import importlib

        os.system(f"python -m spacy download {lang}")
        spacy_en = importlib.import_module(f'{lang}_core_web_sm').load(
            disable=disable)

    #    sleep(60)
    #    spacy_en = spacy.load( f'{lang}_core_web_sm', disable= disable)

    def tokenizer(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]

    # Creating field for text and label
    TEXT = Field(sequential=True, tokenize=tokenizer, lower=True)
    LABEL = Field(sequential=False)

    print('Preprocessing the text...')
    # clean the text
    TEXT.preprocessing = torchtext.data.Pipeline(clean_str)

    print('Creating tabular datasets...It might take a while to finish!')
    train_datafield = [('text', TEXT), ('label', LABEL)]
    tabular_train = TabularDataset(path=path_train_dataset,
                                   format='csv',
                                   skip_header=True,
                                   fields=train_datafield)

    valid_datafield = [('text', TEXT), ('label', LABEL)]

    tabular_valid = TabularDataset(path=path_valid_dataset,
                                   format='csv',
                                   skip_header=True,
                                   fields=valid_datafield)

    print('Building vocaulary...')
    TEXT.build_vocab(tabular_train, vectors=pretrained_emb)
    LABEL.build_vocab(tabular_train)

    return tabular_train, tabular_valid, TEXT.vocab
Exemplo n.º 3
0
def create_tabular_dataset(path_train, path_valid, 
                          lang='en', pretrained_emb='glove.6B.300d'):
    
    spacy_en = spacy.load('en', disable=['tagger', 'parser', 'ner', 'textcat'
                                     'entity_ruler', 'sentencizer', 
                                     'merge_noun_chunks', 'merge_entities',
                                     'merge_subtokens'])

    def tokenizer(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]
    
    #Creating field for text and label
    TEXT = Field(sequential=True, tokenize=tokenizer, lower=True)
    LABEL = Field(sequential=False)
    
    print('Preprocessing the text...')
    #clean the text
    TEXT.preprocessing = torchtext.data.Pipeline(clean_str)

    print('Creating tabular datasets...It might take a while to finish!')
    train_datafield = [('text', TEXT),  ('label', LABEL)]
    tabular_train = TabularDataset(path = path_train,  
                                 format= 'csv',
                                 skip_header=True,
                                 fields=train_datafield)
    
    valid_datafield = [('text', TEXT),  ('label',LABEL)]
    
    tabular_valid = TabularDataset(path = path_valid, 
                           format='csv',
                           skip_header=True,
                           fields=valid_datafield)
    
    print('Building vocaulary...')
    TEXT.build_vocab(tabular_train, vectors= pretrained_emb)
    LABEL.build_vocab(tabular_train)

    
    return tabular_train, tabular_valid, TEXT.vocab
Exemplo n.º 4
0
    def __init__(self, text_field: Field, label_field: Field, score_field: Field, lexicon, dataset='sst-2'):
        fields = [('text', text_field), ('score', score_field), ('label', label_field)]

        # tokenize later
        text_tokenizer = identity
        text_tokenizer, text_field.tokenize = text_field.tokenize, text_tokenizer
        tokenizer = get_tokenizer(score_field.tokenize)

        if lexicon == 'wordnet':
            net = SentiWordNet(default=-1, exclude_stop_words=False)
        else:
            net = Sentiment(lexicon, default=-1)

        def build_score(text):
            texts = tokenizer(text)
            return [net[tok] for tok in texts]

        def build_tag_score(text):
            texts = tokenizer(text)
            tags = nltk.pos_tag(texts)
            return [net.get_score(tok, tag) for tok, tag in tags]

        score_field.tokenize = identity

        if 'sst' in dataset:
            phases = torchtext.datasets.SST.splits(text_field=text_field, label_field=label_field,
                                                   fine_grained=True, root=const.data_path)
            if dataset == 'sst-2':
                mapping = {'very positive': 1, 'positive': 1, 'negative': 0, 'very negative': 0}
            else:
                mapping = {'very positive': 0, 'positive': 1, 'negative': 2, 'very negative': 3, 'neutral': 4}
        elif dataset == 'imdb':
            phases = torchtext.datasets.IMDB.splits(text_field=text_field, label_field=label_field,
                                                    root=const.data_path)
            mapping = {
                'pos': 1,
                'neg': 0
            }
        else:
            raise LookupError

        self.n_class = len(set(mapping.values()))
        label_field.preprocessing = None
        text_field.tokenize = text_tokenizer
        score_field.tokenize = build_tag_score if lexicon == 'wordnet' else build_score

        def process(sample):
            return Example.fromlist([sample.text, sample.text, mapping[sample.label]], fields)

        pool = ProcessPoolExecutor()
        self.dataset = []
        for i, phase in enumerate(phases):
            examples = []
            rets = []
            for example in phase.examples:
                if example.label in mapping.keys():
                    ret = process(example)
                    examples.append(ret)
            # for ret in tqdm(rets):
            #     ex = ret.result()
            #     examples.append(ex)
            self.dataset.append(Dataset(examples, fields))