예제 #1
0
    def __init__(
            self,
            question_path,
            paragraph_path,
            ratio,
            batch_size,
            vocab: Vocab = Ref("model.vocab"),
            batch_first=Ref("model.batch_first", True),
    ):
        self.vocab = vocab
        question = Field(include_lengths=True,
                         batch_first=batch_first,
                         pad_token=vocab.pad_token)
        question.vocab = vocab
        paragraph = Field(batch_first=batch_first, pad_token=vocab.pad_token)
        paragraph.vocab = vocab
        paragraphs = NestedField(paragraph, include_lengths=True)
        paragraphs.vocab = vocab
        target = Field(sequential=False, use_vocab=False, is_target=True)

        fields = [("question", question), ("paragraphs", paragraphs),
                  ("target", target)]
        examples = []
        with open(paragraph_path) as paragraph_file, open(
                question_path) as question_file:
            for q in question_file:
                q = q.strip()
                ps = [paragraph_file.readline().strip() for _ in range(ratio)]
                examples.append(Example.fromlist([q, ps, 0], fields))

        BaseIRDataset.__init__(self, ratio, batch_size, batch_first)
        TorchTextDataset.__init__(self, examples, fields)
예제 #2
0
def make_fields(vocab_count, binary=True):
    text_field = Field(batch_first=True,
                       include_lengths=True,
                       tokenize=lambda x: x.split(' '))
    text_field.vocab = Vocab(vocab_count['text'])
    char_nesting_field = Field(batch_first=True, tokenize=list)
    char_field = NestedField(char_nesting_field,
                             tokenize=lambda x: x.split(' '))
    char_nesting_field.vocab = Vocab(vocab_count['chars'])
    char_field.vocab = Vocab(vocab_count['chars'])
    pos1_field = Field(batch_first=True, sequential=False, use_vocab=False)
    pos2_field = Field(batch_first=True, sequential=False, use_vocab=False)
    pos1_rel_field = Field(sequential=True, batch_first=True)
    pos1_rel_field.vocab = Vocab(vocab_count['pos1_rel'])
    pos2_rel_field = Field(sequential=True, batch_first=True)
    pos2_rel_field.vocab = Vocab(vocab_count['pos2_rel'])
    if binary:
        label_field = Field(sequential=False, batch_first=True)
    else:
        label_field = Field(sequential=False, batch_first=True)
    label_field.vocab = Vocab(vocab_count['relation'], specials=[])
    reltype_field = Field(batch_first=True, sequential=False)
    reltype_field.vocab = Vocab(vocab_count['rel_type'])
    fields_dict = {
        'text': [('text', text_field), ('chars', char_field)],
        'pos1': ('pos1', pos1_field),
        'pos2': ('pos2', pos2_field),
        'pos1_rel': ('pos1_rel', pos1_rel_field),
        'pos2_rel': ('pos2_rel', pos2_rel_field),
        'relation': ('relation', label_field),
        'rel_type': ('rel_type', reltype_field)
    }
    return fields_dict
예제 #3
0
 def prepare_fields(pad_t):
     WORD_field = data.Field(use_vocab=False,
                             batch_first=True,
                             sequential=True,
                             pad_token=pad_t)
     WORD_nested_field = NestedField(
         data.Field(use_vocab=False,
                    batch_first=True,
                    sequential=True,
                    pad_token=pad_t))
     PAD_field = data.Field(use_vocab=False,
                            batch_first=True,
                            sequential=True,
                            pad_token=0)
     PAD_nested_field = NestedField(
         data.Field(use_vocab=False,
                    batch_first=True,
                    sequential=True,
                    pad_token=0))
     MASK_nested_field = NestedField(
         data.Field(use_vocab=False,
                    batch_first=True,
                    sequential=True,
                    pad_token=1.))
     fields = {
         'id': data.RawField(),
         'question': data.RawField(),
         'answers': data.RawField(),
         'src': WORD_nested_field,
         'src_mask': PAD_nested_field,
         'doc_mask': MASK_nested_field,
         'target': WORD_field,
         'target_mask': PAD_field,
     }
     return fields
예제 #4
0
 def initialize_fields(self):
     ''' initializes fields '''
     # initialize the text field with the spacy tokenizer and no casing
     self.text_field = Field(tokenize='spacy', lower=True, batch_first=True)
     # initialize the tag field without an unknown token (hopefully the train set contains all of the tags)
     self.tag_field = Field(unk_token=None, batch_first=True)
     # initialize the character field
     char_nesting_field = Field(tokenize=list, batch_first=True)
     self.char_field = NestedField(char_nesting_field)
     self.pad_token = self.text_field.pad_token
예제 #5
0
def lattice_fields(**kwargs):
    """Create text fields.

    Args:
        base_name (str): Name associated with the field.
        n_feats (int): Number of word level feats (not counting the tokens)
        include_lengths (bool): Optionally return the sequence lengths.
        pad (str, optional): Defaults to ``"<blank>"``.
        bos (str or NoneType, optional): Defaults to ``"<s>"``.
        eos (str or NoneType, optional): Defaults to ``"</s>"``.
        truncate (bool or NoneType, optional): Defaults to ``None``.

    Returns:
        LatticeMultiField
    """

    n_feats = kwargs["n_feats"]
    include_lengths = kwargs["include_lengths"]
    base_name = kwargs["base_name"]
    pad = kwargs.get("pad", "<blank>")
    truncate = kwargs.get("truncate", None)
    fields_ = []
    i = 0
    use_len = i == 0 and include_lengths
    name = base_name + "_feat_" + str(i - 1) if i > 0 else base_name
    nesting_field_text = Field(pad_token=pad, use_vocab=True)
    text_field = NestedField(nesting_field_text,
                             pad_token=pad,
                             include_lengths=use_len,
                             use_vocab=True)

    nesting_field_scores = Field(pad_token=0.0,
                                 use_vocab=False,
                                 dtype=torch.float64)
    scores_field = NestedField(nesting_field_scores,
                               use_vocab=False,
                               tokenize=None,
                               dtype=torch.float64,
                               include_lengths=use_len)

    #feat = [('confnet', text_field), ('scores', scores_field)]
    fields_.append(('confnet', text_field))
    fields_.append(('scores', scores_field))
    """
    print('fields_', fields_)
    print(fields_[0][0])
    print(fields_[0][1])
    print(fields_[1:])
    """
    #field = LatticeMultiField(fields_[0][0], fields_[0][1], fields_[1:])
    confnet_field = LatticeMultiField('confnet', text_field, [])
    score_field = LatticeMultiField('score', scores_field, [])
    #print('lattice field', field)
    return confnet_field, score_field
예제 #6
0
    def define_fields(self):

        self.id_field = Field(sequential=False,
                              tokenize=lambda x: x,
                              use_vocab=True)

        self.tweet_field = Field(sequential=True,
                                 tokenize=DataLoader.tokenize_text,
                                 include_lengths=False,
                                 lower=True,
                                 fix_length=self.max_length,
                                 use_vocab=True)

        self.timestamp_field = Field(sequential=False,
                                     include_lengths=False,
                                     use_vocab=False)

        self.structure_field = Field(
            sequential=True,
            tokenize=lambda x: DataLoader.tokenize_structure(x),
            include_lengths=False,
            fix_length=self.config.max_tweets,
            pad_token=self.config.num_structure_index,
            use_vocab=False)

        self.label_field = Field(sequential=False, use_vocab=False)

        self.tweet_lst_field = NestedField(self.tweet_field,
                                           fix_length=self.config.max_tweets)

        self.timestamp_lst_field = NestedField(
            self.timestamp_field,
            pad_token=str(self.config.size),
            fix_length=self.config.max_tweets)

        self.structure_lst_field = NestedField(
            self.structure_field, fix_length=self.config.max_tweets)

        data_fields = {}

        for key, val in self.config.keys_order.items():

            if key == "post_id":
                data_fields[val] = (val, self.id_field)
            if key == "content":
                data_fields[val] = (val, self.tweet_lst_field)
            elif key == "label":
                data_fields[val] = (val, self.label_field)
            elif key == "time_delay":
                data_fields[val] = (val, self.timestamp_lst_field)
            elif key == "structure":
                data_fields[val] = (val, self.structure_lst_field)

        self.data_fields = data_fields
예제 #7
0
class CharCorpus(object):
    def __init__(self, input_folder, min_word_freq, batch_size, wv_file=None):
        # list all the fields
        self.word_field = Field(lower=True)  # [sent len, batch_size]
        self.tag_field = Field(unk_token=None)  # [sent len, batch_size]
        ### BEGIN MODIFIED SECTION: CHARACTER EMBEDDING ###
        self.char_nesting_field = Field(tokenize=list)
        self.char_field = NestedField(
            self.char_nesting_field)  # [batch_size, sent len, word len]
        # create dataset using built-in parser from torchtext
        self.train_dataset, self.test_dataset = SequenceTaggingDataset.splits(
            path=input_folder,
            train="train.txt",
            test="test.txt",
            fields=((("word", "char"), (self.word_field, self.char_field)),
                    ("tag", self.tag_field)))
        ### END MODIFIED SECTION ###
        # convert fields to vocabulary list
        if wv_file:
            self.wv_model = gensim.models.word2vec.Word2Vec.load(wv_file)
            self.embedding_dim = self.wv_model.vector_size
            word_freq = {
                word: self.wv_model.wv.vocab[word].count
                for word in self.wv_model.wv.vocab
            }
            word_counter = Counter(word_freq)
            self.word_field.vocab = Vocab(word_counter, min_freq=min_word_freq)
            vectors = []
            for word, idx in self.word_field.vocab.stoi.items():
                if word in self.wv_model.wv.vocab.keys():
                    vectors.append(
                        torch.as_tensor(self.wv_model.wv[word].tolist()))
                else:
                    vectors.append(torch.zeros(self.embedding_dim))
            self.word_field.vocab.set_vectors(stoi=self.word_field.vocab.stoi,
                                              vectors=vectors,
                                              dim=self.embedding_dim)
        else:
            self.word_field.build_vocab(self.train_dataset.word,
                                        min_freq=min_word_freq)
        # build vocab for tag and characters
        self.char_field.build_vocab(self.train_dataset.char)  # NEWLY ADDED
        self.tag_field.build_vocab(self.train_dataset.tag)
        # create iterator for batch input
        self.train_iter, self.test_iter = BucketIterator.splits(
            datasets=(self.train_dataset, self.test_dataset),
            batch_size=batch_size)
        # prepare padding index to be ignored during model training/evaluation
        self.word_pad_idx = self.word_field.vocab.stoi[
            self.word_field.pad_token]
        self.char_pad_idx = self.char_field.vocab.stoi[
            self.char_field.pad_token]  # NEWLY ADDED
        self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token]
예제 #8
0
파일: robust45.py 프로젝트: achyudh/castor
class Robust45Hierarchical(Robust45):
    @staticmethod
    def clean_sentence(string):
        return clean_string(string, sentence_droprate=0, max_length=100)

    NESTING_FIELD = Field(batch_first=True, tokenize=clean_string)
    TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=split_sents)
예제 #9
0
def get_data_fields(fixed_lengths: int) -> dict:
    """"
    Creates torchtext fields for the I/O pipeline.
    """
    language = Field(
        batch_first=True, init_token=None, eos_token=None, pad_token=None, unk_token=None)

    characters = Field(include_lengths=True, batch_first=True, init_token=None,
                       eos_token=END_TOKEN, pad_token=PAD_TOKEN, fix_length=fixed_lengths)

    nesting_field = Field(tokenize=list, pad_token=PAD_TOKEN, batch_first=True,
                          init_token=None, eos_token=END_TOKEN)
    paragraph = NestedField(nesting_field, pad_token=PAD_TOKEN, eos_token=END_TOKEN,
                            include_lengths=True)
    #
    # paragraph = Field(include_lengths=True, batch_first=True, init_token=None,
    #                   eos_token=END_TOKEN, pad_token=PAD_TOKEN)

    fields = {
        'characters': ('characters', characters),
        'paragraph':   ('paragraph', paragraph),
        'language':    ('language', language)
    }

    return fields
예제 #10
0
def gen_language_model_corpus(dataset_cls: torchtext.datasets.LanguageModelingDataset):
    field_char = NestedField(Field(
        pad_token=PAD_WORD,
        tokenize=list,
        init_token=SOS_WORD,
        eos_token=EOS_WORD,
        batch_first=True),
        pad_token=PAD_WORD,
    )

    field_word = Field(batch_first=True)
    dataset_char = dataset_cls.splits(field_char)
    dataset_word = dataset_cls.splits(field_word)
    field_char.build_vocab(dataset_char[0])
    field_word.build_vocab(dataset_word[0])
    return [_ for _ in zip(dataset_word, dataset_char)], field_word, field_char
예제 #11
0
    def __init__(self,
                 path,
                 batch_size,
                 vocab: Vocab = Ref("model.vocab"),
                 batch_first=Ref("model.batch_first", True)):
        self.vocab = vocab
        question = Field(include_lengths=True,
                         use_vocab=False,
                         pad_token=vocab.pad_index,
                         batch_first=batch_first)
        paragraph = Field(batch_first=batch_first,
                          pad_token=vocab.pad_index,
                          use_vocab=False)
        paragraphs = NestedField(paragraph, include_lengths=True)
        target = Field(sequential=False, use_vocab=False, is_target=True)

        fields = [("question", question), ("paragraphs", paragraphs),
                  ("target", target)]

        import h5py
        self.data = h5py.File(path, "r")
        ds = self.data["examples"]
        ratio = ds.attrs["ratio"]

        TorchTextDataset.__init__(self, self.ExampleWrapper(ds, ratio, fields),
                                  fields)
        BaseIRDataset.__init__(self, ratio, batch_size, batch_first)
def get_data_fields():
    """Creates torchtext fields for the I/O pipeline."""
    form = Field(include_lengths=True,
                 batch_first=True,
                 init_token=None,
                 eos_token=None,
                 pad_token=PAD_TOKEN,
                 lower=True)
    pos = Field(include_lengths=True,
                batch_first=True,
                init_token=ROOT_TOKEN,
                eos_token=END_TOKEN,
                pad_token=PAD_TOKEN,
                unk_token=None)
    nesting_field = Field(tokenize=list,
                          pad_token=PAD_TOKEN,
                          batch_first=True,
                          init_token=None,
                          eos_token=None)
    chars = NestedField(nesting_field,
                        init_token=None,
                        pad_token=PAD_TOKEN,
                        eos_token=None,
                        include_lengths=True)

    fields = {
        'form': ('form', form),
        'pos': ('pos', pos),
        'chars': ('chars', chars)
    }

    return fields
예제 #13
0
def create_fields(use_prefix=False,
                  use_suffix=False,
                  use_chars=False,
                  lower_words=True,
                  lower_prefixes=True,
                  lower_suffixes=True,
                  lower_chars=False):
    WORDS = Field(batch_first=True,
                  lower=lower_words,
                  init_token='<s>',
                  eos_token='</s>')
    TAGS = Field(batch_first=True, init_token='<s>', eos_token='</s>')
    PREFIXES_2 = Field(batch_first=True,
                       lower=lower_prefixes,
                       init_token='<s>',
                       eos_token='</s>')
    PREFIXES_3 = Field(batch_first=True,
                       lower=lower_prefixes,
                       init_token='<s>',
                       eos_token='</s>')
    SUFFIXES_2 = Field(batch_first=True,
                       lower=lower_suffixes,
                       init_token='<s>',
                       eos_token='</s>')
    SUFFIXES_3 = Field(batch_first=True,
                       lower=lower_suffixes,
                       init_token='<s>',
                       eos_token='</s>')
    CHARS = NestedField(Field(batch_first=True,
                              lower=lower_chars,
                              pad_token='<cpad>',
                              unk_token='<cunk>',
                              tokenize=list,
                              init_token='<w>',
                              eos_token='</w>'),
                        init_token='<s>',
                        eos_token='</s>')

    field_odict = collections.OrderedDict({
        'words': WORDS,
        'tags': TAGS,
        'prefs_2': None,
        'prefs_3': None,
        'suffs_2': None,
        'suffs_3': None,
        'chars': None,
    })
    if use_prefix:
        field_odict['prefs_2'] = PREFIXES_2
        field_odict['prefs_3'] = PREFIXES_3
    if use_suffix:
        field_odict['suffs_2'] = SUFFIXES_2
        field_odict['suffs_3'] = SUFFIXES_3
    if use_chars:
        field_odict['chars'] = CHARS

    return field_odict
class IMDB_HAN(IMDB):
    NESTING = Field(sequential=True,
                    batch_first=True,
                    lower=True,
                    use_vocab=True,
                    tokenize=clean_string)
    TEXT = NestedField(NESTING, tokenize=split_sents, include_lengths=True)
    LABEL = Field(sequential=False,
                  use_vocab=False,
                  batch_first=True,
                  preprocessing=process_labels)
예제 #15
0
    def _init_fields(self):
        self.words = Field(batch_first=True,
                           init_token='<s>',
                           eos_token='</s>')
        self.lab = Field(batch_first=True, unk_token=None, pad_token=None)
        # self.char = NestedField(Field(batch_first=True, tokenize=list, unk_token='<cunk>')
        #                         , init_token='<s>', eos_token='</s>')
        self.char = NestedField(Field(batch_first=True,
                                      tokenize=list,
                                      unk_token='<cunk>',
                                      init_token='<w>',
                                      eos_token='</w>'),
                                init_token='<s>',
                                eos_token='</s>')

        self.labeled_fields = [(self.WORDS_NAME, self.words),
                               (self.CHAR_NAME, self.char),
                               (self.LAB_NAME, self.lab)]
        self.unlabeled_fields = [(self.WORDS_NAME, self.words),
                                 (self.CHAR_NAME, self.char)]
        self.logger.info('fields initialized successfully')
class FakeHealth_HAN(FakeHealth):
    NESTING = Field(sequential=True,
                    batch_first=True,
                    lower=True,
                    use_vocab=True,
                    tokenize=clean_string)
    TEXT = NestedField(NESTING, tokenize=sent_tokenize, include_lengths=True)
    LABEL = Field(sequential=False,
                  use_vocab=False,
                  batch_first=True,
                  preprocessing=process_labels)
    ID = Field(sequential=False,
               use_vocab=False,
               batch_first=True,
               preprocessing=process_ids)
예제 #17
0
def load_dataset(config, device):

    label_dict = {"observing": 0, "against": 1, "for": 2}
    LABEL = Field(use_vocab = False, sequential = False,\
     dtype = torch.long, preprocessing = lambda x: label_dict[x.strip()])

    SEQ = Field(dtype = torch.long, lower = True, batch_first = True,\
     preprocessing = lambda x:x[:45], include_lengths = True)
    SENT = Field(dtype = torch.long, lower = True, batch_first = True,\
     preprocessing = lambda x:x[:45], include_lengths = False)

    DOC = NestedField(SENT, tokenize = lambda s:s.strip().split(' </s> '), \
     preprocessing = lambda s:[x for x in s[:45] if x], dtype = torch.long,\
     include_lengths = True)

    fields = [('label', LABEL), ('claim', SEQ), ('hline', SEQ),\
     ('abst', SEQ), ('body', DOC)]

    train, test = TabularDataset.splits(path="../stance_data/", format = "tsv",\
     fields = fields, train = config.train_file, test = config.test_file)
    train, val = train.split(split_ratio=0.80)

    vectors = GloVe(name="6B",
                    dim=config.embed_dim,
                    cache='/users4/jwduan/vectors/')
    DOC.build_vocab(train, val, test, vectors=vectors)

    SEQ.build_vocab()
    SEQ.vocab = DOC.vocab

    config.vocab_size = len(DOC.vocab)
    train_loader, val_loader, test_loader = Iterator.splits((train, val, test),\
     batch_sizes = (config.batch_size, 256, 256), sort_key = lambda x:len(x.body), sort = True,
      device = device, shuffle = True, repeat = False)

    return (train_loader, val_loader, test_loader), DOC.vocab.vectors
예제 #18
0
def load_dataset(config, device):

    LABEL = Field(sequential = False, dtype = torch.long, use_vocab = False,\
     batch_first = True, preprocessing = lambda x:1 if float(x) > 0. else 0)
    TARGET = Field(batch_first=True,
                   lower=True,
                   dtype=torch.long,
                   preprocessing=lambda x: x[0].split('_'),
                   include_lengths=True)

    TEXT = Field(dtype = torch.long, lower = True, batch_first = True,\
     preprocessing = lambda x:x[:50])# [w for w in x if w not in stopwords_set][:50])

    LEADS = NestedField(TEXT, dtype = torch.long, include_lengths = True,\
     tokenize = lambda s: s.split('</s>'), preprocessing = lambda x:x[-5:])

    DOC = NestedField(TEXT, dtype = torch.long, include_lengths = True,\
     tokenize = lambda s: s.split('</s>'), preprocessing = lambda x:[s for s in x[1:50] if s])
    DOCS = NestNestedField(DOC, dtype = torch.long, include_lengths = True,\
     tokenize = lambda s: s.split('</p>'), preprocessing = lambda x:x[-5:])

    fields = [('label', LABEL), ('target', TARGET), ('leads', LEADS),
              ('docs', DOCS)]
    train, val, test = TabularDataset.splits(path="../abrt_data/", format = "tsv", \
     fields = fields, train = config.train_file, validation = config.dev_file, test = config.test_file)

    TARGET.build_vocab(train, val, test)
    DOCS.build_vocab(train, val, test)

    config.wvocab_size = len(DOCS.vocab)
    config.tvocab_size = len(TARGET.vocab)
    # sort = False,
    train_loader, val_loader, test_loader = BucketIterator.splits((train, val, test),\
     sort_key = lambda x: len(x.docs), sort = True, batch_sizes = (config.batch_size, 32, 32),\
     device = device, repeat = False)
    return (train_loader, val_loader, test_loader)
예제 #19
0
def get_data_fields(model_type) -> dict:
    """"
    Creates torchtext fields for the I/O pipeline.
    """
    language_per_word = Field(include_lengths=True,
                              batch_first=True,
                              init_token=None,
                              eos_token=END_TOKEN,
                              pad_token=PAD_TOKEN)
    language_per_char = Field(include_lengths=True,
                              batch_first=True,
                              init_token=None,
                              eos_token=END_TOKEN,
                              pad_token=PAD_TOKEN)
    characters = Field(include_lengths=True,
                       batch_first=True,
                       init_token=None,
                       eos_token=END_TOKEN,
                       pad_token=PAD_TOKEN)

    nesting_field = Field(tokenize=list,
                          pad_token=PAD_TOKEN,
                          batch_first=True,
                          eos_token=None)

    if model_type != "recurrent":
        paragraph = NestedField(nesting_field,
                                pad_token=PAD_TOKEN,
                                eos_token=END_TOKEN,
                                include_lengths=True)
    else:
        paragraph = Field(include_lengths=True,
                          batch_first=True,
                          init_token=None,
                          eos_token=END_TOKEN,
                          pad_token=PAD_TOKEN)  # FIXME BACK

    fields = {
        'characters': ('characters', characters),
        'paragraph': ('paragraph', paragraph),
        'language_per_word': ('language_per_word', language_per_word),
        'language_per_char': ('language_per_char', language_per_char)
    }

    return fields
예제 #20
0
    def __init__(self, glove=True, device=device):
        self.device = device

        nlp = spacy.load("en_core_web_sm")

        char_nesting = Field(batch_first=True, tokenize=list, lower=True)
        char = NestedField(char_nesting,
                           init_token="<sos>",
                           eos_token="<eos>",
                           tokenize="spacy")
        word = Field(init_token="<sos>",
                     eos_token="<eos>",
                     lower=True,
                     tokenize="spacy")
        label = Field(sequential=False, is_target=True, use_vocab=False)

        self.fields = [("question_char", char), ("question_word", word),
                       ("context_char", char), ("context_word", word),
                       ("answer", label)]

        self.dict_fields = {
            "question": [("question_char", char), ("question_word", word)],
            "context": [("context_char", char), ("context_word", word)],
            "answer": ("answer", label)
        }

        self.train_data = self._get_data("../data/train.jsonl")
        self.dev_data = self._get_data("../data/dev.jsonl")

        char.build_vocab(self.train_data)
        if glove:
            word.build_vocab(self.train_data,
                             vectors=GloVe(name="6B", dim=100))
        else:
            word.build_vocab(self.train_data,
                             vectors=FastText(language='en',
                                              max_vectors=30000))

        self.char_vocab = char.vocab
        self.word_vocab = word.vocab

        pos = []
        ner = []

        ind2pos = []
        ind2ner = []

        for data in tqdm(self.train_data):
            doc = nlp(' '.join(data.question_word + data.context_word))

            # t - token
            pos.extend([t.pos_ for t in doc])
            ner.extend([t.label_ for t in doc.ents])

            ind2pos.extend([[self.word_vocab.stoi[str(t)], t.pos_]
                            for t in doc])
            ind2ner.extend([[self.word_vocab.stoi[str(t)], t.label_]
                            for t in doc.ents])

        self.pos_voc = {tag: i for i, tag in enumerate(set(pos))}
        self.ner_voc = {tag: i + 1 for i, tag in enumerate(set(ner))}
        self.ner_voc['None'] = 0

        # default values, used in DrQA model
        self.ind2pos = defaultdict(lambda: self.pos_voc['X'])  # returns 14
        self.ind2ner = defaultdict(lambda: self.ner_voc['None'])  # returns 0

        self.ind2pos.update({tag[0]: self.pos_voc[tag[1]] for tag in ind2pos})
        self.ind2ner.update({tag[0]: self.ner_voc[tag[1]] for tag in ind2ner})
예제 #21
0
class LyricsArtistHierarchical(LyricsArtist):
    NESTING_FIELD = Field(batch_first=True, tokenize=clean_string)
    TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=split_sents)
예제 #22
0
class JiraHierarchical(Jira):
    NESTING_FIELD = Field(batch_first=True, tokenize=clean_string)
    TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=split_sents)
예제 #23
0
def test_inference_performance():
    from sklearn.metrics import f1_score
    from torchtext.datasets import SequenceTaggingDataset
    from torchtext.data import Field, NestedField

    WORD = Field(init_token='<bos>', eos_token='<eos>')
    CHAR_NESTING = Field(tokenize=list, init_token='<bos>', eos_token='<eos>')
    CHAR = NestedField(CHAR_NESTING, init_token='<bos>', eos_token='<eos>')
    ENTITY = Field(init_token='<bos>', eos_token='<eos>')

    data_file = tempfile.NamedTemporaryFile(delete=True)

    # TODO Need to be decoded in Python 3
    data_file.write(requests.get(CORA_URL).content)

    fields = [(('text', 'char'),
               (WORD, CHAR))] + [(None, None)] * 22 + [('entity', ENTITY)]

    dataset = SequenceTaggingDataset(data_file.name, fields, separator=" ")

    model = Model(model_path='models/neuralParsCit')
    model.parameters['pre_emb'] = os.path.join(os.getcwd(),
                                               'vectors_with_unk.kv')
    f = model.build(training=False, **model.parameters)

    model.reload()

    word_to_id = {v: i for i, v in model.id_to_word.items()}
    char_to_id = {v: i for i, v in model.id_to_char.items()}
    tag_to_id = {tag: i for i, tag in model.id_to_tag.items()}

    tf = tempfile.NamedTemporaryFile(delete=False)
    tf.write("\n\n".join(
        ["\n".join(example.text) for example in dataset.examples]))
    tf.close()

    train_sentences = load_sentences(tf.name, model.parameters['lower'],
                                     model.parameters['zeros'])

    train_inputs = prepare_dataset(train_sentences, word_to_id, char_to_id,
                                   model.parameters['lower'], True)

    preds = []

    for citation in train_inputs:
        inputs = create_input(citation, model.parameters, False)
        y_pred = np.array(f[1](*inputs))[1:-1]

        preds.append([(w, y_pred[i])
                      for i, w in enumerate(citation['str_words'])])

    assert len(preds) == len(dataset.examples)

    results = []

    for P, T in zip(preds, dataset.examples):
        for p, t in zip(P, zip(T.text, T.entity)):
            results.append((p[1], tag_to_id[t[1]]))

    pred, true = zip(*results)

    eval_metrics = {
        'micro_f1': f1_score(true, pred, average='micro'),
        'macro_f1': f1_score(true, pred, average='macro')
    }

    data_file.close()

    assert eval_metrics == pytest.approx({
        'macro_f1': 0.984,
        'micro_f1': 0.993
    },
                                         abs=0.001)
예제 #24
0
class SpringDiffTokenHierarchical(SpringDiffToken):
    NESTING_FIELD = Field(batch_first=True, tokenize=split_string)
    CODE_FIELD = NestedField(NESTING_FIELD, tokenize=split_json)
예제 #25
0
class AppReviewsHierarchical(AppReviews):
    NESTING_FIELD = Field(batch_first=True, tokenize=clean_string)
    TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=split_sents)
예제 #26
0
class VulasPairedTokenHierarchical(VulasPairedToken):
    NESTING1_FIELD = Field(batch_first=True, tokenize=split_string)
    CODE1_FIELD = NestedField(NESTING1_FIELD, tokenize=split_json)
    NESTING2_FIELD = Field(batch_first=True, tokenize=split_string)
    CODE2_FIELD = NestedField(NESTING2_FIELD, tokenize=split_json)
예제 #27
0
class YELP14Hierarchical(YELP14):
    NESTING_FIELD = Field(batch_first=True, tokenize=clean_string)
    TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=split_sents, include_lengths=True)
예제 #28
0
    def __init__(self, args):
        # list all the fields
        self.word_field = Field(lower=True)
        self.event_field = Field(unk_token=None)
        self.entity_field = Field(unk_token=None)
        self.argument_field = Field(unk_token=None)
        self.trigger_pos_field = Field(unk_token=None)
        self.char_nesting_field = Field(tokenize=list)
        self.char_field = NestedField(self.char_nesting_field)

        self.wv = args.wv_file
        # create dataset using built-in parser from torchtext
        self.train_dataset, self.val_dataset, self.test_dataset = SequenceTaggingDataset.splits(
            path=args.input_folder,
            train="train.txt",
            validation="dev.txt",
            test="test.txt",
            fields=((("word", "char"), (self.word_field, self.char_field)),
                    ("event", self.event_field), ("entity", self.entity_field),
                    ("argument", self.argument_field),
                    ("trigger_pos", self.trigger_pos_field)),
        )
        # convert fields to vocabulary list
        # self.word_field.build_vocab(self.train_dataset.word, min_freq=min_word_freq)
        self.event_field.build_vocab(self.train_dataset.event)
        # create iterator for batch input

        if args.wv_file:
            print("start loading embedding")
            self.wv_model = gensim.models.KeyedVectors.load_word2vec_format(
                args.wv_file, binary=True)
            print("done loading embedding")
            self.embedding_dim = self.wv_model.vector_size
            word_freq = {
                word: self.wv_model.wv.vocab[word].count
                for word in self.wv_model.wv.vocab
            }
            word_counter = Counter(word_freq)
            self.word_field.vocab = Vocab(word_counter,
                                          min_freq=args.min_word_freq)
            # mapping each vector/embedding from word2vec model to word_field vocabs
            vectors = []
            print("start loading vec", len(self.word_field.vocab.stoi))
            for word, idx in self.word_field.vocab.stoi.items():
                if word in self.wv_model.wv.vocab.keys():
                    vectors.append(
                        torch.as_tensor(self.wv_model.wv[word].tolist()))
                else:
                    vectors.append(torch.zeros(self.embedding_dim))
            print("done loading vec")
            del self.wv_model
            self.word_field.vocab.set_vectors(
                stoi=self.word_field.vocab.stoi,
                # list of vector embedding, orderred according to word_field.vocab
                vectors=vectors,
                dim=self.embedding_dim)

        else:
            self.word_field.build_vocab(self.train_dataset.word,
                                        min_freq=args.min_word_freq)
        self.char_field.build_vocab(self.train_dataset.char)
        self.entity_field.build_vocab(self.train_dataset.entity)
        self.argument_field.build_vocab(self.train_dataset.argument)
        self.trigger_pos_field.build_vocab(self.train_dataset.trigger_pos)

        self.train_iter, self.val_iter, self.test_iter = BucketIterator.splits(
            datasets=(self.train_dataset, self.val_dataset, self.test_dataset),
            batch_size=args.batch_size,
            shuffle=False,
        )

        # prepare padding index to be ignored during model training/evaluation
        self.word_pad_idx = self.word_field.vocab.stoi[
            self.word_field.pad_token]
        self.event_pad_idx = self.event_field.vocab.stoi[
            self.event_field.pad_token]
        self.char_pad_idx = self.char_field.vocab.stoi[
            self.char_field.pad_token]
        self.entity_pad_idx = self.entity_field.vocab.stoi[
            self.entity_field.pad_token]
        self.argument_pad_idx = self.entity_field.vocab.stoi[
            self.entity_field.pad_token]
예제 #29
0
class Corpus(object):
    def __init__(self, data_path, vector_path, glove6b, embedding_dim,
                 min_word_freq, max_vocab_size, batch_size, device, test,
                 prefix):
        '''

        class for interacting with dataset

        data_path: root path for dataset directory
        vector_path: path for vector_cache
        glove6b: switch for using glove.6b pretrained embeddings
        embedding_dim: dimension of embedding (50, 100, 200, or 300 for glove.6b)
        min_word_freq: ignore words that don't meet the frequency threshold in the text field
        max_vocab_size: maximum size of the vocabulary of the text field
        batch_size: batch size for data iterators
        device: torch device
        test: switch for whether the dataset is a test (torchtext) set that is hopefully more likely to work
        prefix: prefix to be appended to data path
        
        '''
        # set all of the attributes
        self.data_path, self.vector_path, self.glove6b = data_path, vector_path, glove6b
        self.embedding_dim, self.min_word_freq, self.max_vocab_size = embedding_dim, min_word_freq, max_vocab_size
        self.batch_size = batch_size
        self.device, self.test, self.prefix = device, test, prefix
        # initialize text and tag fields
        self.initialize_fields()
        # load dataset
        self.load_data()
        # build vocabularies from text and tag data
        self.build_vocabularies()
        # build iterators for batches of train, dev, and test sets
        self.initialize_iterators()
        # initialize indices of padding and unknown tokens
        self.init_idxs()

    def initialize_fields(self):
        ''' initializes fields '''
        # initialize the text field with the spacy tokenizer and no casing
        self.text_field = Field(tokenize='spacy', lower=True, batch_first=True)
        # initialize the tag field without an unknown token (hopefully the train set contains all of the tags)
        self.tag_field = Field(unk_token=None, batch_first=True)
        # initialize the character field
        char_nesting_field = Field(tokenize=list, batch_first=True)
        self.char_field = NestedField(char_nesting_field)
        self.pad_token = self.text_field.pad_token

    def load_data(self):
        ''' load data from file using torchtext '''
        if self.test:
            # built-in datasets
            if self.prefix == 'udpos':
                self.train_set, self.valid_set, self.test_set = UDPOS.splits(
                    fields=((('text', 'char'), (self.text_field,
                                                self.char_field)),
                            ('tag', self.tag_field), ('pos', None)),
                    root=self.data_path)
            if self.prefix == 'conll2000':
                self.train_set, self.valid_set, self.test_set = CoNLL2000Chunking.splits(
                    fields=((('text', 'char'), (self.text_field,
                                                self.char_field)),
                            ('pos', None), ('tag', self.tag_field)),
                    root=self.data_path)
        else:
            # load datasets from pre-prepared tsv files
            self.train_set, self.valid_set, self.test_set = SequenceTaggingDataset.splits(
                fields=((('text', 'char'), (self.text_field, self.char_field)),
                        ('tag', self.tag_field)),
                path=self.data_path + '/{}'.format(self.prefix),
                train='train.tsv',
                validation='dev.tsv',
                test='test.tsv')

    def build_vocabularies(self):
        ''' builds vocabularies for the text and tag data '''
        # if a vector path is provided, then we have to make sure that the word vectors are handled
        if self.vector_path:
            if self.glove6b:
                # the way to do this is built-in with glove.6b
                self.text_field.build_vocab(self.train_set.text,
                                            max_size=self.max_vocab_size,
                                            min_freq=self.min_word_freq,
                                            vectors='glove.6B.{}d'.format(
                                                self.embedding_dim),
                                            vectors_cache=self.vector_path,
                                            unk_init=torch.Tensor.normal_)
            else:
                # not sure if this is working
                self.text_field.build_vocab(self.train_set.text,
                                            max_size=self.max_vocab_size,
                                            min_freq=self.min_word_freq,
                                            vectors_cache=self.vector_path)
            ###########################################################################
            # not currently working due to conflict between gensim and python version #
            ###########################################################################
            #     self.wv_model = gensim.models.word2vec.Word2Vec.load(wv_file)
            #     self.embedding_dim = self.wv_model.vector_size
            #     word_freq = {word: self.wv_model.wv.vocab[word].count for word in self.wv_model.wv.vocab}
            #     word_counter = Counter(word_freq)
            #     self.word_field.vocab = Vocab(word_counter, min_freq=min_word_freq)
            #     vectors = []
            #     for word, idx in self.word_field.vocab.stoi.items():
            #         if word in self.wv_model.wv.vocab.keys():
            #             vectors.append(torch.as_tensor(self.wv_model.wv[word].tolist()))
            #         else:
            #             vectors.append(torch.zeros(self.embedding_dim))
            #     self.word_field.vocab.set_vectors(stoi=self.word_field.vocab.stoi, vectors=vectors, dim=self.embedding_dim)
        else:
            # no vectors required
            self.text_field.build_vocab(self.train_set.text,
                                        max_size=self.max_vocab_size,
                                        min_freq=self.min_word_freq)
        # build vocabulary for the tags (nothing fancy needed)
        self.char_field.build_vocab(self.train_set.char)
        self.tag_field.build_vocab(self.train_set.tag)

    def initialize_iterators(self):
        ''' build iterators for data (by batches) using the bucket iterator '''
        self.train_iter, self.valid_iter, self.test_iter = BucketIterator.splits(
            datasets=(self.train_set, self.valid_set, self.test_set),
            batch_size=self.batch_size,
            device=self.device,
            random_state=seed)

    def init_idxs(self):
        ''' saves indices for padding and unknown tokens '''
        self.text_pad_idx = self.text_field.vocab.stoi[
            self.text_field.pad_token]
        self.char_pad_idx = self.char_field.vocab.stoi[
            self.char_field.pad_token]
        self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token]
        self.text_unk_idx = self.text_field.vocab.stoi[
            self.text_field.unk_token]
예제 #30
0
파일: imdb.py 프로젝트: Mrmoore98/hedwig
class IMDBHierarchical(IMDB):
    NESTING_FIELD = Field(batch_first=True,
                          tokenize=clean_string,
                          fix_length=50)
    TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=Sentence_Tokenize())