def _get_seq_metadata(self, num_doc_classes, num_word_classes):
        labels = []
        if num_doc_classes:
            vocab = Vocab(Counter())
            vocab.itos = ["C_{}".format(i) for i in range(num_doc_classes)]
            label_meta = FieldMeta()
            label_meta.vocab_size = num_doc_classes
            label_meta.vocab = vocab
            labels.append(label_meta)

        w_vocab = Vocab(Counter())
        w_vocab.itos = W_VOCAB

        seq_feat_meta = FieldMeta()
        seq_feat_meta.unk_token_idx = UNK_IDX
        seq_feat_meta.pad_token_idx = PAD_IDX
        seq_feat_meta.vocab_size = W_VOCAB_SIZE
        seq_feat_meta.vocab = w_vocab
        seq_feat_meta.vocab_export_name = "seq_tokens_vals"
        seq_feat_meta.pretrained_embeds_weight = None
        seq_feat_meta.dummy_model_input = SeqFeatureField.dummy_model_input

        meta = CommonMetadata()
        meta.features = {DatasetFieldName.TEXT_FIELD: seq_feat_meta}
        meta.target = labels
        if len(labels) == 1:
            [meta.target] = meta.target
        meta.label_names = [label.vocab.itos for label in labels]
        meta.feature_itos_map = {
            f.vocab_export_name: f.vocab.itos
            for _, f in meta.features.items()
        }
        return meta
Exemplo n.º 2
0
def load_vocab(file_path: str='vocab.txt') -> Vocab:
    stoi = {}
    itos = {}
    with open(file_path, 'r') as f:
        lines = [x.split() for x in f.readlines()]
        for (word, idx) in lines:
            stoi[word] = int(idx)
            itos[int(idx)] = word

        vocab = Vocab(Counter())
        vocab.stoi = stoi
        vocab.itos = itos
        return vocab
Exemplo n.º 3
0
def load_vocab(dir_name):
  freqs_path = os.path.join(dir_name, 'freqs.json')
  itos_path = os.path.join(dir_name, 'itos.json')
  stoi_path = os.path.join(dir_name, 'stoi.json')

  with open(freqs_path, 'r', encoding='utf-8') as finp:
    freqs = Counter(json.load(finp))
  with open(itos_path, 'r', encoding='utf-8') as finp:
    itos = json.load(finp)
  with open(stoi_path, 'r', encoding='utf-8') as finp:
    stoi = json.load(finp)
  
  vocab = Vocab(freqs)
  vocab.itos = itos
  vocab.stoi = stoi

  return vocab
Exemplo n.º 4
0
    def dict_to_field(self, dicted: Dict) -> Field:
        field = locate(dicted['type'])(dtype=locate(dicted['dtype']))
        for k in self.FIELDS_ATTRS:
            setattr(field, k, dicted[k])

        if 'vocab' in dicted:
            v_dict = dicted['vocab']
            vocab = Vocab(Counter())
            vocab.itos = v_dict['itos']
            vocab.stoi.update(v_dict['stoi'])
            vocab.unk_index = v_dict['unk_index']
            if 'freqs' in v_dict:
                vocab.freqs = Counter(v_dict['freqs'])
        else:
            vocab = Vocab(Counter())
            field.use_vocab = False
        field.vocab = vocab

        return field
    def _get_metadata(self, num_doc_classes, num_word_classes):
        labels = []
        if num_doc_classes:
            vocab = Vocab(Counter())
            vocab.itos = ["C_{}".format(i) for i in range(num_doc_classes)]
            label_meta = FieldMeta()
            label_meta.vocab_size = num_doc_classes
            label_meta.vocab = vocab
            labels.append(label_meta)

        if num_word_classes:
            vocab = Vocab(Counter())
            vocab.itos = ["W_{}".format(i) for i in range(num_word_classes)]
            label_meta = FieldMeta()
            label_meta.vocab_size = num_word_classes
            label_meta.vocab = vocab
            label_meta.pad_token_idx = 0
            labels.append(label_meta)

        w_vocab = Vocab(Counter())
        dict_vocab = Vocab(Counter())
        c_vocab = Vocab(Counter())
        d_vocab = Vocab(Counter())
        w_vocab.itos = W_VOCAB
        dict_vocab.itos = DICT_VOCAB
        c_vocab.itos = CHAR_VOCAB
        d_vocab.itos = []

        text_feat_meta = FieldMeta()
        text_feat_meta.unk_token_idx = UNK_IDX
        text_feat_meta.pad_token_idx = PAD_IDX
        text_feat_meta.vocab_size = W_VOCAB_SIZE
        text_feat_meta.vocab = w_vocab
        text_feat_meta.vocab_export_name = "tokens_vals"
        text_feat_meta.pretrained_embeds_weight = None
        text_feat_meta.dummy_model_input = TextFeatureField.dummy_model_input

        dict_feat_meta = FieldMeta()
        dict_feat_meta.vocab_size = DICT_VOCAB_SIZE
        dict_feat_meta.vocab = dict_vocab
        dict_feat_meta.vocab_export_name = "dict_vals"
        dict_feat_meta.pretrained_embeds_weight = None
        dict_feat_meta.dummy_model_input = DictFeatureField.dummy_model_input

        char_feat_meta = FieldMeta()
        char_feat_meta.vocab_size = CHAR_VOCAB_SIZE
        char_feat_meta.vocab = c_vocab
        char_feat_meta.vocab_export_name = "char_vals"
        char_feat_meta.pretrained_embeds_weight = None
        char_feat_meta.dummy_model_input = CharFeatureField.dummy_model_input

        dense_feat_meta = FieldMeta()
        dense_feat_meta.vocab_size = 0
        dense_feat_meta.vocab = d_vocab
        dense_feat_meta.vocab_export_name = "dense_vals"
        dense_feat_meta.pretrained_embeds_weight = None
        # ugh, dims are fixed
        dense_feat_meta.dummy_model_input = torch.tensor(
            [[1.0] * DENSE_FEATURE_DIM, [1.0] * DENSE_FEATURE_DIM],
            dtype=torch.float,
            device="cpu",
        )

        meta = CommonMetadata()
        meta.features = {
            DatasetFieldName.TEXT_FIELD: text_feat_meta,
            DatasetFieldName.DICT_FIELD: dict_feat_meta,
            DatasetFieldName.CHAR_FIELD: char_feat_meta,
            DatasetFieldName.DENSE_FIELD: dense_feat_meta,
        }
        meta.target = labels
        if len(labels) == 1:
            [meta.target] = meta.target
        meta.label_names = [label.vocab.itos for label in labels]
        meta.feature_itos_map = {
            f.vocab_export_name: f.vocab.itos
            for _, f in meta.features.items()
        }
        return meta