def build_vocab(self): all_words = set() all_chars = set() all_labels = set() train_labels = list( ) # to be used for getting class_weights for imbalanced datasets flatten = lambda l: [item for sublist in l for item in sublist] for sentence in self.train_data_obj: all_words.update(set(map(lambda x: x.word, sentence.tokens))) all_chars.update( set( flatten( map(lambda x: map(lambda y: y.char, x.chars), sentence.tokens)))) all_labels.add(sentence.label) train_labels.append(sentence.label) for sentence in self.valid_data_obj: all_words.update(set(map(lambda x: x.word, sentence.tokens))) all_chars.update( set( flatten( map(lambda x: map(lambda y: y.char, x.chars), sentence.tokens)))) all_labels.add(sentence.label) for sentence in self.test_data_obj: all_words.update(set(map(lambda x: x.word, sentence.tokens))) all_chars.update( set( flatten( map(lambda x: map(lambda y: y.char, x.chars), sentence.tokens)))) all_labels.add(sentence.label) all_words.add(PAD_TOKEN) all_chars.add(PAD_CHAR) word_vocab = list(all_words) char_vocab = list(all_chars) label_vocab = list(all_labels) word2idx = get_vocab_dict(word_vocab) idx2word = {idx: word for (word, idx) in word2idx.items()} char2idx = get_vocab_dict(char_vocab) idx2char = {idx: char for (char, idx) in char2idx.items()} label2idx = get_vocab_dict(label_vocab) idx2label = {idx: label for (label, idx) in label2idx.items()} self.word2idx = word2idx self.idx2word = idx2word self.char2idx = char2idx self.idx2char = idx2char self.label2idx = label2idx self.idx2label = idx2label
def build_vocab(self): all_words = set() all_pos = set() all_dep = set() for sentence in self.train_data: all_words.update(set(map(lambda x: x.word, sentence.tokens))) all_pos.update(set(map(lambda x: x.pos, sentence.tokens))) all_dep.update(set(map(lambda x: x.dep, sentence.tokens))) all_words.add(ROOT_TOKEN.word) all_words.add(NULL_TOKEN.word) all_words.add(UNK_TOKEN.word) all_pos.add(ROOT_TOKEN.pos) all_pos.add(NULL_TOKEN.pos) all_pos.add(UNK_TOKEN.pos) all_dep.add(ROOT_TOKEN.dep) all_dep.add(NULL_TOKEN.dep) all_dep.add(UNK_TOKEN.dep) word_vocab = list(all_words) pos_vocab = list(all_pos) dep_vocab = list(all_dep) word2idx = get_vocab_dict(word_vocab) idx2word = {idx: word for (word, idx) in word2idx.items()} pos2idx = get_vocab_dict(pos_vocab) idx2pos = {idx: pos for (pos, idx) in pos2idx.items()} global dep2idx dep2idx = get_vocab_dict(dep_vocab) idx2dep = {idx: dep for (dep, idx) in dep2idx.items()} self.word2idx = word2idx self.idx2word = idx2word self.pos2idx = pos2idx self.idx2pos = idx2pos self.dep2idx = dep2idx self.idx2dep = idx2dep