def make_vocab_label(self, sents, vocab_label_init=None): if len(sents) == 0: return None if vocab_label_init: vocab_label = deepcopy(vocab_label_init) else: vocab_label = Vocab() none_label = 'O' vocab_label.add_word(none_label) labels = [] for sent in sents: if sent.has_prds: for prop in sent.prd_bio_labels: labels += prop cnt = Counter(labels) labels = [(w, c) for w, c in cnt.most_common()] for label, count in labels: vocab_label.add_word(label) return vocab_label
def make_vocab_label(self, sents, vocab_label_init=None): if len(sents) == 0: return None if vocab_label_init: vocab_label = deepcopy(vocab_label_init) else: vocab_label = Vocab() if self.argv.data_type == 'conll05': core_labels = ["A0", "A1", "A2", "A3", "A4", "A5"] else: core_labels = ["ARG0", "ARG1", "ARG2", "ARG3", "ARG4", "ARG5"] for label in core_labels: vocab_label.add_word(label) bio_labels = [] for sent in sents: for props in sent.prd_bio_labels: bio_labels += props cnt = Counter(bio_labels) bio_labels = [(w, c) for w, c in cnt.most_common()] for label, count in bio_labels: if not label.endswith('-V') and len(label) > 1: vocab_label.add_word(label[2:]) return vocab_label
missing_ratio = round((1.0 * missing_words / len(word_counts)) * 100, 4) print('Number of words missing from GloVe:', missing_words) print('Percent of words that are missing from vocabulary: {}%'.format( missing_ratio)) # Limit the vocab that we will use to words that appear >= threshold or are in GloVe vocab = Vocab() # Dictionary to convert words to integers threshold = 10 for word, count in word_counts.items(): if count >= threshold or word in glove_embeddings: vocab.add_word(word) # Special tokens that will be added to our vocab codes = ["<UNK>", "<EOS>", "<GO>", "<PAD>"] # Add codes to vocab for code in codes: vocab.add_word(code) usage_ratio = round(1.0 * len(vocab) / len(word_counts) + 4, 4) * 100 print("Total number of unique words:", len(word_counts)) print("Number of words we will use:", len(vocab)) print("Percent of words we will use: {}%".format(usage_ratio)) # save vocabulary
def make_vocab_word(word_list): vocab_word = Vocab() vocab_word.add_word(UNK) for w in word_list: vocab_word.add_word(w) return vocab_word
def make_vocab_from_ids(key_value_format): vocab = Vocab() for key, value in key_value_format: vocab.add_word(key) return vocab