def make_vocab_label(corpus, vocab_label_tmp=None, cut_label=0): if vocab_label_tmp: vocab_label = deepcopy(vocab_label_tmp) else: vocab_label = Vocab() vocab_label.add_word(UNDER_BAR) labels = [] for sent in corpus: if sent.has_prds: for prop in sent.prd_props: labels += prop cnt = Counter(labels) labels = [(w, c) for w, c in sorted( cnt.iteritems(), key=lambda x: x[1], reverse=True) if c > cut_label] for label, count in labels: vocab_label.add_word(label) return vocab_label
def make_vocab_from_ids(key_value_format): vocab = Vocab() for key, value in key_value_format: vocab.add_word(key) return vocab
if __name__ == "__main__": #Create a set of all words all_words = set() vocab = Vocab() count_files = 0 for name in ['test', 'train', 'val']: filename = name + '_tokens.txt' f = open(filename, 'r') for line in f: sp_line = line.strip().split() for token in sp_line: all_words.add(token) vocab.add_word(token) f.close() glove_dir = '/media/sf_kickstarter/CS224D/Project/glove.840B.300d' glove_f = open(os.path.join(glove_dir, 'glove.840B.300d.txt'), 'r') embedding_matrix = np.zeros((len(vocab.word_to_index),300)) count = 0 for line in glove_f: line_sp = line.strip().split() word = line_sp[0] line_sp_vec = [float(line_num) for line_num in line_sp[1:]] if word in vocab.word_to_index: line_sp_vec = [float(line_num) for line_num in line_sp[1:]] index = vocab.word_to_index[word]
def make_vocab_word(word_list): vocab_word = Vocab() vocab_word.add_word(UNK) for w in word_list: vocab_word.add_word(w) return vocab_word
vocab = Vocab() count_files = 0 for dir_ in dir_list: print 'In directory:', dir_ curr_dir = os.path.join(docs_dir, dir_) file_list = os.listdir(curr_dir) print curr_dir for fname in file_list: count_files += 1 full_fname = os.path.join(curr_dir, fname) f = open(full_fname, 'r') for line in f: sp_line = line.split() word = sp_line[0] all_words.add(word) #Add word to set vocab.add_word(word) print "Total number of unique words (case sensitive):", len(all_words) print "Total number of parsed files looked at:", count_files dir_for_all_words = os.path.join(root_dir, 'data_tools') file_for_all_words = os.path.join(dir_for_all_words, 'all_words.txt') all_words_f = open(file_for_all_words, 'w') for word in all_words: all_words_f.write(word + '\n') glove_dir = os.path.join(root_dir, 'glove.840B.300d') glove_f = open(os.path.join(glove_dir, 'glove.840B.300d.txt'), 'r') embedding_matrix = np.zeros((len(vocab.word_to_index),300)) count = 0
def prepare(config): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger('qarc') logger.info('Checking the data files...') for data_path in config.train_files + config.dev_files + config.test_files: assert os.path.exists(data_path),\ '{} file does not exist.'.format(data_path) logger.info('Preparing the directories...') train_summary_dir = os.path.join(config.summary_dir, 'train') dev_summary_dir = os.path.join(config.summary_dir, 'dev') for dir_path in [config.vocab_dir, config.model_dir, config.result_dir, train_summary_dir, dev_summary_dir]: if not os.path.exists(dir_path): os.makedirs(dir_path) logger.info('Load dataset...') if config.dataset_name.startswith('cmrc2018'): qarc_data = CMRCDataset(config.max_p_len, config.max_q_len, config.max_char_len, config.max_py_len, config.train_files, config.dev_files, config.test_files) else: qarc_data = BRCDataset(config.max_p_num, config.max_p_len, config.max_q_len, config.max_char_len, config.train_files, config.dev_files, config.test_files) logger.info('Building vocabulary...') vocab = Vocab(lower=True) for word in qarc_data.word_iter('train'): vocab.add_word(word) for char in qarc_data.char_iter('train'): vocab.add_char(char) for py in qarc_data.py_iter('train'): vocab.add_py(py) unfiltered_vocab_word_size = vocab.word_size() vocab.filter_tokens_by_cnt(min_cnt=2) filtered_word_num = unfiltered_vocab_word_size - vocab.word_size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_word_num, vocab.word_size())) unfiltered_vocab_char_size = vocab.char_size() vocab.filter_chars_by_cnt(min_cnt=2) filtered_char_num = unfiltered_vocab_char_size - vocab.char_size() logger.info('After filter {} chars, the final chars size is {}'.format( filtered_char_num, vocab.char_size())) unfiltered_vocab_py_size = vocab.py_size() vocab.filter_pys_by_cnt(min_cnt=2) filtered_py_num = unfiltered_vocab_py_size - vocab.py_size() logger.info('After filter {} pys, the final pys size is {}'.format( filtered_py_num, vocab.py_size())) logger.info('Assigning word embeddings...') vocab.load_pretrained_word_embeddings( config.word2vec, config.word_embed_dim) logger.info('Assigning char embeddings...') # vocab.randomly_init_char_embeddings(config.char_embed_dim) vocab.load_pretrained_char_embeddings( config.word2vec, config.char_embed_dim) logger.info('Assigning py embeddings...') vocab.randomly_init_py_embeddings(config.py_embed_dim) logger.info('Saving vocab...') with open(os.path.join(config.vocab_dir, config.dataset_name + '_vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) logger.info('Done with preparing!')
import pickle if __name__ == "__main__": #Create a set of all words all_words = set() vocab = Vocab() count_files = 0 for name in ['test', 'train', 'val']: filename = name + '_tokens.txt' f = open(filename, 'r') for line in f: sp_line = line.strip().split() for token in sp_line: all_words.add(token) vocab.add_word(token) f.close() glove_dir = '/media/sf_kickstarter/CS224D/Project/glove.840B.300d' glove_f = open(os.path.join(glove_dir, 'glove.840B.300d.txt'), 'r') embedding_matrix = np.zeros((len(vocab.word_to_index), 300)) count = 0 for line in glove_f: line_sp = line.strip().split() word = line_sp[0] line_sp_vec = [float(line_num) for line_num in line_sp[1:]] if word in vocab.word_to_index: line_sp_vec = [float(line_num) for line_num in line_sp[1:]] index = vocab.word_to_index[word] embedding_matrix[index, :] = line_sp_vec