Пример #1
0
    def make_vocab_label(corpus, vocab_label_tmp=None, cut_label=0):
        if vocab_label_tmp:
            vocab_label = deepcopy(vocab_label_tmp)
        else:
            vocab_label = Vocab()
            vocab_label.add_word(UNDER_BAR)

        labels = []
        for sent in corpus:
            if sent.has_prds:
                for prop in sent.prd_props:
                    labels += prop
        cnt = Counter(labels)
        labels = [(w, c) for w, c in sorted(
            cnt.iteritems(), key=lambda x: x[1], reverse=True)
                  if c > cut_label]

        for label, count in labels:
            vocab_label.add_word(label)

        return vocab_label
Пример #2
0
 def make_vocab_from_ids(key_value_format):
     vocab = Vocab()
     for key, value in key_value_format:
         vocab.add_word(key)
     return vocab

if __name__ == "__main__":

    #Create a set of all words
    all_words = set()
    vocab = Vocab()
    count_files = 0
    for name in ['test', 'train', 'val']:
        filename = name + '_tokens.txt'
        f = open(filename, 'r')
        for line in f:
            sp_line = line.strip().split()
            for token in sp_line:
                all_words.add(token)
                vocab.add_word(token)
        f.close()

    glove_dir = '/media/sf_kickstarter/CS224D/Project/glove.840B.300d'
    glove_f = open(os.path.join(glove_dir, 'glove.840B.300d.txt'), 'r')
    embedding_matrix = np.zeros((len(vocab.word_to_index),300))


    count = 0
    for line in glove_f:
        line_sp = line.strip().split()
        word = line_sp[0]
        line_sp_vec = [float(line_num) for line_num in line_sp[1:]]
        if word in vocab.word_to_index:
            line_sp_vec = [float(line_num) for line_num in line_sp[1:]]
            index = vocab.word_to_index[word]
Пример #4
0
 def make_vocab_word(word_list):
     vocab_word = Vocab()
     vocab_word.add_word(UNK)
     for w in word_list:
         vocab_word.add_word(w)
     return vocab_word
    vocab = Vocab()
    count_files = 0
    for dir_ in dir_list:
        print 'In directory:', dir_
        curr_dir = os.path.join(docs_dir, dir_)
        file_list = os.listdir(curr_dir)
	print curr_dir	
        for fname in file_list:
            count_files += 1
            full_fname = os.path.join(curr_dir, fname)
            f = open(full_fname, 'r')
	    for line in f:
		sp_line = line.split()
		word = sp_line[0]
                all_words.add(word) #Add word to set 
		vocab.add_word(word)
    print "Total number of unique words (case sensitive):", len(all_words)
    print "Total number of parsed files looked at:", count_files
    
    dir_for_all_words = os.path.join(root_dir, 'data_tools')
    file_for_all_words = os.path.join(dir_for_all_words, 'all_words.txt')
    all_words_f = open(file_for_all_words, 'w')
    for word in all_words:
	all_words_f.write(word + '\n')

    glove_dir = os.path.join(root_dir, 'glove.840B.300d')
    glove_f = open(os.path.join(glove_dir, 'glove.840B.300d.txt'), 'r')
    embedding_matrix = np.zeros((len(vocab.word_to_index),300))
    
    
    count = 0
Пример #6
0
def prepare(config):
    """
    checks data, creates the directories, 
    prepare the vocabulary and embeddings
    """
    logger = logging.getLogger('qarc')
    logger.info('Checking the data files...')
    for data_path in config.train_files + config.dev_files + config.test_files:
        assert os.path.exists(data_path),\
            '{} file does not exist.'.format(data_path)
    logger.info('Preparing the directories...')
    train_summary_dir = os.path.join(config.summary_dir, 'train')
    dev_summary_dir = os.path.join(config.summary_dir, 'dev')
    for dir_path in [config.vocab_dir, config.model_dir, config.result_dir, train_summary_dir, dev_summary_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Load dataset...')
    if config.dataset_name.startswith('cmrc2018'):
        qarc_data = CMRCDataset(config.max_p_len, config.max_q_len, config.max_char_len, config.max_py_len,
                                config.train_files, config.dev_files, config.test_files)
    else:
        qarc_data = BRCDataset(config.max_p_num, config.max_p_len, config.max_q_len, config.max_char_len,
                               config.train_files, config.dev_files, config.test_files)

    logger.info('Building vocabulary...')
    vocab = Vocab(lower=True)
    for word in qarc_data.word_iter('train'):
        vocab.add_word(word)
    for char in qarc_data.char_iter('train'):
        vocab.add_char(char)
    for py in qarc_data.py_iter('train'):
        vocab.add_py(py)

    unfiltered_vocab_word_size = vocab.word_size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_word_num = unfiltered_vocab_word_size - vocab.word_size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_word_num, vocab.word_size()))

    unfiltered_vocab_char_size = vocab.char_size()
    vocab.filter_chars_by_cnt(min_cnt=2)
    filtered_char_num = unfiltered_vocab_char_size - vocab.char_size()
    logger.info('After filter {} chars, the final chars size is {}'.format(
        filtered_char_num, vocab.char_size()))

    unfiltered_vocab_py_size = vocab.py_size()
    vocab.filter_pys_by_cnt(min_cnt=2)
    filtered_py_num = unfiltered_vocab_py_size - vocab.py_size()
    logger.info('After filter {} pys, the final pys size is {}'.format(
        filtered_py_num, vocab.py_size()))

    logger.info('Assigning word embeddings...')
    vocab.load_pretrained_word_embeddings(
        config.word2vec, config.word_embed_dim)

    logger.info('Assigning char embeddings...')
    # vocab.randomly_init_char_embeddings(config.char_embed_dim)
    vocab.load_pretrained_char_embeddings(
        config.word2vec, config.char_embed_dim)

    logger.info('Assigning py embeddings...')
    vocab.randomly_init_py_embeddings(config.py_embed_dim)

    logger.info('Saving vocab...')
    with open(os.path.join(config.vocab_dir, config.dataset_name + '_vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Пример #7
0
import pickle

if __name__ == "__main__":

    #Create a set of all words
    all_words = set()
    vocab = Vocab()
    count_files = 0
    for name in ['test', 'train', 'val']:
        filename = name + '_tokens.txt'
        f = open(filename, 'r')
        for line in f:
            sp_line = line.strip().split()
            for token in sp_line:
                all_words.add(token)
                vocab.add_word(token)
        f.close()

    glove_dir = '/media/sf_kickstarter/CS224D/Project/glove.840B.300d'
    glove_f = open(os.path.join(glove_dir, 'glove.840B.300d.txt'), 'r')
    embedding_matrix = np.zeros((len(vocab.word_to_index), 300))

    count = 0
    for line in glove_f:
        line_sp = line.strip().split()
        word = line_sp[0]
        line_sp_vec = [float(line_num) for line_num in line_sp[1:]]
        if word in vocab.word_to_index:
            line_sp_vec = [float(line_num) for line_num in line_sp[1:]]
            index = vocab.word_to_index[word]
            embedding_matrix[index, :] = line_sp_vec