def get_word_count_train_validation(): d_word_count_t_q = util.get_d_word_count_train_question() d_word_count_t_c = util.get_d_word_count_train_choice() d_word_count_v_q = util.get_d_word_count_validation_question() d_word_count_v_c = util.get_d_word_count_validation_choice() d_word_count = {} for word in d_word_count_t_q.keys(): d_word_count.setdefault(word, 0) d_word_count[word] += d_word_count_t_q[word] for word in d_word_count_t_c.keys(): d_word_count.setdefault(word, 0) d_word_count[word] += d_word_count_t_c[word] for word in d_word_count_v_q.keys(): d_word_count.setdefault(word, 0) d_word_count[word] += d_word_count_v_q[word] for word in d_word_count_v_c.keys(): d_word_count.setdefault(word, 0) d_word_count[word] += d_word_count_v_c[word] return d_word_count '''
def statis_word2vec_coverage(): ''' How many words can be searched in the word2vec model ? And which ones can or can not be found ? This can guide us to use more and more data. ''' path_model = 'model/word2vec_4.model' model = gensim.models.Word2Vec.load(path_model) d_word_count = util.get_d_word_count_train_choice() n_found = 0 n_miss = 0 for word in d_word_count.keys(): try: res = model[word] n_found += 1 print "%s\t%d\tFound" % (word, d_word_count[word]) except: n_miss += 1 print "%s\t%d\tMiss" % (word, d_word_count[word]) print "Found\t%d\tMiss\t%d" % (n_found, n_miss)