def get_words2vecs(doc, model, dim, stop_list, is_normalization_word=False, fix_stem=None): # flatten doc: doc[sentence_id][word_id] -> doc[word_id] if len(doc) > 0 and isinstance(doc[0], list): doc = sum(doc, []) if fix_stem: (refined_oov, stem_map, oov_map) = fix_stem doc_oov_refined = [] for word in doc: if word not in model and word not in refined_oov: continue doc_oov_refined += (re.split('\s+', refined_oov[word]) if (word in refined_oov and bool(refined_oov[word])) else [word]) doc = [] for word in doc_oov_refined: stemmed_word = (' '.join(my_util.preprocess_content(word, stop_list, is_math=True, is_stemming=True))).strip() if stemmed_word: if stemmed_word in stem_map: doc.append(stem_map[stemmed_word]) elif stemmed_word in oov_map: doc.append(oov_map[stemmed_word]) arr = np.empty((0, (dim + len(oov_map.keys()) if fix_stem else dim))) for word in doc: if word in stop_list or (not fix_stem and word not in model): continue if not fix_stem: arr = np.vstack(( arr, normalize_sum(model[word], is_normalization_word) )) elif isinstance(word, (int, long)): word_arr = np.zeros( dim+len(oov_map.keys()) ) word_arr[word] = 1.0 arr = np.vstack((arr, word_arr)) else: # consider normalize sum(model[word]) to 1 arr = np.vstack(( arr, np.hstack(( np.zeros( len(oov_map.keys()) ), normalize_sum(model[word], is_normalization_word) )) )) return arr
def get_words2vecs(doc, model, dim, stop_list, is_normalization_word=False, fix_stem=None): # flatten doc: doc[sentence_id][word_id] -> doc[word_id] if len(doc) > 0 and isinstance(doc[0], list): doc = sum(doc, []) if fix_stem: (refined_oov, stem_map, oov_map) = fix_stem doc_oov_refined = [] for word in doc: if word not in model and word not in refined_oov: continue doc_oov_refined += ( re.split("\s+", refined_oov[word]) if (word in refined_oov and bool(refined_oov[word])) else [word] ) doc = [] for word in doc_oov_refined: stemmed_word = ( " ".join(my_util.preprocess_content(word, stop_list, is_math=True, is_stemming=True)) ).strip() if stemmed_word: if stemmed_word in stem_map: doc.append(stem_map[stemmed_word]) elif stemmed_word in oov_map: doc.append(oov_map[stemmed_word]) arr = np.empty((0, (dim + len(oov_map.keys()) if fix_stem else dim))) for word in doc: if word in stop_list or (not fix_stem and word not in model): continue if not fix_stem: arr = np.vstack((arr, normalize_sum(model[word], is_normalization_word))) elif isinstance(word, (int, long)): word_arr = np.zeros(dim + len(oov_map.keys())) word_arr[word] = 1.0 arr = np.vstack((arr, word_arr)) else: # consider normalize sum(model[word]) to 1 arr = np.vstack( (arr, np.hstack((np.zeros(len(oov_map.keys())), normalize_sum(model[word], is_normalization_word)))) ) return arr
def _main( ): argv_index = 1 task_type, argv_index = assign_argv(sys.argv, argv_index) # 1 corpus_name, argv_index = assign_argv(sys.argv, argv_index) # 2 model_name, argv_index = assign_argv(sys.argv, argv_index) # 3 lecture_list_name, argv_index = assign_argv(sys.argv, argv_index) # 4 fea_dir, argv_index = assign_argv(sys.argv, argv_index) # 5 dim, argv_index = assign_argv(sys.argv, argv_index, int) # 6 is_normalization_doc2vec, argv_index = assign_argv(sys.argv, argv_index, lambda x: bool(int(x))) # 7 is_normalization_cos_sim, argv_index = assign_argv(sys.argv, argv_index) # 8 is_remove_stop, argv_index = assign_argv(sys.argv, argv_index, lambda x: bool(int(x))) # 9 corpus_pos_name, argv_index = assign_argv(sys.argv, argv_index) # 10 pos_type, argv_index = assign_argv(sys.argv, argv_index) # 11 use_external_model = False fix_stem = False fix_math = False keyword_only = False is_normalization_word = False fix_stem_info = None is_pool = False if 'pool' in task_type: is_pool = True if re.match('.*external_model', task_type): model = gensim.models.Word2Vec.load_word2vec_format(model_name, binary=True) use_external_model = True if 'fix_stem' in task_type: fix_stem = True #if 'pool' in task_type: # is_pool = True if 'fix_stem_math' in task_type: fix_math = True elif 'fix_stem_keyword' in task_type: keyword_only = True else: model = gensim.models.Word2Vec.load(model_name) stop_list = [] if is_remove_stop or fix_stem: stop_list = load_courseware.load_stop_list() if re.match('.*textbook_lecture', task_type): tx_list_name, argv_index = assign_argv(sys.argv, argv_index) # 12 seg_dir, argv_index = assign_argv(sys.argv, argv_index) # 13 elif re.match('.*slides_trans', task_type): window_size, argv_index = assign_argv(sys.argv, argv_index, int) # 12 if re.match('.*textbook_lecture', task_type) or re.match('.*slides_trans', task_type): if use_external_model: if fix_stem: is_normalization_word, argv_index = assign_argv(sys.argv, argv_index, lambda x: bool(int(x))) # textbook=14/slides=13 oov_refined, argv_index = assign_argv(sys.argv, argv_index) # t=15/s=14 stem_to_raw, argv_index = assign_argv(sys.argv, argv_index) # t=16/s=15 if fix_math: math_words, argv_index = assign_argv(sys.argv, argv_index) # t=17/s=16 ? it depends elif keyword_only: keyword_list, argv_index = assign_argv(sys.argv, argv_index) # t=17/s=16 ? it depends else: oov_dict_name, argv_index = assign_argv(sys.argv, argv_index) # t=14/s=13 if use_external_model: if fix_stem: refined_oov = {} keywords = [] if keyword_only: for line in open(keyword_list): keywords += my_util.preprocess_content(line.strip(), stop_list, is_math=True, is_stemming=True) keywords = list(set(keywords)) for line in open(oov_refined): items = line.strip().split('\t') refined_oov[items[0]] = (items[1] if len(items) > 1 else '') stem_map = {} oov_map = {} oov_size = 0 for line in open(stem_to_raw): items = line.strip().split('\t') # remove words not in positive list (keyword list) if keywords and items[0] not in keywords: continue if re.match('n', items[2]): stem_map[items[0]] = re.split('\s+', items[1])[0] else: oov_map[items[0]] = oov_size oov_size += 1 # add math words to oov if fix_math: for line in open(math_words): stemmed_word = (' '.join(my_util.preprocess_content(line.strip(), stop_list, is_math=True, is_stemming=True))).strip() # remove words not in positive list (keyword list) if stemmed_word and keywords and stemmed_word not in keywords: continue if stemmed_word and stemmed_word not in (stem_map.keys()+oov_map.keys()): oov_map[stemmed_word] = oov_size oov_size += 1 fix_stem_info = (refined_oov, stem_map, oov_map) else: oov_dict = my_util.load_stopwords(oov_dict_name) temp_fea_dir = fea_dir for pool_param in gen_pool_param(is_pool): if is_pool: fea_dir = temp_fea_dir + '_%s_%d_%d' % pool_param if not os.path.exists(fea_dir): os.makedirs(fea_dir) corpus = load_corpus(corpus_name, pool_param=pool_param) corpus_pos = load_corpus(corpus_pos_name, pool_param=pool_param) for l_id in open(lecture_list_name): l_id = l_id.strip() trans_temp = get_docs(trans_doc_prefix, l_id, corpus, corpus_pos, pos_type) if re.match('.*slides_trans', task_type): trans = windowing(trans_temp, window_size) targets = get_docs(slides_doc_prefix, l_id, corpus, corpus_pos, pos_type) elif re.match('.*textbook_lecture', task_type): trans = window_by_seg(trans_temp, '%s/%s' % (seg_dir, l_id), pool_param=pool_param) targets = get_tx_docs(corpus, corpus_pos, pos_type, tx_list_name) print_fea('%s/%s' % (fea_dir, l_id), trans, targets, model, dim, stop_list, is_normalization_doc2vec, is_normalization_cos_sim, is_normalization_word=is_normalization_word, fix_stem=fix_stem_info, pool_param=pool_param) if use_external_model and not fix_stem: print_fea('%s/%s.oov' % (fea_dir, l_id), trans, targets, model, dim, stop_list, is_normalization_doc2vec, is_normalization_cos_sim, oov_dict=oov_dict)
def _main(): argv_index = 1 task_type, argv_index = assign_argv(sys.argv, argv_index) # 1 corpus_name, argv_index = assign_argv(sys.argv, argv_index) # 2 model_name, argv_index = assign_argv(sys.argv, argv_index) # 3 lecture_list_name, argv_index = assign_argv(sys.argv, argv_index) # 4 fea_dir, argv_index = assign_argv(sys.argv, argv_index) # 5 dim, argv_index = assign_argv(sys.argv, argv_index, int) # 6 is_normalization_doc2vec, argv_index = assign_argv(sys.argv, argv_index, lambda x: bool(int(x))) # 7 is_normalization_cos_sim, argv_index = assign_argv(sys.argv, argv_index) # 8 is_remove_stop, argv_index = assign_argv(sys.argv, argv_index, lambda x: bool(int(x))) # 9 corpus_pos_name, argv_index = assign_argv(sys.argv, argv_index) # 10 pos_type, argv_index = assign_argv(sys.argv, argv_index) # 11 use_external_model = False fix_stem = False fix_math = False keyword_only = False is_normalization_word = False fix_stem_info = None is_pool = False if "pool" in task_type: is_pool = True if re.match(".*external_model", task_type): model = gensim.models.Word2Vec.load_word2vec_format(model_name, binary=True) use_external_model = True if "fix_stem" in task_type: fix_stem = True # if 'pool' in task_type: # is_pool = True if "fix_stem_math" in task_type: fix_math = True elif "fix_stem_keyword" in task_type: keyword_only = True else: model = gensim.models.Word2Vec.load(model_name) stop_list = [] if is_remove_stop or fix_stem: stop_list = load_courseware.load_stop_list() if re.match(".*textbook_lecture", task_type): tx_list_name, argv_index = assign_argv(sys.argv, argv_index) # 12 seg_dir, argv_index = assign_argv(sys.argv, argv_index) # 13 elif re.match(".*slides_trans", task_type): window_size, argv_index = assign_argv(sys.argv, argv_index, int) # 12 if re.match(".*textbook_lecture", task_type) or re.match(".*slides_trans", task_type): if use_external_model: if fix_stem: is_normalization_word, argv_index = assign_argv( sys.argv, argv_index, lambda x: bool(int(x)) ) # textbook=14/slides=13 oov_refined, argv_index = assign_argv(sys.argv, argv_index) # t=15/s=14 stem_to_raw, argv_index = assign_argv(sys.argv, argv_index) # t=16/s=15 if fix_math: math_words, argv_index = assign_argv(sys.argv, argv_index) # t=17/s=16 ? it depends elif keyword_only: keyword_list, argv_index = assign_argv(sys.argv, argv_index) # t=17/s=16 ? it depends else: oov_dict_name, argv_index = assign_argv(sys.argv, argv_index) # t=14/s=13 if use_external_model: if fix_stem: refined_oov = {} keywords = [] if keyword_only: for line in open(keyword_list): keywords += my_util.preprocess_content(line.strip(), stop_list, is_math=True, is_stemming=True) keywords = list(set(keywords)) for line in open(oov_refined): items = line.strip().split("\t") refined_oov[items[0]] = items[1] if len(items) > 1 else "" stem_map = {} oov_map = {} oov_size = 0 for line in open(stem_to_raw): items = line.strip().split("\t") # remove words not in positive list (keyword list) if keywords and items[0] not in keywords: continue if re.match("n", items[2]): stem_map[items[0]] = re.split("\s+", items[1])[0] else: oov_map[items[0]] = oov_size oov_size += 1 # add math words to oov if fix_math: for line in open(math_words): stemmed_word = ( " ".join(my_util.preprocess_content(line.strip(), stop_list, is_math=True, is_stemming=True)) ).strip() # remove words not in positive list (keyword list) if stemmed_word and keywords and stemmed_word not in keywords: continue if stemmed_word and stemmed_word not in (stem_map.keys() + oov_map.keys()): oov_map[stemmed_word] = oov_size oov_size += 1 fix_stem_info = (refined_oov, stem_map, oov_map) else: oov_dict = my_util.load_stopwords(oov_dict_name) temp_fea_dir = fea_dir for pool_param in gen_pool_param(is_pool): if is_pool: fea_dir = temp_fea_dir + "_%s_%d_%d" % pool_param if not os.path.exists(fea_dir): os.makedirs(fea_dir) corpus = load_corpus(corpus_name, pool_param=pool_param) corpus_pos = load_corpus(corpus_pos_name, pool_param=pool_param) for l_id in open(lecture_list_name): l_id = l_id.strip() trans_temp = get_docs(trans_doc_prefix, l_id, corpus, corpus_pos, pos_type) if re.match(".*slides_trans", task_type): trans = windowing(trans_temp, window_size) targets = get_docs(slides_doc_prefix, l_id, corpus, corpus_pos, pos_type) elif re.match(".*textbook_lecture", task_type): trans = window_by_seg(trans_temp, "%s/%s" % (seg_dir, l_id), pool_param=pool_param) targets = get_tx_docs(corpus, corpus_pos, pos_type, tx_list_name) print_fea( "%s/%s" % (fea_dir, l_id), trans, targets, model, dim, stop_list, is_normalization_doc2vec, is_normalization_cos_sim, is_normalization_word=is_normalization_word, fix_stem=fix_stem_info, pool_param=pool_param, ) if use_external_model and not fix_stem: print_fea( "%s/%s.oov" % (fea_dir, l_id), trans, targets, model, dim, stop_list, is_normalization_doc2vec, is_normalization_cos_sim, oov_dict=oov_dict, )