def makedocvec(Folda = "businesstexts",clusternumber = DimentionN,word2vecdic = word2vecdic): businesspreprocessed_docs = maketextdoc(Folda) docvec = {} for k in businesspreprocessed_docs.keys(): sentence = businesspreprocessed_docs[k] if len(sentence) < 2: continue docvec[k] = np.zeros(clusternumber) for m in sentence: try: labelnum = word2vecdic[m] docvec[k][labelnum] = (docvec[k][labelnum] + 1) except: #print k continue for key in docvec.keys(): if all(docvec[key] == np.zeros(clusternumber)): print Folda,key #print docvec[key] else: docvec[key] = (docvec[key]/np.linalg.norm(docvec[key])) return docvec
for mod_value in range(mod_number): if mod_value == 0: all_target_pred_list_dic, all_target_test_list_dic = evaluate_with_SVM_3(topdocvec,toptarget_dic, newl_dic,func2,func4,func1, date = '07302015',mod_number = mod_number, mod_value = mod_value) else: target_pred_list_dic, target_test_list_dic = evaluate_with_SVM_3(topdocvec,toptarget_dic, newl_dic,func2,func4,func1, date = '07302015',mod_number = mod_number, mod_value = mod_value) for dic_key in all_target_test_list_dic: all_target_test_list_dic[dic_key] += target_test_list_dic[dic_key] all_target_pred_list_dic[dic_key] += target_pred_list_dic[dic_key] for dic_key in all_target_test_list_dic: print dic_key print classification_report(all_target_test_list_dic[dic_key],all_target_pred_list_dic[dic_key],digits=4) return all_target_test_list_dic, all_target_pred_list_dic #all_target_test_list_dic, all_target_pred_list_dic = evaluate_with_SVM_3_k_fold(topdocvec,toptarget_dic, newl_dic) #BOW topdoc2014 = maketextdoc(Folda = "toptexts_kaigyou_kihon2") topdoc2015 = maketextdoc(Folda = "toptexts_kaigyou_kihon2_2015") topdoc2013 = maketextdoc(Folda = "toptexts_kaigyou_kihon2_2013") topdoc = copy.copy(topdoc2013) topdoc.update(topdoc2014) topdoc.update(topdoc2015) #preprocessed_docs = doc preprocessed_docs = topdoc dct = gensim.corpora.Dictionary(preprocessed_docs.values()) unfiltered = dct.token2id.keys() dct.filter_extremes(no_below=5) filtered = dct.token2id.keys() #filtered_out = set(unfiltered) - set(filtered) bow_docs = {} bow_docs_all_zeros = {}