def preprocess_NewCategoryVec(newl_dic, toptarget_dic, dic_key,date = "07302015"): topdocveccategoryMat = IIalgorithm_model.makecategorydocMat(newl_dic[dic_key], defaultMatdict,word2vecdic, Folda = "toptexts_kaigyou_kihon2",clusternumber = DimentionN) #newl_train, newl_test, toptarget_train, toptarget_test = devide_train_test_with_random(newl,toptarget, newl_train, newl_test, toptarget_train, toptarget_test = devide_train_test_with_date( np.array(newl_dic[dic_key]),np.array(toptarget_dic[dic_key]), date = date,mod_number = 5, mod_value = 1) newl_train_balanced = [] newl_test_balanced = [] for i in range(min(len(newl_train[toptarget_train == -1]),len(newl_train[toptarget_train == 0]),len(newl_train[toptarget_train == 1]))): newl_train_balanced.append(newl_train[toptarget_train == -1][i]) newl_train_balanced.append(newl_train[toptarget_train == 0][i]) newl_train_balanced.append(newl_train[toptarget_train == 1][i]) for i in range(min(len(newl_test[toptarget_test == -1]),len(newl_test[toptarget_test == 0]),len(newl_test[toptarget_test == 1]))): newl_test_balanced.append(newl_test[toptarget_test == -1][i]) newl_test_balanced.append(newl_test[toptarget_test == 0][i]) newl_test_balanced.append(newl_test[toptarget_test == 1][i]) topdocveccategoryMat3_train = create_topdocveccategoryMat3(topdocveccategoryMat, newl_train_balanced) topdocveccategoryMat3_test = create_topdocveccategoryMat3(topdocveccategoryMat, newl_test_balanced) topdocveccategoryMat3_all = {} for n in range(DimentionN): topdocveccategoryMat3_all[n] = np.r_[topdocveccategoryMat3_train[n],topdocveccategoryMat3_test[n]] NewtopdocveccategoryMat,NewpreWdict, NewpreW, NewpreW_namelist_dic,NewDimentionN = yahoo_data_preprocess_func.create_NewpreW_NewpreWdict_NewDimentionN( topdocveccategoryMat3_all,preW,preWdict,DimentionN) target_train = np.array([-1,0,1] * (len(newl_train_balanced)/3)).astype(np.int32) target_test = np.array([-1,0,1] * (len(newl_test_balanced)/3)).astype(np.int32) NewtopdocveccategoryMat_train,NewtopdocveccategoryMat_test = {},{} for n in range(DimentionN): NewtopdocveccategoryMat_train[n] = NewtopdocveccategoryMat[n][0:len(target_train)] NewtopdocveccategoryMat_test[n] = NewtopdocveccategoryMat[n][len(target_train):] return target_train,target_test,NewtopdocveccategoryMat_train, NewtopdocveccategoryMat_test,NewpreWdict, NewpreW, NewpreW_namelist_dic,NewDimentionN
#NewpreW_namelist_dic[labelnum] = np.array(preWdict[labelnum].keys())[topdocveccategoryMatbollenlist[labelnum]].T NewpreW_namelist_dic[labelnum] = np.array(namelist_dic[labelnum][topdocveccategoryMatbollenlist[labelnum]]) NewpreWdict[labelnum] = dict(zip(NewpreW_namelist_dic[labelnum], NewpreW[labelnum].T)) #print NewpreW[labelnum].shape if (NewpreW[labelnum].shape[1]) != 0: NewDimentionN += 1 return NewtopdocveccategoryMat,NewpreWdict, NewpreW, NewpreW_namelist_dic,NewDimentionN """ preWdict_copy = copy.deepcopy(preWdict) preW_copy = {} for i in range(DimentionN): preW_copy[i] = ((np.array(preWdict_copy[i].values())[:,np.newaxis]).T).astype(np.float32) NewtopdocveccategoryMat,NewpreWdict, NewpreW, NewpreW_namelist_dic,NewDimentionN = yahoo_data_preprocess_func.create_NewpreW_NewpreWdict_NewDimentionN(topdocveccategoryMat3,preW_copy,preWdict_copy,DimentionN) #NewtopdocveccategoryMat_1,NewpreWdict_1, NewpreW_1, NewpreW_namelist_dic_1,NewDimentionN = create_NewpreW_NewpreWdict_NewDimentionN(topdocveccategoryMat3,new_preW,namelist_dic,DimentionN) NewpreW_zeroadded, NewtopdocveccategoryMat_zeroadded, max_numwords_in_class = yahoo_data_preprocess_func.Create_NewpreW_zero_add(NewpreW, NewtopdocveccategoryMat) scale = 0.01 count = 0 for i in range(3000):count += topdocveccategoryMat[274].values()[i][u"急減"] initialW = np.r_[np.random.randn(3,len(NewpreW_zeroadded)) * scale] k2list = [] for i in range(5): #initialW = np.r_[np.random.random((3,len(NewpreW_zeroadded))) * scale * 2 - scale] #initialW = np.r_[np.random.randn(3,len(NewpreW_zeroadded)) * scale] initialW = np.r_[-scale * np.random.random((1,len(NewpreW_zeroadded))), scale * np.random.random((1,len(NewpreW_zeroadded)))] #initialW = np.r_[-scale * np.ones((1,len(NewpreW_zeroadded))),