def deleteCharaterBymissRate(originPath,savePath,rate): """ 按照缺失率删除特征,特征缺失超过rate进行删除 :param originPath: 文件原始路径 :param savePath: 保存路径 :param rate: 缺失阈值 :return: """ originData, missRow, speciesName, begin, end = readNex(originPath) threshould=int(len(originData[0])*rate) missNumber=[] for j in range(len(originData[0])): count=0 for i in range(len(originData)): if np.isnan(originData[i][j]): count+=1 missNumber.append((count,j)) ret=heapq.nlargest(threshould,missNumber,key=lambda x:x[0]) deletRow = [i[1] for i in ret] deletRow.sort() for index in range(len(begin)): pattern="nchar *= *{}".format(str(len(originData[0]))) substring="nchar = {}".format(str(len(originData[0])-len(deletRow))) begin[index]=re.sub(pattern,substring,begin[index]) for j in deletRow[::-1]: originData=np.delete(originData,j,1) saveNex(savePath,speciesName, originData, begin, end)
def dirtyhand(): path = r'C:\Users\pro\Desktop\int_data' # file='Aria2015.nex' # data, miss_row, speciesName = readNex(os.path.join(path, file)) # mou='(Aysheaia,Anomalocaris,Hurdia,(Isoxys, Surusicaris,(((Canadaspis,Fuxianhuia),Occacaris),(Kunmingella,(((Martinssonia,Cephalocarida),Rehbachiella),((Agnostus,Kiisortoqia),((((Olenoides,Naraoia),Xandarella,Aglaspis),Emeraldella),((Jianfengia,Fortiforceps),(Yohoia,((((Leanchoilia_superlata,Leanchoilia_persephone),Leanchoilia_illecebrosa,(Oestokerkus,Yawunik),Actaeus,Oelandocaris),Alalcomenaeus),Haikoucaris)),((Offacolus,Dibasterium),(Weinbergina,Eurypterida))))))))));' # mouse2Tre(mou,speciesName) # mou2='(Aysheaia,(Anomalocaris ,Hurdia),(Isoxys,(((Canadaspis,Fuxianhuia),Occacaris),(Surusicaris,(Jianfengia,(Fortiforceps,((Yohoia,(((((Leanchoilia_superlata , Leanchoilia_persephone),Leanchoilia_illecebrosa,Actaeus,Oelandocaris),(Oestokerkus,Yawunik)),Alalcomenaeus),Haikoucaris)),((((Kunmingella,Agnostus),((Martinssonia,Cephalocarida),Rehbachiella)),(((Olenoides,Naraoia),Xandarella,Aglaspis),Emeraldella)),(((Offacolus,Dibasterium),(Weinbergina,Eurypterida)),Kiisortoqia)))))))))' # mouse2Tre(mou2, speciesName) file = 'Longrich2010.nex' data, miss_row, speciesName = readNex(os.path.join(path, file)) mou = '(Thescelosaurus_neglectus,Psittacosaurus_spp,((Stegoceras_validum,(Gravitholus_albertae,Colepiocephale_lambei)),Texacephale_langstoni,Hanssuesia_sternbergi,(Sphaerotholus_brevis,(Sphaerotholus_goodwini,(Sphaerotholus_edmontonense,Sphaerotholus_buchholtzae)),((Alaskacephale_gangloffi,Pachycephalosaurus_wyomingensis,(Stygimoloch_spinifer,Dracorex_hogwartsi)),(Tylocephale_gilmorei,Prenocephale_prenes,(Homalocephale_calathocercos,Goyocephale_lattimorei,Wannanosaurus_yansiensis))))));' mouse2Tre(mou, speciesName) file = 'Dikow2009.nex' data, miss_row, speciesName = readNex(os.path.join(path, file)) mou = '(Bombylius_major,((Apsilocephala_longistyla,(Prorates_sp_Escalante,(Phycus_frommeri,Hemigephyra_atra))),((Apiocera_painteri,((Opomydas_townsendi,Mydas_clavatus),(Mitrodetus_dentitarsis,(Nemomydas_brachyrhynchus,Afroleptomydas_sp_Clanwilliam)))),((Rhipidocephala_sp_HaroldJohnson,(Holcocephala_calva,Holcocephala_abdominalis)),((Perasis_transvaalensis,(Laphystia_tolandi,(Trichardis_effrena,(Nusa_infumata,((Laxenecera_albicincta,Hoplistomerus_nobilis),((Pilica_formidolosa,(Cerotainia_albipilosa,Atomosia_puella)),((Stiphrolamyra_angularis,Lamyra_gulo),(Laphria_aktis,Choerades_bella)))))))),((((Damalis_monochaetes,Damalis_annulata),(Rhabdogaster_pedion,Acnephalum_cylindricum)),(((Pegesimallus_laticornis,(Diogmites_grossus,(Plesiomma_sp_Guanacaste,(Dasypogon_diadema,(Saropogon_luteus,Lestomyia_fraudiger))))),((Trichoura_sp_Tierberg,Ablautus_coquilletti),(Molobratia_teutonus,(Nicocles_politus,(Leptarthrus_brevirostris,(Cyrtopogon_rattus,Ceraturgus_fasciatus)))))),((Willistonina_bilineata,(Eudioctria_albius,(Dioctria_hyalipennis,(Dioctria_rufipes,Dioctria_atricapillus)))),((Gonioscelis_ventralis,(Stenopogon_rufibarbis,Ospriocerus_aeacus)),((Tillobroma_punctipennis,(Prolepsis_tristis,Microstylum_sp_Karkloof)),(Lycostommyia_albifacies,(Scylaticus_costalis,Connomyia_varipennis))))))),(((Lasiopogon_cinctus,Lasiopogon_aldrichii),(Stichopogon_punctum,(Stichopogon_trifasciatus,Stichopogon_elegantulus))),(((Euscelidia_pulchra,Beameromyia_bifida),((Leptogaster_cylindrica,Leptogaster_arida),(Tipulogaster_glabrata,Lasiocnemus_lugens))),(((Emphysomera_pallidapex,Emphysomera_conopsoides),(Ommatius_tibialis,Afroestricus_chiastoneurus)),((Proctacanthus_philadelphicus,Pogonioefferia_pogonias),((Philodicus_tenuipes,(Promachus_amastrus,Megaphorus_pulchrus)),((Neolophonotus_bimaculatus,Dasophrys_crenulatus),(Neoitamus_cyanurus,(Clephydroneura_sp_Kepong,(Dysmachus_trigonus,(Philonicus_albiceps,(Machimus_occidentalis,(Tolmerus_atricapillus,(Asilus_sericeus,Asilus_crabroniformis)))))))))))))))))));' mouse2Tre(mou, speciesName) file = 'Liu2011.nex' data, miss_row, speciesName = readNex(os.path.join(path, file)) mou = '(Cycloneuralia,((Aysheaia,(Tardigrada,(Orstenotubulus,(Paucipodia,((Hadranax,Xenusion),(Microdictyon,(Cardiodictyon,(Hallucigenia,(Onychodictyon,(Luolishania,(Collins_monster,(Miraluolishania,Onychophora)))))))))))),(Jianshanopodia,(Megadictyon,(Kerygmachela,(Pambdelurion,(Opabinia,(((Anomalocaris,Laggania),Hurdia),(Diania,(Schinderhannes,(Fuxianhuia,(Leanchoilia,Euarthropoda))))))))))));' mouse2Tre(mou, speciesName)
def main(): path = r'C:\Users\pro\Desktop\int_data' for file in os.listdir(path): try: if file.endswith('nex'): # file='Liu2011.nex' data, miss_row, speciesName = readNex(os.path.join(path, file)) for ind, i in enumerate(speciesName): print(ind, i) print((file[:-3] + 'tre')) treeSpecies = os.path.join(path, (file[:-3] + 'tre')) SvaetreeSpecies = os.path.join(path, (file[:-3] + 'txt')) readTre2mouse(treeSpecies, speciesName, SvaetreeSpecies) treeSpecies = os.path.join(path, file[:-4] + '_ii.tre') SvaetreeSpecies = os.path.join(path, file[:-4] + '_ii.txt') readTre2mouse(treeSpecies, speciesName, SvaetreeSpecies) treeSpecies = os.path.join(path, file[:-4] + '_knn.tre') SvaetreeSpecies = os.path.join(path, file[:-4] + '_knn.txt') readTre2mouse(treeSpecies, speciesName, SvaetreeSpecies) treeSpecies = os.path.join(path, file[:-4] + '_me.tre') SvaetreeSpecies = os.path.join(path, file[:-4] + '_me.txt') readTre2mouse(treeSpecies, speciesName, SvaetreeSpecies) treeSpecies = os.path.join(path, file[:-4] + '_sf.tre') SvaetreeSpecies = os.path.join(path, file[:-4] + '_sf.txt') readTre2mouse(treeSpecies, speciesName, SvaetreeSpecies) treeSpecies = os.path.join(path, file[:-4] + '_auto.tre') SvaetreeSpecies = os.path.join(path, file[:-4] + '_auto.txt') readTre2mouse(treeSpecies, speciesName, SvaetreeSpecies) treeSpecies = os.path.join(path, file[:-4] + '_newTech.tre') SvaetreeSpecies = os.path.join(path, file[:-4] + '_newTech.txt') readTre2mouse(treeSpecies, speciesName, SvaetreeSpecies) except Exception as e: print(e)
from utils.base_impute import random_inpute from utils.base_tools import shear_dile from utils.read_file import readNex from ycimpute.imputer import knnimput, mice, EM from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, IterativeImputer, BiScaler, SimpleFill path = r'nexus_files' pciturePath = r'result/picture' save_path = r'result/ped' total_result_half = {} total_result_all = {} for file in os.listdir(path): try: logger.info( "**********************{}********************".format(file)) data, misss_row, speciesname, begin, end = readNex( os.path.join(path, file)) data = data + 10 except ValueError: print("可能存在数据多态问题") #shear_dile(os.path.join(path, file), os.path.join("G:\labWork\cladistic-data-master\可能无用数据")) print("文件移动成功") continue originData = random_inpute(data) for missPattern in ['block', 'normal', 'taxa', 'chara']: # 缺失比例只到0.5 half = [] methed_names_half = [ 'mice_misc', 'ii_misc', 'median_misc', 'random_misc', 'mida_misc', 'gain_misc' ] # 缺失比例到0.9
from baseline.myMethod import imputeMethod as TAI from utils.handle_missingdata import gene_missingdata, gene_missingdata_taxa_bias, gene_missingdata_chara_bias, \ gene_missingdata_block_bias #originDataPath=r'C:\Users\pro\Desktop\实验相关文档\缺失插补建树全流程\01起始含缺失数据' noMissingDataPath = r'C:\Users\pro\Desktop\实验二自编码器建树\古生物数据集测试\02随机插补无缺失数据集' #simDataPath=r'C:\Users\pro\Desktop\实验相关文档\缺失插补建树全流程\03无缺失随机缺失模拟数据' #生成模拟数据集 # for file in os.listdir(originDataPath): # data,missRow,speciesName,begin,end=readNex(os.path.join(originDataPath,file)) # noMissingData = impyute.imputation.cs.random(data) # saveData(noMissingDataPath,file,speciesName,noMissingData,begin,end) imputedDataPath = r'C:\Users\pro\Desktop\实验相关文档\缺失插补建树全流程\04缺失插补结果' for file in tqdm.tqdm(os.listdir(noMissingDataPath)): originData, missRow, speciesName, begin, end = readNex( os.path.join(noMissingDataPath, file)) for missPattern in ['normal']: result = {} for missRate in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]: if missPattern == 'normal': missData = gene_missingdata(rate=missRate, data=originData) elif missPattern == 'taxa': missData = gene_missingdata_taxa_bias(rate=missRate, data=originData) elif missPattern == 'chara': missData = gene_missingdata_chara_bias(rate=missRate, data=originData) elif missPattern == 'block': missData = gene_missingdata_block_bias(rate=missRate, data=originData)
else: imputedData = mice.MICE().complete(missData) result = addResult(result, missRate, missPattern, imputationMethod, evaluate.RMSE(originData, imputedData), MAE(originData, imputedData), masked_mape_np(originData, imputedData)) except Exception as e: print(e) imputedData = 'none' result = addResult(result, missRate, missPattern, imputationMethod, np.inf, np.inf, np.inf) return result, imputedData if __name__ == "__main__": dataPath = r'C:\Users\pro\Desktop\实验二自编码器建树\古生物数据集测试\01起始数据集\01_Yang2015.nex' missData, missRow, speciesName, begin, end = readNex(dataPath) #missData = lableEncoder(originData) result = {} missRate = 0.3 missPattern = "normal" print(missData) s = set() for i in range(len(missData)): for j in range(len(missData[0])): s.add(missData[i][j]) print(s) print(np.isnan(missData).any()) print(np.isfinite(missData).all()) t = mice.MICE().complete(missData)
# filePath=r'C:\Users\ASUS\Desktop\usefulDataSimple\usefulData' # modelSavePath=r'C:\Users\ASUS\Desktop\usefulDataSimple\Model' # fileSavePath=r'C:\Users\ASUS\Desktop\usefulDataSimple\fixed' filePath = r'G:\labWork\imputed_experiment_data' modelSavePath = r'G:\labWork\imputed_experiment_data\model' fileSavePath = r'G:\labWork\imputed_experiment_data\fix' for file in os.listdir(filePath): if file.endswith('tnt'): file = file[:-4] for i in [0.1, 0.2, 0.4, 0.5]: try: # file='02Bennett94pterosaurs' # file='Liu2011' # originData,miss_mask,speciesName=readNex(r'C:\Users\pro\Desktop\all_nex_data\{}.nex'.format(file)) originData, miss_mask, speciesName, begin, end = readNex( os.path.join(filePath, '{}.tnt'.format(file))) missData, miss_mask = gene_missingdata(rate=i, data=originData) try: min_max_scaler = preprocessing.MinMaxScaler() data = min_max_scaler.fit_transform(missData) miss_location = get_miss_location(data[miss_mask]) modelName = file + str(i) inp = interpolation(modelName=modelName, completeData=np.delete(data, miss_mask, axis=0)) if not os.path.exists( os.path.join(modelSavePath, '{}.pkl'.format(modelName))):