Пример #1
0
def deleteCharaterBymissRate(originPath,savePath,rate):
    """
    按照缺失率删除特征,特征缺失超过rate进行删除
    :param originPath: 文件原始路径
    :param savePath: 保存路径
    :param rate: 缺失阈值
    :return:
    """
    originData, missRow, speciesName, begin, end = readNex(originPath)
    threshould=int(len(originData[0])*rate)
    missNumber=[]
    for j in range(len(originData[0])):
        count=0
        for i in range(len(originData)):
            if np.isnan(originData[i][j]):
               count+=1
        missNumber.append((count,j))
    ret=heapq.nlargest(threshould,missNumber,key=lambda x:x[0])
    deletRow = [i[1] for i in ret]
    deletRow.sort()
    for index in range(len(begin)):
        pattern="nchar *= *{}".format(str(len(originData[0])))
        substring="nchar = {}".format(str(len(originData[0])-len(deletRow)))
        begin[index]=re.sub(pattern,substring,begin[index])
    for j in deletRow[::-1]:
        originData=np.delete(originData,j,1)

    saveNex(savePath,speciesName, originData, begin, end)
Пример #2
0
def dirtyhand():
    path = r'C:\Users\pro\Desktop\int_data'
    # file='Aria2015.nex'
    # data, miss_row, speciesName = readNex(os.path.join(path, file))
    # mou='(Aysheaia,Anomalocaris,Hurdia,(Isoxys, Surusicaris,(((Canadaspis,Fuxianhuia),Occacaris),(Kunmingella,(((Martinssonia,Cephalocarida),Rehbachiella),((Agnostus,Kiisortoqia),((((Olenoides,Naraoia),Xandarella,Aglaspis),Emeraldella),((Jianfengia,Fortiforceps),(Yohoia,((((Leanchoilia_superlata,Leanchoilia_persephone),Leanchoilia_illecebrosa,(Oestokerkus,Yawunik),Actaeus,Oelandocaris),Alalcomenaeus),Haikoucaris)),((Offacolus,Dibasterium),(Weinbergina,Eurypterida))))))))));'
    # mouse2Tre(mou,speciesName)
    # mou2='(Aysheaia,(Anomalocaris ,Hurdia),(Isoxys,(((Canadaspis,Fuxianhuia),Occacaris),(Surusicaris,(Jianfengia,(Fortiforceps,((Yohoia,(((((Leanchoilia_superlata , Leanchoilia_persephone),Leanchoilia_illecebrosa,Actaeus,Oelandocaris),(Oestokerkus,Yawunik)),Alalcomenaeus),Haikoucaris)),((((Kunmingella,Agnostus),((Martinssonia,Cephalocarida),Rehbachiella)),(((Olenoides,Naraoia),Xandarella,Aglaspis),Emeraldella)),(((Offacolus,Dibasterium),(Weinbergina,Eurypterida)),Kiisortoqia)))))))))'
    # mouse2Tre(mou2, speciesName)
    file = 'Longrich2010.nex'
    data, miss_row, speciesName = readNex(os.path.join(path, file))
    mou = '(Thescelosaurus_neglectus,Psittacosaurus_spp,((Stegoceras_validum,(Gravitholus_albertae,Colepiocephale_lambei)),Texacephale_langstoni,Hanssuesia_sternbergi,(Sphaerotholus_brevis,(Sphaerotholus_goodwini,(Sphaerotholus_edmontonense,Sphaerotholus_buchholtzae)),((Alaskacephale_gangloffi,Pachycephalosaurus_wyomingensis,(Stygimoloch_spinifer,Dracorex_hogwartsi)),(Tylocephale_gilmorei,Prenocephale_prenes,(Homalocephale_calathocercos,Goyocephale_lattimorei,Wannanosaurus_yansiensis))))));'
    mouse2Tre(mou, speciesName)
    file = 'Dikow2009.nex'
    data, miss_row, speciesName = readNex(os.path.join(path, file))
    mou = '(Bombylius_major,((Apsilocephala_longistyla,(Prorates_sp_Escalante,(Phycus_frommeri,Hemigephyra_atra))),((Apiocera_painteri,((Opomydas_townsendi,Mydas_clavatus),(Mitrodetus_dentitarsis,(Nemomydas_brachyrhynchus,Afroleptomydas_sp_Clanwilliam)))),((Rhipidocephala_sp_HaroldJohnson,(Holcocephala_calva,Holcocephala_abdominalis)),((Perasis_transvaalensis,(Laphystia_tolandi,(Trichardis_effrena,(Nusa_infumata,((Laxenecera_albicincta,Hoplistomerus_nobilis),((Pilica_formidolosa,(Cerotainia_albipilosa,Atomosia_puella)),((Stiphrolamyra_angularis,Lamyra_gulo),(Laphria_aktis,Choerades_bella)))))))),((((Damalis_monochaetes,Damalis_annulata),(Rhabdogaster_pedion,Acnephalum_cylindricum)),(((Pegesimallus_laticornis,(Diogmites_grossus,(Plesiomma_sp_Guanacaste,(Dasypogon_diadema,(Saropogon_luteus,Lestomyia_fraudiger))))),((Trichoura_sp_Tierberg,Ablautus_coquilletti),(Molobratia_teutonus,(Nicocles_politus,(Leptarthrus_brevirostris,(Cyrtopogon_rattus,Ceraturgus_fasciatus)))))),((Willistonina_bilineata,(Eudioctria_albius,(Dioctria_hyalipennis,(Dioctria_rufipes,Dioctria_atricapillus)))),((Gonioscelis_ventralis,(Stenopogon_rufibarbis,Ospriocerus_aeacus)),((Tillobroma_punctipennis,(Prolepsis_tristis,Microstylum_sp_Karkloof)),(Lycostommyia_albifacies,(Scylaticus_costalis,Connomyia_varipennis))))))),(((Lasiopogon_cinctus,Lasiopogon_aldrichii),(Stichopogon_punctum,(Stichopogon_trifasciatus,Stichopogon_elegantulus))),(((Euscelidia_pulchra,Beameromyia_bifida),((Leptogaster_cylindrica,Leptogaster_arida),(Tipulogaster_glabrata,Lasiocnemus_lugens))),(((Emphysomera_pallidapex,Emphysomera_conopsoides),(Ommatius_tibialis,Afroestricus_chiastoneurus)),((Proctacanthus_philadelphicus,Pogonioefferia_pogonias),((Philodicus_tenuipes,(Promachus_amastrus,Megaphorus_pulchrus)),((Neolophonotus_bimaculatus,Dasophrys_crenulatus),(Neoitamus_cyanurus,(Clephydroneura_sp_Kepong,(Dysmachus_trigonus,(Philonicus_albiceps,(Machimus_occidentalis,(Tolmerus_atricapillus,(Asilus_sericeus,Asilus_crabroniformis)))))))))))))))))));'
    mouse2Tre(mou, speciesName)
    file = 'Liu2011.nex'
    data, miss_row, speciesName = readNex(os.path.join(path, file))
    mou = '(Cycloneuralia,((Aysheaia,(Tardigrada,(Orstenotubulus,(Paucipodia,((Hadranax,Xenusion),(Microdictyon,(Cardiodictyon,(Hallucigenia,(Onychodictyon,(Luolishania,(Collins_monster,(Miraluolishania,Onychophora)))))))))))),(Jianshanopodia,(Megadictyon,(Kerygmachela,(Pambdelurion,(Opabinia,(((Anomalocaris,Laggania),Hurdia),(Diania,(Schinderhannes,(Fuxianhuia,(Leanchoilia,Euarthropoda))))))))))));'
    mouse2Tre(mou, speciesName)
Пример #3
0
def main():
    path = r'C:\Users\pro\Desktop\int_data'
    for file in os.listdir(path):
        try:
            if file.endswith('nex'):
                # file='Liu2011.nex'
                data, miss_row, speciesName = readNex(os.path.join(path, file))
                for ind, i in enumerate(speciesName):
                    print(ind, i)
                print((file[:-3] + 'tre'))
                treeSpecies = os.path.join(path, (file[:-3] + 'tre'))
                SvaetreeSpecies = os.path.join(path, (file[:-3] + 'txt'))
                readTre2mouse(treeSpecies, speciesName, SvaetreeSpecies)

                treeSpecies = os.path.join(path, file[:-4] + '_ii.tre')
                SvaetreeSpecies = os.path.join(path, file[:-4] + '_ii.txt')
                readTre2mouse(treeSpecies, speciesName, SvaetreeSpecies)

                treeSpecies = os.path.join(path, file[:-4] + '_knn.tre')
                SvaetreeSpecies = os.path.join(path, file[:-4] + '_knn.txt')
                readTre2mouse(treeSpecies, speciesName, SvaetreeSpecies)

                treeSpecies = os.path.join(path, file[:-4] + '_me.tre')
                SvaetreeSpecies = os.path.join(path, file[:-4] + '_me.txt')
                readTre2mouse(treeSpecies, speciesName, SvaetreeSpecies)

                treeSpecies = os.path.join(path, file[:-4] + '_sf.tre')
                SvaetreeSpecies = os.path.join(path, file[:-4] + '_sf.txt')
                readTre2mouse(treeSpecies, speciesName, SvaetreeSpecies)

                treeSpecies = os.path.join(path, file[:-4] + '_auto.tre')
                SvaetreeSpecies = os.path.join(path, file[:-4] + '_auto.txt')
                readTre2mouse(treeSpecies, speciesName, SvaetreeSpecies)

                treeSpecies = os.path.join(path, file[:-4] + '_newTech.tre')
                SvaetreeSpecies = os.path.join(path,
                                               file[:-4] + '_newTech.txt')
                readTre2mouse(treeSpecies, speciesName, SvaetreeSpecies)
        except Exception as e:
            print(e)
Пример #4
0
from utils.base_impute import random_inpute
from utils.base_tools import shear_dile
from utils.read_file import readNex
from ycimpute.imputer import knnimput, mice, EM
from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, IterativeImputer, BiScaler, SimpleFill

path = r'nexus_files'
pciturePath = r'result/picture'
save_path = r'result/ped'
total_result_half = {}
total_result_all = {}
for file in os.listdir(path):
    try:
        logger.info(
            "**********************{}********************".format(file))
        data, misss_row, speciesname, begin, end = readNex(
            os.path.join(path, file))
        data = data + 10
    except ValueError:
        print("可能存在数据多态问题")
        #shear_dile(os.path.join(path, file), os.path.join("G:\labWork\cladistic-data-master\可能无用数据"))
        print("文件移动成功")
        continue
    originData = random_inpute(data)
    for missPattern in ['block', 'normal', 'taxa', 'chara']:
        # 缺失比例只到0.5
        half = []
        methed_names_half = [
            'mice_misc', 'ii_misc', 'median_misc', 'random_misc', 'mida_misc',
            'gain_misc'
        ]
        # 缺失比例到0.9
from baseline.myMethod import imputeMethod as TAI

from utils.handle_missingdata import gene_missingdata, gene_missingdata_taxa_bias, gene_missingdata_chara_bias, \
    gene_missingdata_block_bias
#originDataPath=r'C:\Users\pro\Desktop\实验相关文档\缺失插补建树全流程\01起始含缺失数据'
noMissingDataPath = r'C:\Users\pro\Desktop\实验二自编码器建树\古生物数据集测试\02随机插补无缺失数据集'
#simDataPath=r'C:\Users\pro\Desktop\实验相关文档\缺失插补建树全流程\03无缺失随机缺失模拟数据'
#生成模拟数据集
# for file in os.listdir(originDataPath):
#     data,missRow,speciesName,begin,end=readNex(os.path.join(originDataPath,file))
#     noMissingData = impyute.imputation.cs.random(data)
#     saveData(noMissingDataPath,file,speciesName,noMissingData,begin,end)

imputedDataPath = r'C:\Users\pro\Desktop\实验相关文档\缺失插补建树全流程\04缺失插补结果'
for file in tqdm.tqdm(os.listdir(noMissingDataPath)):
    originData, missRow, speciesName, begin, end = readNex(
        os.path.join(noMissingDataPath, file))
    for missPattern in ['normal']:
        result = {}
        for missRate in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]:
            if missPattern == 'normal':
                missData = gene_missingdata(rate=missRate, data=originData)
            elif missPattern == 'taxa':
                missData = gene_missingdata_taxa_bias(rate=missRate,
                                                      data=originData)
            elif missPattern == 'chara':
                missData = gene_missingdata_chara_bias(rate=missRate,
                                                       data=originData)
            elif missPattern == 'block':

                missData = gene_missingdata_block_bias(rate=missRate,
                                                       data=originData)
        else:
            imputedData = mice.MICE().complete(missData)
        result = addResult(result, missRate, missPattern, imputationMethod,
                           evaluate.RMSE(originData, imputedData),
                           MAE(originData, imputedData),
                           masked_mape_np(originData, imputedData))
    except Exception as e:
        print(e)
        imputedData = 'none'
        result = addResult(result, missRate, missPattern, imputationMethod,
                           np.inf, np.inf, np.inf)
    return result, imputedData


if __name__ == "__main__":
    dataPath = r'C:\Users\pro\Desktop\实验二自编码器建树\古生物数据集测试\01起始数据集\01_Yang2015.nex'

    missData, missRow, speciesName, begin, end = readNex(dataPath)
    #missData = lableEncoder(originData)
    result = {}
    missRate = 0.3
    missPattern = "normal"
    print(missData)
    s = set()
    for i in range(len(missData)):
        for j in range(len(missData[0])):
            s.add(missData[i][j])
    print(s)
    print(np.isnan(missData).any())
    print(np.isfinite(missData).all())
    t = mice.MICE().complete(missData)
    # filePath=r'C:\Users\ASUS\Desktop\usefulDataSimple\usefulData'
    # modelSavePath=r'C:\Users\ASUS\Desktop\usefulDataSimple\Model'
    # fileSavePath=r'C:\Users\ASUS\Desktop\usefulDataSimple\fixed'
    filePath = r'G:\labWork\imputed_experiment_data'
    modelSavePath = r'G:\labWork\imputed_experiment_data\model'
    fileSavePath = r'G:\labWork\imputed_experiment_data\fix'
    for file in os.listdir(filePath):
        if file.endswith('tnt'):
            file = file[:-4]
            for i in [0.1, 0.2, 0.4, 0.5]:
                try:
                    # file='02Bennett94pterosaurs'

                    # file='Liu2011'
                    # originData,miss_mask,speciesName=readNex(r'C:\Users\pro\Desktop\all_nex_data\{}.nex'.format(file))
                    originData, miss_mask, speciesName, begin, end = readNex(
                        os.path.join(filePath, '{}.tnt'.format(file)))
                    missData, miss_mask = gene_missingdata(rate=i,
                                                           data=originData)

                    try:
                        min_max_scaler = preprocessing.MinMaxScaler()
                        data = min_max_scaler.fit_transform(missData)
                        miss_location = get_miss_location(data[miss_mask])
                        modelName = file + str(i)
                        inp = interpolation(modelName=modelName,
                                            completeData=np.delete(data,
                                                                   miss_mask,
                                                                   axis=0))
                        if not os.path.exists(
                                os.path.join(modelSavePath,
                                             '{}.pkl'.format(modelName))):