def mainWork(path,savePath):
    pbar = tqdm.tqdm(os.listdir(path), desc='dirs')
    for file in pbar:
        pbar.set_description("Processing %s" % file)
        if file.endswith('xlsx') or file.endswith('csv'):
            originData = readAllTypeFile(os.path.join(path, file))
            for missPattern in ['normal']:
                result = {}
                for missRate in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]:
                    if missPattern == 'normal':
                        missData = gene_missingdata(rate=missRate, data=originData)
                    elif missPattern == 'taxa':
                        missData = gene_missingdata_taxa_bias(rate=missRate, data=originData)
                    elif missPattern == 'chara':
                        missData = gene_missingdata_chara_bias(rate=missRate, data=originData)
                    elif missPattern == 'block':
                        missData = gene_missingdata_block_bias(rate=missRate, data=originData)
                    else:
                        raise Exception("缺失模式错误,请在'normal','taxa','chara','block'中选择对应模式")

                    result, MICEImputedData = MICE(result, originData, missData, missRate, missPattern)
                    for firstImputedMethod in ['mice']:
                        if firstImputedMethod=='mice':
                            firstImputedData  = MICEImputedData
                        for loss in ['MSELoss']:
                            #for autoMethod in ['Autoencoder','ResAutoencoder','StockedAutoencoder','StockedResAutoencoder']:
                            for autoMethod in ['StockedResAutoencoder']:
                                start=time.time()
                                result,_=TAI(result=result,firstImputedMethod=firstImputedMethod,
                                                    firstImputedData=firstImputedData,
                                                    loss=loss,autoMethod=autoMethod,
                                                    originData=originData,missData=missData,
                                                    missRate=missRate,missPattern=missPattern,
                                             )
                                logger.info("{}-{}-{}训练耗时:{}".format(firstImputedMethod,loss,autoMethod,time.time() - start))
                if not os.path.exists(savePath):
                    os.makedirs(savePath)
                saveJson(result, os.path.join(savePath,"{}_{}_{}_{}.json".format("allMethod", missPattern,file, datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))))
Exemplo n.º 2
0
        for first_imputed_method in ['ii', 'mice']:
            for loss in ['MSELoss']:
                for method in [
                        'Autoencoder', 'ResAutoencoder', 'StockedAutoencoder',
                        'StockedResAutoencoder'
                ]:
                    varname = "{}_{}_{}".format(first_imputed_method, loss,
                                                method)
                    globals()[varname] = [[] for _ in range(4)]
                    methed_names_half.append(varname)
                    methed_names_all.append(varname)

        # for i in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        for i in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:
            if missPattern == 'normal':
                missData = gene_missingdata(rate=i, data=originData)
            elif missPattern == 'taxa':
                missData = gene_missingdata_taxa_bias(rate=i, data=originData)
            elif missPattern == 'chara':
                missData = gene_missingdata_chara_bias(rate=i, data=originData)
            elif missPattern == 'block':
                missData = gene_missingdata_block_bias(rate=i, data=originData)
            else:
                raise Exception(
                    "缺失模式错误,请在'normal','taxa','chara','block'中选择对应模式")

            mark = [
                temp[0] for temp in pd.DataFrame(np.unique(missData)).dropna(
                    axis=0).values
            ]
Exemplo n.º 3
0
def mainWork(path, savePath):

    for file in os.listdir(path):
        originData = readAllTypeFile(os.path.join(path, file))
        for missPattern in ['normal']:
            # for missPattern in ['normal','block',  'taxa', 'chara']:
            # for i in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
            result = {}
            for missRate in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]:
                if missPattern == 'normal':
                    missData = gene_missingdata(rate=missRate, data=originData)
                elif missPattern == 'taxa':
                    missData = gene_missingdata_taxa_bias(rate=missRate,
                                                          data=originData)
                elif missPattern == 'chara':
                    missData = gene_missingdata_chara_bias(rate=missRate,
                                                           data=originData)
                elif missPattern == 'block':
                    missData = gene_missingdata_block_bias(rate=missRate,
                                                           data=originData)
                else:
                    raise Exception(
                        "缺失模式错误,请在'normal','taxa','chara','block'中选择对应模式")

                result, _ = Random(result, originData, missData, missRate,
                                   missPattern, 'disperse')
                result, _ = Medain(result, originData, missData, missRate,
                                   missPattern, 'disperse')
                result, KNNImputedData = KNN(result, originData, missData,
                                             missRate, missPattern, 'disperse')
                result, EMImputedData = EM(result, originData, missData,
                                           missRate, missPattern, 'disperse')
                result, IIImputedData = II(result, originData, missData,
                                           missRate, missPattern, 'disperse')
                result, _ = GAIN(result, originData, missData, missRate,
                                 missPattern, 'disperse')
                result, _ = MIDA(result, originData, missData, missRate,
                                 missPattern, 'disperse')
                result, MICEImputedData = MICE(result, originData, missData,
                                               missRate, missPattern,
                                               'disperse')
                # for firstImputedMethod in ['ii', 'mice']:
                for firstImputedMethod in ['knn', 'ii', 'mice']:
                    if firstImputedMethod == 'knn':
                        firstImputedData = KNNImputedData
                    elif firstImputedMethod == 'ii':
                        firstImputedData = IIImputedData
                    elif firstImputedMethod == 'mice':
                        firstImputedData = MICEImputedData
                    for loss in ['MSELoss']:
                        # for autoMethod in ['Autoencoder','ResAutoencoder','StockedAutoencoder','StockedResAutoencoder']:
                        for autoMethod in ['Autoencoder']:
                            start = time.time()
                            result = TAI(
                                result=result,
                                firstImputedMethod=firstImputedMethod,
                                firstImputedData=firstImputedData.copy(),
                                loss=loss,
                                autoMethod=autoMethod,
                                originData=originData,
                                missData=missData,
                                missRate=missRate,
                                missPattern=missPattern)
                            logger.info("改后{}-{}-{}训练耗时:{}".format(
                                firstImputedMethod, loss, autoMethod,
                                time.time() - start))
            saveJson(
                result,
                os.path.join(
                    savePath, "{}_{}_{}_{}.json".format(
                        "allmethod", missPattern, file,
                        datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))))
#simDataPath=r'C:\Users\pro\Desktop\实验相关文档\缺失插补建树全流程\03无缺失随机缺失模拟数据'
#生成模拟数据集
# for file in os.listdir(originDataPath):
#     data,missRow,speciesName,begin,end=readNex(os.path.join(originDataPath,file))
#     noMissingData = impyute.imputation.cs.random(data)
#     saveData(noMissingDataPath,file,speciesName,noMissingData,begin,end)

imputedDataPath = r'C:\Users\pro\Desktop\实验相关文档\缺失插补建树全流程\04缺失插补结果'
for file in tqdm.tqdm(os.listdir(noMissingDataPath)):
    originData, missRow, speciesName, begin, end = readNex(
        os.path.join(noMissingDataPath, file))
    for missPattern in ['normal']:
        result = {}
        for missRate in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]:
            if missPattern == 'normal':
                missData = gene_missingdata(rate=missRate, data=originData)
            elif missPattern == 'taxa':
                missData = gene_missingdata_taxa_bias(rate=missRate,
                                                      data=originData)
            elif missPattern == 'chara':
                missData = gene_missingdata_chara_bias(rate=missRate,
                                                       data=originData)
            elif missPattern == 'block':

                missData = gene_missingdata_block_bias(rate=missRate,
                                                       data=originData)
            else:
                raise Exception(
                    "缺失模式错误,请在'normal','taxa','chara','block'中选择对应模式")
#            saveData(simDataPath, "{}_{}".format(str(missRate),file), speciesName, missData, begin, end)
            missData = lableEncoder(missData)
    # fileSavePath=r'C:\Users\ASUS\Desktop\usefulDataSimple\fixed'
    filePath = r'G:\labWork\imputed_experiment_data'
    modelSavePath = r'G:\labWork\imputed_experiment_data\model'
    fileSavePath = r'G:\labWork\imputed_experiment_data\fix'
    for file in os.listdir(filePath):
        if file.endswith('tnt'):
            file = file[:-4]
            for i in [0.1, 0.2, 0.4, 0.5]:
                try:
                    # file='02Bennett94pterosaurs'

                    # file='Liu2011'
                    # originData,miss_mask,speciesName=readNex(r'C:\Users\pro\Desktop\all_nex_data\{}.nex'.format(file))
                    originData, miss_mask, speciesName, begin, end = readNex(
                        os.path.join(filePath, '{}.tnt'.format(file)))
                    missData, miss_mask = gene_missingdata(rate=i,
                                                           data=originData)

                    try:
                        min_max_scaler = preprocessing.MinMaxScaler()
                        data = min_max_scaler.fit_transform(missData)
                        miss_location = get_miss_location(data[miss_mask])
                        modelName = file + str(i)
                        inp = interpolation(modelName=modelName,
                                            completeData=np.delete(data,
                                                                   miss_mask,
                                                                   axis=0))
                        if not os.path.exists(
                                os.path.join(modelSavePath,
                                             '{}.pkl'.format(modelName))):
                            inp.fit(
                                os.path.join(modelSavePath,
    # #归一化,去掉标签
    file = r'public_data/1_Iris.xlsx'
    fileSavePath = r'G:\labWork\imputed_experiment_data\fix'
    # file='AhyongOM04crabs'

    modelSavePath = r'G:\labWork\imputed_experiment_data\model'

    logger.info("**********************{}********************".format(file))
    data = pd.read_excel(file, sheet_name="dataset")
    dt = np.array(data.values)
    data = dt.astype('float')
    data = data[:-1]
    target = data[-1]
    for i in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:

        missData = gene_missingdata(rate=i, data=data)

        mask = get_mask(missData)
        miss_location = get_miss_location(missData)
        # 数据均值化
        # min_max_scaler = preprocessing.MinMaxScaler()
        # mm_missData = min_max_scaler.fit_transform(missData)
        min_max_scaler = preprocessing.StandardScaler()
        mm_missData = min_max_scaler.fit_transform(missData)
        modelName = file + str(i)
        inp = interpolation_mask(modelName=modelName,
                                 completeData=random_inpute(mm_missData))
        if not os.path.exists(
                os.path.join(modelSavePath, '{}.pkl'.format(modelName))):
            inp.fit(os.path.join(modelSavePath, '{}.pkl'.format(modelName)),
                    mask)