def getIVForCross(df1, combination, target, number, loc, groupByKey=None): if groupByKey is not None: temp = crossVariable(df1, combination, target=target, varlist=None, ignoreList=None, numVar=1, memmorSaving=0) #temp=temp.fillna(-579579) temp = temp.groupby(groupByKey).agg(['min', 'max', 'sum', 'mean']) temp.columns = [ str("_").join(col).strip() for col in temp.columns.values ] temp = temp.drop([target + "_" + f for f in ['min', 'max', 'sum']], axis=1) temp.rename(columns={target + "_mean": target}, inplace=True) binned = binning(temp, target, qCut=10, maxobjectFeatures=50, varCatConvert=1) ivData = iv_all(binned, target, modeBinary=0) else: temp = crossVariable(df1, combination, target=target, varlist=None, ignoreList=None) ivData = iv_all(temp, target, modeBinary=0) ivData.groupby('variable')['ivValue'].sum().to_csv(loc + str(number) + ".csv", mode='a', header=False)
import warnings from dataExploration import distReports,plotGrabh from iv import iv_all,binning import pandas as pd from pandas.core.common import SettingWithCopyWarning warnings.simplefilter(action="ignore", category=SettingWithCopyWarning) loc="/home/pooja/PycharmProjects/datanalysis/finalDatasets/" # #train=pd.read_csv(loc+"relevantDatasets/"+"train.csv") #loc="/home/pooja/PycharmProjects/datanalysis/bureau/" train= pd.read_csv('/home/pooja/PycharmProjects/datanalysis/finalDatasets/final_withCross.csv')#pd.read_csv(loc+"bur.csv") #a=plotGrabh(train,'TARGET',loc+"images/") binned=binning(train,'TARGET',maxobjectFeatures=300) ivData=iv_all(binned,'TARGET') #writer = pd.ExcelWriter(loc+"iv3.xlsx") #ivData.to_excel(writer,sheet_name="iv_detailed") #ivData.groupby('variable')['ivValue'].sum().to_excel(writer,sheet_name="iv_summary") ivData.to_csv(loc+"iv_detailed_cross.csv") ivData.groupby('variable')['ivValue'].sum().to_csv(loc+"iv_sum_cross.csv") # ivInfo=pd.read_csv(loc+"iv3.csv") # distRepo=distReports(train,ivInfo) # distRepo.to_csv(loc+"summary.csv") #writer.save() #writer.close()
) prev = prev.set_index('SK_ID_CURR')[['AMT_ANNUITY', 'AMT_APPLICATION']] #prev['mult']=prev['AMT_ANNUITY']*prev['AMT_APPLICATION'] prev['div'] = prev['AMT_APPLICATION'] / prev['AMT_ANNUITY'] #prev=prev.round(2) b = prev.dtypes prev = prev.replace(np.inf, np.nan) #prev=normalize(prev) #prev =prev.astype(np.float32) prev = prev.join(target) prev = prev.groupby('SK_ID_CURR').agg(['min', 'max', 'sum', 'mean']) #prev.round(1).to_csv('/home/pooja/PycharmProjects/datanalysis/featureEngeering/test.csv') a = prev.memory_usage(deep=True) b = prev.dtypes prev.columns = [ str("_").join(col).strip() for col in prev.columns.values ] prev = prev.drop(['TARGET' + "_" + f for f in ['min', 'max', 'sum']], axis=1) prev.rename(columns={'TARGET' + "_mean": 'TARGET'}, inplace=True) binned = binning(prev, 'TARGET', qCut=10, maxobjectFeatures=50, varCatConvert=1) ivData = iv_all(binned, 'TARGET', modeBinary=0) F = ivData.groupby('variable')['ivValue'].sum() N = 0
import warnings from dataExploration import distReports, plotGrabh from iv import iv_all, binning import pandas as pd from pandas.core.common import SettingWithCopyWarning warnings.simplefilter(action="ignore", category=SettingWithCopyWarning) pk = 'SK_ID_CURR' loc = "/home/pooja/PycharmProjects/datanalysis/" folder = ['previous_application'] fileName = ['previous_application.csv'] target = 'TARGET' train = pd.read_csv(loc + "rawDatas" + "/application_train.csv") for i in range(0, len(folder)): data = pd.read_csv(loc + folder[i] + "/" + fileName[i]) data = data[data[pk].isin(train[pk])] data = data.join(train[[target, pk]].set_index(pk), on=[pk], how='left') #a = plotGrabh(data, target, loc + folder[i] + "/images/") if target not in data.columns: data[target] = 0 binned = binning(data, target) ivData = iv_all(binned, target) ivData.to_csv(loc + folder[i] + "/" + "iv_detailed.csv") ivData.groupby('variable')['ivValue'].sum().to_csv(loc + folder[i] + "/" + "iv3.csv") ivInfo = pd.read_csv(loc + folder[i] + "/" + "iv3.csv") distRepo = distReports(data, ivInfo) distRepo.to_csv(loc + folder[i] + "/" + "summary.csv")
def crossVariablelowRam(df1, train=None, varlist=None, ignoreList=None, target=None, batch=10, loc=None, groupByKey=None): """ :param df1: dataframe|df for which cross variable to be calculated :param train: dataframe|dataframe containing target variable. Only needed if IV needs to be calculated :param varlist: string list|variables in df1 for which cross will be calculated. If None then all variables of dataframe will be considered :param ignoreList:string list| varibles which will not be considered fr cross vars :param target: string|target variable :param batch: int|for parallel processing how many combination will be parrallely processed :param loc: string|where all the outputs will be made :param groupByKey: string| key with which groupBy to be done before IV calculation .Only applicable for numeric variable :return: CSV file| a file consisting of all the cross variable and its IV value """ start = time.time() outputFile = pd.DataFrame(columns=['ivValue']) for i in range(batch): outputFile.to_csv(loc + str(i) + ".csv") cores = cpu_count() pool = Pool(processes=cores) if varlist is None: varlist = df1.columns if ignoreList is not None: varlist = list(set(df1.columns) - set(ignoreList)) coreNum = 0 excludes = [] if groupByKey is None: binned = binning(df1, qCut=10, maxobjectFeatures=50, varCatConvert=1, excludedList=excludes) varlist = list(set(varlist) - set(excludes)) binned = binned.astype(str) binned.columns = [ col.replace('n_', "").replace('c_', "") for col in binned.columns ] else: objectCols = list(df1.select_dtypes(include=['object']).columns) varlist = list(set(varlist) - set(objectCols)) binned = df1[varlist] combs = combinations(varlist, 2) total_cross = len(list(combs)) numberOfBatches = int(total_cross / batch) binned = binned.join(train[[target]]) #print(binned.dtypes) print(numberOfBatches) combs = list(combinations(varlist, 2)) i = 0 for i in range(0, numberOfBatches): cross = combs[i * batch:i * batch + batch] vars = list( set([com[0] for com in cross]).union(set([com[1] for com in cross]))) + [target] pool.apply_async(getIVForCross, args=(binned[vars], cross, target, i % batch, loc, groupByKey)) #print(i) coreNum = coreNum + 1 #gc.collect() if i % int( batch / 2 ) == 0: #used so that limited process run in memmory. So batch should be cosen considering availibilty of memmory pool.close() pool.join() print(i) pool = Pool(processes=cores) cross = combs[i * batch:total_cross] vars = list( set([com[0] for com in cross]).union(set([com[1] for com in cross]))) + [target] pool.apply_async(getIVForCross, args=(binned[vars], cross, target, i % batch, loc, groupByKey)) pool.close() pool.join() for i in range(batch): if i == 0: main = pd.read_csv(loc + str(i) + ".csv") main.to_csv(loc + ".csv") else: main = pd.read_csv(loc + str(i) + ".csv") main.to_csv(loc + ".csv", mode='a', header=False) end = time.time() print("total Time taken" + str((end - start) / 60))