Пример #1
0
def getIVForCross(df1, combination, target, number, loc, groupByKey=None):

    if groupByKey is not None:
        temp = crossVariable(df1,
                             combination,
                             target=target,
                             varlist=None,
                             ignoreList=None,
                             numVar=1,
                             memmorSaving=0)
        #temp=temp.fillna(-579579)
        temp = temp.groupby(groupByKey).agg(['min', 'max', 'sum', 'mean'])
        temp.columns = [
            str("_").join(col).strip() for col in temp.columns.values
        ]
        temp = temp.drop([target + "_" + f for f in ['min', 'max', 'sum']],
                         axis=1)
        temp.rename(columns={target + "_mean": target}, inplace=True)
        binned = binning(temp,
                         target,
                         qCut=10,
                         maxobjectFeatures=50,
                         varCatConvert=1)
        ivData = iv_all(binned, target, modeBinary=0)
    else:
        temp = crossVariable(df1,
                             combination,
                             target=target,
                             varlist=None,
                             ignoreList=None)
        ivData = iv_all(temp, target, modeBinary=0)
    ivData.groupby('variable')['ivValue'].sum().to_csv(loc + str(number) +
                                                       ".csv",
                                                       mode='a',
                                                       header=False)
Пример #2
0
import warnings
from dataExploration import distReports,plotGrabh
from iv import iv_all,binning
import pandas as pd
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
loc="/home/pooja/PycharmProjects/datanalysis/finalDatasets/"
#

#train=pd.read_csv(loc+"relevantDatasets/"+"train.csv")
#loc="/home/pooja/PycharmProjects/datanalysis/bureau/"
train= pd.read_csv('/home/pooja/PycharmProjects/datanalysis/finalDatasets/final_withCross.csv')#pd.read_csv(loc+"bur.csv")
#a=plotGrabh(train,'TARGET',loc+"images/")
binned=binning(train,'TARGET',maxobjectFeatures=300)

ivData=iv_all(binned,'TARGET')

#writer = pd.ExcelWriter(loc+"iv3.xlsx")
#ivData.to_excel(writer,sheet_name="iv_detailed")
#ivData.groupby('variable')['ivValue'].sum().to_excel(writer,sheet_name="iv_summary")
ivData.to_csv(loc+"iv_detailed_cross.csv")
ivData.groupby('variable')['ivValue'].sum().to_csv(loc+"iv_sum_cross.csv")

# ivInfo=pd.read_csv(loc+"iv3.csv")
# distRepo=distReports(train,ivInfo)
# distRepo.to_csv(loc+"summary.csv")
#writer.save()
#writer.close()
Пример #3
0
        )
        prev = prev.set_index('SK_ID_CURR')[['AMT_ANNUITY', 'AMT_APPLICATION']]
        #prev['mult']=prev['AMT_ANNUITY']*prev['AMT_APPLICATION']
        prev['div'] = prev['AMT_APPLICATION'] / prev['AMT_ANNUITY']
        #prev=prev.round(2)
        b = prev.dtypes
        prev = prev.replace(np.inf, np.nan)
        #prev=normalize(prev)
        #prev    =prev.astype(np.float32)
        prev = prev.join(target)
        prev = prev.groupby('SK_ID_CURR').agg(['min', 'max', 'sum', 'mean'])
        #prev.round(1).to_csv('/home/pooja/PycharmProjects/datanalysis/featureEngeering/test.csv')
        a = prev.memory_usage(deep=True)
        b = prev.dtypes
        prev.columns = [
            str("_").join(col).strip() for col in prev.columns.values
        ]

        prev = prev.drop(['TARGET' + "_" + f for f in ['min', 'max', 'sum']],
                         axis=1)
        prev.rename(columns={'TARGET' + "_mean": 'TARGET'}, inplace=True)
        binned = binning(prev,
                         'TARGET',
                         qCut=10,
                         maxobjectFeatures=50,
                         varCatConvert=1)

        ivData = iv_all(binned, 'TARGET', modeBinary=0)
        F = ivData.groupby('variable')['ivValue'].sum()
        N = 0
Пример #4
0
import warnings
from dataExploration import distReports, plotGrabh
from iv import iv_all, binning
import pandas as pd
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
pk = 'SK_ID_CURR'
loc = "/home/pooja/PycharmProjects/datanalysis/"
folder = ['previous_application']
fileName = ['previous_application.csv']
target = 'TARGET'
train = pd.read_csv(loc + "rawDatas" + "/application_train.csv")
for i in range(0, len(folder)):
    data = pd.read_csv(loc + folder[i] + "/" + fileName[i])
    data = data[data[pk].isin(train[pk])]
    data = data.join(train[[target, pk]].set_index(pk), on=[pk], how='left')
    #a = plotGrabh(data, target, loc + folder[i] + "/images/")

    if target not in data.columns: data[target] = 0

    binned = binning(data, target)
    ivData = iv_all(binned, target)
    ivData.to_csv(loc + folder[i] + "/" + "iv_detailed.csv")
    ivData.groupby('variable')['ivValue'].sum().to_csv(loc + folder[i] + "/" +
                                                       "iv3.csv")
    ivInfo = pd.read_csv(loc + folder[i] + "/" + "iv3.csv")
    distRepo = distReports(data, ivInfo)
    distRepo.to_csv(loc + folder[i] + "/" + "summary.csv")
Пример #5
0
def crossVariablelowRam(df1,
                        train=None,
                        varlist=None,
                        ignoreList=None,
                        target=None,
                        batch=10,
                        loc=None,
                        groupByKey=None):
    """

    :param df1: dataframe|df for which cross variable to be calculated
    :param train: dataframe|dataframe containing target variable. Only needed if IV needs to be calculated
    :param varlist: string list|variables in df1 for which cross will be calculated. If None then all variables of dataframe will be considered
    :param ignoreList:string list| varibles which will not be considered fr cross vars
    :param target: string|target variable
    :param batch: int|for parallel processing how many combination will be parrallely processed
    :param loc: string|where all the outputs will be made
    :param groupByKey: string| key with which groupBy to be done before IV calculation .Only applicable for numeric variable
    :return: CSV file| a file consisting of all the cross variable and its IV value
    """
    start = time.time()
    outputFile = pd.DataFrame(columns=['ivValue'])
    for i in range(batch):
        outputFile.to_csv(loc + str(i) + ".csv")
    cores = cpu_count()
    pool = Pool(processes=cores)
    if varlist is None: varlist = df1.columns
    if ignoreList is not None:
        varlist = list(set(df1.columns) - set(ignoreList))
    coreNum = 0
    excludes = []
    if groupByKey is None:
        binned = binning(df1,
                         qCut=10,
                         maxobjectFeatures=50,
                         varCatConvert=1,
                         excludedList=excludes)
        varlist = list(set(varlist) - set(excludes))
        binned = binned.astype(str)
        binned.columns = [
            col.replace('n_', "").replace('c_', "") for col in binned.columns
        ]

    else:
        objectCols = list(df1.select_dtypes(include=['object']).columns)
        varlist = list(set(varlist) - set(objectCols))
        binned = df1[varlist]
    combs = combinations(varlist, 2)
    total_cross = len(list(combs))
    numberOfBatches = int(total_cross / batch)
    binned = binned.join(train[[target]])
    #print(binned.dtypes)
    print(numberOfBatches)
    combs = list(combinations(varlist, 2))
    i = 0
    for i in range(0, numberOfBatches):
        cross = combs[i * batch:i * batch + batch]
        vars = list(
            set([com[0]
                 for com in cross]).union(set([com[1]
                                               for com in cross]))) + [target]
        pool.apply_async(getIVForCross,
                         args=(binned[vars], cross, target, i % batch, loc,
                               groupByKey))
        #print(i)
        coreNum = coreNum + 1
        #gc.collect()
        if i % int(
                batch / 2
        ) == 0:  #used so that limited process run in memmory. So batch should be cosen considering availibilty of memmory
            pool.close()
            pool.join()
            print(i)
            pool = Pool(processes=cores)

    cross = combs[i * batch:total_cross]
    vars = list(
        set([com[0]
             for com in cross]).union(set([com[1]
                                           for com in cross]))) + [target]
    pool.apply_async(getIVForCross,
                     args=(binned[vars], cross, target, i % batch, loc,
                           groupByKey))

    pool.close()
    pool.join()
    for i in range(batch):
        if i == 0:
            main = pd.read_csv(loc + str(i) + ".csv")
            main.to_csv(loc + ".csv")
        else:
            main = pd.read_csv(loc + str(i) + ".csv")
            main.to_csv(loc + ".csv", mode='a', header=False)
    end = time.time()
    print("total Time taken" + str((end - start) / 60))