def fit(self, X, y): """ @param y: an array of strings such as '0100122' """ assert len(y[0]) == len(self.clfs), "outputTable must have the same num of columns as len(self.clfs)" for col, clf in enumerate(self.clfs): t0 = time() print 'Fitting', col curY = np.array([int(s[col]) for s in y]) clf.fit(X, curY) printDoneTime(t0)
def buildModel(data, testData, fieldMaps, n_jobs, useJJ, selectedClfs = None, colNames = 'all', random_states=[None], writeResults=True, cvNumSplits=50, test_size=0.25, verbose=False, **fitArgs): """ @type data DatasetPair @type testData DatasetPair @type fieldMaps dict @param selectedClfs: the classifiers to run. if None, runs all classifiers @param colNames: if 'all' no splicing is done; otherwise is a list of fields to splice by @return: test results @rtype: Iterable """ pprint({k:fieldMaps[k] for k in colNames}) print fieldMaps.keys() t0 = time() res_all = {} bestClf_by_split = {} # {colVals: bestClf}. If bestClf is a scalar, just use it as predictions regardless of the input # ------- set up data ------- if colNames=='all': colIndices = range(len(data.fieldNames)) splits_all = {'all': data} splits_test = {'all': testData} elif isinstance(colNames, Iterable): colIndices = [data.fieldNames.index(name) for name in colNames] splits_all = data.spliceByColumnNames(colNames, removeColumns=True) splits_test = testData.spliceByColumnNames(colNames, removeColumns=True) else: raise ValueError("colNames of type %s isn't one of the recognized types." % type(colNames)) # ------- fit classifiers ------- for colVals in splits_all.keys(): colVal_names = 'all' if colNames=='all' else tuple(reverseDict(fieldMaps[name])[colVal] if name in fieldMaps else colVal for name, colVal in zip(colNames, colVals)) # if not colVal_names==('female', 'Mrs', ''): # continue print '='*10, colVal_names, '='*10, splits_all[colVals].dataCount, 'training data.' if colVal_names == ('female', 'other', 'Q'): print mask2DArrayByCol(testData.X, dict(zip(colIndices, colVals)))[1] print splits_all[colVals].X, splits_all[colVals].Y, splits_test[colVals].X, splits_test[colVals].Y if splits_all[colVals].dataCount == 0: if colVals not in splits_test or splits_test[colVals].dataCount==0: print 'Irrelevant category. Skipping...' else: # there is training data but no testing data. use the mode of training data results v = type(data.Y[0])(mode(data.Y)[0]) res_all[colVals] = np.repeat(v, splits_test[colVals].dataCount) bestClf_by_split[colVals] = v print 'Nothing training data. Using the mode of all training Y values', v continue elif colVals not in splits_test or splits_test[colVals]==0: print 'No testing data for this category. Skipping...' continue # get this slice's data train_cur = splits_all[colVals] test_cur = splits_test[colVals] print '%s has %d training data, %d testing data' % (colVal_names, train_cur.dataCount, test_cur.dataCount) # fit if len(np.unique(train_cur.Y))==1 or train_cur.dataCount<=5: # nothing to fit if the training data has only one class v = type(train_cur.Y[0])(mode(train_cur.Y)[0]) print 'Using the most common one class (%d) in training data for prediction.' % v res_all[colVals] = np.repeat(v, test_cur.dataCount) bestClf_by_split[colVals] = v else: _, (bestClfName, bestClf, bestscore) = fitClassifiers(train_cur, selectedClfs=selectedClfs, random_states=random_states, useJJ=useJJ, n_jobs=n_jobs, overwriteSavedResult=True, cvSplitNum=cvNumSplits, test_size=test_size, verbose=verbose, **fitArgs) print '>>>>>>> The best classifier for %s is %s, with score %f.' % (colVal_names, bestClfName, bestscore) res_all[colVals] = bestClf.fit(*train_cur.getPair()).predict(test_cur.X) bestClf_by_split[colVals] = bestClf # ------- compute overall cv score --------- cvResults = [] for randomState in random_states: cvObj = StratifiedShuffleSplit(data.Y, cvNumSplits, test_size=test_size, random_state=randomState) for trainInds, testInds in cvObj: if colNames=='all': curTrainDataSplitted = {'all': DatasetPair(data.X[trainInds], data.Y[trainInds], data.fieldNames)} curTestDataSplitted = {'all': DatasetPair(data.X[testInds], data.Y[testInds], data.fieldNames)} else: curTrainDataSplitted = DatasetPair(data.X[trainInds], data.Y[trainInds], data.fieldNames).spliceByColumnNames(colNames, removeColumns=True) curTestDataSplitted = DatasetPair(data.X[testInds], data.Y[testInds], data.fieldNames).spliceByColumnNames(colNames, removeColumns=True) curTotalCount = len(testInds) curScore = 0 for colVals in curTrainDataSplitted.keys(): if colVals not in bestClf_by_split or curTestDataSplitted[colVals].dataCount==0 or curTrainDataSplitted[colVals].dataCount==0: continue trainD = curTrainDataSplitted[colVals] testD = curTestDataSplitted[colVals] clf = deepcopy(bestClf_by_split[colVals]) if isinstance(clf, (int, float)) or len(np.unique(trainD.Y))==1: ypred = [clf] * len(testD.Y) else: ypred = clf.fit(*trainD.getPair()).predict(testD.X) curScore += accuracy_score(testD.Y, ypred) * testD.dataCount / curTotalCount cvResults.append(curScore) cvScore = np.mean(cvResults) print 'OVERALL CV SCORE =', cvScore # ------- collect results ------- if colNames=='all': testRes = res_all['all'] else: testRes = np.repeat(99, testData.dataCount) for colVals, res in res_all.iteritems(): _, curMask = mask2DArrayByCol(testData.X, dict(zip(colIndices, colVals))) testRes[curMask] = res # print testRes # # print 'jjjjjjjjjjj', list(testRes).index(99), testData.X[list(testRes).index(99)] # for i in range(len(testData.fieldNames)): # n = testData.fieldNames[i] # # if n in fieldMaps: # print n, ':', fieldMaps[n][testData.X[list(testRes).index(99)][i]] # else: # print n, ':', testData.X[list(testRes).index(99)][i] assert np.logical_or(testRes==0, testRes==1).all() # make sure all values are filled # ------- featureSelectionOutput results ------- if writeResults: writeTestingResToFile("by" + '_'.join(colNames), testRes) print 'Total amount of time spent:' printDoneTime(t0) return testRes, cvScore
if customerId in np.array(historyAndOffers.id): blockTransDict[customerId] = pandas.read_csv(IterStreamer(rawData), names = transHeaders) # ---- finished building. run the pool for this major block if len(blockTransDict) == chunkSize_major or rowNum == transIndexData.shape[0]-1: totalTime = time() - t_dict print "Building transactions dict total:", totalTime print "Building transactions dict each:", 1000000.* totalTime/ sum(df.shape[0] for df in blockTransDict.values()) print '--------- Finished building. Running pool. --------' t0 = time() pool = MyPool(processes=16, initializer = initStep, initargs = (historyAndOffers, compEmptyDf, blockTransDict)) printDoneTime(t0, "Making the pool") t0 = time() poolOutputs = runPool(pool, innerFunc, chunks(blockTransDict.keys(), chunkSize_minor)) printDoneTime(t0, 'Running the pool') # dump pool output to file for chunk in poolOutputs: chunk.to_csv(compressedTransFile, header=False, index=False) print '--- dumping ---', len(poolOutputs), sum(chunk.shape[0] for chunk in poolOutputs) pool.close() pool.join() pool.terminate()