def generateFeatures(year, bipartite, unipartite, newToOldIDs, adjMatrix): timing = Timer('generating features for %d' % year) bipartiteFeatures = extractBipartiteFeatures(bipartite) timing.markEvent('Extracted bipartite features.') # rawUnifeatures, componentFeatureFunc, communityFeatureFuncn = extractUnipartiteFeatures(unipartite, adjMatrix) rawUnifeatures, componentFeatureFunc, CNMFeatureFunc = extractUnipartiteFeatures( unipartite, adjMatrix) unipartiteFeatures = convertNewToOldIDs(rawUnifeatures, newToOldIDs) timing.markEvent('Extracted unipartite features.') # append unipartite features to bipartite features for each node, returning combined feature dictionary. # If the donor is not in the unipartite feature graph then we just take the default values (since the # node falls below the unipartite threshold from sqlToGraphs): features = {} for donorNode in graph_funcs.getDonors(bipartite): oldNID = donorNode.GetId() if oldNID in unipartiteFeatures: features[oldNID] = bipartiteFeatures[oldNID] + unipartiteFeatures[ oldNID] else: features[oldNID] = bipartiteFeatures[ oldNID] + defaultUnipartiteFeatures( componentFeatureFunc, CNMFeatureFunc) #, communityFeatureFuncn) timing.finish() return features
def trainAndTestModels(year, extension, X=None, Y=None, k=10, clf=linear_model.LinearRegression(), transF=None, decomp_func=None): timing = Timer('Running regression for %d.%s' % (year, extension)) if X is None or Y is None: X, Y = pickler.load('Data/Recip-Features/%d.%s' % (year, extension)) if transF: Y = transF(Y) timing.markEvent('Loaded X and Y') rsquareds = [] # Train and test the regression model on each k-fold set kf = KFold(len(Y), k) for train, test in kf: X_train, X_test = X[train], X[test] Y_train, Y_test = Y[train], Y[test] if decomp_func: decomp_func.fit(X_train) X_train = decomp_func.transform(X_train) X_test = decomp_func.transform(X_test) clf.fit(X_train, Y_train) rsquareds.append(clf.score(X_test, Y_test)) timing.markEvent('Ran regression') timing.finish() return rsquareds
def genRecipFeatures(year, weightF, graphFiles=None, bigraph=None): timing = Timer('Generating recip features for %d %s' % (year, weightF)) if not graphFiles: graphFiles = getGraphFiles(year, weightF) if not bigraph: bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year) receiptsFromDonor, totalReceipts, totalDonations = \ recip_feature_extractor.getDonationAmounts(bigraph) partialFeatures, fullFeatures = \ recip_feature_extractor.getCategoricalGraphFeatures(bigraph) timing.markEvent('Loaded bigraph, donor amounts, and categorical feature funcs') for gf in graphFiles: donorFeatures = pickler.load('Data/Features/%s.features' % gf) timing.markEvent('Loaded donor features for graph %s' % gf) recipFeatures = recip_feature_extractor.getRecipFeatures( bigraph, donorFeatures, receiptsFromDonor, totalReceipts, totalDonations, partialFeatures, fullFeatures) timing.markEvent('Calculated recip features') recip_feature_extractor.saveFeatures(bigraph, recipFeatures, 'Data/Recip-Features/%s' % gf) timing.markEvent('Saved recip features') timing.finish()
def genDonorFeatures(year, weightF, graphFiles=None, bigraph=None, adjMat=None, newToOldIDs=None): timing = Timer('Generating donor features for %d %s' % (year, weightF)) if not graphFiles: graphFiles = getGraphFiles(year, weightF) if not bigraph: bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year) if adjMat is None: adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF)) adjMat = adjMat.tocsc() if newToOldIDs is None: newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year) timing.markEvent('Loaded bigraph, adj matrix, and newToOld mapping') for gf in graphFiles: unigraph = graph_funcs.loadGraph('Data/Unipartite-Graphs/%s' % gf, snap.TUNGraph) timing.markEvent('Loaded graph %s' % gf) features = feature_extractor.generateFeatures(year, bigraph, unigraph, newToOldIDs, adjMat) timing.markEvent('Generated features') pickler.save(features, 'Data/Features/%s.features' % gf) timing.markEvent('Saved features') timing.finish()
def calcAverageWeights(graph, adjMat): neighbors = defaultdict(list) timing = Timer('Calculating average weights') # Get all the nodes that a node borders in the graph for edge in graph.Edges(): nodeid1 = edge.GetSrcNId() nodeid2 = edge.GetDstNId() neighbors[nodeid1].append(nodeid2) neighbors[nodeid2].append(nodeid1) timing.markEvent('Gotten all neighbors') # Get the average weight per node connected to weights = {} i = 0 for nodeid in neighbors: rows = neighbors[nodeid] weights[nodeid] = adjMat[rows, nodeid].sum() / float(len(rows)) i += 1 if i % 1000 == 0: timing.markEvent('Done with %d out of %d' % (i, len(neighbors))) return weights
def extractUnipartiteFeatures(unipartiteGraph, adjMat): timing = Timer('extracting unipartite features') features = defaultdict(list) #componentFeatureFunc, communityFeatureFuncn, idToCommunity = getUnipartiteSurfaceFeatures(unipartiteGraph, adjMat, features) componentFeatureFunc, CNMFeatureFunc, idToCNM = getUnipartiteSurfaceFeatures( unipartiteGraph, adjMat, features) timing.markEvent('1. Extracted surface features') # Average weight of edges: avgWeights = calcAverageWeights(unipartiteGraph, adjMat) #totalWeights = {adjMat timing.markEvent('2. Computed average weights.') # Size of connected component: #cnctComponents = calcCnctComponents(unipartiteGraph) timing.markEvent('3. Computed connected components.') # Size of CNM community: communities = calcCommunities(idToCNM) timing.markEvent('4. Computed CNM communities.') # Pagerank: pageRanks = snap.TIntFltH() snap.GetPageRank(unipartiteGraph, pageRanks) timing.markEvent('5. Computed PageRank.') # combine the graph wide features with the existing surface features: for nid in features: features[nid].append(avgWeights[nid]) #features[nid].append(cnctComponents[nid]) features[nid].append(communities[nid]) features[nid].append(pageRanks[nid]) timing.finish() return features, componentFeatureFunc, CNMFeatureFunc
def runFullPipeline(year): timing = Timer('Running pipeline for %d' % year) weightings = ('adamic', 'cosine', 'jaccard', 'jaccard2', 'weighted_adamic') bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph'% year) newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year) for weightF in weightings: graphFiles = getGraphFiles(year, weightF) adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF)) timing.markEvent('Loaded everything for donor features') genDonorFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph,\ adjMat=adjMat, newToOldIDs=newToOldIDs) del adjMat # free the incredible amount of memory for the adjacency matrix genRecipFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph) results = getResults(year, weightF, graphFiles=graphFiles) pickler.save(results, 'Data/pruning_optimizations.%d.%s' % (year, weightF)) timing.markEvent('Finished with %s' % weightF) timing.finish()
def processYearAndWeight(year, weighting, percents=None, thresholds=None): timing = Timer('Running for year %d and weight %s' % (year, weighting)) adjMatFile = 'Data/Unipartite-Matrix/%d.%s' % (year, weighting) sortedVals, N = getSortedMatrixVals(adjMatFile) timing.markEvent('Got sorted vals') if percents: for p in percents: outfile = 'Data/Unipartite-Graphs/%d.%s_percent_%f.graph' \ % (year, weighting, p) graph = pruneGraphByPercent(sortedVals, N, p) graph_funcs.saveGraph(graph, outfile) timing.markEvent('Finished for %f percent' % p) if thresholds: for t in thresholds: outfile = 'Data/Unipartite-Graphs/%d.%s_threshold_%f.graph' \ % (year, weighting, t) graph = pruneGraphByThreshold(sortedVals, N, t) graph_funcs.saveGraph(graph, outfile) timing.markEvent('Finished for threshold %f' % t) timing.finish()
} clfs = { 'OLS': linear_model.LinearRegression(), 'Random Forest': ensemble.RandomForestRegressor(), } extensions = ('jaccard', 'jaccard2', 'cosine', 'adamic', 'weighted_adamic') results = {} resultsList = [] years = [int(arg) for arg in sys.argv[1:]] timing = Timer('Runnign everything') for year in years: timing.markEvent('Running for year %d' % year) results[year] = {} for extension in extensions: timing.markEvent('Running for extension %s' % extension) results[year][extension] = {} for clfname, clf in clfs.iteritems(): timing.markEvent('Running for classifier %s' % clfname) results[year][extension][clfname] = {} for decompname, decompFunction in decompFunctions.iteritems(): timing.markEvent('Running for decomp function %s' % decompname) rsquareds = cfscore_predictions.trainAndTestModels( year, extension, clf=clf, decomp_func=decompFunction) resultsList.append( (year, extension, clfname, decompname, tuple(rsquareds))) results[year][extension][clfname][decompname] = tuple( rsquareds)
def createDonorDonorGraph(year, weightF): timing = Timer('creating donor-donor graph for %d' % year) # Load the old bipartite graph graph bipartiteGraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year) # Load the info about each donor and their recipients numDonations, totalAmount, cands, transactions, amounts, totalReceipts = getDonorInfos( bipartiteGraph) timing.markEvent('Got info about donor nodes') # Create initial unipartite graph with just nodes and node attributes unipartiteGraph, oldToNew, newToOld = cloneBipartiteNodes( bipartiteGraph, cands) timing.markEvent('Finished cloning nodes') jaccardData = [] jaccard2Data = [] affinityData = [] cosineData = [] adamicData = [] weightedAdamicData = [] r = [] c = [] # Add the weighted edges for every relevant pair of donor nodes nodesDone = 0 for i, newID1 in enumerate(newToOld.keys()): oldID1 = newToOld[newID1] for newID2 in newToOld.keys()[i + 1:]: oldID2 = newToOld[newID2] sharedCands = cands[oldID1].intersection(cands[oldID2]) if not sharedCands: continue # Calculate the weight weights = weightF(oldID1, oldID2, sharedCands, numDonations, totalAmount, cands, transactions, amounts, totalReceipts) r.append(newID1) r.append(newID2) c.append(newID2) c.append(newID1) jaccardData.append(weights['jaccard']) jaccardData.append(weights['jaccard']) jaccard2Data.append(weights['jaccard2']) jaccard2Data.append(weights['jaccard2']) affinityData.append(weights['affinity']) affinityData.append(weights['affinity']) cosineData.append(weights['cosine']) cosineData.append(weights['cosine']) adamicData.append(weights['adamic']) adamicData.append(weights['adamic']) weightedAdamicData.append(weights['weighted_adamic']) weightedAdamicData.append(weights['weighted_adamic']) # Add the edges between the two nodes and their weights unipartiteGraph.AddEdge(newID1, newID2) nodesDone += 1 if nodesDone % 100 == 0: timing.markEvent('Finished %d outer loops out of %d' % \ (nodesDone, unipartiteGraph.GetNodes())) N = len(newToOld) jaccardAdjMat = sp.csr_matrix((jaccardData, (r, c)), shape=(N, N)) jaccard2AdjMat = sp.csr_matrix((jaccard2Data, (r, c)), shape=(N, N)) affinityAdjMat = sp.csr_matrix((affinityData, (r, c)), shape=(N, N)) cosineAdjMat = sp.csr_matrix((cosineData, (r, c)), shape=(N, N)) adamicAdjMat = sp.csr_matrix((adamicData, (r, c)), shape=(N, N)) weightedAdamicAdjMat = sp.csr_matrix((weightedAdamicData, (r, c)), shape=(N, N)) timing.finish() return unipartiteGraph, jaccardAdjMat, jaccard2AdjMat, affinityAdjMat, cosineAdjMat, adamicAdjMat, weightedAdamicAdjMat, newToOld, oldToNew
#weightings = ('jaccard', 'jaccard2', 'affinity', 'cosine', 'adamic', 'weighted_adamic') #weightings = ('adamic', 'weighted_adamic') weightings = ('jaccard2', ) for year in sys.argv[1:]: year = int(year) timing = Timer('Generating features for %d' % year) graph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year) receiptsFromDonor, totalReceipts, totalDonations = getDonationAmounts( graph) partialFeatures, fullFeatures = getCategoricalGraphFeatures(graph) baselineFeatures = \ getBaselineFeatures(graph, receiptsFromDonor, totalReceipts, totalDonations, partialFeatures, fullFeatures) saveFeatures(graph, baselineFeatures, 'Data/Recip-Features/%d.baseline' % year) timing.markEvent('Generated baseline features') for weighting in weightings: donorFeatures = pickler.load('Data/Features/%d%s.features' \ % (year, weighting)) recipFeatures = getRecipFeatures(graph, donorFeatures, receiptsFromDonor, totalReceipts, totalDonations, partialFeatures, fullFeatures) saveFeatures(graph, recipFeatures, 'Data/Recip-Features/%d.%s' \ % (year, weighting)) timing.markEvent('Calculated main recipient features for %s' \ % weighting) timing.finish()
################################################################################ # Module command-line behavior # ################################################################################ if __name__ == '__main__': for arg in sys.argv[1:]: year = int(arg) timing = Timer('creating unipartite graph for %d' % year) bipartiteGraph = graph_funcs.loadGraph( 'Data/Bipartite-Graphs/%d.graph' % year) unipartiteGraph = graph_funcs.loadGraph( 'Data/Unipartite-Graphs/%d.graph' % year, snap.TUNGraph) newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year) timing.markEvent('Loaded input graphs/matrices.') #for weightF in ['jaccard', 'affinity', 'jaccard2', 'cosine', 'adamic', 'weighted_adamic']: for weightF in ['jaccard2']: print '******* %s *******' % weightF adjMatrix = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF)) adjMatrix = adjMatrix.tocsc() features = generateFeatures(year, bipartiteGraph, unipartiteGraph, newToOldIDs, adjMatrix) pickler.save(features, 'Data/Features/%d%s.features' % (year, weightF)) timing.markEvent('Processed %s weight function' % weightF)
def getSortedMatrixVals(filename): timing = Timer('Gettin sorted matrix vals') adjMat = pickler.load(filename) timing.markEvent('Loaded adjacency matrix') N = adjMat.shape[0] xIndices, yIndices = adjMat.nonzero() timing.markEvent('Loaded nonzero indices') data = adjMat[xIndices, yIndices] timing.markEvent('Loaded nonzero vals') flat = np.ravel(data) timing.markEvent('Flattened data') vals = zip(xIndices, yIndices, flat) timing.markEvent('Zipped values') vals.sort(key=lambda v: v[2], reverse=True) timing.markEvent('Sorted values') print vals[0][2] > vals[1][2] print vals[0][2], vals[1][2] raise ValueError("LOL") return vals, N