def genDonorFeatures(year, weightF, graphFiles=None, bigraph=None, adjMat=None, newToOldIDs=None): timing = Timer('Generating donor features for %d %s' % (year, weightF)) if not graphFiles: graphFiles = getGraphFiles(year, weightF) if not bigraph: bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year) if adjMat is None: adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF)) adjMat = adjMat.tocsc() if newToOldIDs is None: newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year) timing.markEvent('Loaded bigraph, adj matrix, and newToOld mapping') for gf in graphFiles: unigraph = graph_funcs.loadGraph('Data/Unipartite-Graphs/%s' % gf, snap.TUNGraph) timing.markEvent('Loaded graph %s' % gf) features = feature_extractor.generateFeatures(year, bigraph, unigraph, newToOldIDs, adjMat) timing.markEvent('Generated features') pickler.save(features, 'Data/Features/%s.features' % gf) timing.markEvent('Saved features') timing.finish()
def genRecipFeatures(year, weightF, graphFiles=None, bigraph=None): timing = Timer('Generating recip features for %d %s' % (year, weightF)) if not graphFiles: graphFiles = getGraphFiles(year, weightF) if not bigraph: bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year) receiptsFromDonor, totalReceipts, totalDonations = \ recip_feature_extractor.getDonationAmounts(bigraph) partialFeatures, fullFeatures = \ recip_feature_extractor.getCategoricalGraphFeatures(bigraph) timing.markEvent('Loaded bigraph, donor amounts, and categorical feature funcs') for gf in graphFiles: donorFeatures = pickler.load('Data/Features/%s.features' % gf) timing.markEvent('Loaded donor features for graph %s' % gf) recipFeatures = recip_feature_extractor.getRecipFeatures( bigraph, donorFeatures, receiptsFromDonor, totalReceipts, totalDonations, partialFeatures, fullFeatures) timing.markEvent('Calculated recip features') recip_feature_extractor.saveFeatures(bigraph, recipFeatures, 'Data/Recip-Features/%s' % gf) timing.markEvent('Saved recip features') timing.finish()
def runFullPipeline(year): timing = Timer('Running pipeline for %d' % year) weightings = ('adamic', 'cosine', 'jaccard', 'jaccard2', 'weighted_adamic') bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph'% year) newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year) for weightF in weightings: graphFiles = getGraphFiles(year, weightF) adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF)) timing.markEvent('Loaded everything for donor features') genDonorFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph,\ adjMat=adjMat, newToOldIDs=newToOldIDs) del adjMat # free the incredible amount of memory for the adjacency matrix genRecipFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph) results = getResults(year, weightF, graphFiles=graphFiles) pickler.save(results, 'Data/pruning_optimizations.%d.%s' % (year, weightF)) timing.markEvent('Finished with %s' % weightF) timing.finish()
def createDonorDonorGraph(year, weightF): timing = Timer('creating donor-donor graph for %d' % year) # Load the old bipartite graph graph bipartiteGraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year) # Load the info about each donor and their recipients numDonations, totalAmount, cands, transactions, amounts, totalReceipts = getDonorInfos( bipartiteGraph) timing.markEvent('Got info about donor nodes') # Create initial unipartite graph with just nodes and node attributes unipartiteGraph, oldToNew, newToOld = cloneBipartiteNodes( bipartiteGraph, cands) timing.markEvent('Finished cloning nodes') jaccardData = [] jaccard2Data = [] affinityData = [] cosineData = [] adamicData = [] weightedAdamicData = [] r = [] c = [] # Add the weighted edges for every relevant pair of donor nodes nodesDone = 0 for i, newID1 in enumerate(newToOld.keys()): oldID1 = newToOld[newID1] for newID2 in newToOld.keys()[i + 1:]: oldID2 = newToOld[newID2] sharedCands = cands[oldID1].intersection(cands[oldID2]) if not sharedCands: continue # Calculate the weight weights = weightF(oldID1, oldID2, sharedCands, numDonations, totalAmount, cands, transactions, amounts, totalReceipts) r.append(newID1) r.append(newID2) c.append(newID2) c.append(newID1) jaccardData.append(weights['jaccard']) jaccardData.append(weights['jaccard']) jaccard2Data.append(weights['jaccard2']) jaccard2Data.append(weights['jaccard2']) affinityData.append(weights['affinity']) affinityData.append(weights['affinity']) cosineData.append(weights['cosine']) cosineData.append(weights['cosine']) adamicData.append(weights['adamic']) adamicData.append(weights['adamic']) weightedAdamicData.append(weights['weighted_adamic']) weightedAdamicData.append(weights['weighted_adamic']) # Add the edges between the two nodes and their weights unipartiteGraph.AddEdge(newID1, newID2) nodesDone += 1 if nodesDone % 100 == 0: timing.markEvent('Finished %d outer loops out of %d' % \ (nodesDone, unipartiteGraph.GetNodes())) N = len(newToOld) jaccardAdjMat = sp.csr_matrix((jaccardData, (r, c)), shape=(N, N)) jaccard2AdjMat = sp.csr_matrix((jaccard2Data, (r, c)), shape=(N, N)) affinityAdjMat = sp.csr_matrix((affinityData, (r, c)), shape=(N, N)) cosineAdjMat = sp.csr_matrix((cosineData, (r, c)), shape=(N, N)) adamicAdjMat = sp.csr_matrix((adamicData, (r, c)), shape=(N, N)) weightedAdamicAdjMat = sp.csr_matrix((weightedAdamicData, (r, c)), shape=(N, N)) timing.finish() return unipartiteGraph, jaccardAdjMat, jaccard2AdjMat, affinityAdjMat, cosineAdjMat, adamicAdjMat, weightedAdamicAdjMat, newToOld, oldToNew
return partialFeatures, fullFeatures ################################################################################ # Module command-line behavior # ################################################################################ if __name__ == '__main__': #weightings = ('jaccard', 'jaccard2', 'affinity', 'cosine', 'adamic', 'weighted_adamic') #weightings = ('adamic', 'weighted_adamic') weightings = ('jaccard2', ) for year in sys.argv[1:]: year = int(year) timing = Timer('Generating features for %d' % year) graph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year) receiptsFromDonor, totalReceipts, totalDonations = getDonationAmounts( graph) partialFeatures, fullFeatures = getCategoricalGraphFeatures(graph) baselineFeatures = \ getBaselineFeatures(graph, receiptsFromDonor, totalReceipts, totalDonations, partialFeatures, fullFeatures) saveFeatures(graph, baselineFeatures, 'Data/Recip-Features/%d.baseline' % year) timing.markEvent('Generated baseline features') for weighting in weightings: donorFeatures = pickler.load('Data/Features/%d%s.features' \ % (year, weighting)) recipFeatures = getRecipFeatures(graph, donorFeatures, receiptsFromDonor, totalReceipts,
communities[nid] = communityIndex communityIndex += 1 return communities ################################################################################ # Module command-line behavior # ################################################################################ if __name__ == '__main__': for arg in sys.argv[1:]: year = int(arg) timing = Timer('creating unipartite graph for %d' % year) bipartiteGraph = graph_funcs.loadGraph( 'Data/Bipartite-Graphs/%d.graph' % year) unipartiteGraph = graph_funcs.loadGraph( 'Data/Unipartite-Graphs/%d.graph' % year, snap.TUNGraph) newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year) timing.markEvent('Loaded input graphs/matrices.') #for weightF in ['jaccard', 'affinity', 'jaccard2', 'cosine', 'adamic', 'weighted_adamic']: for weightF in ['jaccard2']: print '******* %s *******' % weightF adjMatrix = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF)) adjMatrix = adjMatrix.tocsc() features = generateFeatures(year, bipartiteGraph, unipartiteGraph, newToOldIDs, adjMatrix)