示例#1
0
def generateFeatures(year, bipartite, unipartite, newToOldIDs, adjMatrix):
    timing = Timer('generating features for %d' % year)

    bipartiteFeatures = extractBipartiteFeatures(bipartite)
    timing.markEvent('Extracted bipartite features.')

    # rawUnifeatures, componentFeatureFunc, communityFeatureFuncn = extractUnipartiteFeatures(unipartite, adjMatrix)
    rawUnifeatures, componentFeatureFunc, CNMFeatureFunc = extractUnipartiteFeatures(
        unipartite, adjMatrix)
    unipartiteFeatures = convertNewToOldIDs(rawUnifeatures, newToOldIDs)
    timing.markEvent('Extracted unipartite features.')

    # append unipartite features to bipartite features for each node, returning combined feature dictionary.
    # If the donor is not in the unipartite feature graph then we just take the default values (since the
    # node falls below the unipartite threshold from sqlToGraphs):
    features = {}
    for donorNode in graph_funcs.getDonors(bipartite):
        oldNID = donorNode.GetId()
        if oldNID in unipartiteFeatures:
            features[oldNID] = bipartiteFeatures[oldNID] + unipartiteFeatures[
                oldNID]
        else:
            features[oldNID] = bipartiteFeatures[
                oldNID] + defaultUnipartiteFeatures(
                    componentFeatureFunc,
                    CNMFeatureFunc)  #, communityFeatureFuncn)
    timing.finish()

    return features
示例#2
0
def trainAndTestModels(year,
                       extension,
                       X=None,
                       Y=None,
                       k=10,
                       clf=linear_model.LinearRegression(),
                       transF=None,
                       decomp_func=None):
    timing = Timer('Running regression for %d.%s' % (year, extension))
    if X is None or Y is None:
        X, Y = pickler.load('Data/Recip-Features/%d.%s' % (year, extension))
    if transF: Y = transF(Y)
    timing.markEvent('Loaded X and Y')
    rsquareds = []

    # Train and test the regression model on each k-fold set
    kf = KFold(len(Y), k)
    for train, test in kf:
        X_train, X_test = X[train], X[test]
        Y_train, Y_test = Y[train], Y[test]

        if decomp_func:
            decomp_func.fit(X_train)
            X_train = decomp_func.transform(X_train)
            X_test = decomp_func.transform(X_test)

        clf.fit(X_train, Y_train)
        rsquareds.append(clf.score(X_test, Y_test))
    timing.markEvent('Ran regression')

    timing.finish()
    return rsquareds
示例#3
0
def genRecipFeatures(year, weightF, graphFiles=None, bigraph=None):
    timing = Timer('Generating recip features for %d %s' % (year, weightF))

    if not graphFiles: graphFiles = getGraphFiles(year, weightF)
    if not bigraph: bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year)

    receiptsFromDonor, totalReceipts, totalDonations = \
            recip_feature_extractor.getDonationAmounts(bigraph)
    partialFeatures, fullFeatures = \
            recip_feature_extractor.getCategoricalGraphFeatures(bigraph)

    timing.markEvent('Loaded bigraph, donor amounts, and categorical feature funcs')

    for gf in graphFiles:
        donorFeatures = pickler.load('Data/Features/%s.features' % gf)
        timing.markEvent('Loaded donor features for graph %s' % gf)

        recipFeatures = recip_feature_extractor.getRecipFeatures(
                bigraph, donorFeatures, receiptsFromDonor, totalReceipts,
                totalDonations, partialFeatures, fullFeatures)
        timing.markEvent('Calculated recip features')

        recip_feature_extractor.saveFeatures(bigraph, recipFeatures, 'Data/Recip-Features/%s' % gf)
        timing.markEvent('Saved recip features')

    timing.finish()
示例#4
0
def genDonorFeatures(year, weightF, graphFiles=None, bigraph=None, adjMat=None, newToOldIDs=None):
    timing = Timer('Generating donor features for %d %s' % (year, weightF))

    if not graphFiles:
        graphFiles = getGraphFiles(year, weightF)
    if not bigraph:
        bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year)
    if adjMat is None:
        adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF))
        adjMat = adjMat.tocsc()
    if newToOldIDs is None:
        newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year)
    timing.markEvent('Loaded bigraph, adj matrix, and newToOld mapping')

    for gf in graphFiles:
        unigraph = graph_funcs.loadGraph('Data/Unipartite-Graphs/%s' % gf, snap.TUNGraph)
        timing.markEvent('Loaded graph %s' % gf)

        features = feature_extractor.generateFeatures(year, bigraph, unigraph, newToOldIDs, adjMat)
        timing.markEvent('Generated features')

        pickler.save(features, 'Data/Features/%s.features' % gf)
        timing.markEvent('Saved features')

    timing.finish()
示例#5
0
def calcAverageWeights(graph, adjMat):
    neighbors = defaultdict(list)
    timing = Timer('Calculating average weights')
    # Get all the nodes that a node borders in the graph
    for edge in graph.Edges():
        nodeid1 = edge.GetSrcNId()
        nodeid2 = edge.GetDstNId()
        neighbors[nodeid1].append(nodeid2)
        neighbors[nodeid2].append(nodeid1)
    timing.markEvent('Gotten all neighbors')

    # Get the average weight per node connected to
    weights = {}
    i = 0
    for nodeid in neighbors:
        rows = neighbors[nodeid]
        weights[nodeid] = adjMat[rows, nodeid].sum() / float(len(rows))
        i += 1
        if i % 1000 == 0:
            timing.markEvent('Done with %d out of %d' % (i, len(neighbors)))

    return weights
示例#6
0
def extractUnipartiteFeatures(unipartiteGraph, adjMat):
    timing = Timer('extracting unipartite features')

    features = defaultdict(list)
    #componentFeatureFunc, communityFeatureFuncn, idToCommunity = getUnipartiteSurfaceFeatures(unipartiteGraph, adjMat, features)
    componentFeatureFunc, CNMFeatureFunc, idToCNM = getUnipartiteSurfaceFeatures(
        unipartiteGraph, adjMat, features)

    timing.markEvent('1. Extracted surface features')

    # Average weight of edges:
    avgWeights = calcAverageWeights(unipartiteGraph, adjMat)
    #totalWeights = {adjMat
    timing.markEvent('2. Computed average weights.')

    # Size of connected component:
    #cnctComponents = calcCnctComponents(unipartiteGraph)
    timing.markEvent('3. Computed connected components.')

    # Size of CNM community:
    communities = calcCommunities(idToCNM)
    timing.markEvent('4. Computed CNM communities.')

    # Pagerank:
    pageRanks = snap.TIntFltH()
    snap.GetPageRank(unipartiteGraph, pageRanks)
    timing.markEvent('5. Computed PageRank.')

    # combine the graph wide features with the existing surface features:
    for nid in features:
        features[nid].append(avgWeights[nid])
        #features[nid].append(cnctComponents[nid])
        features[nid].append(communities[nid])
        features[nid].append(pageRanks[nid])

    timing.finish()

    return features, componentFeatureFunc, CNMFeatureFunc
示例#7
0
def runFullPipeline(year):
    timing = Timer('Running pipeline for %d' % year)

    weightings = ('adamic', 'cosine', 'jaccard', 'jaccard2', 'weighted_adamic')
    bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph'% year)
    newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year)

    for weightF in weightings:

        graphFiles = getGraphFiles(year, weightF)

        adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF))
        timing.markEvent('Loaded everything for donor features')
        genDonorFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph,\
                adjMat=adjMat, newToOldIDs=newToOldIDs)
        del adjMat # free the incredible amount of memory for the adjacency matrix


        genRecipFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph)
        results = getResults(year, weightF, graphFiles=graphFiles)
        pickler.save(results, 'Data/pruning_optimizations.%d.%s' % (year, weightF))
        timing.markEvent('Finished with %s' % weightF)

    timing.finish()
示例#8
0
def processYearAndWeight(year, weighting, percents=None, thresholds=None):
    timing = Timer('Running for year %d and weight %s' % (year, weighting))
    adjMatFile = 'Data/Unipartite-Matrix/%d.%s' % (year, weighting)
    sortedVals, N = getSortedMatrixVals(adjMatFile)
    timing.markEvent('Got sorted vals')

    if percents:
        for p in percents:
            outfile = 'Data/Unipartite-Graphs/%d.%s_percent_%f.graph' \
                    % (year, weighting, p)
            graph = pruneGraphByPercent(sortedVals, N, p)
            graph_funcs.saveGraph(graph, outfile)
            timing.markEvent('Finished for %f percent' % p)

    if thresholds:
        for t in thresholds:
            outfile = 'Data/Unipartite-Graphs/%d.%s_threshold_%f.graph' \
                    % (year, weighting, t)
            graph = pruneGraphByThreshold(sortedVals, N, t)
            graph_funcs.saveGraph(graph, outfile)
            timing.markEvent('Finished for threshold %f' % t)

    timing.finish()
示例#9
0
}

clfs = {
    'OLS': linear_model.LinearRegression(),
    'Random Forest': ensemble.RandomForestRegressor(),
}

extensions = ('jaccard', 'jaccard2', 'cosine', 'adamic', 'weighted_adamic')

results = {}
resultsList = []

years = [int(arg) for arg in sys.argv[1:]]
timing = Timer('Runnign everything')
for year in years:
    timing.markEvent('Running for year %d' % year)
    results[year] = {}
    for extension in extensions:
        timing.markEvent('Running for extension %s' % extension)
        results[year][extension] = {}
        for clfname, clf in clfs.iteritems():
            timing.markEvent('Running for classifier %s' % clfname)
            results[year][extension][clfname] = {}
            for decompname, decompFunction in decompFunctions.iteritems():
                timing.markEvent('Running for decomp function %s' % decompname)
                rsquareds = cfscore_predictions.trainAndTestModels(
                    year, extension, clf=clf, decomp_func=decompFunction)
                resultsList.append(
                    (year, extension, clfname, decompname, tuple(rsquareds)))
                results[year][extension][clfname][decompname] = tuple(
                    rsquareds)
示例#10
0
def createDonorDonorGraph(year, weightF):
    timing = Timer('creating donor-donor graph for %d' % year)

    # Load the old bipartite graph graph
    bipartiteGraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' %
                                           year)

    # Load the info about each donor and their recipients
    numDonations, totalAmount, cands, transactions, amounts, totalReceipts = getDonorInfos(
        bipartiteGraph)
    timing.markEvent('Got info about donor nodes')

    # Create initial unipartite graph with just nodes and node attributes
    unipartiteGraph, oldToNew, newToOld = cloneBipartiteNodes(
        bipartiteGraph, cands)
    timing.markEvent('Finished cloning nodes')

    jaccardData = []
    jaccard2Data = []
    affinityData = []
    cosineData = []
    adamicData = []
    weightedAdamicData = []
    r = []
    c = []

    # Add the weighted edges for every relevant pair of donor nodes
    nodesDone = 0

    for i, newID1 in enumerate(newToOld.keys()):
        oldID1 = newToOld[newID1]
        for newID2 in newToOld.keys()[i + 1:]:
            oldID2 = newToOld[newID2]

            sharedCands = cands[oldID1].intersection(cands[oldID2])
            if not sharedCands: continue

            # Calculate the weight
            weights = weightF(oldID1, oldID2, sharedCands, numDonations,
                              totalAmount, cands, transactions, amounts,
                              totalReceipts)

            r.append(newID1)
            r.append(newID2)
            c.append(newID2)
            c.append(newID1)
            jaccardData.append(weights['jaccard'])
            jaccardData.append(weights['jaccard'])
            jaccard2Data.append(weights['jaccard2'])
            jaccard2Data.append(weights['jaccard2'])
            affinityData.append(weights['affinity'])
            affinityData.append(weights['affinity'])
            cosineData.append(weights['cosine'])
            cosineData.append(weights['cosine'])
            adamicData.append(weights['adamic'])
            adamicData.append(weights['adamic'])
            weightedAdamicData.append(weights['weighted_adamic'])
            weightedAdamicData.append(weights['weighted_adamic'])

            # Add the edges between the two nodes and their weights
            unipartiteGraph.AddEdge(newID1, newID2)

        nodesDone += 1
        if nodesDone % 100 == 0:
            timing.markEvent('Finished %d outer loops out of %d' % \
                    (nodesDone, unipartiteGraph.GetNodes()))

    N = len(newToOld)
    jaccardAdjMat = sp.csr_matrix((jaccardData, (r, c)), shape=(N, N))
    jaccard2AdjMat = sp.csr_matrix((jaccard2Data, (r, c)), shape=(N, N))
    affinityAdjMat = sp.csr_matrix((affinityData, (r, c)), shape=(N, N))
    cosineAdjMat = sp.csr_matrix((cosineData, (r, c)), shape=(N, N))
    adamicAdjMat = sp.csr_matrix((adamicData, (r, c)), shape=(N, N))
    weightedAdamicAdjMat = sp.csr_matrix((weightedAdamicData, (r, c)),
                                         shape=(N, N))

    timing.finish()
    return unipartiteGraph, jaccardAdjMat, jaccard2AdjMat, affinityAdjMat, cosineAdjMat, adamicAdjMat, weightedAdamicAdjMat, newToOld, oldToNew
示例#11
0
    #weightings = ('jaccard', 'jaccard2', 'affinity', 'cosine', 'adamic', 'weighted_adamic')
    #weightings = ('adamic', 'weighted_adamic')
    weightings = ('jaccard2', )
    for year in sys.argv[1:]:
        year = int(year)
        timing = Timer('Generating features for %d' % year)
        graph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year)
        receiptsFromDonor, totalReceipts, totalDonations = getDonationAmounts(
            graph)
        partialFeatures, fullFeatures = getCategoricalGraphFeatures(graph)

        baselineFeatures = \
            getBaselineFeatures(graph, receiptsFromDonor, totalReceipts, totalDonations, partialFeatures, fullFeatures)
        saveFeatures(graph, baselineFeatures,
                     'Data/Recip-Features/%d.baseline' % year)
        timing.markEvent('Generated baseline features')

        for weighting in weightings:
            donorFeatures = pickler.load('Data/Features/%d%s.features' \
                    % (year, weighting))
            recipFeatures = getRecipFeatures(graph, donorFeatures,
                                             receiptsFromDonor, totalReceipts,
                                             totalDonations, partialFeatures,
                                             fullFeatures)
            saveFeatures(graph, recipFeatures, 'Data/Recip-Features/%d.%s' \
                    % (year, weighting))
            timing.markEvent('Calculated main recipient features for %s' \
                    % weighting)

        timing.finish()
示例#12
0
################################################################################
# Module command-line behavior #
################################################################################

if __name__ == '__main__':
    for arg in sys.argv[1:]:
        year = int(arg)
        timing = Timer('creating unipartite graph for %d' % year)

        bipartiteGraph = graph_funcs.loadGraph(
            'Data/Bipartite-Graphs/%d.graph' % year)
        unipartiteGraph = graph_funcs.loadGraph(
            'Data/Unipartite-Graphs/%d.graph' % year, snap.TUNGraph)
        newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' %
                                   year)
        timing.markEvent('Loaded input graphs/matrices.')

        #for weightF in ['jaccard', 'affinity', 'jaccard2', 'cosine', 'adamic', 'weighted_adamic']:
        for weightF in ['jaccard2']:
            print '******* %s *******' % weightF
            adjMatrix = pickler.load('Data/Unipartite-Matrix/%d.%s' %
                                     (year, weightF))
            adjMatrix = adjMatrix.tocsc()

            features = generateFeatures(year, bipartiteGraph, unipartiteGraph,
                                        newToOldIDs, adjMatrix)
            pickler.save(features,
                         'Data/Features/%d%s.features' % (year, weightF))

            timing.markEvent('Processed %s weight function' % weightF)
示例#13
0
def getSortedMatrixVals(filename):
    timing = Timer('Gettin sorted matrix vals')
    adjMat = pickler.load(filename)
    timing.markEvent('Loaded adjacency matrix')
    N = adjMat.shape[0]
    xIndices, yIndices = adjMat.nonzero()
    timing.markEvent('Loaded nonzero indices')
    data = adjMat[xIndices, yIndices]
    timing.markEvent('Loaded nonzero vals')
    flat = np.ravel(data)
    timing.markEvent('Flattened data')

    vals = zip(xIndices, yIndices, flat)
    timing.markEvent('Zipped values')
    vals.sort(key=lambda v: v[2], reverse=True)
    timing.markEvent('Sorted values')
    print vals[0][2] > vals[1][2]
    print vals[0][2], vals[1][2]
    raise ValueError("LOL")
    return vals, N