Пример #1
0
def genRecipFeatures(year, weightF, graphFiles=None, bigraph=None):
    timing = Timer('Generating recip features for %d %s' % (year, weightF))

    if not graphFiles: graphFiles = getGraphFiles(year, weightF)
    if not bigraph: bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year)

    receiptsFromDonor, totalReceipts, totalDonations = \
            recip_feature_extractor.getDonationAmounts(bigraph)
    partialFeatures, fullFeatures = \
            recip_feature_extractor.getCategoricalGraphFeatures(bigraph)

    timing.markEvent('Loaded bigraph, donor amounts, and categorical feature funcs')

    for gf in graphFiles:
        donorFeatures = pickler.load('Data/Features/%s.features' % gf)
        timing.markEvent('Loaded donor features for graph %s' % gf)

        recipFeatures = recip_feature_extractor.getRecipFeatures(
                bigraph, donorFeatures, receiptsFromDonor, totalReceipts,
                totalDonations, partialFeatures, fullFeatures)
        timing.markEvent('Calculated recip features')

        recip_feature_extractor.saveFeatures(bigraph, recipFeatures, 'Data/Recip-Features/%s' % gf)
        timing.markEvent('Saved recip features')

    timing.finish()
Пример #2
0
def loadRecipients(dbNames, filepath):
    timing = Timer('loading Recipients table')
    extractors = [
        0, 7, 8, 10, 12, 13, 14, 15, 16, 22, 23, 39, 46, 47, 61, 62, 63, 64, 65
    ]
    transforms = [
        int, str, safeInt, party, str, str, incumb, float, float, int, gender,
        safeInt, winner, safeFloat, safeFloat, safeFloat, candStatus, int,
        candOrComm
    ]
    observedKeys = set()

    for db in dbNames:
        initRecipientTable(db)

    with open(filepath, 'r') as f:
        reader = csv.reader(f)
        reader.next()  # skip column headers
        for i, block in enumerate(generateChunk(reader, extractors,
                                                transforms)):
            newBlock = filterRecipients(block, observedKeys)
            for db in dbNames:
                commitRecipBlock(db, newBlock)

    timing.finish()
Пример #3
0
def trainAndTestModels(year,
                       extension,
                       X=None,
                       Y=None,
                       k=10,
                       clf=linear_model.LinearRegression(),
                       transF=None,
                       decomp_func=None):
    timing = Timer('Running regression for %d.%s' % (year, extension))
    if X is None or Y is None:
        X, Y = pickler.load('Data/Recip-Features/%d.%s' % (year, extension))
    if transF: Y = transF(Y)
    timing.markEvent('Loaded X and Y')
    rsquareds = []

    # Train and test the regression model on each k-fold set
    kf = KFold(len(Y), k)
    for train, test in kf:
        X_train, X_test = X[train], X[test]
        Y_train, Y_test = Y[train], Y[test]

        if decomp_func:
            decomp_func.fit(X_train)
            X_train = decomp_func.transform(X_train)
            X_test = decomp_func.transform(X_test)

        clf.fit(X_train, Y_train)
        rsquareds.append(clf.score(X_test, Y_test))
    timing.markEvent('Ran regression')

    timing.finish()
    return rsquareds
Пример #4
0
def getDonationAmounts(graph):
    timing = Timer('Getting candidate, donor, and cand-donor donation amounts')
    # A dictionary from rnodeids to dictionaries from cnodeids to floats indicating
    # the total donations from that donor to that candidate
    receiptsFromDonor = defaultdict(lambda: defaultdict(int))

    # A dictionary from rnodeids to ints indicating the total amount donated to
    # that candidate.
    totalReceipts = defaultdict(int)

    # A dictionary from cnodeids to ints indicating the total amount donated by
    # that donor.
    totalDonations = defaultdict(int)

    # For each donation, note it in the relevant dictionaries
    for edge in graph.Edges():
        donor = edge.GetSrcNId()
        recip = edge.GetDstNId()
        amount = graph.GetIntAttrDatE(edge.GetId(), 'amount')

        receiptsFromDonor[recip][donor] += amount
        totalReceipts[recip] += amount
        totalDonations[donor] += amount

    timing.finish()
    return receiptsFromDonor, totalReceipts, totalDonations
Пример #5
0
def generateFeatures(year, bipartite, unipartite, newToOldIDs, adjMatrix):
    timing = Timer('generating features for %d' % year)

    bipartiteFeatures = extractBipartiteFeatures(bipartite)
    timing.markEvent('Extracted bipartite features.')

    # rawUnifeatures, componentFeatureFunc, communityFeatureFuncn = extractUnipartiteFeatures(unipartite, adjMatrix)
    rawUnifeatures, componentFeatureFunc, CNMFeatureFunc = extractUnipartiteFeatures(
        unipartite, adjMatrix)
    unipartiteFeatures = convertNewToOldIDs(rawUnifeatures, newToOldIDs)
    timing.markEvent('Extracted unipartite features.')

    # append unipartite features to bipartite features for each node, returning combined feature dictionary.
    # If the donor is not in the unipartite feature graph then we just take the default values (since the
    # node falls below the unipartite threshold from sqlToGraphs):
    features = {}
    for donorNode in graph_funcs.getDonors(bipartite):
        oldNID = donorNode.GetId()
        if oldNID in unipartiteFeatures:
            features[oldNID] = bipartiteFeatures[oldNID] + unipartiteFeatures[
                oldNID]
        else:
            features[oldNID] = bipartiteFeatures[
                oldNID] + defaultUnipartiteFeatures(
                    componentFeatureFunc,
                    CNMFeatureFunc)  #, communityFeatureFuncn)
    timing.finish()

    return features
Пример #6
0
def genDonorFeatures(year, weightF, graphFiles=None, bigraph=None, adjMat=None, newToOldIDs=None):
    timing = Timer('Generating donor features for %d %s' % (year, weightF))

    if not graphFiles:
        graphFiles = getGraphFiles(year, weightF)
    if not bigraph:
        bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year)
    if adjMat is None:
        adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF))
        adjMat = adjMat.tocsc()
    if newToOldIDs is None:
        newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year)
    timing.markEvent('Loaded bigraph, adj matrix, and newToOld mapping')

    for gf in graphFiles:
        unigraph = graph_funcs.loadGraph('Data/Unipartite-Graphs/%s' % gf, snap.TUNGraph)
        timing.markEvent('Loaded graph %s' % gf)

        features = feature_extractor.generateFeatures(year, bigraph, unigraph, newToOldIDs, adjMat)
        timing.markEvent('Generated features')

        pickler.save(features, 'Data/Features/%s.features' % gf)
        timing.markEvent('Saved features')

    timing.finish()
Пример #7
0
def loadContributors(dbNames, filepath):
    timing = Timer('loading Contributors table')
    extractors = [0, 1, 2, 3]
    transforms = [int, indiv, str, safeFloat]

    for db in dbNames:
        initContributorsTable(db)
    reader = csv.reader(open(filepath, 'rb'))
    reader.next()  # skip column headers
    for i, block in enumerate(generateChunk(reader, extractors, transforms)):
        for db in dbNames:
            commitContribBlock(db, block)

    timing.finish()
Пример #8
0
def getResults(year, weightF, graphFiles=None):
    timing = Timer('Running regressions for %d %s' % (year, weightF))

    results = []

    if not graphFiles: graphFiles = getGraphFiles(year, weightF)

    for gf in graphFiles:
        X, Y = pickler.load('Data/Recip-Features/%s' % gf)
        rsquareds = cfscore_predictions.trainAndTestModels(year, weightF, X=X, Y=Y)
        results.append([weightF, gf, rsquareds])

    timing.finish()

    return results
Пример #9
0
def loadTransactionFile(dbName, csvName, year):
    timing = Timer('loading Transactions_%d into table' % year)
    extractors = [0, 1, 2, 3, 4, 5, 13, 27, 28, 29, 33, 34, 36, 37]
    transforms = [
        int, str, str, strToFltToInt, str, strToFltToInt, indiv, str, party,
        candOrComm, str, str, safeFloat, safeFloat
    ]
    initTransactionsTable(dbName)

    with open(csvName, 'r') as f:
        reader = csv.reader(f)
        reader.next()  # skip column headers
        for i, block in enumerate(generateChunk(reader, extractors,
                                                transforms)):
            newBlock = filterTransactions(block)
            commitTransBlock(dbName, newBlock)

    timing.finish()
Пример #10
0
def extractUnipartiteFeatures(unipartiteGraph, adjMat):
    timing = Timer('extracting unipartite features')

    features = defaultdict(list)
    #componentFeatureFunc, communityFeatureFuncn, idToCommunity = getUnipartiteSurfaceFeatures(unipartiteGraph, adjMat, features)
    componentFeatureFunc, CNMFeatureFunc, idToCNM = getUnipartiteSurfaceFeatures(
        unipartiteGraph, adjMat, features)

    timing.markEvent('1. Extracted surface features')

    # Average weight of edges:
    avgWeights = calcAverageWeights(unipartiteGraph, adjMat)
    #totalWeights = {adjMat
    timing.markEvent('2. Computed average weights.')

    # Size of connected component:
    #cnctComponents = calcCnctComponents(unipartiteGraph)
    timing.markEvent('3. Computed connected components.')

    # Size of CNM community:
    communities = calcCommunities(idToCNM)
    timing.markEvent('4. Computed CNM communities.')

    # Pagerank:
    pageRanks = snap.TIntFltH()
    snap.GetPageRank(unipartiteGraph, pageRanks)
    timing.markEvent('5. Computed PageRank.')

    # combine the graph wide features with the existing surface features:
    for nid in features:
        features[nid].append(avgWeights[nid])
        #features[nid].append(cnctComponents[nid])
        features[nid].append(communities[nid])
        features[nid].append(pageRanks[nid])

    timing.finish()

    return features, componentFeatureFunc, CNMFeatureFunc
Пример #11
0
def processYearAndWeight(year, weighting, percents=None, thresholds=None):
    timing = Timer('Running for year %d and weight %s' % (year, weighting))
    adjMatFile = 'Data/Unipartite-Matrix/%d.%s' % (year, weighting)
    sortedVals, N = getSortedMatrixVals(adjMatFile)
    timing.markEvent('Got sorted vals')

    if percents:
        for p in percents:
            outfile = 'Data/Unipartite-Graphs/%d.%s_percent_%f.graph' \
                    % (year, weighting, p)
            graph = pruneGraphByPercent(sortedVals, N, p)
            graph_funcs.saveGraph(graph, outfile)
            timing.markEvent('Finished for %f percent' % p)

    if thresholds:
        for t in thresholds:
            outfile = 'Data/Unipartite-Graphs/%d.%s_threshold_%f.graph' \
                    % (year, weighting, t)
            graph = pruneGraphByThreshold(sortedVals, N, t)
            graph_funcs.saveGraph(graph, outfile)
            timing.markEvent('Finished for threshold %f' % t)

    timing.finish()
Пример #12
0
def getRecipFeatures(graph,
                     donorFeatures,
                     receiptsFromDonor,
                     totalReceipts,
                     totalDonations,
                     partialFeatures,
                     fullFeatures,
                     includeDonorFeatures=False):
    timing = Timer('Getting recipient features')
    recipFeatures = {}

    for recipNode in graph_funcs.getRecipients(graph, cfs=True):
        rnodeid = recipNode.GetId()

        # Add a donor feature indicating what percent of this donor's donations
        # went to this candidate.
        for donor in receiptsFromDonor[rnodeid]:
            pct = receiptsFromDonor[rnodeid][donor] / float(
                totalDonations[donor])
            donorFeatures[donor].append(pct)

        if includeDonorFeatures:
            recipFeatures[rnodeid] = np.append(
                getPartialNodeRecipSpecificFeatures(graph, rnodeid),
                processDonorFeaturesForRecip(donorFeatures,
                                             receiptsFromDonor[rnodeid]))
        else:
            recipFeatures[rnodeid] = \
                processDonorFeaturesForRecip(donorFeatures, receiptsFromDonor[rnodeid])

        # Remove the temporarily added feature for what percent of this donor's
        # donations went to this candidate.
        for donor in receiptsFromDonor[rnodeid]:
            donorFeatures[donor].pop()

    timing.finish()
    return recipFeatures
Пример #13
0
def runFullPipeline(year):
    timing = Timer('Running pipeline for %d' % year)

    weightings = ('adamic', 'cosine', 'jaccard', 'jaccard2', 'weighted_adamic')
    bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph'% year)
    newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year)

    for weightF in weightings:

        graphFiles = getGraphFiles(year, weightF)

        adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF))
        timing.markEvent('Loaded everything for donor features')
        genDonorFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph,\
                adjMat=adjMat, newToOldIDs=newToOldIDs)
        del adjMat # free the incredible amount of memory for the adjacency matrix


        genRecipFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph)
        results = getResults(year, weightF, graphFiles=graphFiles)
        pickler.save(results, 'Data/pruning_optimizations.%d.%s' % (year, weightF))
        timing.markEvent('Finished with %s' % weightF)

    timing.finish()
Пример #14
0
def getCorrel(year, weightFs):
    timing = Timer('Getting correlation matrix for year %d' % year)
    append = lambda x, y: np.append(x, y, axis=0)
    data = reduce(append, [getNonzeroElems(year, weightF) for weightF in weightFs])
    timing.finish()
    return np.corrcoef(data)
Пример #15
0
def getNonzeroElems(year, weightF):
    timing = Timer('Loading nonzero elems for year %d and weightf %s ' % (year, weightF))
    adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF))
    timing.finish()
    return adjMat[adjMat.nonzero()]
Пример #16
0
extensions = ('jaccard', 'jaccard2', 'cosine', 'adamic', 'weighted_adamic')

results = {}
resultsList = []

years = [int(arg) for arg in sys.argv[1:]]
timing = Timer('Runnign everything')
for year in years:
    timing.markEvent('Running for year %d' % year)
    results[year] = {}
    for extension in extensions:
        timing.markEvent('Running for extension %s' % extension)
        results[year][extension] = {}
        for clfname, clf in clfs.iteritems():
            timing.markEvent('Running for classifier %s' % clfname)
            results[year][extension][clfname] = {}
            for decompname, decompFunction in decompFunctions.iteritems():
                timing.markEvent('Running for decomp function %s' % decompname)
                rsquareds = cfscore_predictions.trainAndTestModels(
                    year, extension, clf=clf, decomp_func=decompFunction)
                resultsList.append(
                    (year, extension, clfname, decompname, tuple(rsquareds)))
                results[year][extension][clfname][decompname] = tuple(
                    rsquareds)
                timing.markEvent('Done')

timing.finish()
print results
pickler.save(results, 'results')
pickler.save(resultsList, 'resultsList')
Пример #17
0
        # Save the weight matrices:
        matrixPrefix = 'Data/Unipartite-Matrix/%d' % year
        pickler.save(wmat1, matrixPrefix + '.jaccard')
        pickler.save(wmat2, matrixPrefix + '.jaccard2')
        pickler.save(wmat3, matrixPrefix + '.affinity')
        pickler.save(wmat4, matrixPrefix + '.cosine')
        pickler.save(wmat5, matrixPrefix + '.adamic')
        pickler.save(wmat6, matrixPrefix + '.weighted_adamic')

        # Save the bipartite-unipartite corresponding node ID dictionaries:
        mappingPrefix = 'Data/Unipartite-NodeMappings/%d' % year
        pickler.save(newToOld, mappingPrefix + '.newToOld')
        pickler.save(oldToNew, mappingPrefix + '.oldToNew')

        timing.finish()

    overallTiming.finish()

# ------ OLD CODE: ------

# weight = weightF(
#     len(sharedCands),
#     sharedTransactions,
#     sharedAmount,
#     len(cands[id1].union(cands[id2])),
#     numDonations[id1] + numDonations[id2],
#     totalAmount[id1] + totalAmount[id2],
# )

# Get the number of transactions and the total amount given to shared
Пример #18
0
def createDonorDonorGraph(year, weightF):
    timing = Timer('creating donor-donor graph for %d' % year)

    # Load the old bipartite graph graph
    bipartiteGraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' %
                                           year)

    # Load the info about each donor and their recipients
    numDonations, totalAmount, cands, transactions, amounts, totalReceipts = getDonorInfos(
        bipartiteGraph)
    timing.markEvent('Got info about donor nodes')

    # Create initial unipartite graph with just nodes and node attributes
    unipartiteGraph, oldToNew, newToOld = cloneBipartiteNodes(
        bipartiteGraph, cands)
    timing.markEvent('Finished cloning nodes')

    jaccardData = []
    jaccard2Data = []
    affinityData = []
    cosineData = []
    adamicData = []
    weightedAdamicData = []
    r = []
    c = []

    # Add the weighted edges for every relevant pair of donor nodes
    nodesDone = 0

    for i, newID1 in enumerate(newToOld.keys()):
        oldID1 = newToOld[newID1]
        for newID2 in newToOld.keys()[i + 1:]:
            oldID2 = newToOld[newID2]

            sharedCands = cands[oldID1].intersection(cands[oldID2])
            if not sharedCands: continue

            # Calculate the weight
            weights = weightF(oldID1, oldID2, sharedCands, numDonations,
                              totalAmount, cands, transactions, amounts,
                              totalReceipts)

            r.append(newID1)
            r.append(newID2)
            c.append(newID2)
            c.append(newID1)
            jaccardData.append(weights['jaccard'])
            jaccardData.append(weights['jaccard'])
            jaccard2Data.append(weights['jaccard2'])
            jaccard2Data.append(weights['jaccard2'])
            affinityData.append(weights['affinity'])
            affinityData.append(weights['affinity'])
            cosineData.append(weights['cosine'])
            cosineData.append(weights['cosine'])
            adamicData.append(weights['adamic'])
            adamicData.append(weights['adamic'])
            weightedAdamicData.append(weights['weighted_adamic'])
            weightedAdamicData.append(weights['weighted_adamic'])

            # Add the edges between the two nodes and their weights
            unipartiteGraph.AddEdge(newID1, newID2)

        nodesDone += 1
        if nodesDone % 100 == 0:
            timing.markEvent('Finished %d outer loops out of %d' % \
                    (nodesDone, unipartiteGraph.GetNodes()))

    N = len(newToOld)
    jaccardAdjMat = sp.csr_matrix((jaccardData, (r, c)), shape=(N, N))
    jaccard2AdjMat = sp.csr_matrix((jaccard2Data, (r, c)), shape=(N, N))
    affinityAdjMat = sp.csr_matrix((affinityData, (r, c)), shape=(N, N))
    cosineAdjMat = sp.csr_matrix((cosineData, (r, c)), shape=(N, N))
    adamicAdjMat = sp.csr_matrix((adamicData, (r, c)), shape=(N, N))
    weightedAdamicAdjMat = sp.csr_matrix((weightedAdamicData, (r, c)),
                                         shape=(N, N))

    timing.finish()
    return unipartiteGraph, jaccardAdjMat, jaccard2AdjMat, affinityAdjMat, cosineAdjMat, adamicAdjMat, weightedAdamicAdjMat, newToOld, oldToNew