예제 #1
0
def main(args):
    if len(args) < 3:
        print(get_usage())
        sys.exit(1)

    votes = sys.argv[1]
    outFile = sys.argv[2]

    t = testutils.Timer(ENABLE_TIMER)
    context = snap.TTableContext()

    VoteS = snap.Schema()
    VoteS.Add(snap.TStrTAttrPr("UserId", snap.atInt))
    VoteS.Add(snap.TStrTAttrPr("AdminId", snap.atInt))
    TVotes = snap.TTable.LoadSS("WikiVotes", VoteS, votes, context, '\t',
                                snap.TBool(False))
    t.show("load Votes", TVotes)

    GroupBy = snap.TStrV()
    GroupBy.Add("UserId")
    JointTable = TVotes.SelfSimJoinPerGroup(GroupBy, "AdminId",
                                            DISTANCE_ATTRIBUTE, snap.Jaccard,
                                            0.5)
    t.show("SimJoinPerGroup complete", JointTable)

    JointTable.SelectAtomic("WikiVotes_1.UserId", "WikiVotes_2.UserId",
                            snap.NEQ)
    t.show("Select complete", JointTable)

    testutils.dump(JointTable, 20)
    JointTable.SaveSS(outFile)
예제 #2
0
def main(args):
	if len(args) < 3:
		print(get_usage())
		sys.exit(1)

	yelp = sys.argv[1]
	outFile = sys.argv[2]

	t = testutils.Timer(ENABLE_TIMER)
	context = snap.TTableContext()

	YelpS = snap.Schema()
	YelpS.Add(snap.TStrTAttrPr("Name", snap.atStr))
	YelpS.Add(snap.TStrTAttrPr("City", snap.atStr))
	YelpS.Add(snap.TStrTAttrPr("State", snap.atStr))
	YelpS.Add(snap.TStrTAttrPr("Latitude", snap.atFlt))
	YelpS.Add(snap.TStrTAttrPr("Longitude", snap.atFlt))

	TYelp = snap.TTable.LoadSS("Yelp", YelpS, yelp, context, '\t', snap.TBool(True));
	t.show("load Yelp", TYelp)

	Cols = snap.TStrV()
	Cols.Add("Latitude")
	Cols.Add("Longitude")

	# Get all business within 5 kilometers of each other
	JointTable = TYelp.SelfSimJoin(Cols, DISTANCE_ATTRIBUTE, snap.Haversine, 2)
	t.show("SimJoin complete", JointTable)

	ProjectionV = snap.TStrV()
	ProjectionV.Add("Yelp_1.Name")
	ProjectionV.Add("Yelp_1.City")
	ProjectionV.Add("Yelp_1.State")
	ProjectionV.Add("Yelp_2.Name")
	ProjectionV.Add("Yelp_2.City")
	ProjectionV.Add("Yelp_2.State")
	ProjectionV.Add(DISTANCE_ATTRIBUTE)

	JointTable.ProjectInPlace(ProjectionV)
	t.show("Project complete")

	testutils.dump(JointTable, 100);
	JointTable.SaveSS(outFile)
예제 #3
0
import snap
import testutils

if __name__ == '__main__':

    if len(sys.argv) < 3:
        print "Usage: " + sys.argv[0] + " <srcfile1> <srcfile2>"
        sys.exit(1)

    srcfile1 = sys.argv[1]
    srcfile2 = sys.argv[2]

    context = snap.TTableContext()

    t = testutils.Timer()
    r = testutils.Resource()

    FIn = snap.TFIn(srcfile1)
    t1 = snap.TTable.Load(FIn, context)
    t.show("load bin", t1)
    r.show("__loadbin__")

    schema = snap.Schema()
    schema.Add(snap.TStrTAttrPr("Index", snap.atInt))
    t2 = snap.TTable.LoadSS(schema, srcfile2, context, "\t", snap.TBool(False))
    t.show("load text", t2)
    r.show("__loadtext__")

    t3 = t1.Join("Src", t2, "Index")
    t.show("join", t3)
예제 #4
0
def main(args):
    if len(args) < 3:
        print(get_usage())
        sys.exit(1)

    root = sys.argv[1]
    mid_date = sys.argv[2]
    mid_ticks = utils.date_to_ticks(mid_date)

    file_cache = {
        TCOLLAB: None,
        TPULL: None,
        TREPO: None,
        TFOLLOW: None,
        TWATCH: None,
        TFORK: None
    }

    for file in os.listdir(root):
        if file.endswith(".tsv"):
            file_cache[file] = os.path.join(root, file)
            print file_cache[file]

    for key, val in file_cache.iteritems():
        if val == None:
            print("One of the required files not found.")
            print(get_usage())
            sys.exit(1)

    t = testutils.Timer(ENABLE_TIMER)
    context = snap.TTableContext()

    S1 = snap.Schema()
    S1.Add(snap.TStrTAttrPr("userid1", snap.atStr))
    S1.Add(snap.TStrTAttrPr("userid2", snap.atStr))
    S1.Add(snap.TStrTAttrPr("created_at", snap.atInt))
    Tfollow = snap.TTable.LoadSS("Tfollow", S1, file_cache[TFOLLOW], context,
                                 '\t', snap.TBool(False))
    t.show("load follow")

    S2 = snap.Schema()
    S2.Add(snap.TStrTAttrPr("userid", snap.atStr))
    S2.Add(snap.TStrTAttrPr("owner", snap.atStr))
    S2.Add(snap.TStrTAttrPr("name", snap.atStr))
    S2.Add(snap.TStrTAttrPr("created_at", snap.atInt))
    Tcollab = snap.TTable.LoadSS("Tcollab", S2, file_cache[TCOLLAB], context,
                                 '\t', snap.TBool(False))
    t.show("load collab")

    S3 = snap.Schema()
    S3.Add(snap.TStrTAttrPr("userid", snap.atStr))
    S3.Add(snap.TStrTAttrPr("owner", snap.atStr))
    S3.Add(snap.TStrTAttrPr("name", snap.atStr))
    S3.Add(snap.TStrTAttrPr("pullid", snap.atInt))
    S3.Add(snap.TStrTAttrPr("status", snap.atStr))
    S3.Add(snap.TStrTAttrPr("created_at", snap.atInt))
    Tpull = snap.TTable.LoadSS("Tpull", S3, file_cache[TPULL], context, '\t',
                               snap.TBool(False))
    t.show("load pull")

    # If (u,v) collaborated on the same repository - determined by the owner, name pair,
    # are added as collaborators.
    #TODO Better column renaming
    V = snap.TStrV()
    V.Add("created_at")
    Tcollab.Order(V, "", snap.TBool(False), snap.TBool(True))

    V.Clr()
    V.Add("owner")
    V.Add("name")
    V.Add("userid")
    Tcollab.Group(V, "UserRepoId")

    V.Clr()
    V.Add("UserRepoId")
    Tcollab.Unique(V)

    Tcollab_merge = Tcollab.SelfJoin("owner")
    Tcollab_merge.SelectAtomic("Tcollab_1.name", "Tcollab_2.name", snap.EQ)
    Tcollab_merge.SelectAtomic("Tcollab_1.userid", "Tcollab_2.userid",
                               snap.NEQ)

    # BUGBUG - Commenting this line will mean created_at is not present in Tcollab_merge.
    # However, the ProjectInPlace will not complain and silently exclude created_at from the
    # result. This leads to the Index:-1 error in SelectAtomicIntConst on created_at later in the code.
    Tcollab_merge.ColMin("Tcollab_1.created_at", "Tcollab_2.created_at",
                         "created_at")

    V = snap.TStrV()
    V.Add("Tcollab_1.userid")
    V.Add("Tcollab_2.userid")
    V.Add("created_at")
    Tcollab_merge.ProjectInPlace(V)

    Tcollab_merge.Rename("Tcollab_1.userid", "userid1")
    Tcollab_merge.Rename("Tcollab_2.userid", "userid2")
    t.show("merge collab", Tcollab_merge)

    #testutils.dump(Tcollab_merge, 50)

    # If (u,v) worked on the same pull request on the same repository, they are added
    # as (soft) collaborators.
    V = snap.TStrV()
    V.Add("created_at")
    Tpull.Order(V, "", snap.TBool(False), snap.TBool(True))

    V.Clr()
    V.Add("owner")
    V.Add("name")
    V.Add("userid")
    Tpull.Group(V, "UserRepoId")

    V.Clr()
    V.Add("UserRepoId")
    Tpull.Unique(V)

    Tpull_merge = Tpull.SelfJoin("owner")

    Tpull_merge.SelectAtomic("Tpull_1.name", "Tpull_2.name", snap.EQ)
    Tpull_merge.SelectAtomic("Tpull_1.pullid", "Tpull_2.pullid", snap.EQ)
    Tpull_merge.SelectAtomic("Tpull_1.userid", "Tpull_2.userid", snap.NEQ)
    Tpull_merge.ColMin("Tpull_1.created_at", "Tpull_2.created_at",
                       "created_at")

    V = snap.TStrV()
    V.Add("Tpull_1.userid")
    V.Add("Tpull_2.userid")
    V.Add("created_at")
    Tpull_merge.ProjectInPlace(V)

    Tpull_merge.Rename("Tpull_1.userid", "userid1")
    Tpull_merge.Rename("Tpull_2.userid", "userid2")
    t.show("merge pull", Tpull_merge)

    # BUGBUG: UnionAll is returning unexpected result at this point
    #Tmerge = Tcollab_merge.UnionAll(Tpull_merge, "Tmerge")
    Tmerge = Tpull_merge

    # Select the base and delta tables from the merged table.
    Tbase = snap.TTable.New(Tmerge, "Base")
    Tdelta = snap.TTable.New(Tmerge, "Delta")

    Tbase.SelectAtomicIntConst("created_at", mid_ticks, snap.LTE)
    Tdelta.SelectAtomicIntConst("created_at", mid_ticks, snap.GTE)

    #TODO: Union Tbase with collab and pull to include (userid, owner) edge
    t.show("collab union")

    # Convert base table to base graph
    Gbase = snap.ToNetwork(snap.PNEANet, Tbase, "userid1", "userid2",
                           snap.aaFirst)
    Gdelta = snap.ToNetwork(snap.PNEANet, Tdelta, "userid1", "userid2",
                            snap.aaFirst)
    t.show("base graph", Gbase)
    t.show("delta graph", Gdelta)

    NITERS = 20
    total_preck = 0

    print("Userid\tPrec@%d\tAverage Index" % (N_TOP_RECOS))

    # Random walk with restarts
    # BUGBUG: Returns the same id everytime
    # userid = Gbase.GetRndNId()
    for i in range(NITERS):
        # Randomly choose a starting node
        userid = random.choice([node.GetId() for node in Gbase.Nodes()])
        user = Gbase.GetNI(userid)

        # Perform random walk with restarts on base graph
        HT = snap.TIntFltH()
        snap.GetRndWalkRestart_PNEANet(Gbase, ALPHA, userid, HT)
        HT.SortByDat(False)

        j = 0
        cnt = 0
        preck = 0
        average_index = -1

        # Calculate precision
        while cnt < N_TOP_RECOS and j < HT.Len():
            recoid = HT.GetKey(j)
            pagerank = HT.GetDat(recoid)

            #print recoid, pagerank

            if recoid != userid:
                # If the edge is not in base graph but is present in delta graph, we made an accurate prediction.
                if not Gbase.IsEdge(userid, recoid) and Gdelta.IsNode(
                        userid) and Gdelta.IsNode(recoid) and (Gdelta.IsEdge(
                            userid, recoid) or Gdelta.IsEdge(recoid, userid)):
                    preck += 1
                cnt += 1
            j += 1

        # Calculate average index
        try:
            node = Gdelta.GetNI(userid)
            edges = [nid for nid in node.GetOutEdges()
                     ] + [nid for nid in node.GetInEdges()]
            #print edges
            #print([HT.GetKeyId(nid) for nid in edges])
            index = 0

            for nid in edges:
                index += HT.GetKeyId(nid)

            average_index = index / len(edges)
        except:
            # Node not present in delta graph implies no new edges formed
            pass

        total_preck += preck
        print("%d\t%d\t%f" % (userid, preck, average_index))

        #rank = snap.TTable.New("Rank", HT, "User", PAGE_RANK_ATTRIBUTE, context, snap.TBool(True))
    print("Average Precision@%d = %f" %
          (N_TOP_RECOS, total_preck / float(NITERS)))
예제 #5
0
if len(sys.argv) < 2:
    print """Usage: python 03-StackOverflow-snap.py <posts.tsv> <tags.tsv> <comments.tsv> <dest.tsv>
  posts.tsv: path to posts.tsv file
  tags.tsv: path to tags.tsv file
  comments.tsv: path to comments.tsv file
  dest.tsv: output .tsv file containing expert scores"""

    exit(1)
postsFile = sys.argv[1]
tagsFile = sys.argv[2]
commentsFile = sys.argv[3]
destFile = sys.argv[4] if len(sys.argv) >= 4 else None

context = snap.TTableContext()

t = testutils.Timer(ENABLE_TIMER)

# a) Compute authority scores

# Load posts
# >>> posts = ringo.load('posts.tsv')
S = snap.Schema()
S.Add(snap.TStrTAttrPr("PostId", snap.atInt))
S.Add(snap.TStrTAttrPr("UserId", snap.atInt))
S.Add(snap.TStrTAttrPr("AcceptedAnswerId", snap.atInt))
S.Add(snap.TStrTAttrPr("CreationDate", snap.atStr))
posts = snap.TTable.LoadSS("t1", S, postsFile, context, '\t',
                           snap.TBool(False))
t.show("load posts", posts)

# Load tags
예제 #6
0
import time

sys.path.append("../utils")

import snap
import testutils

if __name__ == '__main__':

    if len(sys.argv) < 2:
        print "Usage: " + sys.argv[0] + " <graph>"
        sys.exit(1)

    srcfile = sys.argv[1]

    t = testutils.Timer()
    r = testutils.Resource()

    FIn = snap.TFIn(srcfile)
    g = snap.TNGraph.Load(FIn)

    tall = testutils.Timer()
    rall = testutils.Resource()

    t.show("load graph", g)
    r.show("__loadbin__")

    for i in xrange(0, 10):
        PRankH = snap.TIntFltH()

        # change the comments below to select the algorithm:
예제 #7
0
def main(args):
    if len(args) < 1:
        print("python github-join.py <path_to_tsv_file>")
        sys.exit(1)

    filename = args[0]

    t = testutils.Timer(ENABLE_TIMER)
    context = snap.TTableContext()

    S = snap.Schema()
    S.Add(snap.TStrTAttrPr("userid", snap.atStr))
    S.Add(snap.TStrTAttrPr("owner", snap.atStr))
    S.Add(snap.TStrTAttrPr("name", snap.atStr))
    S.Add(snap.TStrTAttrPr("pullid", snap.atInt))
    S.Add(snap.TStrTAttrPr("status", snap.atStr))
    S.Add(snap.TStrTAttrPr("created_at", snap.atInt))
    Tpull = snap.TTable.LoadSS("Tpull", S, filename, context, '\t',
                               snap.TBool(False))
    t.show("load pull")

    V = snap.TStrV()
    V.Add("created_at")
    Tpull.Order(V, "", snap.TBool(False), snap.TBool(True))

    V.Clr()
    V.Add("owner")
    V.Add("name")
    V.Add("userid")
    Tpull.Group(V, "TagId")

    V.Clr()
    V.Add("TagId")
    Tpull.Unique(V)

    t.show("Unique", Tpull)

    Tpull_merge = Tpull.SelfJoin("owner")
    t.show("Merge", Tpull_merge)

    # Things work fine till this point
    Tpull_merge.SelectAtomic("Tpull_1.name", "Tpull_2.name", snap.EQ)
    Tpull_merge.SelectAtomic("Tpull_1.pullid", "Tpull_2.pullid", snap.EQ)
    Tpull_merge.SelectAtomic("Tpull_1.userid", "Tpull_2.userid", snap.NEQ)
    Tpull_merge.ColMin("Tpull_1.created_at", "Tpull_2.created_at",
                       "created_at")

    V = snap.TStrV()
    V.Add("Tpull_1.userid")
    V.Add("Tpull_2.userid")
    V.Add("created_at")
    Tpull_merge.ProjectInPlace(V)

    Tpull_merge.Rename("Tpull_1.userid", "userid1")
    Tpull_merge.Rename("Tpull_2.userid", "userid2")

    # Copy the Tpull_merge to form two graphs - base and delta. Select all rows in base for created_at < x and all dates in delta for created_at > x
    Tbase = snap.TTable.New(Tpull_merge, "Base")
    Tdelta = snap.TTable.New(Tpull_merge, "Delta")

    #Tbase.SelectAtomicIntConst("created_at", x, snap.LTE)
    #Tdelta.SelectAtomicIntConst("created_at", x, snap.GTE)

    G = snap.ToNetwork(snap.PNEANet, Tbase, "userid1", "userid2", snap.aaFirst)
    t.show("graph", G)