Пример #1
0
import os
import resource
import sys
import time
import pdb

sys.path.append("../utils")
sys.path.append("../ringo-engine-python")
import ringo
import snap
import testutils

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print "Usage: " + sys.argv[0] + " <srcfile>"
        sys.exit(1)
    srcfile = sys.argv[1]

    ringo = ringo.Ringo()
    t = testutils.Timer()
    r = testutils.Resource()

    table = ringo.Load(srcfile)
    t.show("load bin")
    r.show("__loadbin__")

    ringo.Select(table, "Src < 10000")
    t.show("selected < 10K in place")
    r.show("__selectedlt10Kinplace__")
Пример #2
0
import sys
import time
import pdb

sys.path.append("../utils")
sys.path.append("../ringo-engine-python")
import ringo
import snap
import testutils

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print "Usage: " + sys.argv[0] + " <srcfile>"
        sys.exit(1)

    srcfile = sys.argv[1]
    ringo = ringo.Ringo()

    t = testutils.Timer()
    r = testutils.Resource()

    table = ringo.Load(srcfile)
    t.show("load bin")
    r.show("__loadbin__")

    table2 = ringo.Select(table, "Src < 10000", False)

    # Ringo? table2 = snap.TTable.New(table.GetSchema(), context)
    t.show("selected < 10K new table")
    r.show("__selectedlt10Knewtable__")
Пример #3
0
        dstfile: destination file for saving the pagerank table"""
        sys.exit(1)

    srcfile = sys.argv[1]
    dstfile = sys.argv[2]

    ringo = ringo.Ringo()
    t = testutils.Timer()
    r = testutils.Resource()

    S = [("Id", "int"), ("OwnerUserId", "int"), ("AcceptedAnswerId", "int"), ("CreationDate", "string"), ("Score", "int"), ("Tag", "string")]
    table = ringo.LoadTableTSV(S, srcfile)
    t.show("load posts text")
    .show("__loadpoststext__")

    questions = ringo.Select(table, "Tag = 'python'", False)
    t.show("selected tag = 'python'")
    r.show("__selectedtagpython__")
  
    qa = ringo.Join(questions, table, "AcceptedAnswerId", "Id")
    graph = ringo.ToGraph(qa, "OwnerUserId-2", "OwnerUserId-1")

    t.show("join")
    r.show("__join__")

    t.show("graph")
    r.show("__graph__")

    prtable = ringo.PageRank(graph, 'PageRank', False, 0.85, 1e-4, 100)
    t.show("pagerank")
    r.show("__pagerank__")
Пример #4
0
      ("pullid", "int"), ("status", "string"), ("created_at", "int")]
Tpull = ringo.LoadTableTSV(S3, file_cache[TPULL])
t.show("load pull")

Tfork = ringo.LoadTableTSV(S2, file_cache[TFORK])
t.show("load fork")

Twatch = ringo.LoadTableTSV(S2, file_cache[TWATCH])
t.show("load watch")

# If (u,v) collaborated on the same repository - determined by the owner, name pair,
# are added as collaborators.
#TODO Better column renaming

Tcollab_merge = ringo.SelfJoin(Tcollab, "owner")
ringo.Select(Tcollab_merge, "2_1.name = 2_2.name", True)
ringo.ColMin(Tcollab_merge, "2_1.created_at", "2_2.created_at", "created_at")
ringo.Project(Tcollab_merge, ("2_1.userid", "2_2.userid", "created_at"))
ringo.Rename(Tcollab_merge, "2_1.userid", "userid1")
ringo.Rename(Tcollab_merge, "2_2.userid", "userid2")
t.show("merge collab", Tcollab_merge)

# If (u,v) worked on the same pull request on the same repository, they are added
# as (soft) collaborators.
Tpull_merge = ringo.SelfJoin(Tpull, "owner")
ringo.Select(Tpull_merge, "3_1.name = 3_2.name", True)
ringo.Select(Tpull_merge, "3_1.pullid = 3_2.pullid", True)
ringo.ColMin(Tpull_merge, "3_1.created_at", "3_2.created_at", "created_at")
ringo.Project(Tpull_merge, ("3_1.userid", "3_2.userid", "created_at"))
ringo.Rename(Tpull_merge, "3_1.userid", "userid1")
ringo.Rename(Tpull_merge, "3_2.userid", "userid2")
t = testutils.Timer(ENABLE_TIMER)

# Load posts
S = [('PostId', 'int'), ('UserId', 'int'), ('AnswerId', 'int'),
     ('CreationDate', 'string')]
t1 = ringo.LoadTableTSV(S, os.path.join(srcdir, POSTS_FILE))
t.show("load posts", t1)

# Load tags
S = [('PostId', 'int'), ('Tag', 'string')]
t2 = ringo.LoadTableTSV(S, os.path.join(srcdir, TAGS_FILE))
t.show("load tags", t2)

# Select
ringo.Select(t2, 'Tag = python', CompConstant=True)
t.show("select", t2)

# Join
t3 = ringo.Join(t1, t2, 'PostId', 'PostId')
t.show("join", t3)

# Join
t4 = ringo.Join(t3, t1, "1.AnswerId", "PostId")
t.show("join", t4)

# Graph
graph = ringo.ToGraph(t4, "1_2.1.UserId", "1.UserId")
t.show("graph", graph)
ringo.ShowMetadata(graph)
Пример #6
0
        sys.exit(1)

    srcfile = sys.argv[1]
    dstfile = sys.argv[2]

    ringo = ringo.Ringo()

    t = testutils.Timer()
    r = testutils.Resource()

    S = [("Id", "int"), ("OwnerUserId", "int"), ("AcceptedAnswerId", "int"), ("CreationDate", "string"), ("Score", "int"), ("Tag", "string")]
    table = ringo.LoadTableTSV(S, srcfile)
    t.show("load posts text")
    r.show("__loadpoststext__")

    questions = ringo.Select(table, "Tag = 'python'", False)
    t.show("selected tag = 'python'")
    r.show("__selectedtagpython__")

    ringo.Select(questions, "AcceptedAnswerId != 0")
    t.show("select questions")
    r.show("__selectquestions__")

    ringo.Select(table, "AcceptedAnswerId = 0")
    t.show("select answers")
    r.show("__selectanswers__")

    ringo.Join(questions, table, "AcceptedAnswerId", "Id")
    t.show("join")
    r.show("__join__")
Пример #7
0
     ('CreationDate', 'string')]
POSTS = ringo.LoadTableTSV(S, os.path.join(srcdir, POSTS_FILE))
t.show("load posts", POSTS)

# Load tags
S = [('PostId', 'int'), ('Tag', 'string')]
T = ringo.LoadTableTSV(S, os.path.join(srcdir, TAGS_FILE))
t.show("load tags", T)

# Join
P = ringo.Join(POSTS, T, "PostId", "PostId")
t.show("join", P)

# Select Java posts
print ringo.GetSchema(P)
ringo.Select(P, '2.Tag = java', CompConstant=True)
t.show("select", P)

# Select Questions
Q = ringo.Select(P, '1.AnswerId != 0', InPlace=False, CompConstant=True)
t.show("select", Q)

# Select Answers
A = ringo.Select(P, '1.AnswerId = 0', InPlace=False, CompConstant=True)
t.show("select", A)

#print ringo.DumpTableContent(Q,5)
#print ringo.DumpTableContent(A,5)
#ringo.GenerateProvenance(Q, '06-StackOverflow-paper-autogen-Q.py')
#ringo.GenerateProvenance(A, '06-StackOverflow-paper-autogen-A.py')
# Join
Пример #8
0
import sys
import ringo

posts_file = sys.argv[1]
tags_file = sys.argv[2]
ringo = ringo.Ringo()
S_posts = [('Id','int'), ('PostTypeId','int'), ('AcceptedAnswerId','int'), ('OwnerUserId','int'), ('Body','string')]
S_tags = [('Id','int'), ('Tag','string')]

posts = ringo.LoadTableTSV(S_posts, posts_file)
ringo.Select(posts, 'OwnerUserId > 0')
#print ringo.ringo.DumpTableContent(posts) # buggy...
q_tags = ringo.LoadTableTSV(S_tags, tags_file)
QT = ringo.Join(posts, q_tags, "Id", "Id")
QT = ringo.Project(QT, ['Id', 'PostTypeId', 'AcceptedAnswerId', 'OwnerUserId', 'Body', 'Tag'])
a_tags = ringo.Project(QT, ['AcceptedAnswerId', 'Tag'], False)
AT = ringo.Join(posts, a_tags, 'Id', 'AcceptedAnswerId')
AT = ringo.Project(AT, ['Id', 'PostTypeId', 'AcceptedAnswerId', 'OwnerUserId', 'Body', 'Tag'])
T = ringo.UnionAll(QT, AT)
ringo.SaveTableTSV(T, 'so_posts.tsv')
Пример #9
0
import ringo
import sys

src_file = sys.argv[1]
Schema = [('Id', 'int'), ('PostTypeId', 'int'), ('AcceptedAnswerId', 'int'),
          ('OwnerUserId', 'int'), ('Body', 'string'), ('Tag', 'string')]
ringo = ringo.Ringo()
P = ringo.LoadTableTSV(Schema, src_file, '\t', True)
ringo.Project(P,
              ['Id', 'PostTypeId', 'AcceptedAnswerId', 'OwnerUserId', 'Tag'])

JP = ringo.Select(P, "Tag = 'java'", False)
Q = ringo.Select(JP, 'PostTypeId = 1', False)
A = ringo.Select(JP, 'PostTypeId = 2', False)

QA = ringo.Join(Q, A, 'AcceptedAnswerId', 'Id')
G = ringo.ToGraph(QA, 'OwnerUserId-1', 'OwnerUserId-2')
PR_MAP = ringo.PageRank(G)  # a hash map object: node/user id -> PageRank score
PR = ringo.TableFromHashMap(PR_MAP, 'user', 'score')
PR = ringo.Order(PR, ['score'])
ringo.SaveTableTSV(PR, 'scores.tsv')
#ringo.SaveTableBinary(PR, 'scores')
ringo.GenerateProvenance(G, 'G.py')