示例#1
0
def store_revisions(self, page_url):
  """
  Retrieve all the revision of a give wikipedia page_url

  parameters:
    - page_url: a wikipedia page URL
  """

  p = Page()

  d = Dataset( "%s:27017" % (mongodb_host) )

  title = url2title(page_url)
  lang = url2lang(page_url)

  p.fetch_from_api_title(title, lang=lang)

  revisions = p.get_all_editors()

  i = 0

  for revision in revisions:
    i += 1

    # ex: en/crimea/revision/999999
    key = "%s/%s/revision/%s" % (lang,title,revision["revid"])

    # fetch the revision from the internet
    value = p.get_revisions(extra_params={ "rvstartid": revision["revid"], "rvlimit" : 1})

    # write in it the database handler
    d.write(key, value)
    self.update_state( state='PROGRESS',
      meta= { 'current': i, 'total': len(revisions)})
示例#2
0
def dataset_blocks(url):
  print "blocks: %s" % (url)
  d = Dataset( "%s:27017" % (mongodb_host) )

  page = d.read(url)

  txt = mw(page["dataset"][0]["*"])

  (blocks, structure) = txt.get_blocks()

  key = "%s/blocks" % (url)
  value = {  "structure" : structure, "blocks": blocks }

  d.write(key, value)

  return value
示例#3
0
def dataset_timeline(url):
  print "timeline: %s" % (url)
  d = Dataset( "%s:27017" % (mongodb_host) )

  title = url2title(url)
  lang = url2lang(url)

  url = "%s/%s" % (lang, title)

  regex_string = "%s\/%s\/revision/([0-9]*$)" % (lang, title)

  r = d.find({ "url" : { "$regex" : regex_string } }, { "dataset.timestamp" : 1, "dataset.revid" : 1 })

  timeline = []

  for result in r:
    i = { "timestamp": result["dataset"][0]["timestamp"], "revid": result["dataset"][0]["revid"] }
    timeline.append(i)

  timeline = sorted( timeline, key=lambda rev: rev["timestamp"])

  print "start: %s" % (timeline[0])
  print "end: %s" % (timeline[-1])

  k = "%s/%s/timeline" % (lang, title)
  d.delete(k)
  d.write(k, timeline)

  print r.count()
示例#4
0
def export_synapse():
  syn = synapseclient.Synapse()
  syn.login()

  project = syn.get("syn2483395")

  d = Dataset( "%s:27017" % (mongodb_host) )

  revisions = d.find({ "url" : { "$regex" : "en/Crimea/revision/([0-9]*)$" } }, { "url":1, "dataset": 1 })

  print "uploading %s files to SYNAPSE" % (revisions.count())

  for revision in revisions[0:100]:
    file_id = revision["url"].split("/")[3]

    revision_file = open("/data/temp/%s.json" % (file_id), 'w')
    print "revision: %s" % (revision["url"])
    del revision["_id"]

    json.dump(revision, revision_file)

    syn.store(File("/data/temp/%s.json" % (file_id), parent=project))
示例#5
0
def store_last_revisions(db_url):
  d = Dataset( "%s:27017" % (mongodb_host) )

  url = db_url.replace("/timeline", "")

  (lang, page) = url.split("/")

  p = Page()
  p.fetch_from_api_title(page, lang=lang)

  last_rev = p.get_revisions(extra_params={ "rvlimit" : 1 })

  print "last revisions: %s" % (url.encode("utf8"))

  t = list(d.find({ "url": db_url }, { "url" : 1, "dataset" : { "$slice": -1 } }))

  # print t[0]

  extra_params = {
    "rvstartid": t[0]["dataset"][0]["revid"],
    "rvendid": last_rev[0]["revid"],
    "rvdir": "newer"
  }

  print extra_params

  revs = p.get_revisions(extra_params=extra_params)

  print "%s new revisions since %s (%s)" % (len(revs), t[0]["dataset"][0]["timestamp"], t[0]["dataset"][0]["revid"])
  print "%s  ---->  %s" % (t[0]["dataset"][0]["timestamp"], last_rev[0]["timestamp"])

  for r in revs:
    key = "%s/%s/revision/%s" % (lang, page, r["revid"])
    value = [ r ]

    d.write(key, value)