Пример #1
0
    def test_sample_db(self):
        sample_path = os.path.join(os.path.dirname(__file__), "sample.discodb")
        dataset = DataSet()

        server = DiscoDBServer(docs=sample_path)

        results = list(server.evaluate(LoadOp('docs')))

        eq_(len(results), 150)
Пример #2
0
    def test_discodb_server(self):
        dataset = DataSet()
        server = DiscoDBServer(users=self.db)

        # return all records from the users table
        results = server.evaluate(LoadOp('users'))

        assert_sequence_equal(list(results), [
            ('John', 2),
            ('Bob', 1),
        ])
Пример #3
0
  def test_sample_db(self):
    sample_path = os.path.join(
      os.path.dirname(__file__),
      "sample.discodb"
    )
    dataset = DataSet()

    server = DiscoDBServer(docs=sample_path)
   
    results = list(server.evaluate(LoadOp('docs')))

    eq_(len(results), 150)
Пример #4
0
  def test_discodb_server(self):
    dataset = DataSet()
    server = DiscoDBServer(users=self.db)

    # return all records from the users table
    results = server.evaluate(LoadOp('users'))

    assert_sequence_equal(
      list(results), 
      [
        ('John', 2),
        ('Bob',1),
      ]
    )
Пример #5
0
def init(**tables):
  """
    Returns a dataset to work with the discodb specified by path
  """

  dataset = DataSet()
  dataset.add_server(DiscoDBServer(**tables))

  dataset.add_server(FileServer(
    common_crawl=dict(
      root_dir=join(dirname(__file__), 'data'),
      pattern="sample.arc.gz",
      decode="application/x-arc",
      #description="Raw documents from http://commoncrawl.org"
    )
  ))


  dataset.add_server(FileServer(
    top_sites=dict(
      #description="Top Sites as reported by Alexa",
      root_dir=join(dirname(__file__), 'data'),
      pattern="alexa-top1m-{date}.csv",
      decode="auto",
      schema=dict(
        fields=[
          dict(name="date", type="DATE"),
          dict(name="rank", type="STRING"),
          dict(name="site", type="STRING")
        ]
      )
    )
  ))

  dataset.frm('top_sites').limit(10).create_view('top_10')

  dataset.create_view(
    'outbound_links',
    "select link_to, count() "
    "from flatten(docs, 'link_to') "
    "group by link_to order by count desc"
  )

  dataset.create_view(
    'scripts',
    "select scripts as script, count() "
    "from flatten(docs, 'scripts') "
    "group by script order by count desc"
  )

  dataset.create_view(
    'servers',
    "select headers_value as server_name, count() " 
    "from flatten(docs, 'headers') " 
    "where headers_name = 'Server' "
    "group by server_name order by count desc"
  )

  return dataset