Пример #1
0
  def __init__(self,  name, root_dir, **options):
    self.name = name

    if not root_dir.endswith('/'):
      root_dir += '/'
    self.root_dir = root_dir

    self.pattern  = options.pop('pattern', None)

    if self.pattern:
      tokens = tokenize_pattern(self.pattern)
      self.path_schema = Schema([
        Field(name=c, type="STRING")
        for c in columns(tokens)
      ])



    self.content_column = options.pop('content_column', None)
    self.filename_column = options.pop('filename_column', None)
  
    self.decode = options.pop('decode', "none")

    schema = options.pop('schema',None)
    if isinstance(schema, Schema):
      self.schema = schema
    else:
      self.schema = schema and Schema(**schema)

    if options:
      raise ValueError("Unrecognized options {}".format(options.keys()))
Пример #2
0
def input_stream(stream):
    arc = warc.ARCFile(fileobj=GzipFile(fileobj=stream))

    schema = Schema([
        dict(name='url', type='STRING'),
        dict(name='checksum', type='STRING'),
        dict(name='filename', type='STRING'),
        dict(name='length', type='STRING'),
        dict(name='location', type='STRING'),
        dict(name='content_type', type='STRING'),
        dict(name='offset', type='STRING'),
        dict(name='date', type='DATETIME'),
        dict(name='ip_address', type='STRING'),
        dict(name='result_code', type='INTEGER'),
        dict(name='payload', type='STRING'),
    ])

    headers = [f.name for f in schema.fields[:-1]]

    def make_row(doc):
        row = [doc.header[h] for h in headers]
        row.append(doc.payload.decode('ascii', 'ignore'))
        return row

    return Relation(schema, (make_row(row) for row in arc))
Пример #3
0
def test_decode():
    p = os.path.join(path, 'test.csv')
    open(p, 'w').write('field1,field2\n1,2\n')

    r = Relation(None, None, Schema([dict(type="STRING", name="path")]),
                 lambda ctx: iter(((p, ), )))

    assert_sequence_equal(list(decode({}, r, 0, 'auto')), [(p, '1', '2')])
Пример #4
0
def __test_projection():
    employees = mock_data_set().get_relation('employees')
    op = relational_ops.ProjectOp(
        Schema([dict(name="employee_id", type="INTEGER")]),
        lambda row, ctx: row['employee_id'])

    rows = employees.rows(None, None)

    assert_sequence_equal(list(op(rows, None)), [(1234, ), (4567, ), (8901, )])
  def __init__(self, server, name, db, range=None):
    self.server = server
    self.name   = name
    schema      = _load_value(list(db['__schema__'])[0])
    self.schema = Schema(**schema)
    self.db     = db

    self.record_count  = _load_value(list(db['__count__'])[0])
    if range:
      self.range = range
Пример #6
0
def test_decode_csv():
    stream = StringIO(u"field1,field2,field3\nfoo,1,0\nbaz,2,0")

    schema = codecs.schema_from(stream, mime_type='text/csv')

    expected = Schema([
        Field(name='field1', type='STRING'),
        Field(name='field2', type='STRING'),
        Field(name='field3', type='STRING')
    ])

    eq_(schema, expected)
Пример #7
0
def test_extract_path():
    r = Relation(
        None,
        None,  #adapter, name, note needed for test
        Schema([dict(type="STRING", name="path")]),
        lambda ctx: iter(
            (('/Music/Nirvana/Nevermind/Smells Like Teen Spirit.ogg', ),
             ('/Videos/Electric Boogaloo.mp4', ))))

    assert_sequence_equal(
        list(extract_path({}, r, "/Music/{artist}/{album}/{track}.{ext}")),
        [('/Music/Nirvana/Nevermind/Smells Like Teen Spirit.ogg', 'Nirvana',
          'Nevermind', 'Smells Like Teen Spirit', 'ogg')])
Пример #8
0
def test_decode_csv():
  stream = StringIO("field1,field2,field3\nfoo,1,0\nbaz,2,0")

  schema = codecs.schema_from(stream, mime_type='text/csv')
  
  eq_(
     schema,
     Schema([
       Field(name='field1', type='STRING'),
       Field(name='field2', type='STRING'),
       Field(name='field3', type='STRING')
     ])
  )


  relation = codecs.relation_from(stream, mime_type='text/csv')

  assert_sequence_equal(
    list(relation),
    [
      ['foo','1','0'],
      ['baz','2','0']
    ]
  )
def create(path, schema, records):
  if isinstance(schema, dict):
    schema = Schema(**schema)

  return DiscoDB(index(schema, records))
Пример #10
0
import os
import tempfile
import shutil

from nose.tools import *
from . import compare

from splicer import Schema
from splicer.ast import *
from splicer.operations import query_zipper
from splicer.adapters.dir_adapter import DirAdapter

TEST_SCHEMA = Schema(fields=[
    dict(type='STRING', name='department'),
    dict(type='INTEGER', name='id'),
    dict(type='STRING', name='full_name'),
    dict(type='INTEGER', name='salary'),
    dict(type='INTEGER', name='manager_id'),
])


def setup_func():
    global path
    path = tempfile.mkdtemp()


def teardown_func():
    global path
    try:
        shutil.rmtree(path)
    finally:
]

schema = Schema([
    dict(name="size_in_kilobytes", type="INTEGER"),
    dict(name="host", type="STRING"),
    dict(name="content_type", type="STRING"),
    dict(name="scripts", type="STRING", mode="REPEATED"),
    dict(name="css", type="STRING", mode="REPEATED"),
    dict(name="link_to", type="STRING", mode="REPEATED"),
    dict(
        name="headers",
        type="RECORD",
        mode="REPEATED",
        fields=[
            dict(name="name", type="string"),
            dict(name="value", type="string")
        ],
    ),
    dict(name="timestamp", type="DATETIME"),
    dict(
        name="tags",
        type="RECORD",
        mode="REPEATED",
        fields=[
            dict(name="name", type="string"),
            dict(name="count", type="INTEGER")
        ],
    ),
    dict(name="scheme", type="STRING")
])

names = [m.__name__ for m in methods]
Пример #12
0
from nose.tools import *

from splicer import  Schema
from splicer.compilers.join import (
  nested_block_join,
  buffered,
  record_size,
  join_keys,
  join_keys_expr,
  hash_join
)
from splicer.ast import EqOp, And, Var, NumberConst


SCHEMA_1 = Schema(name="t1", fields=[dict(name='x', type="INTEGER")])

def t1(ctx=None):
  return iter((
    (1,),
    (2,)
  ))
  

SCHEMA_2 = Schema(
  name="t2", 
  fields=[
    dict(name='y', type="INTEGER"),
    dict(name='z', type="INTEGER")
  ]
)