def __init__(self, name, root_dir, **options): self.name = name if not root_dir.endswith('/'): root_dir += '/' self.root_dir = root_dir self.pattern = options.pop('pattern', None) if self.pattern: tokens = tokenize_pattern(self.pattern) self.path_schema = Schema([ Field(name=c, type="STRING") for c in columns(tokens) ]) self.content_column = options.pop('content_column', None) self.filename_column = options.pop('filename_column', None) self.decode = options.pop('decode', "none") schema = options.pop('schema',None) if isinstance(schema, Schema): self.schema = schema else: self.schema = schema and Schema(**schema) if options: raise ValueError("Unrecognized options {}".format(options.keys()))
def input_stream(stream): arc = warc.ARCFile(fileobj=GzipFile(fileobj=stream)) schema = Schema([ dict(name='url', type='STRING'), dict(name='checksum', type='STRING'), dict(name='filename', type='STRING'), dict(name='length', type='STRING'), dict(name='location', type='STRING'), dict(name='content_type', type='STRING'), dict(name='offset', type='STRING'), dict(name='date', type='DATETIME'), dict(name='ip_address', type='STRING'), dict(name='result_code', type='INTEGER'), dict(name='payload', type='STRING'), ]) headers = [f.name for f in schema.fields[:-1]] def make_row(doc): row = [doc.header[h] for h in headers] row.append(doc.payload.decode('ascii', 'ignore')) return row return Relation(schema, (make_row(row) for row in arc))
def test_decode(): p = os.path.join(path, 'test.csv') open(p, 'w').write('field1,field2\n1,2\n') r = Relation(None, None, Schema([dict(type="STRING", name="path")]), lambda ctx: iter(((p, ), ))) assert_sequence_equal(list(decode({}, r, 0, 'auto')), [(p, '1', '2')])
def __test_projection(): employees = mock_data_set().get_relation('employees') op = relational_ops.ProjectOp( Schema([dict(name="employee_id", type="INTEGER")]), lambda row, ctx: row['employee_id']) rows = employees.rows(None, None) assert_sequence_equal(list(op(rows, None)), [(1234, ), (4567, ), (8901, )])
def __init__(self, server, name, db, range=None): self.server = server self.name = name schema = _load_value(list(db['__schema__'])[0]) self.schema = Schema(**schema) self.db = db self.record_count = _load_value(list(db['__count__'])[0]) if range: self.range = range
def test_decode_csv(): stream = StringIO(u"field1,field2,field3\nfoo,1,0\nbaz,2,0") schema = codecs.schema_from(stream, mime_type='text/csv') expected = Schema([ Field(name='field1', type='STRING'), Field(name='field2', type='STRING'), Field(name='field3', type='STRING') ]) eq_(schema, expected)
def test_extract_path(): r = Relation( None, None, #adapter, name, note needed for test Schema([dict(type="STRING", name="path")]), lambda ctx: iter( (('/Music/Nirvana/Nevermind/Smells Like Teen Spirit.ogg', ), ('/Videos/Electric Boogaloo.mp4', )))) assert_sequence_equal( list(extract_path({}, r, "/Music/{artist}/{album}/{track}.{ext}")), [('/Music/Nirvana/Nevermind/Smells Like Teen Spirit.ogg', 'Nirvana', 'Nevermind', 'Smells Like Teen Spirit', 'ogg')])
def test_decode_csv(): stream = StringIO("field1,field2,field3\nfoo,1,0\nbaz,2,0") schema = codecs.schema_from(stream, mime_type='text/csv') eq_( schema, Schema([ Field(name='field1', type='STRING'), Field(name='field2', type='STRING'), Field(name='field3', type='STRING') ]) ) relation = codecs.relation_from(stream, mime_type='text/csv') assert_sequence_equal( list(relation), [ ['foo','1','0'], ['baz','2','0'] ] )
def create(path, schema, records): if isinstance(schema, dict): schema = Schema(**schema) return DiscoDB(index(schema, records))
import os import tempfile import shutil from nose.tools import * from . import compare from splicer import Schema from splicer.ast import * from splicer.operations import query_zipper from splicer.adapters.dir_adapter import DirAdapter TEST_SCHEMA = Schema(fields=[ dict(type='STRING', name='department'), dict(type='INTEGER', name='id'), dict(type='STRING', name='full_name'), dict(type='INTEGER', name='salary'), dict(type='INTEGER', name='manager_id'), ]) def setup_func(): global path path = tempfile.mkdtemp() def teardown_func(): global path try: shutil.rmtree(path) finally:
] schema = Schema([ dict(name="size_in_kilobytes", type="INTEGER"), dict(name="host", type="STRING"), dict(name="content_type", type="STRING"), dict(name="scripts", type="STRING", mode="REPEATED"), dict(name="css", type="STRING", mode="REPEATED"), dict(name="link_to", type="STRING", mode="REPEATED"), dict( name="headers", type="RECORD", mode="REPEATED", fields=[ dict(name="name", type="string"), dict(name="value", type="string") ], ), dict(name="timestamp", type="DATETIME"), dict( name="tags", type="RECORD", mode="REPEATED", fields=[ dict(name="name", type="string"), dict(name="count", type="INTEGER") ], ), dict(name="scheme", type="STRING") ]) names = [m.__name__ for m in methods]
from nose.tools import * from splicer import Schema from splicer.compilers.join import ( nested_block_join, buffered, record_size, join_keys, join_keys_expr, hash_join ) from splicer.ast import EqOp, And, Var, NumberConst SCHEMA_1 = Schema(name="t1", fields=[dict(name='x', type="INTEGER")]) def t1(ctx=None): return iter(( (1,), (2,) )) SCHEMA_2 = Schema( name="t2", fields=[ dict(name='y', type="INTEGER"), dict(name='z', type="INTEGER") ] )