def test(self): orig = six.BytesIO() write(orig) orig.seek(0) reader = dr.Reader(orig, automagic=True) doc0 = reader.next() doc0_schema = reader.doc_schema doc1 = reader.next() doc1_schema = reader.doc_schema with self.assertRaises(StopIteration): reader.next() rewritten = six.BytesIO() writer = dr.Writer(rewritten, doc0_schema) doc = doc0 self.assertTrue(hasattr(doc, 'tokens')) self.assertTrue(hasattr(doc, 'sents')) self.assertEqual(len(doc.tokens), 0) self.assertEqual(len(doc.sents), 0) self.assertEqual(doc.adjectives, []) writer.write(doc) writer = dr.Writer(rewritten, doc1_schema) doc = doc1 self.assertTrue(hasattr(doc, 'tokens')) self.assertTrue(hasattr(doc, 'sents')) self.assertEqual(len(doc.tokens), 5) self.assertEqual(len(doc.sents), 1) self.assertEqual(doc.tokens[0].norm, 'The') self.assertEqual(doc.tokens[0].span, slice(0, 3)) self.assertEqual(doc.tokens[1].norm, 'quick') self.assertEqual(doc.tokens[1].span, slice(4, 9)) self.assertEqual(doc.tokens[2].norm, 'brown') self.assertEqual(doc.tokens[2].span, slice(11, 16)) self.assertEqual(doc.tokens[3].norm, 'fox') self.assertEqual(doc.tokens[3].span, slice(17, 20)) self.assertEqual(doc.tokens[4].norm, '.') self.assertEqual(doc.tokens[4].span, slice(20, 21)) self.assertEqual(doc.sents[0].span, slice(0, 5)) self.assertListEqual(doc.adjectives, doc.tokens[1:3]) writer.write(doc) orig.seek(0) rewritten.seek(0) orig = orig.getvalue() rewritten = rewritten.getvalue() self.assertEqual(orig, rewritten)
def test(self): doc = Doc() doc.tokens.create(span=slice(0, 3), raw='The') doc.tokens.create(span=slice(4, 9), raw='quick') doc.tokens.create(span=slice(11, 16), raw='brown') doc.tokens.create(span=slice(17, 20), raw='fox') doc.tokens.create(span=slice(20, 21), raw='.') doc.sents.create(span=slice(0, 5)) doc.tokens.create(span=slice(22, 25), raw='The') doc.tokens.create(span=slice(26, 30), raw='lazy') doc.tokens.create(span=slice(31, 34), raw='cat') doc.tokens.create(span=slice(35, 38), raw='too') doc.tokens.create(span=slice(38, 39), raw='.') doc.sents.create(span=slice(5, 10)) correct = six.BytesIO() correct.write(b'\x03' b'\x93' b'\x92' b'\xa8__meta__' b'\x90' b'\x92' b'\xa4Sent' b'\x92' b'\x81\x00\xa6number' b'\x83\x00\xa4span\x01\x01\x02\xc0' b'\x92' b'\xa5Token' b'\x92' b'\x81\x00\xa3raw' b'\x82\x00\xa4span\x02\xc0' b'\x92' b'\x93\xa5sents\x01\x02' b'\x93\xa6tokens\x02\x0a' b'\x01' b'\x80' b'\x0b' b'\x92' b'\x81\x01\x92\x00\x05' b'\x81\x01\x92\x05\x05' b'\x66' b'\x9a' b'\x82\x00\xa3The\x01\x92\x00\x03' b'\x82\x00\xa5quick\x01\x92\x04\x05' b'\x82\x00\xa5brown\x01\x92\x0b\x05' b'\x82\x00\xa3fox\x01\x92\x11\x03' b'\x82\x00\xa1.\x01\x92\x14\x01' b'\x82\x00\xa3The\x01\x92\x16\x03' b'\x82\x00\xa4lazy\x01\x92\x1a\x04' b'\x82\x00\xa3cat\x01\x92\x1f\x03' b'\x82\x00\xa3too\x01\x92\x23\x03' b'\x82\x00\xa1.\x01\x92\x26\x01') out = six.BytesIO() writer = dr.Writer(out, Doc) writer.write(doc) out = out.getvalue() correct = correct.getvalue() self.assertEqual(out, correct)
def build_locals(self): res = {'__name__': '__main__'} from schwa import dr reader, schema = self.get_reader_and_schema(self.args.in_file) res.update({'dr': dr, 'docs': reader}) if self.args.out_file: res['write_doc'] = dr.Writer(self.args.out_file, schema).write return res
def __call__(self): # TODO: clean up!! evaluator = self.evaluator if isinstance(evaluator, KFoldsEvaluator): # avoid full deserialisation # TODO: make more generic reader = self.raw_stream_reader from drapps.util import RawDocWriter make_writer = RawDocWriter else: reader, schema = self.get_reader_and_schema() make_writer = lambda out: dr.Writer(out, schema) if self.args.make_dirs: def fopen(path, mode): dirname = os.path.dirname(path) if not os.path.exists(dirname): cur = '' for part in dirname.split(os.path.sep): cur += part if part and not os.path.exists(cur): os.mkdir(cur) cur += os.path.sep return open(path, mode) else: fopen = open def new_writer(key): fold_num = len(writers) path = self.args.path_tpl.format(n=fold_num, key=key) if not self.args.overwrite and os.path.exists(path): print('Path {0} already exists. Use --overwrite to overwrite.'. format(path), file=sys.stderr) sys.exit(1) print('Writing fold {k} to {path}'.format(k=fold_num, path=path), file=sys.stderr) return make_writer(fopen(path, 'wb')) if self.args.sparse: get_writer = lambda key: make_writer( fopen(self.args.path_tpl.format(key=key), 'ab')) else: writers = {} def get_writer(key): try: writer = writers[key] except KeyError: writer = writers[key] = new_writer(key) return writer for i, doc in enumerate(reader): val = evaluator(doc, i) for key in val if isinstance(val, list) else (val, ): writer = get_writer(key) writer.write(doc)
def write_read(doc, out_schema, in_schema=None): if in_schema is None: in_schema = out_schema print('Writing {0}'.format(out_schema)) f = six.BytesIO() dr.Writer(f, out_schema).write(doc) f.seek(0) print('Reading {0}'.format(in_schema)) return dr.Reader(f, in_schema).next()
def test_exception_message(self): doc = DocToken() t = doc.tokens.create() t.raw = 'meow' stream = io.BytesIO() writer = dr.Writer(stream, DocToken) writer.write(doc) stream.seek(0) reader = dr.Reader(stream, DocTok) with self.assertRaisesRegexp(ReaderException, r"Store u?'tokens' points to annotation type u?'.*Tok' but the store on the stream points to a lazy type \(u?'Token'\)\."): doc = next(reader)
def __call__(self): empty = io.BytesIO() writer = dr.Writer(empty, dr.Doc) writer.write(dr.Doc()) empty = empty.getvalue() out = self.args.out_stream if six.PY3: out = out.buffer i = 0 while i < self.args.ndocs: out.write(empty) i += 1
def write(out): doc1 = Doc() doc2 = Doc() doc2.tokens.create(span=slice(0, 3), norm='The') doc2.tokens.create(span=slice(4, 9), norm='quick') doc2.tokens.create(span=slice(11, 16), norm='brown') doc2.tokens.create(span=slice(17, 20), norm='fox') doc2.tokens.create(span=slice(20, 21), norm='.') doc2.sents.create(span=slice(0, 5)) doc2.adjectives = doc2.tokens[1:3] writer = dr.Writer(out, Doc) writer.write(doc1) writer.write(doc2)
def create_stream(): stream = six.BytesIO() writer = dr.Writer(stream, Doc) d = Doc() for name in ('hello', 'world', '.'): d.xs.create(name=name) writer.write(d) d = Doc() for name in ('how', 'are', 'you', '?'): d.xs.create(name=name) writer.write(d) stream.seek(0) return stream
def __call__(self): reader, schema = self.get_reader_and_schema() tmp_out = io.BytesIO() tmp_writer = dr.Writer(tmp_out, schema) evaluator = self.evaluator items = [] for i, doc in enumerate(reader): # TODO: avoid re-serialising doc_key = evaluator(doc, i) tmp_writer.write(doc) doc_data = tmp_out.getvalue() tmp_out.truncate(0) items.append((doc_key, doc_data)) items.sort() for doc_key, doc_data in items: out = self.args.out_stream if six.PY3: out = out.buffer out.write(doc_data)
def __call__(self): WK_PAGES_EST = 4630000 with open(self.out_path, 'w') as f: i = 0 writer = dr.Writer(f, WikiDoc) try: log.info('Processing docs...') start_time = time() for i, doc in enumerate(self.iter_doc_reps()): if i == 10000 or (i % 100000 == 0 and i > 0): dps = (i + 1) / float(time() - start_time) eta = datetime.timedelta(seconds=int(WK_PAGES_EST / dps)) log.info( 'Processed %i documents... %.2f d/s (eta: %s)', i, dps, eta) writer.write(doc) except: log.error('Failed on doc: %i', i) raise
def serialise(doc, doc_klass): f = six.BytesIO() dr.Writer(f, doc_klass).write(doc) return f.getvalue()
def serialize(doc, schema): out = six.BytesIO() writer = dr.Writer(out, schema) writer.write(doc) return out.getvalue()