Exemplo n.º 1
0
    def file_write(self, fname, data):
        "Write documents in append mode to given file name"
        # perform input data validation
        good_data = []
        # write bad data records into output file
        bdir = '%s/bad' % os.path.dirname(fname)
        if  not os.path.exists(bdir):
            os.makedirs(bdir)
        bfname = '%s/%s_bad.txt' % (bdir, os.path.basename(fname))
        count = ecount = edocs = 0
        with open(bfname, 'a') as bstream:
            for rec in data:
                validator = RecordValidator()
                validator.run(self.schema_json, rec)
                if  validator.errors:
                    bstream.write(json.dumps(rec)+'\n')
                    for err in validator.errors:
                        msg = 'SCHEMA ERROR '
                        for key, val in err.items():
                            msg += '%s: %s ' % (key.upper(), json.dumps(val))
                        bstream.write(msg+'\n')
                    bstream.write('-------------\n')
                    ecount += len(validator.errors)
                    edocs += 1
                else:
                    good_data.append(rec)
                count += 1
        if  ecount:
            print("WARNING: received %s docs, found %s bad docs, %s errors, see %s"\
                    % (count, edocs, ecount, bfname))
        # use only good portion of the data
        data = good_data
        try:
            schema = self.schema
            wmaids = []
            if  not hasattr(data, '__iter__') or isinstance(data, dict):
                data = [data]

            if  os.path.exists(fname):
                schema = None # we'll append to existing file
            mode = 'a+' if fname.endswith('.avro') else 'a'
            if  mode == 'a':
                print("We're unable yet to implement read-write mode with compressed avro files")
                raise NotImplementedError
            rec = None # keep doc in case of failure
            with DataFileWriter(open_file(fname, mode), DatumWriter(), schema) as writer:
                for rec in data:
                    writer.append(rec)
                    writer.flush()
                    wmaid = rec.get('wmaid', wmaHash(rec))
                    wmaids.append(wmaid)
            return wmaids
        except Exception as exc:
            err = traceback.format_exc(limit=1).splitlines()[-1]
            line = ' '.join(str(exc).replace('\n', '').split())
            msg = 'Failure in %s storage, error=%s, exception=%s' \
                    % (self.stype, err, line)
            msg += ' Failed document: '
            msg += json.dumps(rec)
            raise WriteError(msg)
Exemplo n.º 2
0
 def _write(self, data):
     "Internal write API"
     wmaid = self.wmaid(data)
     schema = self.schema
     fname = file_name(self.hdir, wmaid)
     with open_file(fname, "w") as ostream:
         with DataFileWriter(ostream, DatumWriter(), schema) as writer:
             writer.append(data)
Exemplo n.º 3
0
 def _write(self, data):
     "Internal write API"
     wmaid = self.wmaid(data)
     schema = self.schema
     fname = file_name(self.hdir, wmaid)
     with open_file(fname, 'w') as ostream:
         with DataFileWriter(ostream, DatumWriter(), schema) as writer:
             writer.append(data)
Exemplo n.º 4
0
 def _read(self, spec, fields=None):
     "Internal read API"
     if PAT_UID.match(str(spec)):  # requested to read concrete file
         fname = '%s/%s.gz' % (self.uri, spec)
         data = json.load(open_file(fname))
         if isinstance(data, list):
             for rec in data:
                 self.check(rec)
             return data
         self.check(data)
         return [data]
     return self.empty_data
Exemplo n.º 5
0
 def _read(self, spec, fields=None):
     "Internal read API"
     if  PAT_UID.match(str(spec)): # requested to read concrete file
         fname = '%s/%s.gz' % (self.uri, spec)
         data = json.load(open_file(fname))
         if  isinstance(data, list):
             for rec in data:
                 self.check(rec)
             return data
         self.check(data)
         return [data]
     return self.empty_data
Exemplo n.º 6
0
 def file_read(self, fname):
     "Read documents from given file name"
     try:
         schema = self.schema
         out = []
         with DataFileReader(open_file(fname), DatumReader()) as reader:
             for rec in reader:
                 out.append(rec)
         return out
     except Exception as exc:
         err = traceback.format_exc(limit=1).splitlines()[-1]
         msg = "Failure in %s storage, error=%s" % (self.stype, err)
         raise ReadError(msg)
Exemplo n.º 7
0
 def file_read(self, fname):
     "Read documents from given file name"
     try:
         schema = self.schema
         out = []
         with DataFileReader(open_file(fname), DatumReader()) as reader:
             for rec in reader:
                 out.append(rec)
         return out
     except Exception as exc:
         err = traceback.format_exc(limit=1).splitlines()[-1]
         msg = 'Failure in %s storage, error=%s' % (self.stype, err)
         raise ReadError(msg)
Exemplo n.º 8
0
 def _read(self, spec, fields=None):
     "Internal read API"
     if PAT_UID.match(str(spec)):  # requested to read concrete file
         out = []
         fname = file_name(self.hdir, spec)
         with open_file(fname) as istream:
             reader = DataFileReader(istream, DatumReader())
             for data in reader:
                 if isinstance(data, list):
                     for rec in data:
                         self.check(rec)
                     return data
                 self.check(data)
                 out.append(data)
         return out
     return self.empty_data
Exemplo n.º 9
0
 def _read(self, spec, fields=None):
     "Internal read API"
     if PAT_UID.match(str(spec)):  # requested to read concrete file
         out = []
         fname = file_name(self.hdir, spec)
         with open_file(fname) as istream:
             reader = DataFileReader(istream, DatumReader())
             for data in reader:
                 if isinstance(data, list):
                     for rec in data:
                         self.check(rec)
                     return data
                 self.check(data)
                 out.append(data)
         return out
     return self.empty_data
Exemplo n.º 10
0
 def _write(self, data):
     "Internal write API"
     wmaid = self.wmaid(data)
     fname = '%s/%s.gz' % (self.uri, wmaid)
     with open_file(fname, 'w') as ostream:
         ostream.write(json.dumps(data))
Exemplo n.º 11
0
 def _write(self, data):
     "Internal write API"
     wmaid = self.wmaid(data)
     fname = '%s/%s.gz' % (self.uri, wmaid)
     with open_file(fname, 'w') as ostream:
         ostream.write(json.dumps(data))
Exemplo n.º 12
0
    def file_write(self, fname, data):
        "Write documents in append mode to given file name"
        # perform input data validation
        good_data = []
        # write bad data records into output file
        bdir = os.path.dirname(fname)
        bdir = '%s/bad' % bdir if bdir else '/tmp/bad'
        if not os.path.exists(bdir):
            os.makedirs(bdir)
        bfname = '%s/%s_bad.txt' % (bdir, os.path.basename(fname))
        count = ecount = edocs = 0
        with open(bfname, 'a') as bstream:
            for rec in data:
                validator = RecordValidator()
                validator.run(self.schema_json, rec)
                if validator.errors:
                    bstream.write(json.dumps(rec) + '\n')
                    for err in validator.errors:
                        msg = 'SCHEMA ERROR '
                        for key, val in err.items():
                            msg += '%s: %s ' % (key.upper(), json.dumps(val))
                        bstream.write(msg + '\n')
                    bstream.write('-------------\n')
                    ecount += len(validator.errors)
                    edocs += 1
                else:
                    good_data.append(rec)
                count += 1
        if ecount:
            print("WARNING: received %s docs, found %s bad docs, %s errors, see %s"\
                    % (count, edocs, ecount, bfname))
        # use only good portion of the data
        data = good_data
        try:
            schema = self.schema
            wmaids = []
            if not hasattr(data, '__iter__') or isinstance(data, dict):
                data = [data]

            if os.path.exists(fname):
                schema = None  # we'll append to existing file
            mode = 'a+' if fname.endswith('.avro') else 'a'
            if mode == 'a':
                print(
                    "We're unable yet to implement read-write mode with compressed avro files"
                )
                raise NotImplementedError
            rec = None  # keep doc in case of failure
            with DataFileWriter(open_file(fname, mode), DatumWriter(),
                                schema) as writer:
                for rec in data:
                    writer.append(rec)
                    writer.flush()
                    wmaid = rec.get('wmaid', wmaHash(rec))
                    wmaids.append(wmaid)
            return wmaids
        except Exception as exc:
            err = traceback.format_exc(limit=1).splitlines()[-1]
            line = ' '.join(str(exc).replace('\n', '').split())
            msg = 'Failure in %s storage, error=%s, exception=%s' \
                    % (self.stype, err, line)
            msg += ' Failed document: '
            msg += json.dumps(rec)
            raise WriteError(msg)