def stream_items(thrift_data): ''' Iterator over the StreamItems from a buffer of thrift data ''' ## wrap it in a file obj, thrift transport, and thrift protocol transport = StringIO(thrift_data) transport.seek(0) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) ## read stream-item instances until input buffer is exhausted while 1: ## instantiate a StreamItem instance from kba_thrift doc = StreamItem() try: ## read it from the thrift protocol instance doc.read(protocol) ## This has deserialized the data analogous to ## json.loads(line). The StreamItem from the thrift ## format is the analog of the JSON stream-item; see ## http://trec-kba.org/schemas/v1.0/stream-item.json ## yield is python primitive for iteration yield doc except EOFError: break
def parse_thift_data(self, thrift_dir): ''' Parse the thift data in a given directory, apply exact matching over the streaming documents ''' for fname in os.listdir(thrift_dir): ## ignore other files, e.g. stats.json if fname.endswith('.gpg'): continue if fname.endswith('.xz'): continue ## verbose output #print 'Process %s' % fname ### reverse the steps from above: ## load the encrypted data fpath = os.path.join(thrift_dir, fname) thrift_data = open(fpath).read() assert len(thrift_data) > 0, "failed to load: %s" % fpath ## wrap it in a file obj, thrift transport, and thrift protocol transport = StringIO(thrift_data) transport.seek(0) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) ## iterate over all thrift items while 1: stream_item = StreamItem() try: stream_item.read(protocol) except EOFError: break ## process data stream_id = stream_item.stream_id if stream_id in self._missed_docs: for urlname in self._missed_docs[stream_id]: id = self._missed_docs_db.llen(RedisDB.ret_item_list) ret_item = {'id' : id} ret_item['file'] = fname ret_item['query'] = urlname ret_item['rating'] = self._missed_docs[stream_id][urlname] ret_item['stream_id'] = stream_id ret_item['stream_data'] = stream_item.body.cleansed self._missed_docs_db.hmset(id, ret_item) self._missed_docs_db.rpush(RedisDB.ret_item_list, id) print 'Missed %s %s\n\n\n' %(urlname, stream_id) ## suppress the verbose output #print '%s' % stream_item.doc_id ## close that transport transport.close() # free memory thrift_data = None
def get(self, date, file, epoch, doc_id): date_dir = os.path.join(corpus_dir, date) target_id = '%s-%s' %(epoch, doc_id) if not os.path.isdir(date_dir): msg = 'directory %s can not be opened' %date_dir #raise tornado.web.HTTPError(404, log_message=msg) self.render("error.html", msg=msg) return doc = Doc() doc['title'] = 'Null' doc['body'] = 'Null' doc['anchor'] = 'Null' doc['date'] = date doc['file'] = file doc['time'] = datetime.datetime.utcfromtimestamp(float(epoch)).ctime() doc['id'] = target_id fpath = os.path.join(date_dir, file) thrift_data = open(fpath).read() if not len(thrift_data) > 0: msg = 'failed to load: %s' % fpath #raise tornado.web.HTTPError(404, log_message=msg) self.render("error.html", msg=msg) return ## wrap it in a file obj, thrift transport, and thrift protocol transport = StringIO(thrift_data) transport.seek(0) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) ## iterate over all thrift items while 1: stream_item = StreamItem() try: stream_item.read(protocol) if stream_item.stream_id == target_id: found = True doc['title'] = stream_item.title.cleansed doc['body'] = stream_item.body.cleansed doc['anchor'] = stream_item.anchor.cleansed break except EOFError: break self.render("doc.html", title=doc_id, doc=doc)
def parse_thift_data(self, thrift_dir): ''' Parse the thift data in a given directory, apply exact matching over the streaming documents ''' for fname in os.listdir(thrift_dir): ## ignore other files, e.g. stats.json if fname.endswith('.gpg'): continue if fname.endswith('.xz'): continue ## verbose output print 'Process %s' % fname ### reverse the steps from above: ## load the encrypted data fpath = os.path.join(thrift_dir, fname) thrift_data = open(fpath).read() assert len(thrift_data) > 0, "failed to load: %s" % fpath ## wrap it in a file obj, thrift transport, and thrift protocol transport = StringIO(thrift_data) transport.seek(0) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) ## iterate over all thrift items while 1: stream_item = StreamItem() try: stream_item.read(protocol) except EOFError: break ## process data self.process_stream_item(fname, stream_item.stream_id, stream_item.body.cleansed) ## suppress the verbose output #print '%s' % stream_item.doc_id ## close that transport transport.close() # free memory thrift_data = None
def ProcessThriftFile(self, fpath): thrift_data = open(fpath).read() if not len(thrift_data) > 0: msg = 'failed to load: %s' % fpath print 'Error: %s' % (msg) return #print 'Processing %s' %( fpath ) ## wrap it in a file obj, thrift transport, and thrift protocol transport = StringIO(thrift_data) transport.seek(0) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) docs = [] ## iterate over all thrift items while 1: stream_item = StreamItem() try: stream_item.read(protocol) doc = Doc() doc.id = stream_item.stream_id doc.epoch = stream_item.stream_time.epoch_ticks doc.time = datetime.datetime.utcfromtimestamp( doc.epoch).ctime() doc.title = stream_item.title.cleansed doc.body = stream_item.body.cleansed doc.anchor = stream_item.anchor.cleansed self.SaveDoc(doc) except EOFError: break ## close that transport transport.close() # free memory thrift_data = None
def get(self, date, file): ## load the thrift data fpath = os.path.join(corpus_dir, date, file) thrift_data = open(fpath).read() if not len(thrift_data) > 0: msg = 'failed to load: %s' % fpath #raise tornado.web.HTTPError(404, log_message=msg) self.render("error.html", msg=msg) return ## wrap it in a file obj, thrift transport, and thrift protocol transport = StringIO(thrift_data) transport.seek(0) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) docs = [] ## iterate over all thrift items while 1: stream_item = StreamItem() try: stream_item.read(protocol) doc = Doc() doc.id = stream_item.stream_id doc.epoch = stream_item.stream_time.epoch_ticks doc.time = datetime.datetime.utcfromtimestamp(doc.epoch).ctime() docs.append(doc) except EOFError: break ## close that transport transport.close() # free memory thrift_data = None self.render("file-index.html", title=file, date=date, file=file, docs=docs)
def get(self, epoch, id): time = datetime.datetime.utcfromtimestamp(float(epoch)) date = '%d-%.2d-%.2d-%.2d' %(time.year, time.month, time.day, time.hour) if 2011 == time.year: corpus_dir = './corpus/training' else: corpus_dir = './corpus/testing' date_dir = os.path.join(corpus_dir, date) target_id = '%s-%s' %(epoch, id) if not os.path.isdir(date_dir): msg = 'directory %s can not be opened' %date_dir #raise tornado.web.HTTPError(404, log_message=msg) self.set_status(404) self.render("error.html", msg=msg) return doc = Doc() doc['title'] = 'Null' doc['body'] = 'Null' doc['anchor'] = 'Null' doc['date'] = date doc['file'] = 'Null' doc['time'] = datetime.datetime.utcfromtimestamp(float(epoch)).ctime() doc['id'] = target_id #self.write('searching') #self.flush() for fname in os.listdir(date_dir): ## ignore other files if fname.endswith('.gpg'): continue if fname.endswith('.xz'): continue fpath = os.path.join(date_dir, fname) thrift_data = open(fpath).read() if not len(thrift_data) > 0: msg = 'failed to load: %s' % fpath #raise tornado.web.HTTPError(404, log_message=msg) self.render("error.html", msg=msg) return ## wrap it in a file obj, thrift transport, and thrift protocol transport = StringIO(thrift_data) transport.seek(0) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) found = False ## iterate over all thrift items while 1: stream_item = StreamItem() try: stream_item.read(protocol) if stream_item.stream_id == target_id: found = True doc['title'] = stream_item.title.cleansed doc['body'] = stream_item.body.cleansed doc['anchor'] = stream_item.anchor.cleansed doc['file'] = fname break except EOFError: break if found: break self.render("doc.html", title=target_id, doc=doc)
def parse_thift_data(self, corpus_dir): ''' Parse the thift data, find the files in which the stream_items are ''' for item in sorted(self._item_list, key=lambda item: item['id']): ## skip the existing items if self._eval_db.hexists(item['id'], 'id'): print 'Skipping %d' % item['id'] continue target_id = item['stream_id'] list = target_id.split('-') epoch = list[0] time = datetime.datetime.utcfromtimestamp(float(epoch)) date = '%d-%.2d-%.2d-%.2d' % (time.year, time.month, time.day, time.hour) date_dir = os.path.join(corpus_dir, date) if not os.path.isdir(date_dir): print 'directory %s can no be opened' % date_dir continue found = False for fname in os.listdir(date_dir): ## ignore other files, e.g. stats.json if fname.endswith('.gpg'): continue if fname.endswith('.xz'): continue ### reverse the steps from above: ## load the encrypted data fpath = os.path.join(date_dir, fname) thrift_data = open(fpath).read() if not len(thrift_data) > 0: print "failed to load: %s" % fpath continue ## wrap it in a file obj, thrift transport, and thrift protocol transport = StringIO(thrift_data) transport.seek(0) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) ## iterate over all thrift items while 1: stream_item = StreamItem() try: stream_item.read(protocol) except EOFError: break if stream_item.stream_id == target_id: self.process_stream_item(fname, item, stream_item.body.cleansed) found = True break ## close that transport transport.close() # free memory thrift_data = None if found: print 'Item %d processed' % item['id'] break if not found: print 'Item %d (%s) can not be found in any file' % ( item['id'], item['stream_id'])