def _get_uptox_items(x, queue): work = [] i = 0 while ((i < x) & (not queue.empty())): filename = queue.get() p = PaperParser() paper = p.paperExists(filename) if paper != None: logger.info("Paper already exists for %s, skipping...", filename) os.unlink(filename) inform_watcher(logger, filename, exists=True, paperObj=paper) continue with open(filename, 'rb') as f: data = f.read() i += 1 work.append((filename, data)) return Binary(zlib.compress(cPickle.dumps(work)))
def paperExists(self, infile): """Return true if paper with same authors and title already in db""" parser = PaperParser() paper = parser.paperExists(infile) if paper != None: inform_watcher(self.logger, infile, exists=True, paperObj=paper) return paper != None
def run(self): """This is the main event loop for the result handler""" self.running = True while self.running: try: self.paper_files = [] result = self.queue.get(block=False) paperObj = None if (isinstance(result, PreprocessingException)): #get exception information and dump to user self.logger.error("Error processing paper %s: %s", result.paper, result) self.paper_files.append((result.paper, 'keep')) if (result.pdf): name, _ = os.path.splitext( os.path.basename(result.paper)) pdf = os.path.join(self.watchdir, name + ".pdf") self.paper_files.append((pdf, 'move')) inform_watcher(self.logger, result.paper, exception=result) try: #send the error report send_error_report(result, result.traceback, result.files) except Exception as e: self.logger.error("ERROR SENDING EMAIL: %s", e) else: try: paperObj = self._process_paper(result) except Exception as e: _, _, exc_tb = sys.exc_info() self.logger.error("Error processing paper %s: %s", result[0], e) for line in traceback.format_tb(exc_tb): self.logger.error(line) inform_watcher(self.logger, result[0], exception=e) self.cleanupFiles(paperObj) except Empty: self.logger.debug("No work to do.. going back to sleep") time.sleep(1)
def handleProcessingException(self, result): """Method for handling processing exceptions""" #get exception information and dump to user self.logger.error("Error processing paper %s: %s", result.paper, result) inform_watcher(self.logger, result.paper, exception=result) try: print result.files #send the error report send_error_report(result, result.traceback, [result.paper]) except Exception as e: self.logger.error("ERROR SENDING EMAIL: %s", e) self.cleanup(result.paper)
def store(self, result): """Once a document has been handled, store it in file""" if (isinstance(result, PreprocessingException)): self.handleProcessingException(result) print result.files else: filename, outfile, timetaken = result if (self.paperExists(outfile)): inform_watcher( self.logger, filename, exception=PaperExistsException("Paper Already Exists")) self.cleanup(filename) return None #store the paper object in database paperObj = self.storePaperData(outfile) #add paper classification to database paperObj = self.classifyPaper(paperObj) filenames = [filename, outfile] basename = os.path.basename(filename) name, ext = os.path.splitext(basename) pdf = os.path.join(self.watchdir, name + ".pdf") if os.path.exists(pdf): filenames.append(pdf) #add the related files to the db self.savePaperFiles(filenames, paperObj) self.logger.info("Paper has been added successfully") try: inform_watcher(self.logger, filename, paperObj=paperObj) except Exception as e: self.logger.warn( "Failed to inform watcher about paper" + " success: %s", e) if config.has_key('TWITTER_ENABLED') and config['TWITTER_ENABLED']: try: tweet_paper(paperObj) except Exception as e: self.logger.warn("Could not tweet about paper %s", e) #finally update stats average = self.stats[0] total = self.stats[1] + 1 if (average == 0.0): self.stats = (timetaken, total) else: self.stats = (average + ((timetaken - average) / self.stats[1]), total) #save the preprocessing stats to disk save_pp_stats(self.stats, self.outdir) return paperObj