def main(): sys.stdout.write('PDFrate Query Scheduler running! [{0}]\n'.format( datetime.datetime.now())) parser = ArgumentParser(description='PDFrate Query Scheduler') parser.parse_args() QUERY_DIR = config.get('pdfratequeryscheduler', 'query_dir') REPLY_DIR = config.get('pdfratequeryscheduler', 'reply_dir') queries = [] max_priority = 0 sys.stdout.write('Queries found: ') # Process all files in the QUERY_DIR for f in os.listdir(QUERY_DIR): f = os.path.join(QUERY_DIR, f) if not os.path.isfile(f) or os.path.splitext(f)[1] != '.json': continue try: queries.append(json.load(open(f, 'r'))) except Exception as ex: sys.stderr.write('Error reading query file \'{f}\': {ex}\n'.format( f=f, ex=ex)) continue # Keep track of max priority queries[-1]['queryfile'] = f if queries[-1]['priority'] > max_priority: max_priority = queries[-1]['priority'] #sys.stdout.write('{0}\n'.format(queries[-1])) # In case of no queries if not queries: sys.stdout.write("None\nExiting.\n") return else: sys.stdout.write('{}\n'.format(len(queries))) # Filter for max priority queries sys.stdout.write('Max priority: {0}\n'.format(max_priority)) if max_priority != 0: queries = [q for q in queries if q['priority'] == max_priority] # The oldest one is next top_query = min(queries, key=itemgetter('datetime')) del queries sys.stdout.write('Next query: {0}\n'.format(top_query)) # Submit query to PDFrate and save the reply proxy = pdfrateproxy.PdfrateProxy() sleep_time = randint( 0, int(config.get('pdfratequeryscheduler', 'sleep_time'))) sys.stdout.write('Sleeping for {0} seconds...\n'.format(sleep_time)) sleep(sleep_time) sys.stdout.write('Getting report...\n') reply = proxy.get_report(utility.file_sha256_hash(top_query['filename'])) if reply['status'] == 'noreport': sys.stdout.write('No report, submitting file...\n') reply = proxy.submit_file(top_query['filename']) if top_query['get_metadata'] == True and reply['status'] == 'success': # Also get metadata file_hash = os.path.splitext(os.path.basename( top_query['queryfile']))[0] sys.stdout.write('Getting metadata...\n') metadata_reply = proxy.get_metadata(file_hash) reply['metadata'] = metadata_reply['metadata'] reply['status'] = metadata_reply['status'] reply_filename = os.path.join(REPLY_DIR, os.path.basename(top_query['queryfile'])) reply['filename'] = top_query['filename'] sys.stdout.write('Writing reply to disk...\n') json.dump(reply, open(reply_filename, 'w+')) # Remove query file sys.stdout.write('Removing query file...\n') os.remove(top_query['queryfile']) sys.stdout.write('Exiting.\n')
def _get_reply_for_file(self, filename): ''' Given a file, generates the path to the reply file for it. ''' sha256 = utility.file_sha256_hash(filename) return os.path.join(self.reply_dir, '{0}.json'.format(sha256))
def _get_reply_for_file(self, filename): ''' Given a file, generates the path to the reply file for it. ''' sha256 = utility.file_sha256_hash(filename) return os.path.join(self.reply_dir, '{0}.json'.format(sha256))
def main(): sys.stdout.write('PDFrate Query Scheduler running! [{0}]\n'.format(datetime.datetime.now())) parser = ArgumentParser(description = 'PDFrate Query Scheduler') parser.parse_args() QUERY_DIR = config.get('pdfratequeryscheduler', 'query_dir') REPLY_DIR = config.get('pdfratequeryscheduler', 'reply_dir') queries = [] max_priority = 0 sys.stdout.write('Queries found: ') # Process all files in the QUERY_DIR for f in os.listdir(QUERY_DIR): f = os.path.join(QUERY_DIR, f) if not os.path.isfile(f) or os.path.splitext(f)[1] != '.json': continue try: queries.append(json.load(open(f, 'r'))) except Exception as ex: sys.stderr.write('Error reading query file \'{f}\': {ex}\n'.format(f=f, ex=ex)) continue # Keep track of max priority queries[-1]['queryfile'] = f if queries[-1]['priority'] > max_priority: max_priority = queries[-1]['priority'] #sys.stdout.write('{0}\n'.format(queries[-1])) # In case of no queries if not queries: sys.stdout.write("None\nExiting.\n") return else: sys.stdout.write('{}\n'.format(len(queries))) # Filter for max priority queries sys.stdout.write('Max priority: {0}\n'.format(max_priority)) if max_priority != 0: queries = [q for q in queries if q['priority'] == max_priority] # The oldest one is next top_query = min(queries, key=itemgetter('datetime')) del queries sys.stdout.write('Next query: {0}\n'.format(top_query)) # Submit query to PDFrate and save the reply proxy = pdfrateproxy.PdfrateProxy() sleep_time = randint(0, int(config.get('pdfratequeryscheduler', 'sleep_time'))) sys.stdout.write('Sleeping for {0} seconds...\n'.format(sleep_time)) sleep(sleep_time) sys.stdout.write('Getting report...\n') reply = proxy.get_report(utility.file_sha256_hash(top_query['filename'])) if reply['status'] == 'noreport': sys.stdout.write('No report, submitting file...\n') reply = proxy.submit_file(top_query['filename']) if top_query['get_metadata'] == True and reply['status'] == 'success': # Also get metadata file_hash = os.path.splitext(os.path.basename(top_query['queryfile']))[0] sys.stdout.write('Getting metadata...\n') metadata_reply = proxy.get_metadata(file_hash) reply['metadata'] = metadata_reply['metadata'] reply['status'] = metadata_reply['status'] reply_filename = os.path.join(REPLY_DIR, os.path.basename(top_query['queryfile'])) reply['filename'] = top_query['filename'] sys.stdout.write('Writing reply to disk...\n') json.dump(reply, open(reply_filename, 'w+')) # Remove query file sys.stdout.write('Removing query file...\n') os.remove(top_query['queryfile']) sys.stdout.write('Exiting.\n')