def main(args): """ Main function :param args: argparse dict :return: None """ print "Start reindexing from {0} to {1} with batch size of {2} and {3} worker processes".format( args.source_index, args.destination_index, args.batch_size, args.processes ) client = Elasticsearch() print "connected to elastic search at http://localhost:9200" docs = scan(client, index=args.source_index) count = 0 queue = Queue(args.batch_size) # don't fill up queue too much pool = Pool(args.processes, worker_main, (queue, args.source_index, args.destination_index, args.batch_size)) for doc in docs: count += 1 if count % args.batch_size == 0: print "put {0}".format(count) queue.put(doc, True) print "put {0}".format(count) # send stop messages for i in range(args.processes): queue.put(Stop, True) pool.close() pool.join()
def main(): arg = parse_args() folder = arg.fold core = arg.core output = arg.out start = arg.start if start: start = start.replace('-', '') + '000000' task_queue = Queue() result_queue = Queue() task_count = create_task(folder, task_queue, start) print task_count for i in range(core): Process(target=worker, args=(task_queue, result_queue)).start() #send stop signal for i in range(core): task_queue.put('STOP') #print result out_files = {} for i in range(task_count): actions = result_queue.get() user = actions["user"] for day in actions["actions"]: if day not in out_files: out_files[day] = open(os.path.join(output, day), "w") out_files[day].write(json.dumps({"user": user, "actions": actions["actions"][day]}) + "\n") for day in out_files: out_files[day].flush() out_files[day].close()
def main(world_folder, replacement_file_name): global replacements world = nbt.world.WorldFolder(world_folder) logger = configure_logging() logger.info("Starting processing of %s", world_folder) if not isinstance(world, nbt.world.AnvilWorldFolder): logger.error("%s is not an Anvil world" % (world_folder)) return 65 # EX_DATAERR if replacement_file_name != None: logger.info("Using Replacements file: %s", replacement_file_name) with open(replacement_file_name, 'r') as replacement_file: replacements = json.load(replacement_file) # get list of region files, going to pass this into function to process region region_files = world.get_regionfiles(); # Parallel q = Queue() lp = threading.Thread(target=logger_thread, args=[q]) lp.start() p = Pool(initializer=process_init, initargs=[q,replacements], maxtasksperchild=1) region_data = p.map(process_region, region_files) # Map has finished up, lets close the logging QUEUE q.put(None) lp.join() # Not Parallel # region_data = map(process_region, region_files) # Write output data write_block_data(region_data,"output.txt") return 0
def recoverPRNGState(cookie,timeMillisEstimate,PRNGMillisEstimate,IPAddr,serverPort,numWorkers,chunkSize): global PRNGMillisDelta global initalSeek q = Queue(0) i = 0 if PRNGMillisDelta%chunkSize > 0: q.put((PRNGMillisEstimate+PRNGMillisDelta-PRNGMillisDelta%chunkSize,PRNGMillisEstimate+PRNGMillisDelta,initalSeek)) for i in range(PRNGMillisEstimate,PRNGMillisEstimate+PRNGMillisDelta-PRNGMillisDelta%chunkSize,chunkSize): q.put((i,i+chunkSize,initalSeek)) desc = [] seedValue = Value('d', 0) # Start worker processes and assign work. for i in range(numWorkers): p = Process(target=recoverPRNGStateWorker, args=(cookie,timeMillisEstimate,q,IPAddr,serverPort,seedValue)) p.start() desc.append(p) # Wait for worker processes finish. for p in desc: p.join() return long(seedValue.value)
class Manager(Process): def __init__(self, wnum=3): Process.__init__(self) self.s2m = Queue() # message Manager receive from worker and svr self.m2w = Queue() # message send to works self.works = [0] * wnum for i in range(wnum): self.works[i] = Worker(self.s2m, self.m2w) self.works[i].start() def stop(self): for w in self.works: self.m2w.put(None) # FIXME should call worker.Terminal? """ Video Site: bilibili.com Title: 【BD‧1080P】【高分剧情】鸟人-飞鸟侠 2014【中文字幕】 Type: Flash video (video/x-flv) Size: 3410.85 MiB (3576536465 Bytes) Downloading 【BD‧1080P】【高分剧情】鸟人-飞鸟侠 2014【中文字幕】.flv ... 0.7% ( 22.2/3410.9MB) [# """ def run(self): # reset DB flags kuos = get_by_flag(WORK) for uo in kuos: set_flag(uo.mid, STOP) tuos = get_by_flag(WAIT) for uo in tuos: set_flag(uo.mid, STOP) while True: msg = self.s2m.get() # print("pid=%s, self.s2m.get=%s" % (os.getpid(), repr(msg))) who = msg.get("who") if who == "worker": self.handle_mid(msg["mid"], msg["dat"]) elif who == "svr": # self.m2w.put(msg['mid']) self.m2w.put(pick_url(msg["mid"])) elif who == "error": sys.stderr.write(msg["dat"]) # FIXME sys.stderr.write("\n") else: sys.stderr.write("Unknow msg:\n") sys.stderr.write(msg) sys.stderr.write("\n") def handle_mid(self, mid, dat): print(dat) if dat.startswith("Process "): dd = dat.split() act = dd[2].lower() print("mid=%s, act=%s" % (mid, act)) set_flag(mid, act) elif dat.startswith("Downloading "): print("mid=[%s]" % mid) update_filename(mid, dat[12:-5])
def main(): """Runs everything""" #clients hosts = ["localhost", "localhost"] NUMBER_OF_PROCESSES = len(hosts) # Create queues task_queue = Queue() done_queue = Queue() #submit tasks for host in hosts: task_queue.put(host) #Start worker processes for i in range(NUMBER_OF_PROCESSES): Process(target=worker, args=(task_queue, done_queue)).start() # Get and print results print 'Unordered results:' for i in range(len(hosts)): print '\t', done_queue.get().query # Tell child processes to stop for i in range(NUMBER_OF_PROCESSES): task_queue.put('STOP') print "Stopping Process #%s" % i
def run(self): '''run multiple replicates''' if self.data['verbosity'] <= 1: iterations = range(self.data['replicates']) else: widgets = ['{0} : '.format(self.data['name']), Percentage(), ' ', Bar('='), ' ', ETA()] pbar = ProgressBar(widgets=widgets, maxval=self.data['replicates'], term_width=get_terminal_size()[0] - 5) iterations = pbar((i for i in range(self.data['replicates']))) nJobs = max(min(self.data['jobs'], self.data['replicates']), 1) workQueue = Queue() resQueue = Queue() # put all replicates + stop signals in queue for replicate in range(self.data['replicates']): workQueue.put(replicate) for i in range(nJobs): workQueue.put(None) # spawn workers procs = [Process(target = self.calculate, args = (workQueue, resQueue)) for j in range(nJobs)] for p in procs: p.start() # collect the results off the queue for i in iterations: try: self.__save(resQueue.get()) except KeyboardInterrupt as e: raise ValueError("calculator terminated!") for p in procs: p.join() if self.failure_count.value(): env.logger.info("{} invalid replicate(s)".format(self.failure_count.value())) self.data['replicates'] = self.data['replicates'] - self.failure_count.value() return {} if len(self.result) == 0 else dict(list(self.data.items()) + list(self.result.items()))
class Updater(Process): def __init__(self, maxsize=15): Process.__init__(self) #self.queue = Queue(maxsize) self.queue = Queue() self.queue_lock = Lock() self._exit = Event() def run(self): while not self._exit.is_set(): #with self.queue_lock: self.queue.put(self.receive()) #self.queue.put_nowait(self.receive()) #if self.queue.full(): # try: # self.queue.get_nowait() # except: # pass def stop(self): self._exit.set() # This leaves the process hanging on Windows #self.join(STOP_TIMEOUT) if self.is_alive(): #TODO make a nicer warning print 'Terminating updater:', self self.terminate() def receive(self): raise NotImplementedError
def test_report_hash_added_after_send(self, fromConfig, fromOptions, getLogger): # Side effect for fromConfig def fake_virts(logger, config): new_fake_virt = Mock() new_fake_virt.config.name = config.name return new_fake_virt fromConfig.side_effect = fake_virts options = Mock() options.interval = 0 options.oneshot = True options.print_ = False options.log_file = '' options.log_dir = '' virtwho = VirtWho(self.logger, options, config_dir="/nonexistant") def send(report): report.state = AbstractVirtReport.STATE_FINISHED return True virtwho.send = Mock(side_effect=send) queue = Queue() virtwho.queue = queue virtwho.retry_after = 1 virtwho.configManager.addConfig(self.config) virtwho.configManager.addConfig(self.second_config) queue.put(self.fake_report) queue.put(self.fake_domain_list) virtwho.run() self.assertEquals(virtwho.send.call_count, 2) self.assertEqual(virtwho.last_reports_hash[self.config.name], self.fake_report.hash) self.assertEqual(virtwho.last_reports_hash[self.second_config.name], self.fake_domain_list.hash)
class UpDown: def __init__(self, down_workers=2, up_workers=2, db=None): self.down_workers_num = down_workers self.up_workers_num = up_workers self.db = db self.base_url = "http://eol.jsc.nasa.gov/SearchPhotos/" self.down_workers = [] self.up_workers = [] self.to_upload = [] self.q = Queue() def down_worker(self, download_url, image_id): """ Download images and set the database after the download was complete. """ down = ImageDownload(self.base_url + download_url) down.find_urls() if(down.dl()): self.db.update_image_downloaded(image_id, down.file_name) def up_worker(self, mission_id): """ Check for images that are downloaded but not uploaded every minute. """ while True: self.to_upload = self.db.get_to_upload(mission_id) print "No files to upload found!\n" if(len(list(self.to_upload)) > 0): print "Found a file to upload!\n" self.to_upload = list(self.db.get_to_upload(mission_id)) self.q.put(self.to_upload) time.sleep(60)
def ParCalculate(systems,calc,cleanup=True,block=True,prefix="Calc_"): ''' Run calculators in parallel for all systems. Calculators are executed in isolated processes and directories. The resulting objects are returned in the list (one per input system). ''' if type(systems) != type([]) : sysl=[systems] else : sysl=systems if block : iq=Queue(len(sysl)+1) oq=Queue(len(sysl)+1) # Create workers for s in sysl: __PCalcProc(iq, oq, calc, prefix=prefix, cleanup=cleanup).start() # Put jobs into the queue for n,s in enumerate(sysl): iq.put([n,s]) # Protection against too quick insertion time.sleep(0.2) if verbose : print("Workers started:", len(sysl)) # Collect the results res=[] while len(res)<len(sysl) : n,s=oq.get() res.append([n,s]) #print("Got from oq:", n, s.get_volume(), s.get_pressure()) else : # We do not need the multiprocessing complications for non-blocking # workers. We just run all in sequence. basedir=os.getcwd() res=[] for n,s in enumerate(sysl): s.set_calculator(copy.deepcopy(calc)) s.get_calculator().block=block place=tempfile.mkdtemp(prefix=prefix, dir=basedir) os.chdir(place) s.get_calculator().working_dir=place #print("Start at :", place) if hasattr(calc, 'name') and calc.name=='Siesta': s.get_potential_energy() else: s.get_calculator().calculate(s) os.chdir(basedir) #print("Submited", s.get_calculator().calc_finished(), os.getcwd()) # Protection against too quick insertion time.sleep(0.2) res.append([n,s]) if verbose : print("Workers started:", len(sysl)) return [r for ns,s in enumerate(sysl) for nr,r in res if nr==ns]
class TaskQueue: N = 4 symb = string.ascii_letters + string.digits def __init__(self): self.tasks = Queue() self.done = Queue() self.results = {} self.processes = [] for i in range(TaskQueue.N): self.processes.append(Process(target=self.run_tasks)) self.processes[-1].start() threading.Thread(target=self.collect_results).start() def add(self, f, args): id = ''.join(random.choice(TaskQueue.symb) for i in range(15)) self.tasks.put((id, f,args)) return id def get(self, id): return self.results.pop(id, '_NotFound_') def run_tasks(self): for id, func, args in iter(self.tasks.get, 'STOP'): result = func(*args) self.done.put((id,result)) def collect_results(self): for id, r in iter(self.done.get, 'STOP'): self.results[id] = r
def solve(self, problems, **kwargs): if type(problems) not in [list, ndarray]: problems = [problems] assert issubclass(type(problems[0]), _Problem), ( 'ParalelSolver argument is not a _Problem subclass') qin = Queue() qout = Queue() for i, pb in enumerate(problems): qin.put((i, pb)) slaves = [] for i in range(self.n_jobs): slaves += [WorkerSolver(qin, qout, id_w=i, debug=self.debug, **self.param)] qin.put((None, None)) slaves[-1].start() # Join loop N_iter = len(problems) self.solutions = [0]*N_iter self.scores = [0]*N_iter for i in range(N_iter): idp, z, s = qout.get() self.solutions[idp] = z self.scores[idp] = s log.progress(name='Solver', iteration=i+1, i_max=N_iter) for s in slaves: s.join() self.problems = problems return self.solutions
class JobPool(object): """ Pool container. """ pool = None message_queue = None def __init__(self, max_instances=4): self.message_queue = Queue() self.pool = Pool(max_instances, execute_task, (self.message_queue,)) atexit.register(self.clear) def add_analysis(self, analysis): """ Add analysis to the pool. """ analysis.set_started() self.message_queue.put(analysis) def clear(self): """ Pool cleanup. """ self.pool.terminate() self.pool.join()
class TweetManager: def __init__(self): self.sdb = boto.connect_sdb(setting.AWS_KEY, setting.AWS_SECRET) self.__keywords__ = get_filter_keywords(self.sdb) self.__cores__ = cpu_count() self.tweets_queue = Queue() self.db_tweets = self.sdb.get_domain(setting.SDB_DOMAIN) self.__buffer__ = "" def connect_twitter(self): self.conn = pycurl.Curl() self.conn.setopt(pycurl.POSTFIELDS,urllib.urlencode(self.__keywords__)) self.conn.setopt(pycurl.USERPWD, "%s:%s" % (setting.TWITTER_ID, setting.TWITTER_PASSWORD)) self.conn.setopt(pycurl.URL, setting.JSON_STREAMING_URI) print 'starting tweet_producer process' self.conn.setopt(pycurl.WRITEFUNCTION, lambda data: self.tweet_producer(data)) def tweet_producer(self, tweet): self.__buffer__ += tweet if tweet.endswith("\r\n") and self.__buffer__.strip(): self.tweets_queue.put(self.__buffer__) self.__buffer__ = "" def start(self): self.connect_twitter() print 'starting %d tweet_consumer process(s)' % self.__cores__ self.consumers = [Process(target=tweet_consumer, args=(i, self.tweets_queue, self.db_tweets,)) for i in xrange(self.__cores__)] for c in self.consumers: c.start() self.conn.perform()
def ProcessStuff(spp_list): print 'cpu_count() = %d\n' % multiprocessing.cpu_count() NUMBER_OF_PROCESSES = multiprocessing.cpu_count() TASKS = [(CallMaxent, (spp_list[i],)) for i in range(len(spp_list))] #TASKS2 = [(plus, (i, 8)) for i in range(10)] # Create queues task_queue = Queue() done_queue = Queue() # Submit tasks for task in TASKS: task_queue.put(task) # Start worker processes for i in range(NUMBER_OF_PROCESSES): Process(target=worker, args=(task_queue, done_queue)).start() # Get and print results print 'Unordered results:' for i in range(len(TASKS)): print '\t', done_queue.get() # Tell child processes to stop for i in range(NUMBER_OF_PROCESSES): task_queue.put('STOP')
def main(): q = Queue() number_of_processes = 4 plist = [] for i in range(number_of_processes): plist.append(Process(target=f, args=('file_in.txt', i, q, number_of_processes))) for p in plist: p.start() for p in plist: p.join() q.put(None) print 'all joined!' # Loop through all the elements in the queue and write to file with open("file_out.txt", "w") as file_output: while True: item = q.get() print item if item is None: break print >>file_output, item print 'Done'
def parallel_work(jobs, nr_of_threads): """ Setup queues, start the processes and wait until the job is done """ work_queue = Queue() result_queue = Queue() result = {} for job in jobs: work_queue.put(job) if nr_of_threads > len(jobs): nr_of_threads = len(jobs) for i in range(nr_of_threads): worker = Process(target=check_plugin, args=(work_queue,result_queue)) worker.start() while len(result.keys()) < len(jobs): data = result_queue.get() if " | " in data[1]: (status, output) = data[1].split(" | ") else: status = "UNKNOWN" output = data[1] result[data[0]] = {"status": status, "output": output} #print "Host " + data[0] + " " + status return result
def _run_parallel(self, processes=2, progress_bar=False): """ Run all matches in parallel Parameters ---------- progress_bar : bool Whether or not to update the tournament progress bar """ # At first sight, it might seem simpler to use the multiprocessing Pool # Class rather than Processes and Queues. However, Pool can only accept # target functions which can be pickled and instance methods cannot. work_queue = Queue() done_queue = Queue() workers = self._n_workers(processes=processes) chunks = self.match_generator.build_match_chunks() for chunk in chunks: work_queue.put(chunk) self._start_workers(workers, work_queue, done_queue) self._process_done_queue(workers, done_queue, progress_bar=progress_bar) return True
class MultiSegmentWriter(IndexWriter): def __init__(self, index, procs=2, **writerargs): self.index = index self.lock = index.storage.lock(index.indexname + "_LOCK") self.tasks = [] self.postingqueue = Queue() #self.resultqueue = Queue() names = [index._next_segment_name() for _ in xrange(procs)] self.tasks = [SegmentWritingTask(index.storage, index.indexname, segname, writerargs, self.postingqueue) for segname in names] for task in self.tasks: task.start() def add_document(self, **args): self.postingqueue.put(args) def cancel(self): for task in self.tasks: task.cancel() self.lock.release() def commit(self): procs = len(self.tasks) for _ in xrange(procs): self.postingqueue.put(None) for task in self.tasks: print "Joining", task task.join() self.index.segments.append(task.get_segment()) self.index.commit() self.lock.release()
def get_citing_papers(**args): # create the queues tasks = Queue() results = Queue() # how many threads are there to be used if 'threads' in args: threads = args['threads'] else: threads = cpu_count() bibcodes = args.get('bibcodes',[]) # initialize the "harvesters" (each harvester get the citations for a bibcode) harvesters = [ MongoCitationListHarvester(tasks, results) for i in range(threads)] # start the harvesters for b in harvesters: b.start() # put the bibcodes in the tasks queue num_jobs = 0 for bib in bibcodes: tasks.put(bib) num_jobs += 1 # add some 'None' values at the end of the tasks list, to faciliate proper closure for i in range(threads): tasks.put(None) # gather all results into one citation dictionary cit_list = [] while num_jobs: data = results.get() cit_list += data.get('citations',[]) num_jobs -= 1 return cit_list
def test(): queue = Queue() proc = Process(target=doNothing, args=(queue, )) proc.start() _logger.info("Started dummy process with PID %d", proc.pid) startCodeCheckerServerAttachedToPid(proc.pid) time.sleep(3) _logger.info("Allowing the dummy process to finish") queue.put(1) proc.join() if utils.isProcessRunning(proc.pid): _logger.warning("Dummy process %d was still running", proc.pid) proc.terminate() time.sleep(1) it.assertFalse(utils.isProcessRunning(proc.pid), "Process %d is still running after terminating " "it!" % proc.pid) time.sleep(1) _logger.info("Server should have died by now") with it.assertRaises(requests.ConnectionError): requests.post(it._url + '/get_diagnose_info')
def start_combo(argv): queue = Queue(10) test_input = TestInputParser.get_test_input(argv) thread = Thread(target=combo, args=(queue, test_input)) thread.start() time.sleep(24 * 60 * 60) queue.put("stop")
class YaraJobPool(object): """ Yara pool container. """ pool = None message_queue = None def __init__(self, max_instances=3): self.message_queue = Queue() self.pool = Pool(max_instances, execute_yara_task, (self.message_queue,)) atexit.register(self.clear) def add_yara_task(self, yara_task): """ Adds the yara task. """ self.message_queue.put(yara_task) def clear(self): """ Pool cleanup. """ self.pool.terminate() self.pool.join()
def start_load(argv): queue = Queue(10) test_input = TestInputParser.get_test_input(argv) load_info = { 'server_info': [test_input.servers[0]], 'memcached_info': { 'bucket_name': "default", 'bucket_port': "11210", 'bucket_password': "", }, 'operation_info': { 'operation_distribution': {'set': 10}, 'valuesize_distribution': {20: 30, 30: 5, 25: 5}, 'create_percent': 25, 'threads': 6, }, 'limit_info': { 'max_items': 0, 'operation_count': 0, 'time': time.time() + 24 * 60 * 60, 'max_size': 0, }, } thread = Thread(target=loadrunner, args=(queue, test_input.servers, load_info)) thread.start() time.sleep(24 * 60 * 60) queue.put("stop")
def getFeatureMultiprocessing(subProcFunc, blwFile, outputFile, funcArgs, keyword=['Vietnamese_by_catalog', 'ppVietnamese_by_catalog']): START_TIME = time.time() # getFreqWordsForFileFromDict(['data/ppVietnamese_by_catalog/Easy/ct24/ct24 (100).txt',12.35,3, 4], 'data/TanSoTu.txt') # getDataNFeatureFromFile('test_data.txt', 'output/test_Vietnamese_output_classifier.csv', 'test') # X3 = getDataNFeatureFromFile('Difficult_data.txt', 'output/vietnamesewn_Difficult_output.csv', 3) # X1 = getDataNFeatureFromFile('Easy_data.txt','output/vietnamesewn_Easy_output.csv', 1) # X2 = getDataNFeatureFromFile('Normal_data.txt','output/vietnamesewn_Normal_output.csv', 2) _tempfile = open(blwFile, 'r') temp = _tempfile.read().splitlines() _tempfile.close() filesQueue = Queue() RESULT_QUEUE = Queue() for i in range(1, len(temp)): temp[i] = temp[i].split(',') temp[i][0] = re.sub(keyword[0], keyword[1], temp[i][0]) if not keyword[0] == '' and (not temp[i][0].find(keyword[-1]) > 0): print('[ERROR] processing ', temp[i][0]) print('sub', keyword[0], keyword[-1], re.sub(keyword[0], keyword[-1], temp[i][0])) return filesQueue.put(temp[i]) PROCESS_LOCK = Lock() myProcess = [] for processID in range(MAX_PROCESS): myProcess.append(Process(target=getDataNFeatureFromFileForAProc, args=(PROCESS_LOCK, RESULT_QUEUE, filesQueue, subProcFunc, funcArgs))) myProcess.append(Process(target=writeOutResult, args=(RESULT_QUEUE, outputFile))) for _process in myProcess: _process.start() for _process in myProcess: _process.join() print('total runtime:', time.time() - START_TIME)
def start_backup(argv): queue = Queue(10) test_input = TestInputParser.get_test_input(argv) thread = Thread(target=backup, args=(queue, test_input.servers)) thread.start() time.sleep(24 * 60 * 60) queue.put("stop")
class BackgroundProcess(object): """A background process that reads batches and stores them in a queue. The :meth:`main` method needs to be called in order to start reading batches into the queue. Note that this process will run infinitely; start it as a :attr:`~multiprocessing.Process.daemon` to make sure it will get killed when the main process exits. Parameters ---------- data_stream : :class:`.DataStream` or :class:`Transformer` The data stream from which to read batches. max_batches : int The maximum number of batches to store in the queue. If reached, the process wil block until a batch is popped from the queue. """ def __init__(self, data_stream, max_batches): self.data_stream = data_stream self.batches = Queue(max_batches) self.run_background = True def main(self): while True: iterator = self.data_stream.get_epoch_iterator() for batch in iterator: self.batches.put(batch) self.batches.put(StopIteration) def get_next_data(self): return self.batches.get()
def processFiles(patch_dir): root = os.getcwd() glbl.data_dirs = {} if root != patch_dir: working_path = root+"/"+patch_dir else: working_path = root for path, dirs, files in os.walk(working_path): if len(dirs) == 0: glbl.data_dirs[path] = '' # Multiprocessing Section ######################################### Qids = glbl.data_dirs.keys() manager = Manager() # creates shared memory manager object results = manager.dict() # Add dictionary to manager, so it can be accessed across processes nextid = Queue() # Create Queue object to serve as shared id generator across processes for qid in Qids: nextid.put(qid) # Load the ids to be tested into the Queue for x in range(0,multiprocessing.cpu_count()): # Create one process per logical CPU p = Process(target=processData, args=(nextid,results)) # Assign process to processCBR function, passing in the Queue and shared dictionary glbl.jobs.append(p) # Add the process to a list of running processes p.start() # Start process running for j in glbl.jobs: j.join() # For each process, join them back to main, blocking on each one until finished # write out results c = 1 sets = results.keys() sets.sort() for x in sets: if results[x] != 'None': FINAL = open('result'+str(c)+'.txt','w') n = "\n************************************************************************************************\n" FINAL.write(n+"* "+x+' *\n'+n+results[x]+"\n") FINAL.close() c += 1
def test_same_report_filtering(self, fromConfig, fromOptions, getLogger): def fake_virts(logger, config): new_fake_virt = Mock() new_fake_virt.config.name = config.name return new_fake_virt fromConfig.side_effect = fake_virts options = Mock() options.interval = 0 options.oneshot = True options.print_ = False options.log_dir = '' options.log_file = '' virtwho = VirtWho(self.logger, options, config_dir="/nonexistant") queue = Queue() # Create another report with same hash report2 = HostGuestAssociationReport(self.config, self.fake_report.association) self.assertEqual(self.fake_report.hash, report2.hash) def send(report): report.state = AbstractVirtReport.STATE_FINISHED # Put second report when the first is done queue.put(report2) return True virtwho.send = Mock(side_effect=send) virtwho.queue = queue virtwho.retry_after = 1 virtwho.configManager.addConfig(self.config) queue.put(self.fake_report) virtwho.run() self.assertEquals(virtwho.send.call_count, 1)
'.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>(?P<comment_num>.*?)评价</span>', re.S) ret = com.finditer(s) for i in ret: print({ "id": i.group("id"), "title": i.group("title"), "rating_num": i.group("rating_num"), "comment_num": i.group("comment_num"), }) if __name__ == '__main__': count = 0 q = Queue() p_l = [] for i in range(10): count += 25 p = Process( target=producer, args=(q, 'https://movie.douban.com/top250?start=%s&filter=' % count)) p.start() p_l.append(p) for i in range(5): c = Process(target=consumer, args=(q, )) c.start() for i in p_l: i.join() q.put(None)
class EpollServer(TServer): def __init__(self, *args, **kwargs): super(EpollServer, self).__init__(*args) self._clients = {} self._worker_processes = {} event_queue_size = kwargs.get('event_queue_size', 100) self._worker_process_number = kwargs.get('worker_process_number', 1) self._tasks = Queue(event_queue_size) """ 创建socket两端,thrift服务端处理完若干thrift客户端请求后,此socket write端将这些thrift客户端对应的文件描述符以 分隔符','连接起来发送给read端,read端读取到这些文件描述符之后,依次将每个请求的响应发送给thrift客户端 """ self._read_side, self._write_side = socket.socketpair() self._stop_flag = Value('b', False) self._stop_read_flag = Value('b', False) self._epoll = select.epoll() # EPOLLIN设置读操作位 self._epoll.register(self._read_side, select.EPOLLIN) self._harakiri = kwargs.get('harakiri', 5) manager = Manager() self._responses = manager.dict() self._connection_limiter = ConnectionLimiter(self._get_queue_size, event_queue_size) def _get_queue_size(self): try: return self._tasks.qsize() except NotImplementedError: return 0 def _register_harakiri(self): signal.signal(signal.SIGALRM, self._do_harakiri) def _do_harakiri(self, signum, frame): raise Exception('Execution killed after %s seconds' % self._harakiri) @contextmanager def _harakiri_execute(self): signal.alarm(self._harakiri) try: yield finally: signal.alarm(0) def set_worker_process_number(self, num): self._worker_process_number = num def stop(self): self._stop_read_flag.vale = True # 做缓冲 for put_count in range(self._worker_process_number): self._tasks.put([None, None]) # close队列,需要看看这块的知识 self._tasks.close() self._tasks.join_thread() self._stop_flag.value = True # 相当于close底层socket self.serverTransport.close() def serve(self): self.serverTransport.listen() self.serverTransport.handle.setblocking(0) # 注册thrift transport监视的文件描述符 self._epoll.register(self.serverTransport.handle.fileno(), select.EPOLLIN) self._stop_flag.value = False self._stop_read_flag.value = False # fork工作进程 for proc_no in range(self._worker_process_number): self._fork_worker_process(proc_no) # 子进程异常终止重新fork signal.signal(signal.SIGCHLD, self._refork_worker_process) while not self._stop_flag.value: try: self.handle() except (SystemExit, KeyboardInterrupt): break def _fork_worker_process(self, proc_no=0): process = Process(target=self._start_worker_process, args=(proc_no, )) process.start() self._worker_processes[proc_no] = process def _refork_worker_process(self, signum, frame): if not self._stop_flag.value: for proc_no, worker_process in self._worker_processes.iteritems(): if not worker_process.is_alive(): self._fork_worker_process(proc_no) def _start_worker_process(self, proc_no): CommonUtil.set_proctitle('sub_process_%s' % proc_no) self._register_harakiri() # 进程结束信号 signal.signal(signal.SIGTERM, self._terminate_handler) while True: fileno = None try: # 从thrift客户端请求队列里取出客户端传送的数据和文件描述符 message, fileno = self._tasks.get() itransport = TTransport.TMemoryBuffer(message) otransport = TTransport.TMemoryBuffer() iprot = self.inputProtocolFactory.getProtocol(itransport) oprot = self.outputProtocolFactory.getProtocol(otransport) if message is None: break with self._harakiri_execute(): # thrift服务端处理请求 self.processor.process(iprot, oprot) # 保存文件描述符对应的客户端连接的响应数据 self._responses[fileno] = (True, otransport.getvalue()) # 将文件描述符以','分隔写入write端 self._write_side.sendall(str(fileno) + ',') except Exception: if fileno: self._responses[fileno] = (False, b'') self._write_side.sendall(str(fileno) + ',') def _terminate_handler(self, signum, frame): raise SystemExit() def handle(self): try: events = self._epoll.poll(1) except Exception as e: # 慢系统调用异常 if CommonUtil.get_exception_errno(e) == errno.EINTR: events = [] else: raise for fileno, event in events: if fileno == self.serverTransport.handle.fileno( ) and not self._stop_read_flag.value: # 获取到thrift客户端连接 client = self.serverTransport.accept().handle self._clients[client.fileno()] = EpollConnection( client, self._epoll) # epoll注册客户端对应的文件描述符 self.register_epollin(client.fileno()) elif event & select.EPOLLIN: if fileno == self._read_side.fileno(): msg = self._read_side.recv(1024) for client_fileno in msg.split(',')[:-1]: if client_fileno == '' or client_fileno is None: continue client_fileno = int(client_fileno) connection = self._clients.get(client_fileno) response = self._responses.get(client_fileno) if connection and response: connection.ready(*response) elif not self._stop_read_flag.value: connection = self._clients.get(fileno) if connection: connection.read() if connection.get_status( ) == ConnectionStatus.WAIT_PROCESS: try: if self._connection_limiter.try_acquire(): self._tasks.put_nowait([ connection.get_msg(), connection.get_fileno() ]) else: connection.reset() del self._clients[fileno] except _Queue.Full: connection.reset() del self._clients[fileno] else: connection = self._clients[fileno] connection.reset() del self._clients[fileno] elif event & select.EPOLLOUT: connection = self._clients.get(fileno) if connection: connection.write() elif event & select.EPOLLHUP: connection = self._clients.get(fileno) if connection: connection.close() del self._clients[fileno] def register_epollin(self, fileno): self._epoll.register(fileno, select.EPOLLIN) def register_epollout(self, fileno): self._epoll.register(fileno, select.EPOLLOUT)
class RestoreVMsWindow(Ui_Restore, QWizard): __pyqtSignals__ = ("restore_progress(int)","backup_progress(int)") def __init__(self, app, qvm_collection, blk_manager, parent=None): super(RestoreVMsWindow, self).__init__(parent) self.app = app self.qvm_collection = qvm_collection self.blk_manager = blk_manager self.restore_options = None self.vms_to_restore = None self.func_output = [] self.feedback_queue = Queue() self.canceled = False self.tmpdir_to_remove = None self.error_detected = Event() self.excluded = {} self.vm = self.qvm_collection[0] assert self.vm != None self.setupUi(self) self.select_vms_widget = MultiSelectWidget(self) self.select_vms_layout.insertWidget(1, self.select_vms_widget) self.connect(self, SIGNAL("currentIdChanged(int)"), self.current_page_changed) self.connect(self, SIGNAL("restore_progress(QString)"), self.commit_text_edit.append) self.connect(self, SIGNAL("backup_progress(int)"), self.progress_bar.setValue) self.dir_line_edit.connect(self.dir_line_edit, SIGNAL("textChanged(QString)"), self.backup_location_changed) self.connect(self.verify_only, SIGNAL("stateChanged(int)"), self.on_verify_only_toogled) self.select_dir_page.isComplete = self.has_selected_dir self.select_vms_page.isComplete = self.has_selected_vms self.confirm_page.isComplete = self.all_vms_good #FIXME #this causes to run isComplete() twice, I don't know why self.select_vms_page.connect(self.select_vms_widget, SIGNAL("selected_changed()"), SIGNAL("completeChanged()")) fill_appvms_list(self) self.__init_restore_options__() @pyqtSlot(name='on_select_path_button_clicked') def select_path_button_clicked(self): select_path_button_clicked(self, True) def on_ignore_missing_toggled(self, checked): self.restore_options['use-default-template'] = checked self.restore_options['use-default-netvm'] = checked def on_ignore_uname_mismatch_toggled(self, checked): self.restore_options['ignore-username-mismatch'] = checked def on_verify_only_toogled(self, checked): self.restore_options['verify-only'] = bool(checked) def cleanupPage(self, p_int): if self.page(p_int) is self.select_vms_page: self.vms_to_restore = None else: super(RestoreVMsWindow, self).cleanupPage(p_int) def __fill_vms_list__(self): if self.vms_to_restore is not None: return self.select_vms_widget.selected_list.clear() self.select_vms_widget.available_list.clear() self.target_appvm = None if self.appvm_combobox.currentIndex() != 0: #An existing appvm chosen self.target_appvm = self.qvm_collection.get_vm_by_name( str(self.appvm_combobox.currentText())) try: self.vms_to_restore = backup.backup_restore_prepare( unicode(self.dir_line_edit.text()), unicode(self.passphrase_line_edit.text()), options=self.restore_options, host_collection=self.qvm_collection, encrypted=self.encryption_checkbox.isChecked(), appvm=self.target_appvm) for vmname in self.vms_to_restore: if vmname.startswith('$'): # Internal info continue self.select_vms_widget.available_list.addItem(vmname) except QubesException as ex: QMessageBox.warning (None, "Restore error!", str(ex)) def __init_restore_options__(self): if not self.restore_options: self.restore_options = {} backup.backup_restore_set_defaults(self.restore_options) if 'use-default-template' in self.restore_options and 'use-default-netvm' in self.restore_options: val = self.restore_options['use-default-template'] and self.restore_options['use-default-netvm'] self.ignore_missing.setChecked(val) else: self.ignore_missing.setChecked(False) if 'ignore-username-mismatch' in self.restore_options: self.ignore_uname_mismatch.setChecked(self.restore_options['ignore-username-mismatch']) def gather_output(self, s): self.func_output.append(s) def restore_error_output(self, s): self.error_detected.set() self.feedback_queue.put((SIGNAL("restore_progress(QString)"), u'<font color="red">{0}</font>'.format(s))) def restore_output(self, s): self.feedback_queue.put((SIGNAL("restore_progress(QString)"), u'<font color="black">{0}</font>'.format(s))) def update_progress_bar(self, value): self.feedback_queue.put((SIGNAL("backup_progress(int)"), value)) def __do_restore__(self, thread_monitor): err_msg = [] self.qvm_collection.lock_db_for_writing() try: backup.backup_restore_do(self.vms_to_restore, self.qvm_collection, print_callback=self.restore_output, error_callback=self.restore_error_output, progress_callback=self.update_progress_bar) except backup.BackupCanceledError as ex: self.canceled = True self.tmpdir_to_remove = ex.tmpdir err_msg.append(unicode(ex)) except Exception as ex: print "Exception:", ex err_msg.append(unicode(ex)) err_msg.append("Partially restored files left in " "/var/tmp/restore_*, investigate them and/or clean them up") self.qvm_collection.unlock_db() if self.canceled: self.emit(SIGNAL("restore_progress(QString)"), '<b><font color="red">{0}</font></b>' .format("Restore aborted!")) elif len(err_msg) > 0 or self.error_detected.is_set(): if len(err_msg) > 0: thread_monitor.set_error_msg('\n'.join(err_msg)) self.emit(SIGNAL("restore_progress(QString)"), '<b><font color="red">{0}</font></b>' .format("Finished with errors!")) else: self.emit(SIGNAL("restore_progress(QString)"), '<font color="green">{0}</font>' .format("Finished successfully!")) thread_monitor.set_finished() def current_page_changed(self, id): old_sigchld_handler = signal.signal(signal.SIGCHLD, signal.SIG_DFL) if self.currentPage() is self.select_vms_page: self.__fill_vms_list__() elif self.currentPage() is self.confirm_page: for v in self.excluded: self.vms_to_restore[v] = self.excluded[v] self.excluded = {} for i in range(self.select_vms_widget.available_list.count()): vmname = self.select_vms_widget.available_list.item(i).text() self.excluded[str(vmname)] = self.vms_to_restore[str(vmname)] del self.vms_to_restore[str(vmname)] del self.func_output[:] self.vms_to_restore = backup.restore_info_verify(self.vms_to_restore, self.qvm_collection) backup.backup_restore_print_summary( self.vms_to_restore, print_callback = self.gather_output) self.confirm_text_edit.setReadOnly(True) self.confirm_text_edit.setFontFamily("Monospace") self.confirm_text_edit.setText("\n".join(self.func_output)) self.confirm_page.emit(SIGNAL("completeChanged()")) elif self.currentPage() is self.commit_page: self.button(self.FinishButton).setDisabled(True) self.showFileDialog.setEnabled(True) self.showFileDialog.setChecked(self.showFileDialog.isEnabled() and str(self.dir_line_edit.text()) .count("media/") > 0) self.thread_monitor = ThreadMonitor() thread = threading.Thread (target= self.__do_restore__ , args=(self.thread_monitor,)) thread.daemon = True thread.start() while not self.thread_monitor.is_finished(): self.app.processEvents() time.sleep (0.1) try: for (signal_to_emit,data) in iter(self.feedback_queue.get_nowait,None): self.emit(signal_to_emit,data) except Empty: pass if not self.thread_monitor.success: if self.canceled: if self.tmpdir_to_remove and \ QMessageBox.warning(None, "Restore aborted", "Do you want to remove temporary " "files from %s?" % self .tmpdir_to_remove, QMessageBox.Yes, QMessageBox.No) == \ QMessageBox.Yes: shutil.rmtree(self.tmpdir_to_remove) else: QMessageBox.warning (None, "Backup error!", "ERROR: {1}" .format(self.vm.name, self.thread_monitor.error_msg)) if self.showFileDialog.isChecked(): self.emit(SIGNAL("restore_progress(QString)"), '<b><font color="black">{0}</font></b>'.format( "Please unmount your backup volume and cancel " "the file selection dialog.")) if self.target_appvm: self.target_appvm.run("QUBESRPC %s dom0" % "qubes" ".SelectDirectory") else: file_dialog = QFileDialog() file_dialog.setReadOnly(True) file_dialog.getExistingDirectory( self, "Detach backup device", os.path.dirname(unicode(self.dir_line_edit.text()))) self.progress_bar.setValue(100) self.button(self.FinishButton).setEnabled(True) self.button(self.CancelButton).setEnabled(False) self.showFileDialog.setEnabled(False) signal.signal(signal.SIGCHLD, old_sigchld_handler) def all_vms_good(self): for vminfo in self.vms_to_restore.values(): if not vminfo.has_key('vm'): continue if not vminfo['good-to-go']: return False return True def reject(self): if self.currentPage() is self.commit_page: if backup.backup_cancel(): self.emit(SIGNAL("restore_progress(QString)"), '<font color="red">{0}</font>' .format("Aborting the operation...")) self.button(self.CancelButton).setDisabled(True) else: self.done(0) def has_selected_dir(self): backup_location = unicode(self.dir_line_edit.text()) if not backup_location: return False if self.appvm_combobox.currentIndex() == 0: if os.path.isfile(backup_location) or \ os.path.isfile(os.path.join(backup_location, 'qubes.xml')): return True else: return True return False def has_selected_vms(self): return self.select_vms_widget.selected_list.count() > 0 def backup_location_changed(self, new_dir = None): self.select_dir_page.emit(SIGNAL("completeChanged()"))
pool = Pool(args.num_workers, worker, (input_q, output_q)) if (args.stream): print('Reading from hls stream.') video_capture = HLSVideoStream(src=args.stream).start() else: print('Reading from webcam.') video_capture = WebcamVideoStream(src=args.video_source, width=args.width, height=args.height).start() fps = FPS().start() while True: # fps._numFrames < 120 frame = video_capture.read() input_q.put(frame) t = time.time() output_rgb = cv2.cvtColor(output_q.get(), cv2.COLOR_RGB2BGR) cv2.imshow('Video', output_rgb) fps.update() print('[INFO] elapsed time: {:.2f}'.format(time.time() - t)) if cv2.waitKey(1) & 0xFF == ord('q'): break fps.stop() print('[INFO] elapsed time (total): {:.2f}'.format(fps.elapsed())) print('[INFO] approx. FPS: {:.2f}'.format(fps.fps()))
class SafariBooks: LOGIN_URL = ORLY_BASE_URL + "/member/auth/login/" LOGIN_ENTRY_URL = SAFARI_BASE_URL + "/login/unified/?next=/home/" API_TEMPLATE = SAFARI_BASE_URL + "/api/v1/book/{0}/" HEADERS = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "accept-encoding": "gzip, deflate", "accept-language": "it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7", "cache-control": "no-cache", "cookie": "", "pragma": "no-cache", "origin": SAFARI_BASE_URL, "referer": LOGIN_ENTRY_URL, "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/60.0.3112.113 Safari/537.36" } BASE_01_HTML = "<!DOCTYPE html>\n" \ "<html lang=\"en\" xml:lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\"" \ " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"" \ " xsi:schemaLocation=\"http://www.w3.org/2002/06/xhtml2/" \ " http://www.w3.org/MarkUp/SCHEMA/xhtml2.xsd\"" \ " xmlns:epub=\"http://www.idpf.org/2007/ops\">\n" \ "<head>\n" \ "{0}\n" \ "<style type=\"text/css\">" \ "body{{margin:1em;}}" \ "#sbo-rt-content *{{text-indent:0pt!important;}}#sbo-rt-content .bq{{margin-right:1em!important;}}" KINDLE_HTML = "body{{background-color:transparent!important;}}" \ "#sbo-rt-content *{{word-wrap:break-word!important;" \ "word-break:break-word!important;}}#sbo-rt-content table,#sbo-rt-content pre" \ "{{overflow-x:unset!important;overflow:unset!important;" \ "overflow-y:unset!important;white-space:pre-wrap!important;}}" BASE_02_HTML = "</style>" \ "</head>\n" \ "<body>{1}</body>\n</html>" CONTAINER_XML = "<?xml version=\"1.0\"?>" \ "<container version=\"1.0\" xmlns=\"urn:oasis:names:tc:opendocument:xmlns:container\">" \ "<rootfiles>" \ "<rootfile full-path=\"OEBPS/content.opf\" media-type=\"application/oebps-package+xml\" />" \ "</rootfiles>" \ "</container>" # Format: ID, Title, Authors, Description, Subjects, Publisher, Rights, Date, CoverId, MANIFEST, SPINE, CoverUrl CONTENT_OPF = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" \ "<package xmlns=\"http://www.idpf.org/2007/opf\" unique-identifier=\"bookid\" version=\"2.0\" >\n" \ "<metadata xmlns:dc=\"http://purl.org/dc/elements/1.1/\" " \ " xmlns:opf=\"http://www.idpf.org/2007/opf\">\n"\ "<dc:title>{1}</dc:title>\n" \ "{2}\n" \ "<dc:description>{3}</dc:description>\n" \ "{4}" \ "<dc:publisher>{5}</dc:publisher>\n" \ "<dc:rights>{6}</dc:rights>\n" \ "<dc:language>en-US</dc:language>\n" \ "<dc:date>{7}</dc:date>\n" \ "<dc:identifier id=\"bookid\">{0}</dc:identifier>\n" \ "<meta name=\"cover\" content=\"{8}\"/>\n" \ "</metadata>\n" \ "<manifest>\n" \ "<item id=\"ncx\" href=\"toc.ncx\" media-type=\"application/x-dtbncx+xml\" />\n" \ "{9}\n" \ "</manifest>\n" \ "<spine toc=\"ncx\">\n{10}</spine>\n" \ "<guide><reference href=\"{11}\" title=\"Cover\" type=\"cover\" /></guide>\n" \ "</package>" # Format: ID, Depth, Title, Author, NAVMAP TOC_NCX = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"no\" ?>\n" \ "<!DOCTYPE ncx PUBLIC \"-//NISO//DTD ncx 2005-1//EN\"" \ " \"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd\">\n" \ "<ncx xmlns=\"http://www.daisy.org/z3986/2005/ncx/\" version=\"2005-1\">\n" \ "<head>\n" \ "<meta content=\"ID:ISBN:{0}\" name=\"dtb:uid\"/>\n" \ "<meta content=\"{1}\" name=\"dtb:depth\"/>\n" \ "<meta content=\"0\" name=\"dtb:totalPageCount\"/>\n" \ "<meta content=\"0\" name=\"dtb:maxPageNumber\"/>\n" \ "</head>\n" \ "<docTitle><text>{2}</text></docTitle>\n" \ "<docAuthor><text>{3}</text></docAuthor>\n" \ "<navMap>{4}</navMap>\n" \ "</ncx>" def __init__(self, args): self.args = args self.display = Display("info_%s.log" % escape(args.bookid)) self.display.intro() self.cookies = {} self.jwt = {} if not args.cred: if not os.path.isfile(COOKIES_FILE): self.display.exit("Login: unable to find cookies file.\n" " Please use the --cred option to perform the login.") self.cookies = json.load(open(COOKIES_FILE)) else: self.display.info("Logging into Safari Books Online...", state=True) self.do_login(*args.cred) if not args.no_cookies: json.dump(self.cookies, open(COOKIES_FILE, "w")) self.book_id = args.bookid self.api_url = self.API_TEMPLATE.format(self.book_id) self.display.info("Retrieving book info...") self.book_info = self.get_book_info() self.display.book_info(self.book_info) self.display.info("Retrieving book chapters...") self.book_chapters = self.get_book_chapters() self.chapters_queue = self.book_chapters[:] if len(self.book_chapters) > sys.getrecursionlimit(): sys.setrecursionlimit(len(self.book_chapters)) self.book_title = self.book_info["title"] self.base_url = self.book_info["web_url"] self.clean_book_title = "".join(self.escape_dirname(self.book_title).split(",")[:2]) \ + " ({0})".format(self.book_id) books_dir = os.path.join(PATH, "Books") if not os.path.isdir(books_dir): os.mkdir(books_dir) self.BOOK_PATH = os.path.join(books_dir, self.clean_book_title) self.css_path = "" self.images_path = "" self.create_dirs() self.display.info("Output directory:\n %s" % self.BOOK_PATH) self.chapter_title = "" self.filename = "" self.css = [] self.images = [] self.display.info("Downloading book contents... (%s chapters)" % len(self.book_chapters), state=True) self.BASE_HTML = self.BASE_01_HTML + (self.KINDLE_HTML if not args.no_kindle else "") + self.BASE_02_HTML self.cover = False self.get() if not self.cover: self.cover = self.get_default_cover() cover_html = self.parse_html( html.fromstring("<div id=\"sbo-rt-content\"><img src=\"Images/{0}\"></div>".format(self.cover)), True ) self.book_chapters = [{ "filename": "default_cover.xhtml", "title": "Cover" }] + self.book_chapters self.filename = self.book_chapters[0]["filename"] self.save_page_html(cover_html) self.css_done_queue = Queue(0) if "win" not in sys.platform else WinQueue() self.display.info("Downloading book CSSs... (%s files)" % len(self.css), state=True) self.collect_css() self.images_done_queue = Queue(0) if "win" not in sys.platform else WinQueue() self.display.info("Downloading book images... (%s files)" % len(self.images), state=True) self.collect_images() self.display.info("Creating EPUB file...", state=True) self.create_epub() if not args.no_cookies: json.dump(self.cookies, open(COOKIES_FILE, "w")) self.display.done(os.path.join(self.BOOK_PATH, self.book_id + ".epub")) self.display.unregister() if not self.display.in_error and not args.log: os.remove(self.display.log_file) sys.exit(0) def return_cookies(self): return " ".join(["{0}={1};".format(k, v) for k, v in self.cookies.items()]) def return_headers(self, url): if ORLY_BASE_HOST in urlsplit(url).netloc: self.HEADERS["cookie"] = self.return_cookies() else: self.HEADERS["cookie"] = "" return self.HEADERS def update_cookies(self, jar): for cookie in jar: self.cookies.update({ cookie.name: cookie.value }) def requests_provider( self, url, post=False, data=None, perfom_redirect=True, update_cookies=True, update_referer=True, **kwargs ): try: response = getattr(requests, "post" if post else "get")( url, headers=self.return_headers(url), data=data, allow_redirects=False, **kwargs ) self.display.last_request = ( url, data, kwargs, response.status_code, "\n".join( ["\t{}: {}".format(*h) for h in response.headers.items()] ), response.text ) except (requests.ConnectionError, requests.ConnectTimeout, requests.RequestException) as request_exception: self.display.error(str(request_exception)) return 0 if update_cookies: self.update_cookies(response.cookies) if update_referer: # TODO Update Referer HTTP Header # TODO How about Origin? self.HEADERS["referer"] = response.request.url if response.is_redirect and perfom_redirect: return self.requests_provider(response.next.url, post, None, perfom_redirect, update_cookies, update_referer) # TODO How about **kwargs? return response @staticmethod def parse_cred(cred): if ":" not in cred: return False sep = cred.index(":") new_cred = ["", ""] new_cred[0] = cred[:sep].strip("'").strip('"') if "@" not in new_cred[0]: return False new_cred[1] = cred[sep + 1:] return new_cred def do_login(self, email, password): response = self.requests_provider(self.LOGIN_ENTRY_URL) if response == 0: self.display.exit("Login: unable to reach Safari Books Online. Try again...") redirect_uri = response.request.path_url[response.request.path_url.index("redirect_uri"):] # TODO try...catch redirect_uri = redirect_uri[:redirect_uri.index("&")] redirect_uri = "https://api.oreilly.com%2Fapi%2Fv1%2Fauth%2Fopenid%2Fauthorize%3F" + redirect_uri response = self.requests_provider( self.LOGIN_URL, post=True, json={ "email": email, "password": password, "redirect_uri": redirect_uri }, perfom_redirect=False ) if response == 0: self.display.exit("Login: unable to perform auth to Safari Books Online.\n Try again...") if response.status_code != 200: # TODO To be reviewed try: error_page = html.fromstring(response.text) errors_message = error_page.xpath("//ul[@class='errorlist']//li/text()") recaptcha = error_page.xpath("//div[@class='g-recaptcha']") messages = ([" `%s`" % error for error in errors_message if "password" in error or "email" in error] if len(errors_message) else []) +\ ([" `ReCaptcha required (wait or do logout from the website).`"] if len(recaptcha) else[]) self.display.exit("Login: unable to perform auth login to Safari Books Online.\n" + self.display.SH_YELLOW + "[*]" + self.display.SH_DEFAULT + " Details:\n" "%s" % "\n".join(messages if len(messages) else [" Unexpected error!"])) except (html.etree.ParseError, html.etree.ParserError) as parsing_error: self.display.error(parsing_error) self.display.exit( "Login: your login went wrong and it encountered in an error" " trying to parse the login details of Safari Books Online. Try again..." ) self.jwt = response.json() # TODO: save JWT Tokens and use the refresh_token to restore user session response = self.requests_provider(self.jwt["redirect_uri"]) if response == 0: self.display.exit("Login: unable to reach Safari Books Online. Try again...") def get_book_info(self): response = self.requests_provider(self.api_url) if response == 0: self.display.exit("API: unable to retrieve book info.") response = response.json() if not isinstance(response, dict) or len(response.keys()) == 1: self.display.exit(self.display.api_error(response)) if "last_chapter_read" in response: del response["last_chapter_read"] return response def get_book_chapters(self, page=1): response = self.requests_provider(urljoin(self.api_url, "chapter/?page=%s" % page)) if response == 0: self.display.exit("API: unable to retrieve book chapters.") response = response.json() if not isinstance(response, dict) or len(response.keys()) == 1: self.display.exit(self.display.api_error(response)) if "results" not in response or not len(response["results"]): self.display.exit("API: unable to retrieve book chapters.") if response["count"] > sys.getrecursionlimit(): sys.setrecursionlimit(response["count"]) result = [] result.extend([c for c in response["results"] if "cover" in c["filename"] or "cover" in c["title"]]) for c in result: del response["results"][response["results"].index(c)] result += response["results"] return result + (self.get_book_chapters(page + 1) if response["next"] else []) def get_default_cover(self): response = self.requests_provider(self.book_info["cover"], update_cookies=False, stream=True) if response == 0: self.display.error("Error trying to retrieve the cover: %s" % self.book_info["cover"]) return False file_ext = response.headers["Content-Type"].split("/")[-1] with open(os.path.join(self.images_path, "default_cover." + file_ext), 'wb') as i: for chunk in response.iter_content(1024): i.write(chunk) return "default_cover." + file_ext def get_html(self, url): response = self.requests_provider(url) if response == 0 or response.status_code != 200: self.display.exit( "Crawler: error trying to retrieve this page: %s (%s)\n From: %s" % (self.filename, self.chapter_title, url) ) root = None try: root = html.fromstring(response.text, base_url=SAFARI_BASE_URL) except (html.etree.ParseError, html.etree.ParserError) as parsing_error: self.display.error(parsing_error) self.display.exit( "Crawler: error trying to parse this page: %s (%s)\n From: %s" % (self.filename, self.chapter_title, url) ) return root @staticmethod def url_is_absolute(url): return bool(urlparse(url).netloc) def link_replace(self, link): if link: if not self.url_is_absolute(link): if "cover" in link or "images" in link or "graphics" in link or \ link[-3:] in ["jpg", "peg", "png", "gif"]: link = urljoin(self.base_url, link) if link not in self.images: self.images.append(link) self.display.log("Crawler: found a new image at %s" % link) image = link.split("/")[-1] return "Images/" + image return link.replace(".html", ".xhtml") else: if self.book_id in link: return self.link_replace(link.split(self.book_id)[-1]) return link @staticmethod def get_cover(html_root): lowercase_ns = etree.FunctionNamespace(None) lowercase_ns["lower-case"] = lambda _, n: n[0].lower() if n and len(n) else "" images = html_root.xpath("//img[contains(lower-case(@id), 'cover') or contains(lower-case(@class), 'cover') or" "contains(lower-case(@name), 'cover') or contains(lower-case(@src), 'cover') or" "contains(lower-case(@alt), 'cover')]") if len(images): return images[0] divs = html_root.xpath("//div[contains(lower-case(@id), 'cover') or contains(lower-case(@class), 'cover') or" "contains(lower-case(@name), 'cover') or contains(lower-case(@src), 'cover')]//img") if len(divs): return divs[0] a = html_root.xpath("//a[contains(lower-case(@id), 'cover') or contains(lower-case(@class), 'cover') or" "contains(lower-case(@name), 'cover') or contains(lower-case(@src), 'cover')]//img") if len(a): return a[0] return None def parse_html(self, root, first_page=False): if random() > 0.8: if len(root.xpath("//div[@class='controls']/a/text()")): self.display.exit(self.display.api_error(" ")) book_content = root.xpath("//div[@id='sbo-rt-content']") if not len(book_content): self.display.exit( "Parser: book content's corrupted or not present: %s (%s)" % (self.filename, self.chapter_title) ) page_css = "" stylesheet_links = root.xpath("//link[@rel='stylesheet']") if len(stylesheet_links): stylesheet_count = 0 for s in stylesheet_links: css_url = urljoin("https:", s.attrib["href"]) if s.attrib["href"][:2] == "//" \ else urljoin(self.base_url, s.attrib["href"]) if css_url not in self.css: self.css.append(css_url) self.display.log("Crawler: found a new CSS at %s" % css_url) page_css += "<link href=\"Styles/Style{0:0>2}.css\" " \ "rel=\"stylesheet\" type=\"text/css\" />\n".format(stylesheet_count) stylesheet_count += 1 stylesheets = root.xpath("//style") if len(stylesheets): for css in stylesheets: if "data-template" in css.attrib and len(css.attrib["data-template"]): css.text = css.attrib["data-template"] del css.attrib["data-template"] try: page_css += html.tostring(css, method="xml", encoding='unicode') + "\n" except (html.etree.ParseError, html.etree.ParserError) as parsing_error: self.display.error(parsing_error) self.display.exit( "Parser: error trying to parse one CSS found in this page: %s (%s)" % (self.filename, self.chapter_title) ) # TODO: add all not covered tag for `link_replace` function svg_image_tags = root.xpath("//image") if len(svg_image_tags): for img in svg_image_tags: image_attr_href = [x for x in img.attrib.keys() if "href" in x] if len(image_attr_href): svg_url = img.attrib.get(image_attr_href[0]) svg_root = img.getparent().getparent() new_img = svg_root.makeelement("img") new_img.attrib.update({"src": svg_url}) svg_root.remove(img.getparent()) svg_root.append(new_img) book_content = book_content[0] book_content.rewrite_links(self.link_replace) xhtml = None try: if first_page: is_cover = self.get_cover(book_content) if is_cover is not None: page_css = "<style>" \ "body{display:table;position:absolute;margin:0!important;height:100%;width:100%;}" \ "#Cover{display:table-cell;vertical-align:middle;text-align:center;}" \ "img{height:90vh;margin-left:auto;margin-right:auto;}" \ "</style>" cover_html = html.fromstring("<div id=\"Cover\"></div>") cover_div = cover_html.xpath("//div")[0] cover_img = cover_div.makeelement("img") cover_img.attrib.update({"src": is_cover.attrib["src"]}) cover_div.append(cover_img) book_content = cover_html self.cover = is_cover.attrib["src"] xhtml = html.tostring(book_content, method="xml", encoding='unicode') except (html.etree.ParseError, html.etree.ParserError) as parsing_error: self.display.error(parsing_error) self.display.exit( "Parser: error trying to parse HTML of this page: %s (%s)" % (self.filename, self.chapter_title) ) return page_css, xhtml @staticmethod def escape_dirname(dirname, clean_space=False): if ":" in dirname: if dirname.index(":") > 15: dirname = dirname.split(":")[0] elif "win" in sys.platform: dirname = dirname.replace(":", ",") for ch in ['~', '#', '%', '&', '*', '{', '}', '\\', '<', '>', '?', '/', '`', '\'', '"', '|', '+']: if ch in dirname: dirname = dirname.replace(ch, "_") return dirname if not clean_space else dirname.replace(" ", "") def create_dirs(self): if os.path.isdir(self.BOOK_PATH): self.display.log("Book directory already exists: %s" % self.BOOK_PATH) else: os.makedirs(self.BOOK_PATH) oebps = os.path.join(self.BOOK_PATH, "OEBPS") if not os.path.isdir(oebps): self.display.book_ad_info = True os.makedirs(oebps) self.css_path = os.path.join(oebps, "Styles") if os.path.isdir(self.css_path): self.display.log("CSSs directory already exists: %s" % self.css_path) else: os.makedirs(self.css_path) self.display.css_ad_info.value = 1 self.images_path = os.path.join(oebps, "Images") if os.path.isdir(self.images_path): self.display.log("Images directory already exists: %s" % self.images_path) else: os.makedirs(self.images_path) self.display.images_ad_info.value = 1 def save_page_html(self, contents): self.filename = self.filename.replace(".html", ".xhtml") open(os.path.join(self.BOOK_PATH, "OEBPS", self.filename), "wb")\ .write(self.BASE_HTML.format(contents[0], contents[1]).encode("utf-8", 'xmlcharrefreplace')) self.display.log("Created: %s" % self.filename) def get(self): len_books = len(self.book_chapters) for _ in range(len_books): if not len(self.chapters_queue): return first_page = len_books == len(self.chapters_queue) next_chapter = self.chapters_queue.pop(0) self.chapter_title = next_chapter["title"] self.filename = next_chapter["filename"] if os.path.isfile(os.path.join(self.BOOK_PATH, "OEBPS", self.filename.replace(".html", ".xhtml"))): if not self.display.book_ad_info and \ next_chapter not in self.book_chapters[:self.book_chapters.index(next_chapter)]: self.display.info( "File `%s` already exists.\n" " If you want to download again all the book%s,\n" " please delete the `<BOOK NAME>/OEBPS/*.xhtml` files and restart the program." % ( self.filename.replace(".html", ".xhtml"), " (especially because you selected the `--no-kindle` option)" if self.args.no_kindle else "" ) ) self.display.book_ad_info = 2 else: self.save_page_html(self.parse_html(self.get_html(next_chapter["web_url"]), first_page)) self.display.state(len_books, len_books - len(self.chapters_queue)) def _thread_download_css(self, url): css_file = os.path.join(self.css_path, "Style{0:0>2}.css".format(self.css.index(url))) if os.path.isfile(css_file): if not self.display.css_ad_info.value and url not in self.css[:self.css.index(url)]: self.display.info("File `%s` already exists.\n" " If you want to download again all the CSSs,\n" " please delete the `<BOOK NAME>/OEBPS/*.xhtml` and `<BOOK NAME>/OEBPS/Styles/*`" " files and restart the program." % css_file) self.display.css_ad_info.value = 1 else: response = self.requests_provider(url, update_cookies=False) if response == 0: self.display.error("Error trying to retrieve this CSS: %s\n From: %s" % (css_file, url)) with open(css_file, 'wb') as s: s.write(response.content) self.css_done_queue.put(1) self.display.state(len(self.css), self.css_done_queue.qsize()) def _thread_download_images(self, url): image_name = url.split("/")[-1] image_path = os.path.join(self.images_path, image_name) if os.path.isfile(image_path): if not self.display.images_ad_info.value and url not in self.images[:self.images.index(url)]: self.display.info("File `%s` already exists.\n" " If you want to download again all the images,\n" " please delete the `<BOOK NAME>/OEBPS/*.xhtml` and `<BOOK NAME>/OEBPS/Images/*`" " files and restart the program." % image_name) self.display.images_ad_info.value = 1 else: response = self.requests_provider(urljoin(SAFARI_BASE_URL, url), update_cookies=False, stream=True) if response == 0: self.display.error("Error trying to retrieve this image: %s\n From: %s" % (image_name, url)) with open(image_path, 'wb') as img: for chunk in response.iter_content(1024): img.write(chunk) self.images_done_queue.put(1) self.display.state(len(self.images), self.images_done_queue.qsize()) def _start_multiprocessing(self, operation, full_queue): if len(full_queue) > 5: for i in range(0, len(full_queue), 5): self._start_multiprocessing(operation, full_queue[i:i+5]) else: process_queue = [Process(target=operation, args=(arg,)) for arg in full_queue] for proc in process_queue: proc.start() for proc in process_queue: proc.join() def collect_css(self): self.display.state_status.value = -1 if "win" in sys.platform: # TODO for css_url in self.css: self._thread_download_css(css_url) else: self._start_multiprocessing(self._thread_download_css, self.css) def collect_images(self): if self.display.book_ad_info == 2: self.display.info("Some of the book contents were already downloaded.\n" " If you want to be sure that all the images will be downloaded,\n" " please delete the `<BOOK NAME>/OEBPS/*.xhtml` files and restart the program.") self.display.state_status.value = -1 if "win" in sys.platform: # TODO for image_url in self.images: self._thread_download_images(image_url) else: self._start_multiprocessing(self._thread_download_images, self.images) def create_content_opf(self): self.css = next(os.walk(self.css_path))[2] self.images = next(os.walk(self.images_path))[2] manifest = [] spine = [] for c in self.book_chapters: c["filename"] = c["filename"].replace(".html", ".xhtml") item_id = escape("".join(c["filename"].split(".")[:-1])) manifest.append("<item id=\"{0}\" href=\"{1}\" media-type=\"application/xhtml+xml\" />".format( item_id, c["filename"] )) spine.append("<itemref idref=\"{0}\"/>".format(item_id)) for i in set(self.images): dot_split = i.split(".") head = "img_" + escape("".join(dot_split[:-1])) extension = dot_split[-1] manifest.append("<item id=\"{0}\" href=\"Images/{1}\" media-type=\"image/{2}\" />".format( head, i, "jpeg" if "jp" in extension else extension )) for i in range(len(self.css)): manifest.append("<item id=\"style_{0:0>2}\" href=\"Styles/Style{0:0>2}.css\" " "media-type=\"text/css\" />".format(i)) authors = "\n".join("<dc:creator opf:file-as=\"{0}\" opf:role=\"aut\">{0}</dc:creator>".format( escape(aut["name"]) ) for aut in self.book_info["authors"]) subjects = "\n".join("<dc:subject>{0}</dc:subject>".format(escape(sub["name"])) for sub in self.book_info["subjects"]) return self.CONTENT_OPF.format( (self.book_info["isbn"] if self.book_info["isbn"] else self.book_id), escape(self.book_title), authors, escape(self.book_info["description"]), subjects, ", ".join(escape(pub["name"]) for pub in self.book_info["publishers"]), escape(self.book_info["rights"]), self.book_info["issued"], self.cover, "\n".join(manifest), "\n".join(spine), self.book_chapters[0]["filename"].replace(".html", ".xhtml") ) @staticmethod def parse_toc(l, c=0, mx=0): r = "" for cc in l: c += 1 if int(cc["depth"]) > mx: mx = int(cc["depth"]) r += "<navPoint id=\"{0}\" playOrder=\"{1}\">" \ "<navLabel><text>{2}</text></navLabel>" \ "<content src=\"{3}\"/>".format( cc["fragment"] if len(cc["fragment"]) else cc["id"], c, escape(cc["label"]), cc["href"].replace(".html", ".xhtml").split("/")[-1] ) if cc["children"]: sr, c, mx = SafariBooks.parse_toc(cc["children"], c, mx) r += sr r += "</navPoint>\n" return r, c, mx def create_toc(self): response = self.requests_provider(urljoin(self.api_url, "toc/")) if response == 0: self.display.exit("API: unable to retrieve book chapters. " "Don't delete any files, just run again this program" " in order to complete the `.epub` creation!") response = response.json() if not isinstance(response, list) and len(response.keys()) == 1: self.display.exit( self.display.api_error(response) + " Don't delete any files, just run again this program" " in order to complete the `.epub` creation!" ) navmap, _, max_depth = self.parse_toc(response) return self.TOC_NCX.format( (self.book_info["isbn"] if self.book_info["isbn"] else self.book_id), max_depth, self.book_title, ", ".join(aut["name"] for aut in self.book_info["authors"]), navmap ) def create_epub(self): open(os.path.join(self.BOOK_PATH, "mimetype"), "w").write("application/epub+zip") meta_info = os.path.join(self.BOOK_PATH, "META-INF") if os.path.isdir(meta_info): self.display.log("META-INF directory already exists: %s" % meta_info) else: os.makedirs(meta_info) open(os.path.join(meta_info, "container.xml"), "wb").write( self.CONTAINER_XML.encode("utf-8", "xmlcharrefreplace") ) open(os.path.join(self.BOOK_PATH, "OEBPS", "content.opf"), "wb").write( self.create_content_opf().encode("utf-8", "xmlcharrefreplace") ) open(os.path.join(self.BOOK_PATH, "OEBPS", "toc.ncx"), "wb").write( self.create_toc().encode("utf-8", "xmlcharrefreplace") ) zip_file = os.path.join(PATH, "Books", self.book_id) if os.path.isfile(zip_file + ".zip"): os.remove(zip_file + ".zip") shutil.make_archive(zip_file, 'zip', self.BOOK_PATH) os.rename(zip_file + ".zip", os.path.join(self.BOOK_PATH, self.book_id) + ".epub")
item['coverImage'] = html_element.xpath( '//div[@class="detail_left fn-left"]/img/@data-original')[0] with open('shijijiyua.json', 'w') as file: json_str = json.dumps(item, ensure_ascii=False) + '\n' file.write(json_str) if __name__ == '__main__': # 创建任务队列 taskQueue = Queue() # 设置起始任务 taskQueue.put( 'http://date.jiayuan.com/eventslist_new.php?page=1&city_id=4201&shop_id=33' ) taskQueue.put( 'http://date.jiayuan.com/eventslist_new.php?page=1&city_id=31&shop_id=15' ) taskQueue.put( 'http://date.jiayuan.com/eventslist_new.php?page=1&city_id=3702&shop_id=42' ) taskQueue.put( 'http://date.jiayuan.com/eventslist_new.php?page=1&city_id=50&shop_id=5' ) # 创建数据队列 dataQueue = Queue() # 创建进程爬取任务
class Viewer(object): def __init__(self): self.image_queue = Queue() self.pose_queue = Queue() self.view_thread = Process(target=self.view) self.view_thread.start() def update_pose(self, pose): if pose is None: return self.pose_queue.put(pose.matrix()) def update_image(self, image): if image is None: return elif image.ndim == 2: image = np.repeat(image[..., np.newaxis], 3, axis=2) self.image_queue.put(image) def view(self): pangolin.CreateWindowAndBind('Viewer', 1024, 768) gl.glEnable(gl.GL_DEPTH_TEST) gl.glEnable(gl.GL_BLEND) gl.glBlendFunc(gl.GL_SRC_ALPHA, gl.GL_ONE_MINUS_SRC_ALPHA) viewpoint_x = 0 viewpoint_y = -7 viewpoint_z = -18 viewpoint_f = 1000 proj = pangolin.ProjectionMatrix(1024, 768, viewpoint_f, viewpoint_f, 512, 389, 0.1, 300) look_view = pangolin.ModelViewLookAt(viewpoint_x, viewpoint_y, viewpoint_z, 0, 0, 0, 0, -1, 0) # Camera Render Object (for view / scene browsing) scam = pangolin.OpenGlRenderState(proj, look_view) # Add named OpenGL viewport to window and provide 3D Handler dcam = pangolin.CreateDisplay() dcam.SetBounds(0.0, 1.0, 175 / 1024., 1.0, -1024 / 768.) dcam.SetHandler(pangolin.Handler3D(scam)) # image width, height = 376, 240 dimg = pangolin.Display('image') dimg.SetBounds(0, height / 768., 0.0, width / 1024., 1024 / 768.) dimg.SetLock(pangolin.Lock.LockLeft, pangolin.Lock.LockTop) texture = pangolin.GlTexture(width, height, gl.GL_RGB, False, 0, gl.GL_RGB, gl.GL_UNSIGNED_BYTE) image = np.ones((height, width, 3), 'uint8') # axis axis = pangolin.Renderable() axis.Add(pangolin.Axis()) trajectory = DynamicArray() camera = None image = None while not pangolin.ShouldQuit(): if not self.pose_queue.empty(): while not self.pose_queue.empty(): pose = self.pose_queue.get() trajectory.append(pose[:3, 3]) camera = pose.T if not self.image_queue.empty(): while not self.image_queue.empty(): img = self.image_queue.get() img = img[::-1, :, ::-1] img = cv2.resize(img, (width, height)) image = img.copy() gl.glClear(gl.GL_COLOR_BUFFER_BIT | gl.GL_DEPTH_BUFFER_BIT) gl.glClearColor(1.0, 1.0, 1.0, 1.0) dcam.Activate(scam) # draw axis axis.Render() # draw current camera if camera is not None: gl.glLineWidth(1) gl.glColor3f(0.0, 0.0, 1.0) pangolin.DrawCameras(np.array([camera]), 0.5) # show trajectory if len(trajectory) > 0: gl.glPointSize(2) gl.glColor3f(0.0, 0.0, 0.0) pangolin.DrawPoints(trajectory.array()) # show image if image is not None: texture.Upload(image, gl.GL_RGB, gl.GL_UNSIGNED_BYTE) dimg.Activate() gl.glColor3f(1.0, 1.0, 1.0) texture.RenderToViewport() pangolin.FinishFrame()