def search_parallel(username, password, client_matter, q, num_workers=15): ''' Download a list of dockets in parallel by launching many processes. docket_list: A list of (court, docket) tuples num_workers: How many parallel processes to start ''' login_token = call(call="login", method="POST", username=username, password=password)['login_token'] first_page = call(call="search", method="GET", q=q, login_token=login_token, client_matter=client_matter) num_first_page = len(first_page['search_results']) num_results = first_page['count'] # The main thread removes them from searchqueue and puts them into a list. results = [None] * num_results results[:num_first_page] = first_page['search_results'] logging.info("Downloading %s Results, already got first %d" % (num_results, num_first_page)) # Put all of the search ranges into the result queue dlqueue = ProcessQueue() NUM_AT_ONCE = 20 for i in xrange(num_first_page, num_results, NUM_AT_ONCE): limit = min(num_results, i + NUM_AT_ONCE) - i logging.info("Added: %s --> %s" % (i, i + limit)) dlqueue.put((i, limit)) # The processes will put their results into the searchqueue searchqueue = ProcessQueue() # Start up the parallel processes pool = MultiProcessPool( processes=num_workers, initializer=_search_worker, initargs=[username, password, client_matter, q, dlqueue, searchqueue]) try: # Continue until the processing queue is empty. while True: # It takes about 15 seconds to download a docket, so wait that long. time.sleep(2.0 / num_workers) got = 0 try: item = searchqueue.get_nowait() start, end = item['offset'], item['offset'] + item['limit'] results[start:end] = item['result']['search_results'] logging.info("Downloaded: %s --> %s (of %d total)" % (start, end, num_results)) got += 1 except Empty: left = len(results) - len(filter(None, results)) if left <= 0: break logging.info("Got %d, %d results. Waiting for %d more." % (got, len(results), left)) continue except Exception as e: logging.info("Main thread loop exception: %s" % e) break except KeyboardInterrupt as e: logging.info("Main thread exception: %s" % e) dlqueue.close() searchqueue.close() pool.close() pool.terminate() # Return what we have even if there was an exception. return results for i, r in enumerate(results): if not r: print("Missing Result %s" % (i + 1)) return { 'search_results': results, 'count': num_results, }
def getdocket_parallel(username, password, client_matter, docket_list, cached=False, num_workers=15, save_progress=None, _async=False): ''' Download a list of dockets in parallel by launching many processes. docket_list: A list of (court, docket) tuples num_workers: How many parallel processes to start cached: Get cached dockets instead of fresh ones from the court save_progress Use a temporary file to save work in case we crash. async If True, we get data asyncrhonously. ''' if save_progress != None: if _async == True: raise NotImplementedError("Cannot save progress and async.") save_progress = shelve.open(save_progress, 'c') def get_key(court, docket): return ("(%s),(%s)" % (court, docket)).encode('ascii', 'ignore') dockets = [] def deb(msg, *args, **kwargs): msg = "getdocket_parallel %s-%s: %s" % (username, client_matter, msg) logging.info(msg, *args, **kwargs) # Put all of the tuples into a processing queue dlqueue = ProcessQueue() for c_vals in docket_list: c_vals = list(c_vals) if len(c_vals) < 2: raise Exception( "Expecting a list of at least two with court, " "docket, instead got: %s", c_vals) court, docket = c_vals[:2] k = get_key(court, docket) if save_progress != None and save_progress.get(k) and \ save_progress[k]['result']['success']: # Add to the results dockets.append(save_progress[k]) else: # Add it to the download queue dlqueue.put((court, docket)) # The processes will put their results into the docketqueue docketqueue = ProcessQueue() # The main thread removes them from docketqueue and puts them into a list. # Start up the parallel processes pool = MultiProcessPool(processes=num_workers, initializer=_dl_worker, initargs=[ username, password, client_matter, cached, dlqueue, docketqueue ]) def iterator(sleep_time=1.0): '''An iterator that goes through all of the given dockets.''' # Continue until the processing queue is empty got, iters, total = 0, 0, len(docket_list) while True: # It takes about 15 seconds to download a docket, so wait that long. iters += 1 try: time.sleep(sleep_time) # get_nowait will have raise Empty and break the loop while True: yield docketqueue.get_nowait() got += 1 except Empty: left = total - got if left <= 0: deb("Finished iterating %s" % total) break if iters % 5 == 0: deb("Did %d/%d, %d left.", got, total, left) continue except KeyboardInterrupt as e: deb("Main thread interrupt: %s" % e) break except Exception as e: deb("Main thread loop exception: %s" % e) break dlqueue.close() docketqueue.close() pool.close() pool.terminate() if _async: return iterator for new_i, new_docket in enumerate(iterator()): dockets.append(new_docket) # Only save if succesful if save_progress != None and new_docket['result']['success']: # Save our progress k = get_key(new_docket['court'], new_docket['docket']) save_progress[k] = new_docket elif save_progress != None and new_i % 20 == 0: deb("sync dbase len=%d, added=%d ", len(save_progress), 'got') save_progress.sync() # Return what we have even if there was an exception. if save_progress != None: save_progress.sync() save_progress.close() return dockets
def getdocket_parallel(username, password, client_matter, docket_list, cached=False, num_workers=15, save_progress=None): ''' Download a list of dockets in parallel by launching many processes. docket_list: A list of (court, docket) tuples num_workers: How many parallel processes to start cached: Get cached dockets instead of fresh ones from the court save_progress Use a temporary file to save work in case we crash. ''' if save_progress != None: save_progress = shelve.open(save_progress, 'c') def get_key(court, docket): return ("(%s),(%s)" % (court, docket)).encode('ascii', 'ignore') dockets = [] # Put all of the tuples into a processing queue dlqueue = ProcessQueue() for court, docket in docket_list: k = get_key(court, docket) if save_progress != None and save_progress.get(k) and \ save_progress[k]['result']['success']: # Add to the results dockets.append(save_progress[k]) else: # Add it to the download queue dlqueue.put((court, docket)) # The processes will put their results into the docketqueue docketqueue = ProcessQueue() # The main thread removes them from docketqueue and puts them into a list. # Start up the parallel processes pool = MultiProcessPool(processes=num_workers, initializer=_dl_worker, initargs=[ username, password, client_matter, cached, dlqueue, docketqueue ]) try: # Continue until the processing queue is empty got = 0 while True: # It takes about 15 seconds to download a docket, so wait that long. time.sleep(1.0) try: # get_nowait will have raise Empty and break the loop while True: new_docket = docketqueue.get_nowait() dockets.append(new_docket) # Only save if succesful if save_progress != None and new_docket['result'][ 'success']: # Save our progress k = get_key(new_docket['court'], new_docket['docket']) save_progress[k] = new_docket got += 1 except Empty: if save_progress != None: print("Syncing dbase (len=%d), dockets=%d " % (len(save_progress), len(dockets))) save_progress.sync() left = len(docket_list) - len(dockets) if left <= 0: break logging.info("Got %d, %d total dockets. Waiting again." % (got, len(dockets))) continue except Exception as e: logging.info("Main thread loop exception: %s" % e) break except KeyboardInterrupt as e: logging.info("Main thread exception: %s" % e) dlqueue.close() docketqueue.close() pool.close() pool.terminate() # Return what we have even if there was an exception. if save_progress != None: save_progress.sync() save_progress.close() return dockets
if save_progress != None and save_progress.get(k) and \ save_progress[k]['result']['success']: # Add to the results dockets.append(save_progress[k]) else: # Add it to the download queue dlqueue.put((court, docket)) # The processes will put their results into the docketqueue docketqueue = ProcessQueue() # The main thread removes them from docketqueue and puts them into a list. # Start up the parallel processes pool = MultiProcessPool(processes=num_workers, initializer=_dl_worker, initargs=[ username, password, client_matter, cached, dlqueue, docketqueue ]) def iterator(sleep_time=1.0): '''An iterator that goes through all of the given dockets.''' # Continue until the processing queue is empty got, iters, total = 0, 0, len(docket_list) while True: # It takes about 15 seconds to download a docket, so wait that long. iters += 1 try: time.sleep(sleep_time) # get_nowait will have raise Empty and break the loop while True: yield docketqueue.get_nowait()