예제 #1
0
def search_parallel(username, password, client_matter, q, num_workers=15):
    '''
	Download a list of dockets in parallel by launching many processes.
	
	docket_list:		A list of (court, docket) tuples
	num_workers:		How many parallel processes to start
	'''
    login_token = call(call="login",
                       method="POST",
                       username=username,
                       password=password)['login_token']
    first_page = call(call="search",
                      method="GET",
                      q=q,
                      login_token=login_token,
                      client_matter=client_matter)

    num_first_page = len(first_page['search_results'])

    num_results = first_page['count']
    # The main thread removes them from searchqueue and puts them into a list.
    results = [None] * num_results
    results[:num_first_page] = first_page['search_results']
    logging.info("Downloading %s Results, already got first %d" %
                 (num_results, num_first_page))

    # Put all of the search ranges into the result queue
    dlqueue = ProcessQueue()
    NUM_AT_ONCE = 20
    for i in xrange(num_first_page, num_results, NUM_AT_ONCE):
        limit = min(num_results, i + NUM_AT_ONCE) - i
        logging.info("Added: %s --> %s" % (i, i + limit))
        dlqueue.put((i, limit))

    # The processes will put their results into the searchqueue
    searchqueue = ProcessQueue()
    # Start up the parallel processes
    pool = MultiProcessPool(
        processes=num_workers,
        initializer=_search_worker,
        initargs=[username, password, client_matter, q, dlqueue, searchqueue])
    try:
        # Continue until the processing queue is empty.
        while True:
            # It takes about 15 seconds to download a docket, so wait that long.
            time.sleep(2.0 / num_workers)
            got = 0
            try:
                item = searchqueue.get_nowait()
                start, end = item['offset'], item['offset'] + item['limit']
                results[start:end] = item['result']['search_results']
                logging.info("Downloaded: %s --> %s (of %d total)" %
                             (start, end, num_results))
                got += 1
            except Empty:
                left = len(results) - len(filter(None, results))
                if left <= 0:
                    break
                logging.info("Got %d, %d results. Waiting for %d more." %
                             (got, len(results), left))
                continue
            except Exception as e:
                logging.info("Main thread loop exception: %s" % e)
                break

    except KeyboardInterrupt as e:
        logging.info("Main thread exception: %s" % e)
        dlqueue.close()
        searchqueue.close()
        pool.close()
        pool.terminate()
        # Return what we have even if there was an exception.
        return results

    for i, r in enumerate(results):
        if not r:
            print("Missing Result %s" % (i + 1))

    return {
        'search_results': results,
        'count': num_results,
    }
예제 #2
0
def getdocket_parallel(username,
                       password,
                       client_matter,
                       docket_list,
                       cached=False,
                       num_workers=15,
                       save_progress=None,
                       _async=False):
    '''
	Download a list of dockets in parallel by launching many processes.
	
	docket_list:		A list of (court, docket) tuples
	num_workers:		How many parallel processes to start
	cached:				Get cached dockets instead of fresh ones from the court
	save_progress		Use a temporary file to save work in case we crash.
	async               If True, we get data asyncrhonously.
	'''
    if save_progress != None:
        if _async == True:
            raise NotImplementedError("Cannot save progress and async.")
        save_progress = shelve.open(save_progress, 'c')

    def get_key(court, docket):
        return ("(%s),(%s)" % (court, docket)).encode('ascii', 'ignore')

    dockets = []

    def deb(msg, *args, **kwargs):
        msg = "getdocket_parallel %s-%s: %s" % (username, client_matter, msg)
        logging.info(msg, *args, **kwargs)

    # Put all of the tuples into a processing queue
    dlqueue = ProcessQueue()
    for c_vals in docket_list:
        c_vals = list(c_vals)
        if len(c_vals) < 2:
            raise Exception(
                "Expecting a list of at least two with court, "
                "docket, instead got: %s", c_vals)
        court, docket = c_vals[:2]
        k = get_key(court, docket)
        if save_progress != None and save_progress.get(k) and \
          save_progress[k]['result']['success']:
            # Add to the results
            dockets.append(save_progress[k])
        else:
            # Add it to the download queue
            dlqueue.put((court, docket))

    # The processes will put their results into the docketqueue
    docketqueue = ProcessQueue()
    # The main thread removes them from docketqueue and puts them into a list.

    # Start up the parallel processes
    pool = MultiProcessPool(processes=num_workers,
                            initializer=_dl_worker,
                            initargs=[
                                username, password, client_matter, cached,
                                dlqueue, docketqueue
                            ])

    def iterator(sleep_time=1.0):
        '''An iterator that goes through all of the given dockets.'''
        # Continue until the processing queue is empty
        got, iters, total = 0, 0, len(docket_list)
        while True:
            # It takes about 15 seconds to download a docket, so wait that long.
            iters += 1
            try:
                time.sleep(sleep_time)
                # get_nowait will have raise Empty and break the loop
                while True:
                    yield docketqueue.get_nowait()
                    got += 1
            except Empty:
                left = total - got
                if left <= 0:
                    deb("Finished iterating %s" % total)
                    break
                if iters % 5 == 0:
                    deb("Did %d/%d, %d left.", got, total, left)
                continue
            except KeyboardInterrupt as e:
                deb("Main thread interrupt: %s" % e)
                break
            except Exception as e:
                deb("Main thread loop exception: %s" % e)
                break

        dlqueue.close()
        docketqueue.close()
        pool.close()
        pool.terminate()

    if _async:
        return iterator

    for new_i, new_docket in enumerate(iterator()):
        dockets.append(new_docket)
        # Only save if succesful
        if save_progress != None and new_docket['result']['success']:
            # Save our progress
            k = get_key(new_docket['court'], new_docket['docket'])
            save_progress[k] = new_docket
        elif save_progress != None and new_i % 20 == 0:
            deb("sync dbase len=%d, added=%d ", len(save_progress), 'got')
            save_progress.sync()

        # Return what we have even if there was an exception.

    if save_progress != None:
        save_progress.sync()
        save_progress.close()
    return dockets
예제 #3
0
def getdocket_parallel(username,
                       password,
                       client_matter,
                       docket_list,
                       cached=False,
                       num_workers=15,
                       save_progress=None):
    '''
	Download a list of dockets in parallel by launching many processes.
	
	docket_list:		A list of (court, docket) tuples
	num_workers:		How many parallel processes to start
	cached:				Get cached dockets instead of fresh ones from the court
	save_progress		Use a temporary file to save work in case we crash.
	'''
    if save_progress != None:
        save_progress = shelve.open(save_progress, 'c')

    def get_key(court, docket):
        return ("(%s),(%s)" % (court, docket)).encode('ascii', 'ignore')

    dockets = []

    # Put all of the tuples into a processing queue
    dlqueue = ProcessQueue()
    for court, docket in docket_list:
        k = get_key(court, docket)
        if save_progress != None and save_progress.get(k) and \
          save_progress[k]['result']['success']:
            # Add to the results
            dockets.append(save_progress[k])
        else:
            # Add it to the download queue
            dlqueue.put((court, docket))

    # The processes will put their results into the docketqueue
    docketqueue = ProcessQueue()
    # The main thread removes them from docketqueue and puts them into a list.

    # Start up the parallel processes
    pool = MultiProcessPool(processes=num_workers,
                            initializer=_dl_worker,
                            initargs=[
                                username, password, client_matter, cached,
                                dlqueue, docketqueue
                            ])

    try:
        # Continue until the processing queue is empty
        got = 0
        while True:
            # It takes about 15 seconds to download a docket, so wait that long.
            time.sleep(1.0)
            try:
                # get_nowait will have raise Empty and break the loop
                while True:
                    new_docket = docketqueue.get_nowait()
                    dockets.append(new_docket)
                    # Only save if succesful
                    if save_progress != None and new_docket['result'][
                            'success']:
                        # Save our progress
                        k = get_key(new_docket['court'], new_docket['docket'])
                        save_progress[k] = new_docket
                    got += 1
            except Empty:
                if save_progress != None:
                    print("Syncing dbase (len=%d), dockets=%d " %
                          (len(save_progress), len(dockets)))
                    save_progress.sync()
                left = len(docket_list) - len(dockets)
                if left <= 0:
                    break
                logging.info("Got %d, %d total dockets. Waiting again." %
                             (got, len(dockets)))
                continue
            except Exception as e:
                logging.info("Main thread loop exception: %s" % e)
                break

    except KeyboardInterrupt as e:
        logging.info("Main thread exception: %s" % e)
        dlqueue.close()
        docketqueue.close()
        pool.close()
        pool.terminate()
        # Return what we have even if there was an exception.

    if save_progress != None:
        save_progress.sync()
        save_progress.close()
    return dockets
예제 #4
0
        if save_progress != None and save_progress.get(k) and \
          save_progress[k]['result']['success']:
            # Add to the results
            dockets.append(save_progress[k])
        else:
            # Add it to the download queue
            dlqueue.put((court, docket))

    # The processes will put their results into the docketqueue
    docketqueue = ProcessQueue()
    # The main thread removes them from docketqueue and puts them into a list.

    # Start up the parallel processes
    pool = MultiProcessPool(processes=num_workers,
                            initializer=_dl_worker,
                            initargs=[
                                username, password, client_matter, cached,
                                dlqueue, docketqueue
                            ])

    def iterator(sleep_time=1.0):
        '''An iterator that goes through all of the given dockets.'''
        # Continue until the processing queue is empty
        got, iters, total = 0, 0, len(docket_list)
        while True:
            # It takes about 15 seconds to download a docket, so wait that long.
            iters += 1
            try:
                time.sleep(sleep_time)
                # get_nowait will have raise Empty and break the loop
                while True:
                    yield docketqueue.get_nowait()