def __init__(self, occ, city, state, numthreads=8, js=True): self.scraper = YPScraper(occ, city, state) self.companies = [] self.js = js self.to_process_queue = UniqueQueue() self.results_queue = Queue() self.threads = [DomainScrapeThread(self.to_process_queue, self.results_queue) for _ in range(numthreads)]
class Manager(object): """ Manages the creation and delegation of DomainScrapeThread's for companies as well as the initial yp scraping. Only interface that a user should need to use directly. fromStateFile should be the filename of which to load from """ def __init__(self, occ, city, state, numthreads=8, js=True): self.scraper = YPScraper(occ, city, state) self.companies = [] self.js = js self.to_process_queue = UniqueQueue() self.results_queue = Queue() self.threads = [DomainScrapeThread(self.to_process_queue, self.results_queue) for _ in range(numthreads)] def __repr__(self): return "<Manager Object [%s, %s, %s]>" % (self.occ, self.city, self.state) def __str__(self): s = '' for c in self.companies: s += str(c) return s def __len__(self): return len(self.companies) def scrape_companies(self): """ proxy call to ensure all threads get properly torn down to avoid extra running processes """ self._scrape_comp_call() for i, t in enumerate(self.threads): self.threads[i] = None t.teardown() def _scrape_comp_call(self): """ main 'manager' thread.. moniters all other threads and if one is inactive, assigns a new one with the next url in tpProcess. returns when all threads are inactive and toProcess is empty """ self._init_with_yp() for t in self.threads: t.start() for t in self.threads: t.join() for idx, url, page in self.results_queue.queue: self.companies[idx].add_page(url, page) def _init_with_yp(self): """ runs the YP search and populates self.urls with the base urls to process if they exist """ self.companies = self.scraper.scrape() for i, c in enumerate(self.companies): if c.url: self.to_process_queue.put((i, c.url)) def view_all_pages(self): """ debugging method """ for c in self.companies: if c.url: pprint(c.name) input() for p in c.pages: pprint(p) input() def save_state(self, dump_file='application/tmp/state.txt'): """ :type dump_file the file to save this instance to dump the entire manager instance to a pickle file """ logging.info("dumping manager (self) to "+dump_file) d = shelve.open(dump_file) d['manager'] = self d.close() @staticmethod def load_state(load_file='application/tmp/state.txt'): """ :type load_file the file to load from return a manager object from a previously saved execution """ logging.info("retrieving manager from "+load_file) d = shelve.open(load_file) return d['manager'] def define_expected(self): """ open the base url for each company and accept user defined outcome used to build a training set for a scikit-learn pipeline """ pass def fit_pipeline(self): pipeline.fit(self.companies)