def main(): arg_parser = argparse.ArgumentParser() arg_parser.add_argument( '--processes', '-p', help='run data scraping and parsing in X parallel processes', default=1, type=int) arg_parser.add_argument( '--no-download', '-nd', help="don't download data at start (assumes data already downloaded)", default=False, action='store_true') arg_parser.add_argument( '--parse-chains', '-c', help="parse chains login data from the government webpage", default=False, action='store_true') args = arg_parser.parse_args() start = time.time() p = Pool(processes=args.processes) db = SessionController() # 1) get all chains (and subchains) if args.parse_chains: gov = web_scraper.GovDataScraper(db) gov.parse_chains_to_db() chains = [chain for chain in db.query(Chain)] # 2) download all data before starting if not args.no_download: s = time.time() print('Downloading all chains data') res = p.map(download_chain_data, chains) print('data download: {}'.format(time.time() - s)) # 3) parse all chain stores s = time.time() print('parsing all chains stores') res = p.map(parse_chain_stores, chains) print('stores parsing: {}'.format(time.time() - s)) # 4) parse stores daily prices and promos for chain in chains: s = time.time() print('parsing prices for chain {}'.format(chain.name)) stores = [ store for store in db.query(Store).filter(Store.chain_id == chain.id) ] p.starmap(parse_chain_prices, zip(repeat(chain), stores)) print('chain parsing ended: {}'.format(time.time() - s)) # ChainXmlParser.set_products_item_id(db) print('total time: {}'.format(time.time() - start))
def main(): db = SessionController(db_logging=False) for chain in db.query(Chain): if chain.name != 'סופר דוש': continue parser = ChainXmlParser(chain, db) for store in db.query(Store).filter(Store.chain_id == chain.id): parser.parse_store_promos(store) break
def main(): try: db = SessionController() for chain in db.query(Chain): scraper = db_chain_factory(chain) print(chain.name) # print(scraper.get_chain_full_id()) print(scraper.get_stores_xml()) print(scraper.get_prices_xml(1)) print(scraper.get_promos_xml(1)) except BaseException as e: raise e
def __init__(self, city, db=None, logger=None): logging.basicConfig(level=logging.INFO) logger = logger or logging.getLogger(__name__) self.db = db or SessionController() self.parser = xml_parser.ChainXmlParser(db) self.city = city logger.info('getting city stores') self.stores = self.get_city_stores() logger.info(self.stores) self.basket = Basket() self.stores_items = {} for store in self.stores: logger.info('getting store {} items'.format(store)) items = self.get_store_items(store) if not items: self.parser.parse_store_prices(store.chain, store) items = self.get_store_items(store) for item in items: try: self.stores_items[item].append(store) except KeyError: self.stores_items[item] = [store]
def __init__(self, db_chain, db=None): self.db = db or SessionController() self.page_size = 100000 self.chain = db_chain
def __init__(self, db=None): self.db = db or SessionController() self.chain_table_url = "http://www.economy.gov.il/Trade/ConsumerProtection/Pages/PriceTransparencyRegulations.aspx"