def parse(job, config): driver = None result = { 'id': job['id'], 'source_id': job['source_id'], 'status': config.JS_FAILED, 'content': '', 'message': '' } try: url = job['url'] prefix = url.split('//')[1].split('/')[0] if prefix in PARSERS: driver = util.create_chrome_driver() content = PARSERS[prefix](driver, url) # Dispatch according to url result['content'] = content result['status'] = config.JS_FINISHED else: raise Exception('No parser for %s.' % url) except Exception as e: result['message'] = '%s\n%s' % (e, traceback.format_exc()) finally: if driver: driver.quit() return result
def parse(jobs): result = [] driver = util.create_chrome_driver() for job in jobs: result.append({ 'id': job['id'], 'source_id': job['source_id'], 'message': '' }) try: url = job['url'] prefix = url.split('//')[1].split('/')[0] if prefix in parsers: content = parsers[prefix](driver, url) result[-1]['status'] = js_finished result[-1]['content'] = content else: raise Exception('Parser not found for %s' % url) except Exception as e: result[-1]['status'] = js_failed result[-1]['message'] = '%s\n%s' % (e, traceback.format_exc()) driver.quit() return result
# coding: utf-8 import sys sys.path.append('../') import util prefixes = ['www.miumiu.com'] def parse(driver, url): products = [] driver.get(url) elements = util.find_elements_by_css_selector(driver, 'div.product > div > a') for element in elements: products.append(element.get_attribute('href').strip()) return ';'.join(products) if __name__ == '__main__': driver = util.create_chrome_driver() print(parse(driver, sys.argv[1])) driver.quit()