def __init__(self): CrawlSpider.__init__(self) configure_logging({ 'LOG_FORMAT': '%(asctime)s %(levelname)s: %(message)s', 'LOG_FILE': 'logs/news_info_errors.log', 'LOG_LEVEL': logging.ERROR })
def __init__(self, start_date, end_date): CrawlSpider.__init__(self) SpiderBase.__init__(self, 'http://www.wickerparkbucktown.com/', start_date, end_date, date_format='%B %d, %Y')
def __init__(self, **kwargs): ''' Read user arguments and initialize variables :param kwargs: command line input :return: None ''' CrawlSpider.__init__(self) self.startDate = kwargs['startDate'] self.endDate = kwargs['endDate'] print('startDate: ', self.startDate) print('self.endDate: ', self.endDate) self.article_folder = "../../Output/ArticleRawData/" self.comments_folder = "../../Output/CommentsRawData/" self.error_articles_file = "../../Output/error_article_ids.txt" self.error_comments_file = "../../Output/error_comments_article_ids.txt" self.empty_comments_file = "../../Output/empty_comment_article_ids.txt" self.headers = ({ 'User-Agent': 'Mozilla/5.0', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest' }) self.payload = { 'username': '******', 'password': '******' } # CHANGE THE USER AUTHENTICATION HERE self.apikey = '2_oNjjtSC8Qc250slf83cZSd4sbCzOF4cCiqGIBF8__5dWzOJY_MLAoZvds76cHeQD' # This API key is for public use self.categoryID = 'Production' self.ids_seen = set()
def __init__(self, url=None, *args, **kwargs): CrawlSpider.__init__(self, *args, **kwargs) self.url = url self.start_urls = ['%s' % url] self.allowed_domains = ['%s' % get_netloc(url)] self.urls_ = defaultdict(lambda: 'none_value') self.asyn_capute()
def __init__(self, start_date, end_date): CrawlSpider.__init__(self) SpiderBase.__init__(self, 'https://www.chicagohistory.org/', start_date, end_date, date_format='%d %B %Y', request_date_format='%Y%m%d')
def __init__(self): # 汎用データ辞書/リスト self.generalData_dict = dict() self.generalData_list = list() self.setup_hooks() # フックセットアップ self.setup_domains() # ドメイン名セットアップ # クロールスパイダーを初期化(最後にするのが肝) CrawlSpider.__init__(self)
def __init__(self): self.domain = "www.gsmarena.com" self.name = "gsmarena" self.custom_settings = {} self.allowed_domains = ["www.gsmarena.com"] CrawlSpider.__init__(self) self.start_urls = ["http://www.gsmarena.com/","http://www.gsmarena.com/makers.php3"] self.count = 0 self.deny = "" self.crawl_limt = 0 self.real_count = 0
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) self.browser = webdriver.Chrome(chrome_options=chrome_options) self.browser.set_page_load_timeout(120) self.connection = MongoClient(MONGO_CONNECTION_STRING, 27017) self.comments = self.connection.esmio.items self.links = self.connection.esmio.links
def __init__(self, **kwargs): ''' Read user arguments and initialize variables :param kwargs: command line input :return: None ''' CrawlSpider.__init__(self) self.headers = ({ 'User-Agent': 'Mozilla/5.0', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest' }) self.output_path = "../../Sample_Resources/Online_Resources/sample_base_urls.txt" self.ids_seen = set()
def __init__(self): self.domain = "www.gsmarena.com" self.name = "gsmarena" self.custom_settings = {} self.allowed_domains = ["www.gsmarena.com"] CrawlSpider.__init__(self) self.start_urls = ["http://www.gsmarena.com/makers.php3", "http://www.gsmarena.com/acer-phones-59.php", "http://www.gsmarena.com/alcatel-phones-5.php"] self.count = 0 self.deny = "" self.crawl_limt = 0 self.real_count = 0 self.batch_size = 300 self.mobile_product = []
def __init__(self, **kwargs): ''' :param kwargs: Read user arguments and initialize variables ''' CrawlSpider.__init__(self) self.outDir = kwargs['outDir'] self.startYear = kwargs['startYear'] self.endYear = kwargs['endYear'] print('startYear: ', self.startYear) print('self.endYear: ', self.endYear) print('self.outDir: ', self.outDir) self.headers = ({'User-Agent': 'Mozilla/5.0', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest'}) self.payload = {'username': '******', 'password': '******'} self.apikey = '[API Key for Gigya]' self.categoryID = 'Production'
def __init__(self, rule, worksheet, logging): CrawlSpider.__init__(self) # use any browser you wish self.browser = webdriver.Firefox() self.logging = logging self.rule = rule self.name = self.rule["ranking_name"] self.logging.info("==============================") self.logging.info("self.rule[start_urls]: %s" % self.rule["start_urls"]) self.start_urls = self.rule["start_urls"] # slef.next_page is a defined array. self.next_page = self.rule["next_page"] \ if ("next_page" in self.rule) else ["NONE"] self.logging.info("#### self.next_page %s" % self.next_page) self.flag = self.rule["flag"] \ if ("flag" in self.rule) else ["NONE"] self.logging.info("#### self.flag %s" % self.flag) self.worksheet = worksheet self.logging.info("Finish the __init__ method ... ")
def __init__(self, rule, worksheet, logging): CrawlSpider.__init__(self) # use any browser you wish self.browser = webdriver.Firefox() self.logging = logging self.rule = rule self.name = self.rule["ranking_name"] self.logging.info("==============================") self.logging.info("self.rule[start_urls]: %s" % self.rule["start_urls"]) self.start_urls = self.rule["start_urls"] # slef.next_page is a defined array. self.next_page = self.rule["next_page"] \ if ("next_page" in self.rule) else ["NONE"] self.logging.info("#### self.next_page %s" % self.next_page) self.flag = self.rule["flag"] \ if ("flag" in self.rule) else ["NONE"] self.logging.info("#### self.flag %s" % self.flag) self.worksheet = worksheet self.logging.info("Finish the __init__ method ... ")
def __init__(self): CrawlSpider.__init__(self) #create database try : dbfile = '%s/%s' % (conf.PROJECT_PATH['data'], conf.SQLITE['file']) if os.path.exists(dbfile): moveto = '%s.%d' % (dbfile, int(time.time())) shutil.move(dbfile, moveto) print 'old db file %s is moved to %s.' % (dbfile, moveto) conn = sqlite3.connect(dbfile) cursor = conn.cursor() for table in conf.SQLITE['tables']: cursor.execute(table['sql']) conn.commit() print 'db initialization complete!' finally: conn.close()
def __init__(self, rule, worksheet, logging): CrawlSpider.__init__(self) # use any browser you wish self.browser = webdriver.Firefox() self.logging = logging self.rule = rule self.name = self.rule["ranking_name"] self.logging.info("==============================") self.logging.info("self.rule[start_urls]: %s" % self.rule["start_urls"]) self.start_urls = self.rule["start_urls"] # slef.next_page is a defined array. self.next_page = self.rule["next_page"] \ if ("next_page" in self.rule) else ["NONE"] self.logging.info("#### self.next_page %s" % self.next_page) self.flag = self.rule["flag"] \ if ("flag" in self.rule) else ["NONE"] self.logging.info("#### self.flag %s" % self.flag) self.worksheet = worksheet self.logging.info("Finish the __init__ method ... ") logging.info("***********************************") #yield Request(self.start_urls,headers={'User-Agent':"Windows;U;Windows NT 6.1;en-US;rv;1.9.1.6"}) logging.info("***********************************")
def __init__(self): CrawlSpider.__init__(self) connection = pymongo.MongoClient(settings['MONGODB_HOST'], settings['MONGODB_PORT']) db = connection[settings['MONGODB_DB']] self.collection = db[settings['MONGODB_COLLECTION']]
def __init__(self, *args, **kwargs): CrawlSpider.__init__(self, *args, **kwargs) self.proxy_pool = proxy_list
def __init__(self): CrawlSpider.__init__(self) # use any browser you wish self.browser = webdriver.Firefox()
def __init__(self, *a, **kw): """Init BaseSpider with storage configuration""" CrawlSpider.__init__(self, *a, **kw) self.source_name = self.get_source_name() self.storage = get_storage(self.source_name)
def __init__(self, *arg, **karg): self.name = karg['name'] self.init_yaml('scrapy_service/templates/product.yaml', self.name) CrawlSpider.__init__(self, *arg)
def __init__(self, *arg, **karg): self.init_yaml('scrapy_service/templates/product.yaml','lazada_sitemap') CrawlSpider.__init__(self, *arg)
def __init__(self): CrawlSpider.__init__(self)
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] # self.selenium=selenium('localhost',4444,"*chrome") self.driver = webdriver.Firefox()
def __init__(self, *args, **kwargs): CrawlSpider.__init__(self) if 'mining_job_id' in kwargs: self.mining_job_id = kwargs['mining_job_id'] if 'site_id' in kwargs: self.site_id = kwargs['site_id'] if 'preview' in kwargs: self.preview = 1 if 'iteration' in kwargs: self.iteration = kwargs['iteration'] if 'management_node' in kwargs: self.management_node = kwargs['management_node'] if 'username' in kwargs: self.username = kwargs['username'] if 'password' in kwargs: self.password = kwargs['password'] if 'proxy' in kwargs: self.proxy = kwargs['proxy'] if 'robots_obey' in kwargs: settings.set('ROBOTSTXT_OBEY', int(kwargs['robots_obey']), priority='cmdline') if 'url' in kwargs: self.start_urls.append(kwargs['url'] + self.url_fragmentanchor) if 'extract' in kwargs: self.extract = kwargs['extract'] if 'maxjobs' in kwargs: self.maxjobs = int(kwargs['maxjobs']) if 'protocol' in kwargs: self.protocol = kwargs['protocol'] if 'maximum_try' in kwargs: self.maximum_try = kwargs['maximum_try'] if 'on_demand' in kwargs: self.on_demand = kwargs['on_demand'] if 'debug_id' in kwargs: self.debug_id = kwargs['debug_id'] if 'stale_limit_seconds' in kwargs: self.stale_limit = int(kwargs['stale_limit_seconds']) if 'subspider_detector' in kwargs: self.subspider_detector = True self.required_fields = self.subspider_detect_fields # Sending max items to be scraped. if 'max_items_count' in kwargs: self.max_items_count = int(kwargs['max_items_count']) # set spider_valid_cutoff, default 80 percent of max_items_count spider_valid_cutoff = kwargs.get("valid_cutoff") if spider_valid_cutoff: self.spider_valid_cutoff = int(spider_valid_cutoff) else: self.spider_valid_cutoff = int(self.max_items_count * 0.8) # this will reduce extra requstes after a close_spider call settings.overrides['CONCURRENT_REQUESTS'] = 1 self.debug = int(kwargs.get('debug', '0')) if 'download_delay' in kwargs or hasattr(self, 'download_delay'): download_delay = float(kwargs.get('download_delay', getattr(self, 'download_delay', 0))) settings.set('DOWNLOAD_DELAY', download_delay, priority='cmdline') if download_delay > 0: settings.set('AUTOTHROTTLE_ENABLED', True, priority='cmdline') if self.allowed_domain_bynetloc: self.allowed_domains.append(urlparse.urlparse(kwargs['url']).netloc) # set list of domain allowed to crawl self.default_job_field_getters.update({ 'url': lambda self, response, item: response.url, 'date': lambda self, response, item: datetime.now().strftime('%Y/%m/%d'), 'language': lambda self, response, item: self.language if hasattr(self, 'language') else None }) if self.extract_logo: self.default_job_field_getters.update({'autoextracted_logo_urls': self.get_logos}) if self.extract_email: self.default_job_field_getters.update({'autoextracted_emails': self.get_emails}) if self.extract_salary: self.default_job_field_getters.update({'autoextracted_salaries': self.get_salaries}) if self.extract_website: self.default_job_field_getters.update({'autoextracted_company_websites': self.get_websites}) self.default_fields = self.default_job_field_getters.keys() self.validate_parse_job_wrapper = validate(fields_to_check=self.required_fields)(type(self).parse_job_wrapper) dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] self.selenium = selenium("localhost", 4444, "*chrome", "http://www.domain.com") self.selenium.start()
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] self.selenium = selenium("localhost", 4444, "*chrome", "http://www.domain.com") self.selenium.start()
def __init__(self, *a, **kw): CrawlSpider.__init__(self, *a, **kw) self.crawledurl = set() self.itemIds = set()
def __init__(self): CrawlSpider.__init__(self) BaseCrawler.__init__(self)
def __init__(self): CrawlSpider.__init__(self, self.name) self.driver = create_bs_driver() self.driver.set_page_load_timeout(20) self.num = ''
def __init__(self, xpath_dict={}, files=None): CrawlSpider.__init__(self) self.xpath_dict = xpath_dict self.from_url_file = files if self.from_url_file: self.crawl_from_files()
def __init__(self, *arg, **karg): self.name = karg['name'] self.init_yaml('scrapy_service/templates/product.yaml',self.name) CrawlSpider.__init__(self, *arg)
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] self.selenium = webdriver.Firefox()
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] self.selenium = webdriver.Firefox(executable_path="/Users/theodoreshih/Downloads/geckodriver") #("localhost", 4444, "*chrome", "http://www.domain.com") self.selenium.get('https://www.greenrush.com/dispensary/cannabis-express')
def __init__(self): CrawlSpider.__init__(self) # use any browser you wish self.browser = webdriver.Firefox()
def __init__(self, *args, **kwargs): CrawlSpider.__init__(self, *args, **kwargs)
def __init__(self, **kwargs): CrawlSpider.__init__(self, **kwargs)
def __init__(self, *args, **kwargs): CrawlSpider.__init__(self) SpiderBase.__init__(*args, **kwargs)
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors=[] # self.selenium=selenium('localhost',4444,"*chrome") self.driver=webdriver.Firefox()
def __init__(self, **kwargs): LrmiBase.__init__(self, **kwargs) CrawlSpider.__init__(self, **kwargs)
def __init__(self, *args, **kwargs): BaseSpider.__init__(self, *args, **kwargs) CS.__init__(self, *args, **kwargs)