def __init__(self, keywordList, extkey=None, se='baidu', pages=2, MONGODB_SERVER='127.0.0.1', MONGODB_PORT=27017, MONGODB_DB='BGY_ZC_META', *args, **kwargs): super(keywordSpider, self).__init__(*args, **kwargs) self.keywordList = keywordList.lower() self.searchEngine = se.lower() self.selector = SearchEngineResultSelectors[self.searchEngine] self.dateSelector = SearchEngineResultDateSelectors[self.searchEngine] if 'WX' in MONGODB_DB or 'WB' in MONGODB_DB: settings['DOWNLOAD_DELAY'] = 5 settings['MONGODB_SERVER'] = MONGODB_SERVER settings['MONGODB_PORT'] = int(MONGODB_PORT) settings['MONGODB_DB'] = MONGODB_DB keys = keywordList.split(',') for key in keys: key = key.strip() if key == '': continue key = key.strip() if extkey != None: key = key + ' ' + extkey.strip() pageUrls = searResultPages(key, se, int(pages)) for url in pageUrls: self.start_urls.append(url)
def __init__(self, keyword, se = 'bing', pages = 50, *args, **kwargs): super(keywordSpider, self).__init__(*args, **kwargs) self.keyword = keyword.lower() self.searchEngine = se.lower() self.selector = SearchEngineResultSelectors[self.searchEngine] pageUrls = searResultPages(keyword, se, int(pages)) currUrl = pageUrls.next() self.start_urls.append(currUrl)
def __init__(self, keyword, se = 'bing', pages = 50, *args, **kwargs): super(keywordSpider, self).__init__(*args, **kwargs) self.keyword = keyword.lower() self.searchEngine = se.lower() self.selector = SearchEngineResultSelectors[self.searchEngine] pageUrls = searResultPages(keyword, se, int(pages)) for url in pageUrls: print(url) self.start_urls.append(url)
def __init__(self, keyword, se='amazon', pages=2, *args, **kwargs): self.driver = webdriver.Chrome('/usr/local/bin/chromedriver') super(keywordSpider, self).__init__(*args, **kwargs) self.keyword = keyword.lower() self.searchEngine = se.lower() self.selector = SearchEngineResultSelectors[self.searchEngine] pageUrls = searResultPages(keyword, se, int(pages)) for url in pageUrls: print(url) self.start_urls.append(url)