def __init__(self,logging_name): #self.interval = WATCH_INTERVAL self.crawl_pool = Pool(size=CRWAL_POOL_SIZE) self.logger = get_logger(logging_name) self.page_queue = Queue() self.info_queue = Queue() self.parm_queue = Queue() self.proxy_manager = ProxyManager("{}/1.txt".format(project_dir),self.logger) #self.timer = Timer(random.randint(0,2),self.interval) self.proxys = self.proxy_manager.get_proxy() self.count = 0
def _search(self, retries=0): payload = {} try: payload['q'] = self.query.encode('utf8') # query to lookup except: payload['q'] = self.query # query to lookup #payload['sclient'] = 'psy-ab' payload['gl'] = self.country # query from country payload['hl'] = self.language # user query language payload['lr'] = 'lang_%s' % self.language # restrict language pages payload['pws'] = 0 # no custom payload['gws_rd'] = 'cr' # country select # old API payload['client'] = 'firefox' # --> json result pool = Urllib3PoolFactory.getProxyPool() # Getting the response in an Object r time.sleep(random.uniform(0.0, 0.2)) try: r = pool.request('GET', self.googleHost, fields=payload, headers={ "User-Agent": UserAgent.old, "Accept": "text/html" }) except Exception as ex: app_logger.error(u"_requestError %s" % ex) raise ex # ['repelente', [['repelente<b> mosquitos</b>', 0, [131]], ['repelente<b> para gatos</b>', 0, [131]], ['repelente<b> para perros</b>', 0], ['repelente', 0]], ... results = [] data = json.loads(r.data) if len(data) > 1: for itemsData in data[1]: ##related_query = BeautifulSoup(itemsData[0], "lxml").getText() related_query = itemsData if related_query != self.query: results.append(related_query) if not data and not results: ProxyManager.invalidateProxy() if retries > 0: print 'Reintentando(%s)... %s' % (retries, self.query) return self._search(retries - 1) return results
class Taskmanager(object): def __init__(self,logging_name): #self.interval = WATCH_INTERVAL self.crawl_pool = Pool(size=CRWAL_POOL_SIZE) self.logger = get_logger(logging_name) self.page_queue = Queue() self.info_queue = Queue() self.parm_queue = Queue() self.proxy_manager = ProxyManager("{}/1.txt".format(project_dir),self.logger) #self.timer = Timer(random.randint(0,2),self.interval) self.proxys = self.proxy_manager.get_proxy() self.count = 0 def run(self): pass def reload_proxies(self): self.proxy_manager.reload_proxies() def _feed_page_queue(self,base_url): pass def _page_loop(self): while 1: page_url=self.page_queue.get(block=True) gevent.sleep(2) self.crawl_pool.spawn(self._feed_info_queue, page_url) def _feed_info_queue(self,url): pass def _item_loop(self): while 1: item_url=self.info_queue.get(block=True) gevent.sleep(2) self.crawl_pool.spawn(self._crawl_info, item_url) def _crawl_info(self,item_url): pass def _db_save_loop(self): while 1: parm = self.parm_queue.get(block=True) gevent.sleep(0.1) self.count = self.count+1 S = SqlHelper(logger=self.logger) self.crawl_pool.spawn(S.insert_scholar, **parm)
def getProxyPool(): from utils.proxy_manager import ProxyManager urllib3.disable_warnings() #@UndefinedVariable nextProxy = ProxyManager.getNextProxy() if nextProxy.proxy_basic_auth: proxy_url = 'http://%s@%s:%s' % (nextProxy.proxy_basic_auth, nextProxy.host, nextProxy.port) else: proxy_url = 'http://%s:%s' % (nextProxy.host, nextProxy.port) proxies = { 'http': proxy_url, 'https': proxy_url, } import requests session = requests.Session() session.proxies = proxies session.max_redirects = 2 return session
def _feed_journal_loop(self, subject, subject_url): self.logging = get_logger(name=subject) self.proxy_manager = ProxyManager("{}/1.txt".format(project_dir), self.logging) self.logging.info("Processing journal {}".format(subject_url)) html_source = fetch(subject_url, requests_session=self.requests_session) journal_item = iter(extract("//li[@class='browseimpBrowseRow']/ul/li/span/a/@href", html_source, multi=True)) try: while True: self.journal_queue.put_nowait('http://www.sciencedirect.com{}'.\ format(next(journal_item))) except StopIteration: self.logging.info("Journal_Queue Get {} seeds".format(self.journal_queue._qsize()))
def getProxyPool(): from utils.proxy_manager import ProxyManager urllib3.disable_warnings() #@UndefinedVariable nextProxy = ProxyManager.getNextProxy() if nextProxy.proxy_basic_auth: headers = urllib3.make_headers( proxy_basic_auth=nextProxy.proxy_basic_auth) else: headers = None proxy_url = 'http://%s:%s' % (nextProxy.host, nextProxy.port) proxy = urllib3.ProxyManager(proxy_url, proxy_headers=headers, retries=Retry(total=None, connect=2, read=2, redirect=2, backoff_factor=0.1)) return proxy
def _search(self, start, visible=0): payload = {} try: payload['q'] = self.query.encode('utf8') # query to lookup except: payload['q'] = self.query # query to lookup payload['start'] = start # start point payload['gl'] = self.country # query from country payload['hl'] = self.language # user query language payload['lr'] = 'lang_%s' % self.language # restrict language pages payload['num'] = GoogleSeleniumPlus.PAGE_LIMIT payload['safe'] = 'off' params = urllib.urlencode(payload) display = Display(visible=visible, size=(800, 600)) display.start() try: proxyInfo = ProxyManager.getNextProxy() myProxy = '%s:%s' % (proxyInfo.host, proxyInfo.port) proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': myProxy, 'ftpProxy': myProxy, 'sslProxy': myProxy, 'noProxy': '' # set this value as desired }) browser = webdriver.Firefox(proxy=proxy) browser.set_page_load_timeout(30) try: params = urllib.urlencode(payload) browser.implicitly_wait(10) browser.get('%s#%s' % (self.googleHost, params)) app_logger.info(u"%s" % browser.current_url) h3List = browser.find_elements_by_xpath("//h3[@class='r']") results = [] for h3 in h3List: link = h3.find_element_by_tag_name('a') results.append(link.get_attribute("href")) box = browser.find_element_by_id('lst-ib') partialQuery = ' '.join(self.query.split()[1:]) for _letter in partialQuery: box.send_keys(Keys.BACKSPACE) randomSleep(0.03, 0.05) typeQuery(box, partialQuery) randomSleep(0.05, 0.25) print('-' * 80) finally: browser.close() except Exception as ex: raise ex finally: display.stop() if not results: ProxyManager.invalidateProxy() return results
def _search(self, start, retries=0, visible=0): display = None results = [] try: display = Display(visible=visible, size=(800, 600)) display.start() except Exception as ex: pass try: proxyInfo = ProxyManager.getNextProxy() myProxy = '%s:%s' % (proxyInfo.host,proxyInfo.port) proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': myProxy, 'ftpProxy': myProxy, 'sslProxy': myProxy, 'noProxy': '' # set this value as desired }) browser = webdriver.Firefox(proxy=proxy) browser.set_page_load_timeout(30) try: #params = urllib.urlencode(payload) browser.implicitly_wait(10) browser.get('%s' % (self.googleHost,)) box = browser.find_element_by_id('lst-ib') app_logger.info(u"%s" % self.googleHost) typeQuery(box, self.query) # <p class="_e4b"><a href="..">elefantes marinos <b>videos</b></a></p> paragraphList = browser.find_elements_by_xpath('//*[@class="_e4b"]') for p in paragraphList: link = p.find_element_by_tag_name('a') results.append(link.text) # fake typing partialQuery = ' '.join(self.query.split()[1:]) for _letter in partialQuery: box.send_keys(Keys.BACKSPACE) randomSleep(0.03, 0.05) typeQuery(box, ' '.join(self.query.split()[1:])) randomSleep(0.05, 0.25) finally: browser.close() except Exception as ex: raise ex finally: if display: display.stop() if not results: ProxyManager.invalidateProxy() if retries > 0: print 'Reintentando... %s' % self.query return self._search(start, retries=retries-1, visible=visible) return results
def _search(self, start): payload = {} try: payload['q'] = self.query.encode('utf8') # query to lookup except: payload['q'] = self.query # query to lookup payload['start'] = start # start point payload['gl'] = self.country # query from country payload['hl'] = self.language # user query language payload['lr'] = 'lang_%s' % self.language # restrict language pages payload['num'] = GoogleSelenium.PAGE_LIMIT payload['safe'] = 'off' params = urllib.urlencode(payload) display = Display(visible=0, size=(800, 600)) try: display.start() proxyInfo = ProxyManager.getNextProxy() myProxy = '%s:%s' % (proxyInfo.host, proxyInfo.port) proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': myProxy, 'ftpProxy': myProxy, 'sslProxy': myProxy, 'noProxy': '' # set this value as desired }) driver = webdriver.Firefox(proxy=proxy) try: ''' Cualquier fallo aquí, revisar que la ip está dada de alta en buyproxies.com ''' driver.implicitly_wait(10) driver.get('%s?%s' % (self.googleHost, params)) app_error_logger.info(u"%s" % driver.current_url) results = [] h3List = driver.find_elements_by_xpath("//h3[@class='r']") for h3 in h3List: link = h3.find_element_by_tag_name('a') results.append(link.get_attribute("href")) except Exception as ex: raise ex finally: driver.close() except Exception as ex: raise ex finally: display.stop() if not results: ProxyManager.invalidateProxy() return results