def main(): proxy = Proxy() proxy = proxy.get_proxy() db_build() url = 'https://www.rusprofile.ru/codes/89220' url_2 = 'https://www.rusprofile.ru/codes/429110' list_url = [url, url_2] address = get_page_address(list_url, proxy) print(address) for i in address: html_company = get_html(i, proxy) get_page_data(html_company) con.commit() con.close()
class Scraper: def __init__(self, url_base, custom_headers=None): self.url_base = url_base self.custom_headers = custom_headers self.proxy = Proxy() self.user_agent = UserAgent() @staticmethod def make_url(url, *res, **params): for r in res: url = '{}/{}'.format(url, r) if params: url = '{}?{}'.format(url, urlencode(params)) return url def set_proxy(self, session): """ Configure the session to use one of the proxy_candidates. If verify is True, then the proxy will have been verified to work. """ proxy = self.proxy.get_proxy() while True: session.proxies = { 'https': 'https://{}:{}'.format(proxy['IP Address'], proxy['Port']) } try: return session.get('https://httpbin.org/ip').json() except Exception: proxy = self.proxy.get_proxy() def crawl(self, *url_path, **url_params): session = requests.Session() if self.custom_headers: session.headers = self.custom_headers url_crawl = self.make_url(self.url_base, *url_path, **url_params) while True: try: session.headers = {'User-Agent': self.user_agent.random} self.set_proxy(session) response = session.get(url_crawl) response.raise_for_status() return response.text except (requests.exceptions.HTTPError, requests.exceptions.ProxyError, requests.exceptions.SSLError): pass
def readweb(): print u'get jobdict.....' configmap = {} jobnamesfile = 'dict/jobnames.pkl' if os.path.isfile(jobnamesfile): configmap = pickle.load(open(jobnamesfile)) return configmap p = Proxy() while (True): proxies = p.getproxies() try: r = requests.get(url='http://www.lagou.com/', proxies=proxies, timeout=60) break except Exception, e: p.nextip() logging.debug(str(e))
def get_company_description(fetchallist): db = MySQLdb.connect(dbadd, user, password, database, use_unicode=True, charset="utf8") cursor = db.cursor() p = Proxy() for id in fetchallist: while (True): try: values = get_company_info_byid(id[0], p) values.append(id[0]) cursor.execute( 'update company set companyUrl = %s,description = %s,fullName = %s,shortName = %s,detailPosition = %s,industryField = %s,companySize = %s,city = %s,financeStage = %s,profile = %s where companyId = %s', values) db.commit() print u"update:", id[0] break except Exception, e: logging.debug(str(e)) p.nextip()
def scrapy(jobname): # print 'crawling ' + jobname + '.....' p = Proxy() db = MySQLdb.connect(dbadd, user, password, database, use_unicode=True, charset="utf8") cursor = db.cursor() req_url = 'http://www.lagou.com/jobs/positionAjax.json?' headers = {'content-type': 'application/json;charset=UTF-8'} while (True): proxies = p.getproxies() try: req = requests.post(req_url, params={'first': 'false', 'pn': 1, 'kd': jobname}, headers=headers, timeout=60, proxies=proxies, allow_redirects=False) totalCount = req.json()['content']['positionResult']['totalCount'] pageSize = req.json()['content']['positionResult']['pageSize'] maxpagenum = totalCount / pageSize break except Exception, e: p.nextip() logging.debug(str(e))
def start(self): self.is_lang = False self.delete_msg() self.startButton.config(state=DISABLED) site = self.siteBox.get() asin = self.asinEntry.get() page = self.pageEntry.get() if not asin: self.write_msg('asin 为空,请输入asin') self.startButton.config(state=NORMAL) return if not page: self.write_msg('页码 为空, 请输入页码') self.startButton.config(state=NORMAL) return try: page = int(page) except Exception as e: print(e) self.write_msg('出现错误, 原因: 页码不是数字') self.startButton.config(state=NORMAL) return self.write_msg('开始任务...,站点--{},Asin--{}'.format(site, asin)) if not self.is_proxies.get(): self.write_msg('不使用代理') proxies = None session = None else: self.write_msg('使用代理, 正在准备代理') try: session, proxies = Proxy(self).get_proxies(site) if not proxies or type(proxies) != dict: self.write_msg( '代理获取失败, 原因: {}'.format(proxies['msg'] if proxies and 'msg' in proxies else '无')) self.startButton.config(state=NORMAL) return except Exception as e: print(e) self.write_msg('出现错误, 原因: {}'.format(e)) self.startButton.config(state=NORMAL) return #初始化请求类 self.requests = AmazonRequests(site, asin, page, session, proxies) self.csv = JsonCsv(asin) t = threading.Thread(target=self.start_download) self.daemon = t.setDaemon(True) t.start()
def get_catalog_info(url): """ :param url: the started url :return: the next url """ while True: # 获取代理地址 proxy = rd.lpop("catalog_proxies") if proxy is None: Proxy("catalog_proxies") proxy = rd.lpop("catalog_proxies") print proxy # 如果请求失败则重复发送请求,直到请求成功为止 try: r = requests.get(url, headers=headers, timeout=5, proxies={'http': proxy}) r.encoding = 'gb2312' html = etree.HTML(r.text) # 获取下一页的 link next_page_link = html.xpath("//div[@class='controlbar']/span[2]/a")[0].get('href') # 这里可能是 ConnectionError, ReadTimeout 等 except Exception: catalog_logger.info("requests error: %s", url) else: break for tr in html.xpath("//table[@class='cytable']//tr[position()>1]"): try: # info 元素顺序依次:作者 作品 标签 风格 进度 字数 作品积分 发表时间 # # 以 whitespace 进行 split,因为整个字符串中以 whitespace 打头和结尾, # 所以 split 后第一项元素和最后一项元素为 '',需要剔除 info = re.split(r'\s{4,}', tr.xpath("string(.)"), re.UNICODE)[1:][:-1] # 测试时发现有些栏目对不上,这里直接舍弃 # 取标签 <a> 中 rel 属性的值作为 abstract 和 tag [abstract, tag]= tr.xpath(".//a[@rel]")[0].get("rel").strip().split(u"<br />标签:") # 测试时发现有些栏目可能都是空的,这里直接舍弃 # xpath 中的 index 从 1 开始 author_link = tr.xpath(".//td[1]/a")[0].get("href") author_link = urlparse.urljoin(r.url, author_link) author_link_query = urlparse.urlparse(author_link).query author_id = urlparse.parse_qs(author_link_query)['authorid'][0] novel_link = tr.xpath(".//td[2]/a")[0].get("href") novel_link = urlparse.urljoin(r.url, novel_link) novel_link_query = urlparse.urlparse(novel_link).query novel_id = urlparse.parse_qs(novel_link_query)['novelid'][0] except Exception: pass else: #print info[1] catalog_logger.info(info[1]) # TODO:可能有些字段还是不匹配,这里直接舍弃 try: catalog = OrderedDict([ ("novel", info[1]), ("novel_id", int(novel_id)), ("novel_link", novel_link), ("author", info[0]), ("author_id", int(author_id)), ("author_link", author_link), ("tag", tag.strip() or u'无'), # 再次 strip() 避免 whitespace ("abstract", abstract or u'无'), ("style", info[3]), ("process", info[4]), ("word_count", int(info[5])), ("point", int(info[6])), ("publish_time", info[7]), ("status", 'WAITING'), # 爬取状态 ("create_time", datetime.datetime.now()), ]) except Exception: pass else: insert_catalog(catalog) if next_page_link is not None: next_page_link = urlparse.urljoin(r.url, next_page_link) print next_page_link catalog_logger.info(next_page_link) return next_page_link
def __init__(self, url_base, custom_headers=None): self.url_base = url_base self.custom_headers = custom_headers self.proxy = Proxy() self.user_agent = UserAgent()
def get_target(queue, log): # MongoDB client = MongoClient() db = client.jingjiang catalog_col = db.catalog while True: catalog = catalog_col.find_one({"status": "WAITING"}, sort=[("create_time", 1)]) # 结束 producer,并将结束信号传递给 customer if catalog == None: for i in xrange(3): queue.put("target:-1") queue.close() queue.join_thread() os._exit(1) novel_id = catalog["novel_id"] print 'producer start', os.getpid(), novel_id while True: # 获取代理地址 proxy = rd.lpop("producer_proxies") if proxy is None: Proxy("producer_proxies") proxy = rd.lpop("producer_proxies") # 如果请求失败则重复发送请求,直到请求成功为止 try: r = requests.get(catalog["novel_link"], headers, timeout=5, proxies={'http': proxy}) r.encoding = 'gb2312' html = etree.HTML(r.text) trs = html.xpath("//tr[contains(@itemprop, 'chapter')]") except Exception: log.info("requests error: %s", catalog["novel_link"]) else: break for tr in trs: try: # info 元素顺序依次为 章节 标题 摘要 字数 更新日期 # 以 whitespace 进行 split,因为整个字符串中以 whitespace 打头和结尾, # 所以 split 后第一项元素和最后一项元素为 '',需要剔除 info = re.split(r'\s{4,}', tr.xpath("string(.)"), re.UNICODE)[1:][:-1] # 剔除 *最新章节 if u'\xa0*\u6700\u65b0\u66f4\u65b0' in info: info = info[:-1] # 摘要可能会有换行符分隔,将其合并 if len(info) > 5: info[2:len(info) - 2] = [''.join(info[2:len(info) - 2])] # 摘要可能没有 elif len(info) == 4: info.insert(2, '无') # 点击数量 暂时不需要, 接口如下(xxx 为 novelid 的值): # r = requests.get('http://s8.static.jjwxc.net/getnovelclick.php?novelid=xxx') # chapter_link 可能被禁,导致没有 chapter_link = tr.xpath(".//a[@itemprop='url']")[0].get('href') #print chapter_link log.info(chapter_link) except Exception: if 'target' in locals(): del target # 删除 redis 中的消息队列 rd.delete('target:%s' % novel_id) # 更改任务状态为 SUSPENDED suspend_task(catalog_col, novel_id) break else: target = { "chapter_id": info[0], "title": info[1], "abstract": info[2], "word_count": int(info[3]), "publish_time": info[4], "chapter_link": chapter_link, } # 写入 redis target = pickle.dumps(target) rpush_to_redis(target, novel_id) # 写入队列 if 'target' in locals(): key = 'target:%s' % novel_id queue.put(key) # 更改任务状态为 QUEUEING queue_task(catalog_col, novel_id) # 删除 catalog 表中相同的 document(去重) delete_same_catalog(catalog_col, novel_id)
def parse_target(queue, lock, log): # MongoDB novel_col, catalog_col = connect_to_MongoDB() while True: chapters = [] task = queue.get() print 'customer start', os.getpid() novel_id = int(task.split(':')[1]) # 结束 customer 进程 if novel_id == -1: os._exit(-1) # 更改任务状态为 PROCESSING process_task(catalog_col, novel_id) while True: ptarget = rd.lpop(task) if ptarget is None: break target = pickle.loads(ptarget) log.info(target["chapter_link"]) while True: # 获取代理地址 proxy = rd.lpop("customer_proxies") # 只允许一个 customer 去添加 proxy if rd.llen("customer_proxies") <= 10: if lock.acquire(block=False): Proxy("customer_proxies") lock.release() if proxy is None: time.sleep(0.5) continue # 如果请求失败则重复发送请求,直到请求成功为止 try: r = requests.get( target["chapter_link"], headers=headers, timeout=5, proxies={'http': proxy}) except Exception: log.info('requests error: %s', target["chapter_link"]) else: break r.encoding = 'gb2312' html = etree.HTML(r.text) # 某些页面可能需要登陆,或者被锁。 try: novel_text = html.xpath("//div[@class='noveltext']")[0] except IndexError: # 挂起任务 suspend_task(catalog_col, novel_id) del chapters # 删除 redis 中的数据 rd.delete(task) break else: novel_text = etree.tostring( novel_text, encoding="unicode", method="html") # 剔除前半部分无关的内容 novel_text = re.split( r'<div style="clear:both;"></div>(\s*<div class="readsmall".*?</div>)?', novel_text)[2] # 剔除后半部分无关的内容 novel_text = re.split( r'<div id="favoriteshow_3".*</div>', novel_text)[0] # 剔除干扰部分 <font>...</font><br> paras = re.split(r'<font.*?<br>', novel_text) paras = [para.strip().replace("<br>", "\r\n") for para in paras if para] content = '\r\n'.join(paras) chapters.append(OrderedDict([ ("chapter_id", target["chapter_id"]), ("chapter_link", target["chapter_link"]), ("title", target["title"]), ("abstract", target["abstract"]), ("word_count", target["word_count"]), ("publish_time", target["publish_time"]), ("content", content), ])) # 更改任务状态为 FINISHED #if chapters: if 'chapters' in locals(): novel_title = finish_task(catalog_col, novel_id) novel = OrderedDict([ ("novel", novel_title), ("novel_id", novel_id), ("chapters", chapters), ("create_time", datetime.datetime.now()), ]) insert_novel(novel_col, novel) print 'customer end %d, pid: %d' % (novel_id, os.getpid())
def test_proxy_10_attempts(): proxy = Proxy() proxy.set_number_of_attempts(10) assert proxy.get()