def get_category(url): try: text = requests.get(url, headers={'User-Agent': get_ua()}).text html = etree.HTML(text) cate_temp = html.xpath('/html/body/div[3]/div[1]/article/section[1]/dl/dd[1]/span/text()') cate = cate_temp[0] if cate_temp else '' return cate except Exception as e: return e.__traceback__.tb_lineno, e.args[0]
def get_detail(url): try: text = requests.get(url, headers={'User-Agent': get_ua()}).text html = etree.HTML(text) status_str = html.xpath( '//*[@id="step-box"]/ul/li[1]/span/span/text()') status = status_str[0] if status_str else '定标及以后' pub_date = html.xpath( '//*[@id="step-box"]/ul/li[1]/div/span[2]/text()') pub_time = pub_date[0] if pub_date else None return [status, pub_time] except Exception as e: return e.__traceback__.tb_lineno, e.args[0]
def get_links(url): link_list = [] try: text = requests.get(url, headers={'User-Agent': get_ua()}).text html = etree.HTML(text) orders = html.xpath('//*[@class="xiangmu_item"]') for order in orders: link = 'http://www.51waibao.net/' + order.xpath( './div[1]/div[1]/a/@href')[0] link_list.append(link) return link_list except Exception as e: return e.__traceback__.tb_lineno, e.args[0]
def get_id(url): try: response = requests.get(url, headers=oschina_headers.update( {'User-Agent': get_ua()})) if response.status_code == 200: data = response.json() datas = data['data']['data'] id_list = [(d['id'], d['type']) for d in datas] return id_list else: return 19, response.status_code except Exception as e: return e.__traceback__.tb_lineno, e.args[0]
def get_detail(url): try: text = requests.get(url, headers={'User-Agent': get_ua()}).text html = etree.HTML(text) info = html.xpath('//*[@id="form1"]/div[6]/div[3]')[0] wid = info.xpath('./div[1]/div[1]/ul/li[1]/text()')[0].split( 'waibao')[1] cate = info.xpath('./div[1]/div[1]/ul/li[2]/text()')[0][6:] status = info.xpath('./div[1]/div[1]/ul/li[6]/text()')[0] pub_time = info.xpath('./div[1]/div[1]/ul/li[7]/text()')[0][6:] desc_list = info.xpath('./div[2]/div[2]//text()') desc = '\n'.join([dl.strip() for dl in desc_list]) return [wid, cate, status, pub_time, desc] except Exception as e: return e.__traceback__.tb_lineno, e.args[0]
def get_id(url): try: response = requests.get(url, headers=oschina_headers.update({'User-Agent': get_ua()})) if response.status_code == 200: data = response.json() try: datas = data['data']['data'] id_list = [d['id'] for d in datas] return id_list except Exception as e: return None, e.args[0] else: return None, response.status_code except Exception as e: return None, e.args[0]
def get_info(url): info_list = [] try: text = requests.get(url, headers={'User-Agent': get_ua()}).text html = etree.HTML(text) orders = html.xpath('//*[@id="db_adapt_id"]/div[position()>2]') for order in orders: info = {} link = order.xpath('./div[1]/div[2]/a/@href')[0] desc = order.xpath('./div[1]/div[1]/div/a/p/text()')[0] info['link'] = link info['desc'] = desc.strip() info_list.append(info) return info_list except Exception as e: return None, e.args[0]
def get_info(url): info_list = [] try: text = requests.get(url, headers={'User-Agent': get_ua()}).text html = etree.HTML(text) orders = html.xpath('//*[@id="r-list-wrapper"]/div[2]/div') for order in orders: info = {} link = 'http://www.rrkf.com' + order.xpath( './div[1]/div/h4/a/@href')[0] desc = order.xpath('./div[1]/div/p/text()')[0] info['link'] = link info['desc'] = desc.strip() info_list.append(info) return info_list except Exception as e: return e.__traceback__.tb_lineno, e.args[0]
def get_info(url): info_list = [] try: text = requests.get(url, headers={'User-Agent': get_ua()}).text html = etree.HTML(text) orders = html.xpath('//*[@class="xiangmu_item"]') for order in orders: info = {} link = 'http://www.51waibao.net/' + order.xpath( './div[1]/div[1]/a/@href')[0] desc = order.xpath('./div[2]/text()')[0] info['link'] = link info['desc'] = desc.strip() info_list.append(info) return info_list except Exception as e: return None, e.args[0]
def get_info(url): info_list = [] try: text = requests.get(url, headers={'User-Agent':get_ua()}).text orders = re.findall(r'<div class="job">(.*?)<div class="clearfix"></div>', text, re.S | re.M) for order in orders: info = {} link = 'http://www.shixian.com' + re.search(r'<a target="_blank" href="(.+?)">', order).groups()[0] desc_str = re.search(r'<p class="describe text-inline-limit">(.*?)</p>', order, re.S | re.M).groups()[0] desc = emoji_regex.sub('[Emoji]', desc_str) start_time = re.search(r'.*?(\d{4}-\d{2}-\d{2}).*?', order, re.S | re.M).groups()[0] info['link'] = str(link) info['desc'] = desc.strip() info['start_time'] = start_time + ' 23:59:59' info_list.append(info) return info_list except Exception as e: return e.__traceback__.tb_lineno, e.args[0]
def get_info(url): info_list = [] try: text = requests.get(url, headers={'User-Agent':get_ua()}).text html = etree.HTML(text) orders = html.xpath('//*[@id="db_adapt_id"]/div[position()>2]') for order in orders: info = {} link = str(order.xpath('./a/@href')[0]) desc_str = order.xpath('./div[1]/div[1]/div/a/p/text()')[0] desc = emoji_regex.sub('[Emoji]', desc_str) status = order.xpath('./div[2]/a/text()')[0] info['link'] = link info['desc'] = desc.strip() info['status'] = status info_list.append(info) return info_list except Exception as e: return e.__traceback__.tb_lineno, e.args[0]
class RocketpunchDetailSpider(Spider): """ 데이터 저장소 내의 상세 페이지중 상세 내용을 확인하지 않은 것들을 수집하여 채워 넣는다. 만약 더 이상 돌 페이지가 없으면 종료 company, job, tags, techs 등의 아이템을 반환한다. """ name = "rocketpunch_jobs" hello_url = "https://www.rocketpunch.com/jobs" custom_settings = { "DOWNLOAD_DELAY": 1, "USER_AGENT": get_ua(0), # get first user_agent string "DEFAULT_REQUEST_HEADERS": { "dnt": "1", # do not track me "accept": "*/*", # accept all types "accept-language": "ko-KR,ko;q=0.9", }, "SCHEDULER_DISK_QUEUE": "scrapy.squeues.PickleFifoDiskQueue", "SCHEDULER_MEMORY_QUEUE": "scrapy.squeues.FifoMemoryQueue", "ITEM_PIPELINES": { "rocketpunch.PrintData": 300, }, "FEEDS": { "detail_output.json": { "format": "json", "indent": 2, "encoding": "utf8", "fields": None, } }, } request_urls = [] def start_requests(self): pass def parse(self, response): pass
def get_info(url): info_list = [] try: text = requests.get(url, headers={'User-Agent':get_ua()}).text html = etree.HTML(text) orders = html.xpath('//*[@class="job"]') for order in orders: info = {} link = 'http://www.shixian.com' + order.xpath('./div[1]/a/@href')[0] desc = order.xpath('./div[1]/a/p/text()')[0] # release_time = order.xpath('./div[1]/div/div/span/text()')[0] # if '1 天前发布' in release_time or '小时' in release_time: # info['link'] = link # info['desc'] = desc.strip() # info_list.append(info) # else: # continue info['link'] = link info['desc'] = desc.strip() info_list.append(info) return info_list except Exception as e: return None, e.args[0]
def expdb_parser(eid): """ 单次解析,返回单条数据 :param eid: :return exp_values: :type:dict :value:{'EDB-ID:': '37074', 'CVE:': '2015-4039;2015-4038', 'Author:': 'Panagiotis Vagenas', 'Type:': 'webapps', 'Platform:': 'PHP', 'Date:': '2015-05-21'}: """ exp_values = dict() url = 'https://www.exploit-db.com/exploits/' + str(eid) r = requests.get(url, headers={'User-Agent': get_ua()}, timeout=10) html = r.content if r.status_code == 200 and b'404 Page Not Found' not in html: soup = BeautifulSoup(html, 'html.parser') for div in soup.find_all('div', class_='col-6 text-center'): exp_value = list(div.stripped_strings) if len(exp_value) >= 2: exp_values[exp_value[0]] = ";".join(exp_value[1:]) else: exp_values[exp_value[0]] = '' return exp_values else: return False
import random import re import time from db import * import emoji from conf import book_name_list import requests from utils import get_ua, get_proxy, delete_proxy, request_url, request_url_list from lxml import etree base_url = 'https://book.douban.com' url = 'https://book.douban.com/tag/%E4%B8%9C%E9%87%8E%E5%9C%AD%E5%90%BE' headers = {"User-Agent": get_ua()} api_url = 'http://127.0.0.1:5010/get/' if __name__ == '__main__': for tag in book_name_list: url = f'https://book.douban.com/tag/{tag}' while url: # response = requests.get(url, headers=headers) response = request_url_list(url) response = response.text html = etree.HTML(response) pic_list = html.xpath('//li[@class="subject-item"]') next_url = html.xpath('//span[@class="next"]/a/@href') if pic_list: for item in pic_list: time.sleep(3) book_url = item.xpath('./div[@class="pic"]/a/@href')[0]
import urllib2 import urllib import urlparse import re import HTMLParser import xbmc import xbmcgui import os import math import socket from operator import itemgetter from addon.common.net import Net from addon.common.addon import Addon from db_utils import DB_Connection USER_AGENT = utils.get_ua() _1CH = Addon('plugin.video.1channel') ADDON_PATH = _1CH.get_path() ICON_PATH = os.path.join(ADDON_PATH, 'icon.png') MAX_RETRIES = 2 TEMP_ERRORS = [500, 502, 503, 504] class PW_Error(Exception): pass class MyHTTPRedirectHandler(urllib2.HTTPRedirectHandler): def redirect_request(self, req, fp, code, msg, headers, newurl): utils.log( 'Using Custom Redirect: |%s|%s|%s|%s|%s|' %
import urllib2 import urllib import urlparse import re import HTMLParser import xbmc import xbmcgui import os import math import socket from operator import itemgetter from addon.common.net import Net from addon.common.addon import Addon from db_utils import DB_Connection USER_AGENT = utils.get_ua() _1CH = Addon('plugin.video.1channel_bp') ADDON_PATH = _1CH.get_path() ICON_PATH = os.path.join(ADDON_PATH, 'icon.png') MAX_RETRIES = 2 TEMP_ERRORS = [500, 502, 503, 504] class PW_Error(Exception): pass class MyHTTPRedirectHandler(urllib2.HTTPRedirectHandler): def redirect_request(self, req, fp, code, msg, headers, newurl): utils.log('Using Custom Redirect: |%s|%s|%s|%s|%s|' % (req.header_items(), code, msg, headers, newurl), xbmc.LOGDEBUG) request = urllib2.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl) if request: host = request.get_host()
def process_request(self, request, spider): request.headers["User-Agent"] = get_ua()
def fetch(url): headers = {'User-Agent': get_ua()} r = requests.get(url, headers=headers) r.encoding = 'utf8' return r.text
class RocketpunchPageSpider(Spider): """ first step : https://www.rocketpunch.com/jobs response headers를 통해 앞으로 사용할 requests headers를 설정함 second step : https://www.rocketpunch.com/api/jobs/template?page=&q= 첫번째 페이지에 접속해서 전체 페이지를 확인한 후 한꺼번에 requests를 생성한다. """ name = "rocketpunch_jobs" hello_url = "https://www.rocketpunch.com/jobs" custom_settings = { "DOWNLOAD_DELAY": 1, "USER_AGENT": get_ua(0), # get first user_agent string "DEFAULT_REQUEST_HEADERS": { "dnt": "1", # do not track me "accept": "*/*", # accept all types "accept-language": "ko-KR,ko;q=0.9", }, "SCHEDULER_DISK_QUEUE": "scrapy.squeues.PickleFifoDiskQueue", "SCHEDULER_MEMORY_QUEUE": "scrapy.squeues.FifoMemoryQueue", "ITEM_PIPELINES": { "rocketpunch.PrintData": 300, }, # "FEEDS": { # "output.json": { # "format": "json", # "indent": 2, # "encoding": "utf8", # "fields": None, # } # }, } def start_requests(self): """ Let's say hi to Rocketpunch! If this action succeeds, we can get csrf tokens. """ yield Request(url=self.hello_url, callback=self.hello_parser) def hello_parser(self, response): yield Request( url="https://www.rocketpunch.com/api/jobs/template?page=&q=", callback=self.first_page_parser, meta={"page_number": 1}, ) def first_page_parser(self, response): data = Selector(text=response.json()["data"]["template"], base_url=response.url) end_page_number = int( data.css("div.ui.pagination div.disabled.item + a::text").get()) self.logger.info(f"generating 1 to {end_page_number} pages requests") yield self.page_parser(response) for page_number in range(2, 4): yield Request( url= f"https://www.rocketpunch.com/api/jobs/template?page={page_number}&q=", headers={"Referer": "https://www.rocketpunch.com/jobs"}, callback=self.page_parser, meta={"page_number": page_number}, ) def page_parser(self, response): """ 페이지 dom #company-list > div.company ** 회사의 리스트 get attr data-company-id 회사 고유 id div.content ----------------------------- >div.company-name ----------------------------- >a attr[href] 회사 상세 정보 페이지 주소 h4.name::text 회사 이름 div.description :: text 회사 상세 정보 div.meta :: text 회사 직무 분야 div.company-jobs-detail ** 목록의 리스트 a.job-title.link attr[href] 잡 상세 정보 페이지 주소 a.job-title.link :: text 잡 이름 span.job-stat-info :: text 연봉 및 경력 등 >div.job-dates span ~3 마감일, [기타 근무사항], 수정일 """ self.logger.debug("=" * 50) self.logger.debug(response.request.headers.to_unicode_dict()) self.logger.debug(response.meta) self.logger.debug("=" * 50) text = response.json()["data"]["template"] selector = Selector(text=text) company_list = [] for company in selector.css("#company-list > div.company"): _company_name = company.css("div.content>div.company-name")[0] _a = _company_name.css("a[target='_blank']")[0] _job_details = company.css( "div.company-jobs-detail>div.job-detail") company_id = company.attrib["data-company_id"] company_href = _a.attrib["href"] company_name = "".join( _a.css(".header.name>strong::text,small::text").getall()) company_description = company.css( "div.description::text").get().strip() company_meta_info = company.css( "div.nowrap.meta::text").get().strip() job_details = [] for job in _job_details: job_href = job.css("a.job-title::attr(href)").get() # TODO jobid \/.*?\/(\d*?)\/ 로 추출할 것 job_title = job.css("a.job-title::text").get() job_stat_info = job.css("span.job-stat-info::text").get() _dates = tuple( filter( lambda x: x != "", [ text.strip() for text in job.css( "div.job-dates>span::text").getall() ], )) job_date_until = _dates[0] job_date_modified = _dates[-1] job_date_etc = _dates[1] if len(_dates) == 3 else "" job_details.append({ "job_href": job_href, "job_title": job_title, "job_stat_info": job_stat_info, "job_date_until": job_date_until, "job_date_modified": job_date_modified, "job_date_etc": job_date_etc, }) company_list.append({ "company_id": company_id, "company_href": company_href, "company_name": company_name, "company_description": company_description, "company_meta_info": company_meta_info, "job_details": job_details, }) return { "page": response.meta["page_number"], "company_list": company_list }
def __init__(self, sites, proxies=None): self.sites = sites self.proxies = proxies self.headers = {'User-Agent': get_ua()} self.queue = queue.Queue() self.scheduling()
def __init__(self, queue, proxies=None): super().__init__() self.queue = queue self.headers = {'User-Agent': get_ua()} self.proxies = proxies