Exemplo n.º 1
0
def get_category(url):
    try:
        text = requests.get(url, headers={'User-Agent': get_ua()}).text
        html = etree.HTML(text)
        cate_temp = html.xpath('/html/body/div[3]/div[1]/article/section[1]/dl/dd[1]/span/text()')
        cate = cate_temp[0] if cate_temp else ''
        return cate
    except Exception as e:
        return e.__traceback__.tb_lineno, e.args[0]
def get_detail(url):
    try:
        text = requests.get(url, headers={'User-Agent': get_ua()}).text
        html = etree.HTML(text)
        status_str = html.xpath(
            '//*[@id="step-box"]/ul/li[1]/span/span/text()')
        status = status_str[0] if status_str else '定标及以后'
        pub_date = html.xpath(
            '//*[@id="step-box"]/ul/li[1]/div/span[2]/text()')
        pub_time = pub_date[0] if pub_date else None
        return [status, pub_time]
    except Exception as e:
        return e.__traceback__.tb_lineno, e.args[0]
Exemplo n.º 3
0
def get_links(url):
    link_list = []
    try:
        text = requests.get(url, headers={'User-Agent': get_ua()}).text
        html = etree.HTML(text)
        orders = html.xpath('//*[@class="xiangmu_item"]')
        for order in orders:
            link = 'http://www.51waibao.net/' + order.xpath(
                './div[1]/div[1]/a/@href')[0]
            link_list.append(link)
        return link_list
    except Exception as e:
        return e.__traceback__.tb_lineno, e.args[0]
Exemplo n.º 4
0
def get_id(url):
    try:
        response = requests.get(url,
                                headers=oschina_headers.update(
                                    {'User-Agent': get_ua()}))
        if response.status_code == 200:
            data = response.json()
            datas = data['data']['data']
            id_list = [(d['id'], d['type']) for d in datas]
            return id_list
        else:
            return 19, response.status_code
    except Exception as e:
        return e.__traceback__.tb_lineno, e.args[0]
Exemplo n.º 5
0
def get_detail(url):
    try:
        text = requests.get(url, headers={'User-Agent': get_ua()}).text
        html = etree.HTML(text)
        info = html.xpath('//*[@id="form1"]/div[6]/div[3]')[0]
        wid = info.xpath('./div[1]/div[1]/ul/li[1]/text()')[0].split(
            'waibao')[1]
        cate = info.xpath('./div[1]/div[1]/ul/li[2]/text()')[0][6:]
        status = info.xpath('./div[1]/div[1]/ul/li[6]/text()')[0]
        pub_time = info.xpath('./div[1]/div[1]/ul/li[7]/text()')[0][6:]
        desc_list = info.xpath('./div[2]/div[2]//text()')
        desc = '\n'.join([dl.strip() for dl in desc_list])
        return [wid, cate, status, pub_time, desc]
    except Exception as e:
        return e.__traceback__.tb_lineno, e.args[0]
Exemplo n.º 6
0
def get_id(url):
    try:
        response = requests.get(url, headers=oschina_headers.update({'User-Agent': get_ua()}))
        if response.status_code == 200:
            data = response.json()
            try:
                datas = data['data']['data']
                id_list = [d['id'] for d in datas]
                return id_list
            except Exception as e:
                return None, e.args[0]
        else:
            return None, response.status_code
    except Exception as e:
        return None, e.args[0]
Exemplo n.º 7
0
def get_info(url):
    info_list = []
    try:
        text = requests.get(url, headers={'User-Agent': get_ua()}).text
        html = etree.HTML(text)
        orders = html.xpath('//*[@id="db_adapt_id"]/div[position()>2]')
        for order in orders:
            info = {}
            link = order.xpath('./div[1]/div[2]/a/@href')[0]
            desc = order.xpath('./div[1]/div[1]/div/a/p/text()')[0]
            info['link'] = link
            info['desc'] = desc.strip()
            info_list.append(info)
        return info_list
    except Exception as e:
        return None, e.args[0]
def get_info(url):
    info_list = []
    try:
        text = requests.get(url, headers={'User-Agent': get_ua()}).text
        html = etree.HTML(text)
        orders = html.xpath('//*[@id="r-list-wrapper"]/div[2]/div')
        for order in orders:
            info = {}
            link = 'http://www.rrkf.com' + order.xpath(
                './div[1]/div/h4/a/@href')[0]
            desc = order.xpath('./div[1]/div/p/text()')[0]
            info['link'] = link
            info['desc'] = desc.strip()
            info_list.append(info)
        return info_list
    except Exception as e:
        return e.__traceback__.tb_lineno, e.args[0]
def get_info(url):
    info_list = []
    try:
        text = requests.get(url, headers={'User-Agent': get_ua()}).text
        html = etree.HTML(text)
        orders = html.xpath('//*[@class="xiangmu_item"]')
        for order in orders:
            info = {}
            link = 'http://www.51waibao.net/' + order.xpath(
                './div[1]/div[1]/a/@href')[0]
            desc = order.xpath('./div[2]/text()')[0]
            info['link'] = link
            info['desc'] = desc.strip()
            info_list.append(info)
        return info_list
    except Exception as e:
        return None, e.args[0]
Exemplo n.º 10
0
def get_info(url):
    info_list = []
    try:
        text = requests.get(url, headers={'User-Agent':get_ua()}).text
        orders = re.findall(r'<div class="job">(.*?)<div class="clearfix"></div>', text, re.S | re.M)
        for order in orders:
            info = {}
            link = 'http://www.shixian.com' + re.search(r'<a target="_blank" href="(.+?)">', order).groups()[0]
            desc_str = re.search(r'<p class="describe text-inline-limit">(.*?)</p>', order, re.S | re.M).groups()[0]
            desc = emoji_regex.sub('[Emoji]', desc_str)
            start_time = re.search(r'.*?(\d{4}-\d{2}-\d{2}).*?', order, re.S | re.M).groups()[0]
            info['link'] = str(link)
            info['desc'] = desc.strip()
            info['start_time'] = start_time + ' 23:59:59'
            info_list.append(info)
        return info_list

    except Exception as e:
        return e.__traceback__.tb_lineno, e.args[0]
Exemplo n.º 11
0
def get_info(url):
    info_list = []
    try:
        text = requests.get(url, headers={'User-Agent':get_ua()}).text
        html = etree.HTML(text)
        orders = html.xpath('//*[@id="db_adapt_id"]/div[position()>2]')
        for order in orders:
            info = {}
            link = str(order.xpath('./a/@href')[0])
            desc_str = order.xpath('./div[1]/div[1]/div/a/p/text()')[0]
            desc = emoji_regex.sub('[Emoji]', desc_str)
            status = order.xpath('./div[2]/a/text()')[0]
            info['link'] = link
            info['desc'] = desc.strip()
            info['status'] = status
            info_list.append(info)
        return info_list
    except Exception as e:
        return e.__traceback__.tb_lineno, e.args[0]
Exemplo n.º 12
0
class RocketpunchDetailSpider(Spider):
    """
    데이터 저장소 내의 상세 페이지중 상세 내용을 확인하지 않은 것들을 수집하여 채워 넣는다.
    만약 더 이상 돌 페이지가 없으면 종료
    company, job, tags, techs 등의 아이템을 반환한다.
    """

    name = "rocketpunch_jobs"
    hello_url = "https://www.rocketpunch.com/jobs"
    custom_settings = {
        "DOWNLOAD_DELAY": 1,
        "USER_AGENT": get_ua(0),  # get first user_agent string
        "DEFAULT_REQUEST_HEADERS": {
            "dnt": "1",  # do not track me
            "accept": "*/*",  # accept all types
            "accept-language": "ko-KR,ko;q=0.9",
        },
        "SCHEDULER_DISK_QUEUE": "scrapy.squeues.PickleFifoDiskQueue",
        "SCHEDULER_MEMORY_QUEUE": "scrapy.squeues.FifoMemoryQueue",
        "ITEM_PIPELINES": {
            "rocketpunch.PrintData": 300,
        },
        "FEEDS": {
            "detail_output.json": {
                "format": "json",
                "indent": 2,
                "encoding": "utf8",
                "fields": None,
            }
        },
    }
    request_urls = []

    def start_requests(self):
        pass

    def parse(self, response):
        pass
Exemplo n.º 13
0
def get_info(url):
    info_list = []
    try:
        text = requests.get(url, headers={'User-Agent':get_ua()}).text
        html = etree.HTML(text)
        orders = html.xpath('//*[@class="job"]')
        for order in orders:
            info = {}
            link = 'http://www.shixian.com' + order.xpath('./div[1]/a/@href')[0]
            desc = order.xpath('./div[1]/a/p/text()')[0]
            # release_time = order.xpath('./div[1]/div/div/span/text()')[0]
            # if '1 天前发布' in release_time or '小时' in release_time:
            #     info['link'] = link
            #     info['desc'] = desc.strip()
            #     info_list.append(info)
            # else:
            #     continue
            info['link'] = link
            info['desc'] = desc.strip()
            info_list.append(info)
        return info_list
    except Exception as e:
        return None, e.args[0]
Exemplo n.º 14
0
def expdb_parser(eid):
    """
    单次解析,返回单条数据
    :param eid:
    :return exp_values:
        :type:dict
        :value:{'EDB-ID:': '37074', 'CVE:': '2015-4039;2015-4038', 'Author:': 'Panagiotis Vagenas', 'Type:': 'webapps', 'Platform:': 'PHP', 'Date:': '2015-05-21'}:
    """
    exp_values = dict()
    url = 'https://www.exploit-db.com/exploits/' + str(eid)
    r = requests.get(url, headers={'User-Agent': get_ua()}, timeout=10)
    html = r.content
    if r.status_code == 200 and b'404 Page Not Found' not in html:
        soup = BeautifulSoup(html, 'html.parser')
        for div in soup.find_all('div', class_='col-6 text-center'):
            exp_value = list(div.stripped_strings)
            if len(exp_value) >= 2:
                exp_values[exp_value[0]] = ";".join(exp_value[1:])
            else:
                exp_values[exp_value[0]] = ''

        return exp_values
    else:
        return False
Exemplo n.º 15
0
import random
import re
import time
from db import *
import emoji

from conf import book_name_list
import requests
from utils import get_ua, get_proxy, delete_proxy, request_url, request_url_list
from lxml import etree

base_url = 'https://book.douban.com'
url = 'https://book.douban.com/tag/%E4%B8%9C%E9%87%8E%E5%9C%AD%E5%90%BE'
headers = {"User-Agent": get_ua()}
api_url = 'http://127.0.0.1:5010/get/'

if __name__ == '__main__':
    for tag in book_name_list:
        url = f'https://book.douban.com/tag/{tag}'
        while url:
            # response = requests.get(url, headers=headers)
            response = request_url_list(url)
            response = response.text
            html = etree.HTML(response)
            pic_list = html.xpath('//li[@class="subject-item"]')

            next_url = html.xpath('//span[@class="next"]/a/@href')
            if pic_list:
                for item in pic_list:
                    time.sleep(3)
                    book_url = item.xpath('./div[@class="pic"]/a/@href')[0]
Exemplo n.º 16
0
import urllib2
import urllib
import urlparse
import re
import HTMLParser
import xbmc
import xbmcgui
import os
import math
import socket
from operator import itemgetter
from addon.common.net import Net
from addon.common.addon import Addon
from db_utils import DB_Connection

USER_AGENT = utils.get_ua()
_1CH = Addon('plugin.video.1channel')
ADDON_PATH = _1CH.get_path()
ICON_PATH = os.path.join(ADDON_PATH, 'icon.png')
MAX_RETRIES = 2
TEMP_ERRORS = [500, 502, 503, 504]


class PW_Error(Exception):
    pass


class MyHTTPRedirectHandler(urllib2.HTTPRedirectHandler):
    def redirect_request(self, req, fp, code, msg, headers, newurl):
        utils.log(
            'Using Custom Redirect: |%s|%s|%s|%s|%s|' %
Exemplo n.º 17
0
import urllib2
import urllib
import urlparse
import re
import HTMLParser
import xbmc
import xbmcgui
import os
import math
import socket
from operator import itemgetter
from addon.common.net import Net
from addon.common.addon import Addon
from db_utils import DB_Connection

USER_AGENT = utils.get_ua()
_1CH = Addon('plugin.video.1channel_bp')
ADDON_PATH = _1CH.get_path()
ICON_PATH = os.path.join(ADDON_PATH, 'icon.png')
MAX_RETRIES = 2
TEMP_ERRORS = [500, 502, 503, 504]

class PW_Error(Exception):
    pass

class MyHTTPRedirectHandler(urllib2.HTTPRedirectHandler):
    def redirect_request(self, req, fp, code, msg, headers, newurl):
        utils.log('Using Custom Redirect: |%s|%s|%s|%s|%s|' % (req.header_items(), code, msg, headers, newurl), xbmc.LOGDEBUG)
        request = urllib2.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl)
        if request:
            host = request.get_host()
Exemplo n.º 18
0
 def process_request(self, request, spider):
     request.headers["User-Agent"] = get_ua()
Exemplo n.º 19
0
def fetch(url):
    headers = {'User-Agent': get_ua()}
    r = requests.get(url, headers=headers)
    r.encoding = 'utf8'
    return r.text
Exemplo n.º 20
0
class RocketpunchPageSpider(Spider):
    """
    first step : https://www.rocketpunch.com/jobs
        response headers를 통해 앞으로 사용할 requests headers를 설정함
    second step : https://www.rocketpunch.com/api/jobs/template?page=&q=
        첫번째 페이지에 접속해서 전체 페이지를 확인한 후 한꺼번에 requests를 생성한다.
    """

    name = "rocketpunch_jobs"
    hello_url = "https://www.rocketpunch.com/jobs"
    custom_settings = {
        "DOWNLOAD_DELAY": 1,
        "USER_AGENT": get_ua(0),  # get first user_agent string
        "DEFAULT_REQUEST_HEADERS": {
            "dnt": "1",  # do not track me
            "accept": "*/*",  # accept all types
            "accept-language": "ko-KR,ko;q=0.9",
        },
        "SCHEDULER_DISK_QUEUE": "scrapy.squeues.PickleFifoDiskQueue",
        "SCHEDULER_MEMORY_QUEUE": "scrapy.squeues.FifoMemoryQueue",
        "ITEM_PIPELINES": {
            "rocketpunch.PrintData": 300,
        },
        # "FEEDS": {
        #     "output.json": {
        #         "format": "json",
        #         "indent": 2,
        #         "encoding": "utf8",
        #         "fields": None,
        #     }
        # },
    }

    def start_requests(self):
        """
        Let's say hi to Rocketpunch! If this action succeeds, we can get csrf tokens.
        """
        yield Request(url=self.hello_url, callback=self.hello_parser)

    def hello_parser(self, response):
        yield Request(
            url="https://www.rocketpunch.com/api/jobs/template?page=&q=",
            callback=self.first_page_parser,
            meta={"page_number": 1},
        )

    def first_page_parser(self, response):
        data = Selector(text=response.json()["data"]["template"],
                        base_url=response.url)
        end_page_number = int(
            data.css("div.ui.pagination div.disabled.item + a::text").get())
        self.logger.info(f"generating 1 to {end_page_number} pages requests")
        yield self.page_parser(response)
        for page_number in range(2, 4):
            yield Request(
                url=
                f"https://www.rocketpunch.com/api/jobs/template?page={page_number}&q=",
                headers={"Referer": "https://www.rocketpunch.com/jobs"},
                callback=self.page_parser,
                meta={"page_number": page_number},
            )

    def page_parser(self, response):
        """
        페이지 dom
        #company-list > div.company             ** 회사의 리스트
        get attr data-company-id                회사 고유 id
            div.content                         -----------------------------
                >div.company-name               -----------------------------
                    >a attr[href]               회사 상세 정보 페이지 주소
                    h4.name::text               회사 이름
                div.description :: text         회사 상세 정보
                div.meta :: text                회사 직무 분야
                div.company-jobs-detail         ** 목록의 리스트
                    a.job-title.link attr[href] 잡 상세 정보 페이지 주소
                    a.job-title.link :: text    잡 이름
                    span.job-stat-info :: text  연봉 및 경력 등
                    >div.job-dates
                        span ~3                 마감일, [기타 근무사항], 수정일
        """
        self.logger.debug("=" * 50)
        self.logger.debug(response.request.headers.to_unicode_dict())
        self.logger.debug(response.meta)
        self.logger.debug("=" * 50)
        text = response.json()["data"]["template"]
        selector = Selector(text=text)
        company_list = []
        for company in selector.css("#company-list > div.company"):
            _company_name = company.css("div.content>div.company-name")[0]
            _a = _company_name.css("a[target='_blank']")[0]
            _job_details = company.css(
                "div.company-jobs-detail>div.job-detail")
            company_id = company.attrib["data-company_id"]
            company_href = _a.attrib["href"]
            company_name = "".join(
                _a.css(".header.name>strong::text,small::text").getall())
            company_description = company.css(
                "div.description::text").get().strip()
            company_meta_info = company.css(
                "div.nowrap.meta::text").get().strip()
            job_details = []
            for job in _job_details:
                job_href = job.css("a.job-title::attr(href)").get()
                # TODO jobid \/.*?\/(\d*?)\/ 로 추출할 것
                job_title = job.css("a.job-title::text").get()
                job_stat_info = job.css("span.job-stat-info::text").get()
                _dates = tuple(
                    filter(
                        lambda x: x != "",
                        [
                            text.strip() for text in job.css(
                                "div.job-dates>span::text").getall()
                        ],
                    ))
                job_date_until = _dates[0]
                job_date_modified = _dates[-1]
                job_date_etc = _dates[1] if len(_dates) == 3 else ""

                job_details.append({
                    "job_href": job_href,
                    "job_title": job_title,
                    "job_stat_info": job_stat_info,
                    "job_date_until": job_date_until,
                    "job_date_modified": job_date_modified,
                    "job_date_etc": job_date_etc,
                })
            company_list.append({
                "company_id": company_id,
                "company_href": company_href,
                "company_name": company_name,
                "company_description": company_description,
                "company_meta_info": company_meta_info,
                "job_details": job_details,
            })

        return {
            "page": response.meta["page_number"],
            "company_list": company_list
        }
Exemplo n.º 21
0
 def __init__(self, sites, proxies=None):
     self.sites = sites
     self.proxies = proxies
     self.headers = {'User-Agent': get_ua()}
     self.queue = queue.Queue()
     self.scheduling()
Exemplo n.º 22
0
 def __init__(self, queue, proxies=None):
     super().__init__()
     self.queue = queue
     self.headers = {'User-Agent': get_ua()}
     self.proxies = proxies