Exemplo n.º 1
0
def parse_link(url, mongo_table):
    # url = 'https://www.lagou.com/zhaopin/Python/'
    for page in range(1, 31):
        link = '{}{}/?filterOption=3'.format(url, str(page))
        resp = requests.get(link, headers=config.head())
        if resp.status_code == 404:
            pass
        else:
            soup = BeautifulSoup(resp.text, 'lxml')

        pos_link = config.position()  # 链接
        sel = list(map(soup.select, pos_link))  # 筛选
        data = zip(*sel)  # 转置

        for position, add, release_time, money, need, company, tag, welfare in data:

            data = {
                'position': position.get_text(),
                'add': add.get_text(),
                'release_time': release_time.get_text(),
                'money': money.get_text(),
                'need': need.get_text().split('\n')[2],
                'company': company.get_text(),
                'tag': tag.get_text().replace('\n', '-'),
                'welfare': welfare.get_text()
            }
            save_database(data, mongo_table)
Exemplo n.º 2
0
def parse_link(url):
    """内容页爬取字段"""

    # headers = config.random_header()
    headers = config.head()
    resp = requests.get(url, headers=headers)
    time.sleep(10)
    if resp.status_code == 404:
        pass
    else:
        soup = BeautifulSoup(resp.text, 'lxml')

    pos_link = config.position()  # 链接
    sel = list(map(soup.select, pos_link))  # 筛选
    for position, region, release_time, money, need, company, tag, welfare, industry in zip(
            *sel):

        position = position.get_text()
        city = region.get_text().split('·')[0]
        area = region.get_text().split('·')[1]
        release_time = release_time.get_text()
        money = money.get_text()
        need = need.get_text().split('\n')[2]
        company = company.get_text()
        tag = tag.get_text().replace('\n', '-')
        welfare = welfare.get_text()
        industry = industry.get_text().replace('\n', '').replace(' ', '')
        info = [
            position, city, area, release_time, money, need, company, tag,
            welfare, industry
        ]
        print(info)

    thread_lock.release()  #解锁
Exemplo n.º 3
0
def save_img(url, name):
    url = "http:{}".format(url)
    if not os.path.isdir("img"):
        os.mkdir("img")
    img_path = os.path.join(os.getcwd(), "img")
    r = requests.get(url, headers=config.head(config.img_temp))
    with open(os.path.join(img_path, name), 'wb') as f:
        f.write(r.content)
        print(name, "  done")
    time.sleep(3)
Exemplo n.º 4
0
def test(page):
    t = str(int(time.time()))[:-2] + '558'
    head = config.head()
    url = config.urldemo.format(str(page), t)
    r = requests.get(url, headers=head)
    j = json.loads(r.text)
    for i in j["data"]["pic_list"]:
        try:
            or_picurl = i['original_pic']
            name = str(i['mid']) + '_' + i['sub_name'].replace(
                ":", '') + '.' + or_picurl[-3:]
        except:
            continue
        if os.path.isfile(os.path.join(os.getcwd(), "img", name)):
            print("{} 已存在 跳过爬取".format(name))
            continue
        save_img(or_picurl, name)
Exemplo n.º 5
0
def parse_link(url):
    """内容页爬取字段"""
    # resp = requests.get(url, headers=headers, proxies=proxies)
    # headers = config.random_header()
    headers = config.head()
    print(headers)
    resp = requests.get(url, headers=headers)

    if resp.status_code == 404:
        pass
    else:
        soup = BeautifulSoup(resp.text, 'lxml')

    pos_link = config.position()  # 链接
    sel = list(map(soup.select, pos_link))  # 筛选
    for position, region, release_time, money, need, company, tag, welfare, industry in zip(
            *sel):

        position = position.get_text()

        # region = region.get_text()
        city = region.get_text().split('·')[0]
        area = region.get_text().split('·')[1]
        release_time = release_time.get_text()
        money = money.get_text()
        need = need.get_text().split('\n')[2]
        company = company.get_text()
        tag = tag.get_text().replace('\n', '-')
        welfare = welfare.get_text()
        industry = industry.get_text().replace('\n', '').replace(' ', '')

        # print(position, type(position))
        # print(city, type(city))
        # print(area, type(area))
        # print(release_time, type(release_time))
        # print(money, type(money))
        # print(need, type(need))
        # print(company, type(company))
        # print(tag, type(tag))
        # print(welfare, type(welfare))
        # print(industry, type(industry))
        # print('*'*120)
        db.insert_position(position, city, area, release_time, money, need,
                           company, tag, welfare, industry)