Exemplo n.º 1
0
    def run(self):

        headers = my_user_agent.get_user_agent()
        # 判断队列中url_queue,是否为空,为空就停止
        while self.url_queue.empty() == False:
            response = requests.get(self.url_queue.get(), headers=headers)
            if response.status_code == 200:
                self.html_queue.put(response.text)
            # print(response.text)
            print("*" * 60)
            print(response.url)
            print("*" * 60)
Exemplo n.º 2
0
def get_video(file_name, url):
    '''
    拿到视频能下载的url地址,并下载
    :param: file_name 名字
    :param  url 下载链接
    :return:
    '''
    # 使用requests发出请求,下载
    response = requests.get(url, stream=True, headers=get_user_agent())
    if response.status_code == 200:
        # 写入收到的视频数据
        with open(file_name, 'ab') as f:
            f.write(response.content)
            # 刷新缓冲区
            f.flush()
            print("下载成功")
    else:
        print("找不到该视频。。。。")
import requests
from my_user_agent import get_user_agent
from selenium import webdriver
import time
from lxml import etree
import json
from queue import Queue
from gevent import monkey
import gevent
import random

# 有耗时操作时需要
monkey.patch_all()  # 将程序中用到的耗时操作的代码,换为genvent中自已实现的模块
# 西瓜美食频道
headers = get_user_agent()
file_name = input("请输入要保存的文件名:")
path = r"C:\\Users\\Administrator\\Desktop\\youtube\\"
print(path)


def get_source():
    browser.implicitly_wait(10)
    for i in range(3):
        # 鼠标拉动滚动条
        browser.execute_script(
            "window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage"
        )
        time.sleep(1)

    source = browser.page_source
    return source
import os
import re
from queue import Queue
from threading import Thread
import requests
import my_user_agent
import json
# from gevent import monkey
import gevent
import random

# 有耗时操作时需要
# monkey.patch_all()  # 将程序中用到的耗时操作的代码,换为genvent中自已实现的模块

path = r"C:\\Users\\Administrator\\Desktop\\youtube\\youku\\"
headers = my_user_agent.get_user_agent()


# 爬虫类(json_url)
class CrawlInfo(Thread):
    def __init__(self, url_q, info_q):
        Thread.__init__(self)
        self.url_q = url_q
        self.info_q = info_q

    def run(self):

        # params = {
        #     'page_size': 10,
        #     'next_offset': str(num),
        #     'tag': '今日热门',