def get_html_link(link_list):
    # 创建字典
    data = {
        'title': '',
        'url': '',
        'review': '',
        'content': '',
        'time': '',
        'type': ''
    }
    dataList = []
    url_list = []  # 存储所有访问过的 url, 避免重复访问
    # 遍历所有子链接
    for i in link_list:
        # 判断是否遍历过
        flag = 0
        for url in url_list:
            if url == str(i):
                flag = 1
                break
        if flag == 1:
            continue
        url_list.append(i)
        # 正常请求
        html_link = requests.get(i)
        html_link.encoding = 'utf-8'
        soup = BeautifulSoup(html_link.text, 'lxml')
        title = soup.title.string
        url = i
        review = ""
        content_div = soup.select(".entry-content")[0]
        content_list = content_div.select("p")
        for k in range(1, 6):
            appended_string = content_div.select("h" + str(k))
            for ap in appended_string:
                content_list.append(ap)
        content = ""
        for j in content_list:
            content = content + str(j)
        # 处理内容,替换字符串中的字符,去掉正文中的图片信息
        content = contentDeal.deal_content(content)
        # 通过标签名查找 时间
        time = re.search(
            r"<meta property=\"article:published_time\" content=\"(.*?)\" />",
            html_link.text).group(1)
        type = "christmas"
        # 给字典赋值
        data['title'] = title
        data['url'] = url
        data['review'] = review
        data['content'] = content
        data['time'] = time
        data['type'] = type
        # 加入 List
        dataList.append(data)
        # 更改字典地址
        data = copy.copy(data)
    return dataList
def get_html_link(link_list):
    # 创建字典
    data = {
        'title': '',
        'url': '',
        'review': '',
        'content': '',
        'time': '',
        'type': ''
    }
    dataList = []
    url_list = []  # 存储所有访问过的 url, 避免重复访问
    # 遍历所有子链接
    for i in link_list:
        # 判断是否遍历过
        flag = 0
        for url in url_list:
            if url == str(i):
                flag = 1
                break
        if flag == 1:
            continue
        url_list.append(i)
        # 正常请求
        html_link = requests.get(i)
        html_link.encoding = 'utf-8'
        soup = BeautifulSoup(html_link.text, 'lxml')
        title = soup.title.string
        url = i
        review = ""
        content_list = soup.select(".article-content-main")[0].select("p")
        content = ""
        for j in content_list:
            content = content + str(j)
        # 处理内容,替换字符串中的字符,去掉正文中的图片信息
        content = contentDeal.deal_content(content)
        # 通过标签名查找 时间
        time = soup.select("time")[0]
        time = re.search(r"<time datetime=\"(.*)\">", str(time)).group(1)
        type = "science-and-technology"
        # 给字典赋值
        data['title'] = title
        data['url'] = url
        data['review'] = review
        data['content'] = content
        data['time'] = time
        data['type'] = type
        # 加入 List
        dataList.append(data)
        # 更改字典地址
        data = copy.copy(data)
    return dataList
Exemplo n.º 3
0
def get_html_link(link_list) :
    # 创建字典
    data = {'title': '', 'url': '','review': '', 'content': '', 'time': '', 'type': ''}
    dataList = []
    url_list = []    # 存储所有访问过的 url, 避免重复访问
    # 遍历所有子链接
    for i in link_list :
        # 判断是否遍历过
        flag = 0
        for url in url_list :
            if url == str(i) :
                flag = 1
                break
        if flag == 1 :
            continue
        url_list.append(i)
        # 正常请求
        html_link = requests.get(i)
        html_link.encoding='utf-8'
        soup = BeautifulSoup(html_link.text, 'lxml')
        # 去除 js 代码
        [s.extract() for s in soup('script')]
        # 去掉结尾的论坛标记
        title = soup.title.string.replace(" - Finland Forum", "")
        url = i
        # 提取 论坛 内容
        content = soup.select(".content")
        if content is None :
            print(i)
            print("内容为空, 跳过此网站")
            continue
        if len(content) == 0 :
            print(str(url) + "\t 此网站无内容 跳过此网站")
            continue
        content = content[0].text
        # 处理内容,替换字符串中的字符,去掉正文中的图片信息
        content = contentDeal.deal_content(content)
        # 提取 论坛 评论
        reviews = soup.select(".content")
        review = ""
        if len(reviews) > 2 :
            for rev in reviews[2:] :
                review = review + "<p>" + reviewDeal.deal_review(rev.text) + "<p>"
        # 通过标签名查找 时间 
        time = ""
        if len(soup.select('.author')) == 0 :
            print(str(url) + "\t此网站时间不可获取")
        else :
            time = soup.select('.author')[0].get_text()[-26:-1]
        type = "forum"
        # 给字典赋值
        data['title'] = title
        data['url'] = url
        data['review'] = review
        data['content'] = content
        data['time'] = time
        data['type'] = type
        # 加入 List
        dataList.append(data)
        # 更改字典地址
        data = copy.copy(data)

        # 检查 话题 页数
        page_total = re.search(r"<strong>(\d+)</strong></span></a>", html_link.text)
        if page_total is not None :
            page_total = int(page_total.group(1))
        else :
            page_total = 1
        # 如果有很多页,继续遍历
        if page_total == 1 :
            continue
        # 获取网页地址
        # 从第二页开始,遍历每一页
        page_url = soup.find_all('a', {'class' : 'button', 'role' : 'button'})[1].get('href')
        page_url = re.search(r".(/.*)", page_url).group(1).replace("amp;", "")
        page_url = page_url[0:-2]
        for j in range(2, page_total+1) :
            url = page_url + str((j - 1) * 15)
            inner_after(dataList, data, "https://www.finlandforum.org" + url)
            # 更改字典地址
            data = copy.copy(data)
    return dataList
def get_html_link(link_list):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    # 创建字典
    data = {
        'title': '',
        'url': '',
        'review': '',
        'content': '',
        'time': '',
        'type': ''
    }
    dataList = []
    url_list = []  # 存储所有访问过的 url, 避免重复访问
    # 遍历所有子链接
    for i in link_list:
        # 判断是否遍历过
        flag = 0
        for url in url_list:
            if url == str(i):
                flag = 1
                break
        if flag == 1:
            continue
        url_list.append(i)
        # 正常请求
        html_link = requests.get(i, headers=headers, verify=False)
        html_link.encoding = 'utf-8'
        soup = BeautifulSoup(html_link.text, 'lxml')
        title = soup.title.string
        url = i
        review = ""
        content_div = soup.select(".entry-content")[0]
        content_list = content_div.select("p")
        for k in range(1, 6):
            appended_string = content_div.select("h" + str(k))
            for ap in appended_string:
                content_list.append(ap)
        content = ""
        for j in content_list:
            content = content + str(j)
        # 处理内容,替换字符串中的字符,去掉正文中的图片信息
        content = contentDeal.deal_content(content)
        # 通过标签名查找 时间
        time = re.search(
            r"<meta property=\"article:published_time\" content=\"(.*?)\" />",
            html_link.text).group(1)
        type = "life"
        # 给字典赋值
        data['title'] = title
        data['url'] = url
        data['review'] = review
        data['content'] = content
        data['time'] = time
        data['type'] = type
        # 加入 List
        dataList.append(data)
        # 更改字典地址
        data = copy.copy(data)
    return dataList
Exemplo n.º 5
0
def visit_single_html(url) :
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    # 创建字典
    data = {'title': '', 'url': '','review': '', 'content': '', 'time': '', 'type': ''}
    dataList = []
    # 正常请求
    html_link = requests.get(url, verify=False, headers=headers)
    html_link.encoding='utf-8'
    soup = BeautifulSoup(html_link.text, 'lxml')
    # 去除 js 代码
    [s.extract() for s in soup('script')]
    # 去掉结尾的论坛标记
    title = soup.title.string.replace(" - Finland Forum", "")
    # 提取 论坛 内容
    content = soup.select(".content")
    if content is None :
        print(url)
        print("内容为空, 跳过此网站")
        return dataList
    if len(content) == 0 :
        print(str(url) + "\t 此网站无内容")
        return dataList
    content = content[0].text
    # 处理内容,替换字符串中的字符,去掉正文中的图片信息
    content = contentDeal.deal_content(content)
    # 提取 论坛 评论
    reviews = soup.select(".content")
    review = ""
    if len(reviews) > 2 :
        for rev in reviews[2:] :
            review = review + "<p>" + reviewDeal.deal_review(rev.text) + "<p>"
    # 通过标签名查找 时间 
    time = ""
    if len(soup.select('.author')) == 0 :
        print(str(url) + "\t此网站时间不可获取")
    else :
        time = soup.select('.author')[0].get_text()[-26:-1]
    type = "forum"
    # 给字典赋值
    data['title'] = title
    data['url'] = url
    data['review'] = review
    data['content'] = content
    data['time'] = time
    data['type'] = type
    # 加入 List
    dataList.append(data)
    # 更改字典地址
    data = copy.copy(data)

    # 检查 话题 页数
    page_total = re.search(r"<strong>(\d+)</strong></span></a>", html_link.text)
    if page_total is not None :
        page_total = int(page_total.group(1))
    else :
        page_total = 1
    # 如果有很多页,继续遍历
    if page_total == 1 :
        return dataList
    # 获取网页地址
    # 从第二页开始,遍历每一页
    page_url = soup.find_all('a', {'class' : 'button', 'role' : 'button'})[1].get('href')
    page_url = re.search(r".(/.*)", page_url).group(1).replace("amp;", "")
    page_url = page_url[0:-2]
    for j in range(2, page_total+1) :
        url = page_url + str((j - 1) * 15)
        inner_after(dataList, data, "https://www.finlandforum.org" + url)
        # 更改字典地址
        data = copy.copy(data)
    return dataList