def parse(self, response): item = V_qqItem() urls = [] url_list = response.xpath( '//*[@id="mod_main_nav"]/div/div/a/@href').extract() for url in url_list: if url[0] == '/' and not url[0:4] == "http": urls.append(f"http://v.qq.com{url}") url_list2 = response.xpath( '//*[@class="mod_row_box"]/div[2]/div/div/a/@href').extract() urls = urls + url_list2 item['urls'] = urls for i in range(len(item['urls'])): try: res = requests.get(item['urls'][i], headers=HEADERS) if res.ok: res.encoding = 'utf-8' html = res.text ex = Extractor(threshold=3) content = ex.filter_tags(html) data = clean_content(ex.getText(content)) with open(f"E:/c++/毕业设计开发日志/06.文本数据集/娱乐/视频/{i}.txt", 'w', encoding="utf-8") as txtfile: txtfile.write(data) print(f"第{i}个网页爬取完毕") time.sleep(2) except Exception as e: print(f"第{i}个文章错误,链接{item['urls'][i]},错误原因{e}") print(f"共{i}个视频页面信息爬取完毕")
def parse_url(): urls = [] data = [] length_all = 0 length_ok = 0 with open(r"C:\Users\叫乌鸦的少年怪\Desktop\历史记录文件.txt", 'r', encoding='utf-8') as files: urls += ["http://" + url[:-1] for url in files.readlines()] length_all = len(urls) j = 0 for url in urls: try: ex = Extractor(threshold=30) html = ex.getHtml(url) content = ex.getText(ex.filter_tags(html)) content = clean_content(content) length_ok += 1 if content != "This page has no content to extract ": j += 1 data_str = "" for _ in content.splitlines(): data_str += _ data.append(data_str) else: pass except ConnectionError as errr: print(errr) # print("响应失败") # # TODO 记录下host except Exception as e: print(e) continue with open(r"C:\Users\叫乌鸦的少年怪\Desktop\content.txt", 'w+', encoding='utf-8') as rlt_txt: for single_data in data: rlt_txt.write(single_data + '\n') print(f"成功访问的{length_ok}") print(f"一共{length_all}") print(f'百分比{length_ok / length_all}')
def parse_urls(url_list: list): j = 0 for i in range(len(url_list)): try: extractor = Extractor(threshold=30) html = extractor.getHtml(url_list[i]) content = extractor.filter_tags(html) data = clean_content(extractor.getText(content)) if data != "This page has no content to extract ": j += 1 with open(f'E:/c++/毕业设计开发日志/06.文本数据集/数据清洗模块测试.txt', 'w+', encoding='utf-8') as txtfile: txtfile.write(data) print(f"第{i+1}篇文章处理完毕") else: pass except Exception as e: print(e) print(f"共获取到{i}篇文章") print(f"成功处理{j}篇文章")
from Settings import HEADERS import json from data_cleaning.Extractor import Extractor from data_cleaning.content_clean import clean_content if __name__ == '__main__': urls = [] for i in range(5): response = get( "https://www.douyu.com/japi/weblist/apinc/rec/list?uid=8b6321ddbef037034b351cab00081501&num=20", headers=HEADERS) data_json = json.loads(response.text) data_url = (data_json['data']) for data in data_url: urls.append(f"https://douyu.com/{data['roomId']}") print(f"共爬取{len(urls)}条房间") try: for i in range(len(urls)): ex = Extractor(threshold=20) html = get(urls[i], headers=HEADERS).text content = ex.filter_tags(html) data = clean_content(ex.getText(content)) with open(f'E:/c++/毕业设计开发日志/06.文本数据集/娱乐/直播/{i}.txt', 'w', encoding='utf-8') as txtfile: txtfile.write(data) print(f"第{i}个直播间处理完毕") print(f'共{i}个直播间处理完毕') except Exception as e: print(e)
def single_content_analyse(self, corpus): content = get_txt_to_single(corpus) cut_content = clean_content(content) result = self.keams_single_analyse(cut_content) print(result)
from data_cleaning.Extractor import Extractor from data_cleaning.content_clean import clean_content cx = Extractor(threshold=90) html = cx.getHtml( "https://blog.csdn.net/Winterto1990/article/details/51220307") content = cx.filter_tags(html) # print(content) s = cx.getText(content) data = clean_content(s) print(data) # TODO: 给爬虫新增代理; # 增加文本输出; # 文本语料库!!