def save_url(): count = 0 file = open("C:\savedb\youtube/Index_new.json", "w", encoding="UTF-8") with open("C:\savedb\youtube/IndexUrl.json", encoding='UTF-8') as f: temp = f.readlines() for data in temp: data = json.loads(data) Index_url = copy.deepcopy(db_cof.IndexUrl) Index_url["_id"] = md5(str(data["url"])) Index_url["url"] = data["url"] Index_url = json.dumps(Index_url) file.write(Index_url + "\n") count += 1 file.close()
def save_manage(self, is_country, manage=None): """ :param is_country: 当前主页国家信息是否存在 :param manage: 需要保存的数据:dict :return: """ md5_url = md5(manage["url"]) if is_country: logger_cmd.debug("IndexUrl集合信息保存开始...") # 更新数据到初始url的数据库中 index_url = db_obj.get_one(db_cof.IndexUrl_coll, {"_id": md5_url}) if not index_url: index_url = copy.deepcopy(db_cof.IndexUrl) index_url["isRequest"] = 1 index_url["_id"] = md5_url db_obj.save(db_cof.IndexUrl_coll, index_url) logger_cmd.debug("IndexUrl集合信息保存完成...") logger_cmd.debug(("comment集合信息保存开始...")) index_comment = db_obj.get_one(db_cof.Comment_coll, {"url": md5_url}) if not index_comment: index_comment = copy.deepcopy(db_cof.Comment) index_comment["author"] = manage["author"] index_comment["url"] = manage["url"] index_comment["subscribers"] = manage["subscribers"] index_comment["country"] = manage["country"] index_comment["dataTime"] = manage["data_time"] index_comment["sort"] = manage["sort"] index_comment["_id"] = md5_url db_obj.save(db_cof.Comment_coll, index_comment) logger_cmd.debug(("comment集合信息保存完成...")) logger_cmd.debug(("CurrentReq集合信息保存开始...")) CurrentReq_comment = copy.deepcopy(db_cof.CurrentReq) CurrentReq_comment["url"] = manage["url"] db_obj.save(db_cof.Current_coll, CurrentReq_comment) logger_cmd.debug('CurrentReq集合信息保存完成...') else: logger_cmd.debug("IndexUrl集合信息保存开始...") index_url = db_obj.get_one(db_cof.IndexUrl_coll, {"url": md5_url}) if not index_url: index_url = copy.deepcopy(db_cof.IndexUrl) index_url["url"] = manage["url"] index_url["isDelete"] = 1 index_url["isRequest"] = 1 index_url["_id"] = md5_url db_obj.save(db_cof.IndexUrl_coll, index_url) logger_cmd.debug("IndexUrl集合信息保存完成...")
def save_comment(): count = 0 file = open("C:\savedb\youtube/Comment_new.json", "w", encoding="UTF-8") with open("C:\savedb\youtube/Comment.json", encoding='UTF-8') as f: temp = f.readlines() for data in temp: data = json.loads(data) url = re.findall("https://www.youtube.com(.*?)/about", data['url'])[0] sort = int(time.time()) + count InitUrl_comment = copy.deepcopy(db_cof.Comment) InitUrl_comment["_id"] = md5(str(data["url"])) InitUrl_comment["author"] = data["author"] InitUrl_comment["url"] = url InitUrl_comment["subscribers"] = data["subscribers"] InitUrl_comment["country"] = data["country"] InitUrl_comment["sort"] = sort InitUrl_comment["dataTime"] = data["data_time"] InitUrl_comment = json.dumps(InitUrl_comment) file.write(InitUrl_comment + "\n") count += 1 file.close()
def save_url(self, index_urls, key, page): logger_cmd.debug("IndexUrl集合信息保存开始...") for url in index_urls: md5_url = md5(url) url_data = db_obj.get_one(db_cof.IndexUrl_coll, {"_id": md5_url}) if not url_data: logger_cmd.debug("第一次保存该主页地址...") indexurl_comment = copy.deepcopy(db_cof.IndexUrl) indexurl_comment["url"] = url # 保留关键字并记录次数 indexurl_comment["keyWord"] = {key: 1} indexurl_comment["_id"] = md5_url print(indexurl_comment) else: indexurl_comment = url_data keyword = indexurl_comment.get("keyWord", {}) # 判断是否有此关键字,有则加1,无则设置为1 if keyword.get(key, False): keyword[key] = int(keyword[key]) + 1 else: keyword[key] = 1 indexurl_comment["keyWord"] = keyword indexurl_comment["_id"] = md5_url db_obj.save(db_cof.IndexUrl_coll, indexurl_comment) logger_cmd.debug("IndexUrl集合信息保存完成...") logger_cmd.debug("KeyDb集合信息保存开始...") keydb_data = db_obj.get_one(db_cof.KeyDb_coll, {"key": key}) if not keydb_data: keydb_comment = copy.deepcopy(db_cof.KeyDb) keydb_comment["key"] = key keydb_comment["page"] = [page] else: keydb_comment = keydb_data keydb_comment["page"].append(page) db_obj.save(db_cof.KeyDb_coll, keydb_comment) logger_cmd.debug("KeyDb集合信息保存完成...")
def get_initurl(obj, self, key, page): """ :param key: 查找关键字 :param page: 页码 :return: """ try: base_url = 'https://www.youtube.com/results?search_query=' url_key = key.replace(' ', '+') run_url = base_url + url_key + "&page=" + str(page) response = requests.get(run_url, headers=get_hearders(), timeout=common.timeout, verify=False) # 自动解码 response.encoding = response.apparent_encoding html = etree.HTML(response.text) init_url = html.xpath("//div[@class='yt-lockup-byline ']/a/@href") # 判断最大页数和保存对应的url if init_url and self.Before_url != init_url: self.logger_cmd.debug("关键词%s查找第%s页获取的urls: " % (key, page) + str(init_url)) self.Before_url = init_url # 保存初始url for url in init_url: md5_url = md5(url) url_data = self.db_obj.get_one(db_cof.InitUrl_coll, {"_id": md5_url}) if not url_data: initurl_comment = copy.deepcopy(db_cof.InitUrl) initurl_comment["url"] = url # 保留关键字并记录次数 initurl_comment["keyWord"] = {key: 1} initurl_comment["_id"] = md5_url self.db_obj.save(db_cof.InitUrl_coll, initurl_comment) else: initurl_comment = url_data keyword = initurl_comment.get("keyWord", {}) # 判断是否有此关键字,有则加1,无则设置为1 if keyword.get(key, False): keyword["keyWord"] = int(keyword[key]) + 1 else: keyword[key] = 1 initurl_comment["keyWord"] = keyword initurl_comment["_id"] = md5_url self.db_obj.save(db_cof.InitUrl_coll, initurl_comment) # 保存当前关键字和页面到数据库中 -------------------- keydb_comment = copy.deepcopy(db_cof.KeyDb) keydb_data = self.db_obj.get_one(db_cof.KeyDb_coll, {"key": key}) if keydb_data is None: keydb_comment["key"] = key keydb_comment["page"] = [page] self.db_obj.save(db_cof.KeyDb_coll, keydb_comment) else: keydb_data["page"].append(page) self.db_obj.save(db_cof.KeyDb_coll, keydb_data) else: self.logger_cmd.debug("关键词%s错误查找次数%s第%s页获取的urls: " % (key, self.InitCount, page) + str(run_url)) self.InitCount += 1 if self.InitCount > common.Fault_Tolerance: # 退出查找页循环 return True except Exception as e: time.sleep(common.response_eorr_time) self.logger_response.error('查找响应失败:' + run_url) self.logger_cmd.debug("请求初始url的异常信息打印:-----------------") traceback.print_exc() self.logger_cmd.debug("请求初始url的异常信息打印完毕:-----------------")
def filer(self, url): country, author, subscribers = "", "", 0 md5_url = md5(url) try: base_url = "https://www.youtube.com" + url + "/about" # 请求主页 response = self.res_obj.get_index(self, base_url) # 提取主页相关信息 try: # 判断国家是否符合要求,不符合直接报异常 is_country = re.findall( r'<span class="country-inline">\s+(.*?)\s+</span>', response.text) if is_country: country = is_country[0].lstrip().rstrip() self.logger_cmd.debug('当前页面的国家:' + str(country)) # 获取作者名称 is_author = re.findall( r"<title>\s+(.*?)\s+-.*?YouTube.*?</title>", response.text) if is_author: author = is_author[0].lstrip().rstrip() else: self.logger_cmd.debug('当前页面无法查找作者:' + str(author)) # 获取订阅者人数 is_init_num = re.findall(r'subscribers.*?>(.*?)</span>', response.text) if is_init_num: init_num = is_init_num[0] base_num = re.findall('(\d+.*?\d+).*', init_num)[0] if 'K' in init_num: multiple = 10**3 elif 'M' in init_num: multiple = 10**6 else: multiple = 1 subscribers = float(base_num) * multiple self.logger_cmd.debug('当前页面的订阅数:' + str(subscribers)) else: self.logger_cmd.debug('当前页面无法查找订阅数:' + str(subscribers)) # 当前时间 data_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 当前时间戳 sort = int(time.time()) # 数据库保存 ----------------------------------------- self.logger_cmd.debug('作者信息开始保存--------------') # 更新数据到初始url的数据库中 initurl_comment = self.db_obj.get_one({"url": md5_url}) if not initurl_comment: initurl_comment = copy.deepcopy(db_cof.InitUrl) initurl_comment["country"] = country initurl_comment["subscribers"] = subscribers initurl_comment["isRequest"] = 1 initurl_comment["sort"] = sort initurl_comment["_id"] = md5_url self.db_obj.save(db_cof.InitUrl_coll, initurl_comment) # 保存信息到信息数据库 db_comment = self.db_obj.get_one(db_cof.Comment_coll, {"url": md5_url}) if not db_comment: db_comment = copy.deepcopy(db_cof.comment) db_comment["author"] = author db_comment["url"] = base_url db_comment["subscribers"] = subscribers db_comment["country"] = country db_comment["dataTime"] = data_time db_comment["_id"] = md5_url self.db_obj.save(db_cof.Comment_coll, db_comment) # 保存当前请求的url CurrentReq_comment = copy.deepcopy(db_cof.CurrentReq) CurrentReq_comment["sort"] = sort self.db_obj.save(db_cof.Current_coll, CurrentReq_comment) self.logger_cmd.debug('作者信息保存完毕--------------') else: # 无法查找国家,信息完善,登记数据库 initUrl_comment = self.db_obj.get_one( db_cof.InitUrl_coll, {"url": md5_url}) if not initUrl_comment: initUrl_comment = copy.deepcopy(db_cof.InitUrl_coll) initUrl_comment["url"] = url initUrl_comment["isDelete"] = 1 initUrl_comment["isRequest"] = 1 initUrl_comment["_id"] = md5_url self.db_obj.save(db_cof.InitUrl_coll, initUrl_comment) self.logger_cmd.debug("当前页面无作者的国家信息:" + str(is_country)) except Exception as e: self.logger_cmd.debug("当前页面请求出错") traceback.print_exc() except Exception as e: self.logger_cmd.debug("当前url异常:%s" + base_url) traceback.print_exc()