Exemplo n.º 1
0
def save_url():
    count = 0
    file = open("C:\savedb\youtube/Index_new.json", "w", encoding="UTF-8")
    with open("C:\savedb\youtube/IndexUrl.json", encoding='UTF-8') as f:
        temp = f.readlines()
        for data in temp:
            data = json.loads(data)
            Index_url = copy.deepcopy(db_cof.IndexUrl)
            Index_url["_id"] = md5(str(data["url"]))
            Index_url["url"] = data["url"]
            Index_url = json.dumps(Index_url)
            file.write(Index_url + "\n")
            count += 1
        file.close()
Exemplo n.º 2
0
    def save_manage(self, is_country, manage=None):
        """
        :param is_country: 当前主页国家信息是否存在
        :param manage: 需要保存的数据:dict
        :return:
        """
        md5_url = md5(manage["url"])
        if is_country:
            logger_cmd.debug("IndexUrl集合信息保存开始...")
            # 更新数据到初始url的数据库中
            index_url = db_obj.get_one(db_cof.IndexUrl_coll, {"_id": md5_url})
            if not index_url:
                index_url = copy.deepcopy(db_cof.IndexUrl)
            index_url["isRequest"] = 1
            index_url["_id"] = md5_url
            db_obj.save(db_cof.IndexUrl_coll, index_url)
            logger_cmd.debug("IndexUrl集合信息保存完成...")

            logger_cmd.debug(("comment集合信息保存开始..."))
            index_comment = db_obj.get_one(db_cof.Comment_coll,
                                           {"url": md5_url})
            if not index_comment:
                index_comment = copy.deepcopy(db_cof.Comment)
            index_comment["author"] = manage["author"]
            index_comment["url"] = manage["url"]
            index_comment["subscribers"] = manage["subscribers"]
            index_comment["country"] = manage["country"]
            index_comment["dataTime"] = manage["data_time"]
            index_comment["sort"] = manage["sort"]
            index_comment["_id"] = md5_url
            db_obj.save(db_cof.Comment_coll, index_comment)
            logger_cmd.debug(("comment集合信息保存完成..."))

            logger_cmd.debug(("CurrentReq集合信息保存开始..."))
            CurrentReq_comment = copy.deepcopy(db_cof.CurrentReq)
            CurrentReq_comment["url"] = manage["url"]
            db_obj.save(db_cof.Current_coll, CurrentReq_comment)
            logger_cmd.debug('CurrentReq集合信息保存完成...')
        else:
            logger_cmd.debug("IndexUrl集合信息保存开始...")
            index_url = db_obj.get_one(db_cof.IndexUrl_coll, {"url": md5_url})
            if not index_url:
                index_url = copy.deepcopy(db_cof.IndexUrl)
            index_url["url"] = manage["url"]
            index_url["isDelete"] = 1
            index_url["isRequest"] = 1
            index_url["_id"] = md5_url
            db_obj.save(db_cof.IndexUrl_coll, index_url)
            logger_cmd.debug("IndexUrl集合信息保存完成...")
Exemplo n.º 3
0
def save_comment():
    count = 0
    file = open("C:\savedb\youtube/Comment_new.json", "w", encoding="UTF-8")
    with open("C:\savedb\youtube/Comment.json", encoding='UTF-8') as f:
        temp = f.readlines()
        for data in temp:
            data = json.loads(data)
            url = re.findall("https://www.youtube.com(.*?)/about",
                             data['url'])[0]
            sort = int(time.time()) + count
            InitUrl_comment = copy.deepcopy(db_cof.Comment)
            InitUrl_comment["_id"] = md5(str(data["url"]))
            InitUrl_comment["author"] = data["author"]
            InitUrl_comment["url"] = url
            InitUrl_comment["subscribers"] = data["subscribers"]
            InitUrl_comment["country"] = data["country"]
            InitUrl_comment["sort"] = sort
            InitUrl_comment["dataTime"] = data["data_time"]
            InitUrl_comment = json.dumps(InitUrl_comment)
            file.write(InitUrl_comment + "\n")
            count += 1
        file.close()
Exemplo n.º 4
0
    def save_url(self, index_urls, key, page):

        logger_cmd.debug("IndexUrl集合信息保存开始...")
        for url in index_urls:
            md5_url = md5(url)
            url_data = db_obj.get_one(db_cof.IndexUrl_coll, {"_id": md5_url})
            if not url_data:
                logger_cmd.debug("第一次保存该主页地址...")
                indexurl_comment = copy.deepcopy(db_cof.IndexUrl)
                indexurl_comment["url"] = url
                # 保留关键字并记录次数
                indexurl_comment["keyWord"] = {key: 1}
                indexurl_comment["_id"] = md5_url
                print(indexurl_comment)
            else:
                indexurl_comment = url_data
                keyword = indexurl_comment.get("keyWord", {})
                # 判断是否有此关键字,有则加1,无则设置为1
                if keyword.get(key, False):
                    keyword[key] = int(keyword[key]) + 1
                else:
                    keyword[key] = 1
                indexurl_comment["keyWord"] = keyword
                indexurl_comment["_id"] = md5_url

            db_obj.save(db_cof.IndexUrl_coll, indexurl_comment)
        logger_cmd.debug("IndexUrl集合信息保存完成...")

        logger_cmd.debug("KeyDb集合信息保存开始...")
        keydb_data = db_obj.get_one(db_cof.KeyDb_coll, {"key": key})
        if not keydb_data:
            keydb_comment = copy.deepcopy(db_cof.KeyDb)
            keydb_comment["key"] = key
            keydb_comment["page"] = [page]
        else:
            keydb_comment = keydb_data
            keydb_comment["page"].append(page)
        db_obj.save(db_cof.KeyDb_coll, keydb_comment)
        logger_cmd.debug("KeyDb集合信息保存完成...")
Exemplo n.º 5
0
    def get_initurl(obj, self, key, page):
        """
        :param key: 查找关键字
        :param page: 页码
        :return:
        """
        try:
            base_url = 'https://www.youtube.com/results?search_query='
            url_key = key.replace(' ', '+')
            run_url = base_url + url_key + "&page=" + str(page)

            response = requests.get(run_url,
                                    headers=get_hearders(),
                                    timeout=common.timeout,
                                    verify=False)

            # 自动解码
            response.encoding = response.apparent_encoding
            html = etree.HTML(response.text)
            init_url = html.xpath("//div[@class='yt-lockup-byline ']/a/@href")

            # 判断最大页数和保存对应的url
            if init_url and self.Before_url != init_url:
                self.logger_cmd.debug("关键词%s查找第%s页获取的urls: " % (key, page) +
                                      str(init_url))
                self.Before_url = init_url

                # 保存初始url
                for url in init_url:
                    md5_url = md5(url)
                    url_data = self.db_obj.get_one(db_cof.InitUrl_coll,
                                                   {"_id": md5_url})
                    if not url_data:
                        initurl_comment = copy.deepcopy(db_cof.InitUrl)
                        initurl_comment["url"] = url
                        # 保留关键字并记录次数
                        initurl_comment["keyWord"] = {key: 1}
                        initurl_comment["_id"] = md5_url
                        self.db_obj.save(db_cof.InitUrl_coll, initurl_comment)
                    else:
                        initurl_comment = url_data
                        keyword = initurl_comment.get("keyWord", {})
                        # 判断是否有此关键字,有则加1,无则设置为1
                        if keyword.get(key, False):
                            keyword["keyWord"] = int(keyword[key]) + 1
                        else:
                            keyword[key] = 1
                        initurl_comment["keyWord"] = keyword
                        initurl_comment["_id"] = md5_url
                        self.db_obj.save(db_cof.InitUrl_coll, initurl_comment)

                # 保存当前关键字和页面到数据库中 --------------------
                keydb_comment = copy.deepcopy(db_cof.KeyDb)
                keydb_data = self.db_obj.get_one(db_cof.KeyDb_coll,
                                                 {"key": key})

                if keydb_data is None:
                    keydb_comment["key"] = key
                    keydb_comment["page"] = [page]
                    self.db_obj.save(db_cof.KeyDb_coll, keydb_comment)
                else:
                    keydb_data["page"].append(page)
                    self.db_obj.save(db_cof.KeyDb_coll, keydb_data)

            else:
                self.logger_cmd.debug("关键词%s错误查找次数%s第%s页获取的urls: " %
                                      (key, self.InitCount, page) +
                                      str(run_url))
                self.InitCount += 1
                if self.InitCount > common.Fault_Tolerance:
                    # 退出查找页循环
                    return True
        except Exception as e:
            time.sleep(common.response_eorr_time)
            self.logger_response.error('查找响应失败:' + run_url)
            self.logger_cmd.debug("请求初始url的异常信息打印:-----------------")
            traceback.print_exc()
            self.logger_cmd.debug("请求初始url的异常信息打印完毕:-----------------")
Exemplo n.º 6
0
    def filer(self, url):
        country, author, subscribers = "", "", 0
        md5_url = md5(url)

        try:
            base_url = "https://www.youtube.com" + url + "/about"
            # 请求主页
            response = self.res_obj.get_index(self, base_url)

            # 提取主页相关信息
            try:
                # 判断国家是否符合要求,不符合直接报异常
                is_country = re.findall(
                    r'<span class="country-inline">\s+(.*?)\s+</span>',
                    response.text)
                if is_country:
                    country = is_country[0].lstrip().rstrip()
                    self.logger_cmd.debug('当前页面的国家:' + str(country))

                    # 获取作者名称
                    is_author = re.findall(
                        r"<title>\s+(.*?)\s+-.*?YouTube.*?</title>",
                        response.text)
                    if is_author:
                        author = is_author[0].lstrip().rstrip()
                    else:
                        self.logger_cmd.debug('当前页面无法查找作者:' + str(author))

                    # 获取订阅者人数
                    is_init_num = re.findall(r'subscribers.*?>(.*?)</span>',
                                             response.text)
                    if is_init_num:
                        init_num = is_init_num[0]
                        base_num = re.findall('(\d+.*?\d+).*', init_num)[0]
                        if 'K' in init_num:
                            multiple = 10**3
                        elif 'M' in init_num:
                            multiple = 10**6
                        else:
                            multiple = 1
                        subscribers = float(base_num) * multiple
                        self.logger_cmd.debug('当前页面的订阅数:' + str(subscribers))
                    else:
                        self.logger_cmd.debug('当前页面无法查找订阅数:' +
                                              str(subscribers))

                    # 当前时间
                    data_time = time.strftime("%Y-%m-%d %H:%M:%S",
                                              time.localtime())
                    # 当前时间戳
                    sort = int(time.time())

                    # 数据库保存 -----------------------------------------
                    self.logger_cmd.debug('作者信息开始保存--------------')
                    # 更新数据到初始url的数据库中
                    initurl_comment = self.db_obj.get_one({"url": md5_url})
                    if not initurl_comment:
                        initurl_comment = copy.deepcopy(db_cof.InitUrl)
                    initurl_comment["country"] = country
                    initurl_comment["subscribers"] = subscribers
                    initurl_comment["isRequest"] = 1
                    initurl_comment["sort"] = sort
                    initurl_comment["_id"] = md5_url
                    self.db_obj.save(db_cof.InitUrl_coll, initurl_comment)

                    # 保存信息到信息数据库
                    db_comment = self.db_obj.get_one(db_cof.Comment_coll,
                                                     {"url": md5_url})
                    if not db_comment:
                        db_comment = copy.deepcopy(db_cof.comment)

                    db_comment["author"] = author
                    db_comment["url"] = base_url
                    db_comment["subscribers"] = subscribers
                    db_comment["country"] = country
                    db_comment["dataTime"] = data_time
                    db_comment["_id"] = md5_url
                    self.db_obj.save(db_cof.Comment_coll, db_comment)

                    # 保存当前请求的url
                    CurrentReq_comment = copy.deepcopy(db_cof.CurrentReq)
                    CurrentReq_comment["sort"] = sort
                    self.db_obj.save(db_cof.Current_coll, CurrentReq_comment)
                    self.logger_cmd.debug('作者信息保存完毕--------------')

                else:
                    # 无法查找国家,信息完善,登记数据库
                    initUrl_comment = self.db_obj.get_one(
                        db_cof.InitUrl_coll, {"url": md5_url})
                    if not initUrl_comment:
                        initUrl_comment = copy.deepcopy(db_cof.InitUrl_coll)
                    initUrl_comment["url"] = url
                    initUrl_comment["isDelete"] = 1
                    initUrl_comment["isRequest"] = 1
                    initUrl_comment["_id"] = md5_url
                    self.db_obj.save(db_cof.InitUrl_coll, initUrl_comment)
                    self.logger_cmd.debug("当前页面无作者的国家信息:" + str(is_country))

            except Exception as e:
                self.logger_cmd.debug("当前页面请求出错")
                traceback.print_exc()

        except Exception as e:
            self.logger_cmd.debug("当前url异常:%s" + base_url)
            traceback.print_exc()