Exemplo n.º 1
0
 def __init__(self):
     self._ha = HttpAccess()
     cookies = '_RF1=101.204.79.78; _RSG=7t4K9DysapAAy3T6IzZvP9; _RDG=28c3a46e16bd9527e206056f639f93f12d; _RGUID=ace4dbc3-4950-4dc7-9679-8fd486743f0a; ASP.NET_SessionSvc=MTAuOC4xODkuNTV8OTA5MHxqaW5xaWFvfGRlZmF1bHR8MTU0NzYzNTY5NDYxNA; bdshare_firstime=1550397871920; MKT_Pagesource=PC; _ga=GA1.2.1090470229.1550397875; _gid=GA1.2.111071048.1550397875; _bfa=1.1550397832747.3uetxn.1.1550397832747.1550397832747.1.4; _bfs=1.4; gad_city=be2e953e1ae09d16d9cc90a550611388; __zpspc=9.1.1550397884.1550397884.1%234%7C%7C%7C%7C%7C%23; _jzqco=%7C%7C%7C%7C1550397884384%7C1.1018365145.1550397884256.1550397884256.1550397884256.1550397884256.1550397884256.0.0.0.1.1; _bfi=p1%3D290510%26p2%3D290546%26v1%3D4%26v2%3D3; appFloatCnt=3'
     self._ha._managedCookie.add_cookies('ctrip.com', cookies)
     self.page_url = queue.Queue()
     self.que_dealing = []
     # 当前文件夹
     self.filepath = Path(__file__).parents[0]
Exemplo n.º 2
0
    def _get_cms_ver(self, host: str, path: str, rgx: re.Pattern):
        ver: str = None
        try:
            ha = HttpAccess()
            # access home page to get cookie
            url = host
            if not url.startswith("http"):
                url = "http://" + host.strip("/")
            self._logger.debug("Get CMS ver home: {}".format(url))
            ha.getstring(
                url,
                headers="""
            Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
            Accept-Encoding: gzip, deflate
            Accept-Language: en-US,en;q=0.9
            Cache-Control: no-cache
            Pragma: no-cache
            Proxy-Connection: keep-alive
            Upgrade-Insecure-Requests: 1
            User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36""",
                timeout=10,
            )

            # access version page
            url = host.strip("/") + "/" + path.lstrip("/")
            if not url.startswith("http"):
                url = "http://" + host.strip("/") + "/" + path.lstrip("/")
            self._logger.debug("Get CMS ver subpath: {}".format(url))
            html = ha.getstring(
                url,
                headers="""
            Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
            Accept-Encoding: gzip, deflate
            Accept-Language: en-US,en;q=0.9
            Cache-Control: no-cache
            Pragma: no-cache
            Proxy-Connection: keep-alive
            Upgrade-Insecure-Requests: 1
            User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36""",
                timeout=10,
            )

            if html is None or html == "":
                return ver

            # <version>(.+)</version>
            m: re.Match = re.search(rgx, html, re.S)
            if m is None:
                return ver

            ver = m.group(1)

        except Exception as e:
            self._logger.error("Get joomla version faile: {} {}".format(host, e.args))
        return ver
Exemplo n.º 3
0
 def __init__(self):
     # self.s = requests.session()
     self.s = HttpAccess()
     self.tieba_keyword = '四川'
     self.tiezi_keyword = ['四川', '德阳']
     start_cookie = 'TIEBA_USERTYPE=8f42a94301cb125114b88e7c; wise_device=0; BAIDUID=CB7173B0D9165F60AF77E8ACE3C20897:FG=1; bdshare_firstime=1551248833930; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1551248834; BDUSS=BBdHZRVnhYfnB3aGRKdUViVW9-QXFCUkVJVFUyNWdyUVRMUUpOeWxaU1oyWjFjQUFBQUFBJCQAAAAAAAAAAAEAAAA23WE5yq7UwnNlcHRlbWJlcgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJlMdlyZTHZcQV; STOKEN=621f6ba3aa1e26cbad20ecfe531ea78659a0ec1878489146ad833b226ce9e2fa; TIEBAUID=f986682cc736e76dfd7f2ee8; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1551258762'
     self.s._managedCookie.add_cookies('tieba.baidu.com', start_cookie)
     self.content_url_queue = queue.Queue()
     self.dealing_queue = []
     # 当前文件夹
     self.filepath = Path(__file__).parents[0]
     self.tiebahost = 'https://tieba.baidu.com'
Exemplo n.º 4
0
    def __init__(self):
        ProxySpiderbase.__init__(self, False)

        self._loggr: MsLogger = MsLogManager.get_logger(
            self.__class__.__name__)

        # 慢点,interval=1表示1秒/次请求
        self._ha: HttpAccess = HttpAccess(interval=1)
Exemplo n.º 5
0
    def __init__(self):
        ProxySpiderbase.__init__(self, False)

        self._logger: MsLogger = MsLogManager.get_logger(
            self.__class__.__name__)

        self._reproxy = re.compile(r"([\d.]+?):(\d+)", re.S)

        # 用于验证HTTP代理的,http访问器;interval两个HTTP请求之间的间隔时间(秒)
        self._ha: HttpAccess = HttpAccess(interval=1)
Exemplo n.º 6
0
    def __init__(self):
        ProxySpiderbase.__init__(self, False)

        self._logger: MsLogger = MsLogManager.get_logger(self.__class__.__name__)

        self._reproxy = re.compile(r'"([\d.]+?):(\d+)"', re.S)

        # 用于验证HTTP代理的,http访问器;interval两个HTTP请求之间的间隔时间(秒)
        self._ha: HttpAccess = HttpAccess(interval=1)
        # 付费齐云代理的key
        self.key = 'dd0b192e8199af0b47faf005aac4483b1efff860'
Exemplo n.º 7
0
 def _get_accesstoken(cls):
     """
     首先需要账号登陆获取token
     :return:
     """
     # res = False
     _user = zoomeyeconf.get('username')
     _password = zoomeyeconf.get('password')
     try:
         url = "https://api.zoomeye.org/user/login"
         postdata = f'{{"username": "******","password": "******"}}'
         ha = HttpAccess()
         html = ha.getstring(url, postdata)
         if html is None:
             raise Exception("Get accesstoken failed")
         js = json.loads(html)
         accesstoken = js["access_token"]
         cls._accesstoken = accesstoken
         res = True
     except Exception as ex:
         res = False
     return res
Exemplo n.º 8
0
    def __init__(self, token: str):
        assert isinstance(token, str)
        self._token: str = token
        self._header = self._header % self._token

        self._is_logined: bool = False
        self._login_locker = threading.RLock()

        self._logger: MsLogger = MsLogManager.get_logger("GitAPIv4")
        self._ha: HttpAccess = HttpAccess()

        self._user_name: str = None
        self._user_login: str = None
        self._user_id: str = None
Exemplo n.º 9
0
 def __init__(self):
     ScoutPlugBase.__init__(self)
     self._dbip = DbipMmdb()
     self._rdap_apis = {
         # arin 加拿大、美国和一些加勒比海岛屿
         "arin": "https://rdap.arin.net/registry/ip/",
         # apnic 亚洲/太平洋地区
         "apnic": "https://rdap.apnic.net/history/ip/",
         # afrinic 非洲地区
         "afrinic": "https://rdap.afrinic.net/rdap/ip/",
         # ripe 欧洲、中东和中亚
         "ripe":
             "https://stat.ripe.net/data/whois/data.json?resource=192.0.20/23",
     }
     self._ha: HttpAccess = HttpAccess()
Exemplo n.º 10
0
    def __init__(self, task: IscoutTask):
        self.task = task
        self.tmppath = clienttaskconfig.tmppath
        self.outpath = clienttaskconfig.outputpath
        self._ha = HttpAccess()
        # 插件名字
        self._name = type(self).__name__
        self._logger: MsLogger = MsLogManager.get_logger(
            f"{self._name}_{self.task.taskid}")
        self._sqlfunc = DbManager

        # 最大的输出条数
        self.max_output = 10000
        # 新增reason字段,需要对应打击武器的
        self.dtools = dtools
        # 新增数据统计,modify by judy 2020/08/10
        self.output_count = 0
        # 日志log后缀,create by judy 2020/08/12
        self._log_suffix = 'prg_log'
Exemplo n.º 11
0
    def __init__(self):
        ProxySpiderbase.__init__(self, False)

        self._logger: MsLogger = MsLogManager.get_logger(
            self.__class__.__name__)

        self._reproxy = re.compile(r'([\d.]+?):(\d+)<br>', re.S)
        # 慢点,1秒一次请求
        self._ha: HttpAccess = HttpAccess(interval=1)
        self._ha.getstring("http://www.89ip.cn",
                           headers='''
            Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
            Accept-Encoding: gzip, deflate
            Accept-Language: en,zh-CN;q=0.9,zh;q=0.8
            Cache-Control: no-cache
            Connection: keep-alive
            Host: www.89ip.cn
            Pragma: no-cache
            Upgrade-Insecure-Requests: 1
            User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'''
                           )
Exemplo n.º 12
0
    def __init__(self):
        ScoutPlugBase.__init__(self)

        self._ha: HttpAccess = HttpAccess()
        self._host: str = 'securitytrails.com'
Exemplo n.º 13
0
class WebTechRecognizer(ScoutPlugBase):
    """Recognizer for web technologies"""

    __inst = None
    __initialed: bool = False
    __initlocker = threading.RLock()
    # 文件夹
    file = Path(__file__).parents[0]

    # __appfi = os.path.abspath(os.path.join("./resource/tools/apps.json"))
    __appfi: Path = file / 'apps.json'
    __ha: HttpAccess = HttpAccess()
    _webtechs: dict = {}

    def __init__(self, task: IscoutTask):
        ScoutPlugBase.__init__(self)
        self.task = task
        self.__init()

    def __init(self):
        """check if the resource apps.json is exists, 
        otherwise download it."""
        if WebTechRecognizer.__initialed:
            return
        with WebTechRecognizer.__initlocker:
            if WebTechRecognizer.__initialed:
                return

            # 如果没有文件那么去下载
            if not WebTechRecognizer.__appfi.exists():
                self.__download_appfi()
            # 如果有文件那么去检查下更新,超过7天重新下载
            self.__update_appfi()
            # 初始化json文件,将文件加载到内存
            if not self.__init_json():
                raise Exception("Init web_tech_recognizer json failed.")

            WebTechRecognizer.__initialed = True

    def __download_appfi(self):
        """download app.json"""
        url: str = 'https://raw.githubusercontent.com/AliasIO/Wappalyzer/master/src/apps.json'
        respio: ResponseIO = WebTechRecognizer.__ha.get_response_stream(url)
        with WebTechRecognizer.__appfi.open(mode='wb') as fs:
            respio.readinto(fs)

    def __update_appfi(self):
        """
        如果存在文件,那么检查下文件是否超过7天,
        如果超过7天那么删除重新下载
        直接使用unixtime
        :return:
        """
        file_time = int(WebTechRecognizer.__appfi.stat().st_mtime)
        now_time = int(datetime.now(pytz.timezone('Asia/Shanghai')).timestamp())
        if now_time - file_time > 7 * 24 * 60 * 60:
            # 先删除文件
            WebTechRecognizer.__appfi.unlink()
            # 重新下载
            self.__download_appfi()

    def __init_json(self) -> bool:
        """init app.json"""
        sj = None
        with open(WebTechRecognizer.__appfi, mode='r', encoding='utf-8') as fs:
            sj = json.load(fs)

        if not sj.__contains__("apps"):
            raise Exception('Key "apps" not found in app.json')
        if not sj.__contains__("categories"):
            raise Exception('Key "categories" not found in app.json')

        for c, v in sj["categories"].items():
            WebTech.set_cats(c, v["name"])

        for name, source in sj["apps"].items():
            webtec: WebTech = WebTech(name, source)
            WebTechRecognizer._webtechs[name] = webtec

        return True

    #######################################
    # match
    def __judge_two_str(self, sstr, dstr):
        """
        判断两个字符串,是否类似
        :param sstr:
        :param dstr:
        :return:
        """
        res = False
        if sstr in dstr or dstr in sstr or sstr == dstr:
            res = True
        return res

    def __url_match(self, url: str):
        """
        根据url来匹配组件,先之前已经做了很多判断所以这个url是能拿到的
        目前好像只能通过一次循环来查找,
        这个json里面的url好像是这个组件的url
        :param url:
        :return:
        """
        try:
            for k, v in WebTechRecognizer._webtechs.items():
                webtech: WebTech = v
                if webtech._url is not None and self.__judge_two_str(url, webtech._url):
                    for cat in webtech._cats:
                        name = WebTech.get_cat_name(cat)
                        if name is not None:
                            self._logger.debug(f"Url match a component,name:{name}")
                            yield name
                    # 目前都只去匹配一次,因为情况比较特殊,所以先这么干
                    break
        except:
            self._logger.error(f"Url match error, err:{traceback.format_exc()}")
        finally:
            self._logger.info("Complete use url to match component.")

    def __rheader_match(self, rheader: dict):
        """
        根据返回的header来匹配
        :param rheader:
        :return:
        """
        try:
            get = False
            for k, v in WebTechRecognizer._webtechs.items():
                webtech: WebTech = v
                # 如果发现指纹库里的header不为空,那么就去对比下
                if len(webtech._headers) != 0:
                    # 遍历传入的header去对比
                    for sk, sv in rheader.items():
                        # 判断信息是否符合
                        if webtech._headers.__contains__(sk):
                            # 这里的正则好像都有version,但是实际上得到的值可能没有,所以需要判断下
                            match = False
                            wh = webtech._headers[sk]
                            if 'version' in sv:
                                re_header = re.compile(wh)
                                match = re_header.search(sv)
                            else:
                                # 这里有两种情况'.+?\;version:\d+'
                                if 'version' in wh:
                                    try:
                                        re_header = re.compile(wh.split('version')[0][:-2])
                                    except:
                                        self._logger.debug(f'Cant split version, server:{wh}')
                                else:
                                    re_header = re.compile(wh)
                                match = re_header.search(sv)
                            if match:
                                self._logger.debug(f"Header match component, name:{k}")
                                for cat in webtech._cats:
                                    category = WebTech.get_cat_name(cat)
                                    if category is not None:
                                        self._logger.info(f"Rheader match a component,name:{category}")
                                        yield (k, v, category)
                                get = True
                                break
                # 如果遍历到了数据了那么就不再需要继续拿了,一个网站应该不会使用那么技术
                # 如果以后需要那么多的数据那么就把这个去掉继续拿
                if get:
                    break
        except:
            self._logger.error(f"Rheader match error, err:{traceback.format_exc()}")
        finally:
            self._logger.info("Complete use rheader to match component.")

    def __html_match(self, html: str):
        """
        根据返回的html来匹配
        :param html:
        :return:
        """
        try:
            for k, v in WebTechRecognizer._webtechs.items():
                webtech: WebTech = v
                if webtech._html is not None:
                    # html只有一个,直接就是正则表达式
                    match = False
                    if isinstance(webtech._html, str):
                        re_html = re.compile(webtech._html)
                        match = re_html.search(html)
                    else:
                        # html有多个,有多个正则表达式
                        for he in webtech._html:
                            re_html = re.compile(he)
                            match = re_html.search(html)
                            if match:
                                break
                    if match:
                        self._logger.debug(f"Html match component, name:{k}")
                        for cat in webtech._cats:
                            category = WebTech.get_cat_name(cat)
                            if category is not None:
                                self._logger.info(f"Html match a component, name:{category}")
                                yield (k, v, category)
                        # 目前就只去匹配一次
                        break
        except:
            self._logger.error(f"Html match error, err:{traceback.format_exc()}")
        finally:
            self._logger.info("Complete use html to match component.")

    def _match(self, url: str, respheader: dict, html: str):
        """
        根据传入的数据匹配相应的组件
        return (name, category, version)
        :param url:
        :param respheader:
        :param html:
        :return:
        """
        # 这里进行很多判断确保条件不会出错,然后去拿相应的数据
        # 目前在这只能拿到组件的英文名字,所以直接将数据返回,然后在上一层封装成
        # if url is not None and url != '':
        #     for data in self.__url_match(url):
        #         yield data
        self._logger.info(f"Start to find component, url:{url}")
        if respheader is not None and len(respheader) > 0:
            for data in self.__rheader_match(respheader):
                yield data
        if html is not None and html != '':
            for data in self.__html_match(html):
                yield data

    def get_match_res(self, level, url):
        """
        根据url和
        :param level:
        :param url:
        :return:
        """
        try:
            res = requests.get(url)
            # url = res.url
            rheaders = dict(res.headers)
            html = res.text
            match_iter = self._match(url, rheaders, html)
            for k, v, categroy in match_iter:
                com = Component(self.task, level, k)
                com.category = categroy
                com.url = v._website
                yield com
        except:
            self._logger.error(f"Match component error, err:{traceback.format_exc()}")
Exemplo n.º 14
0
    def __init__(self, servicetype: str) -> None:
        LogicalGrabberBase.__init__(self, servicetype)

        self._ha: HttpAccess = HttpAccess()
Exemplo n.º 15
0
class BaiDuTieBa(object):

    def __init__(self):
        # self.s = requests.session()
        self.s = HttpAccess()
        self.tieba_keyword = '四川'
        self.tiezi_keyword = ['四川', '德阳']
        start_cookie = 'TIEBA_USERTYPE=8f42a94301cb125114b88e7c; wise_device=0; BAIDUID=CB7173B0D9165F60AF77E8ACE3C20897:FG=1; bdshare_firstime=1551248833930; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1551248834; BDUSS=BBdHZRVnhYfnB3aGRKdUViVW9-QXFCUkVJVFUyNWdyUVRMUUpOeWxaU1oyWjFjQUFBQUFBJCQAAAAAAAAAAAEAAAA23WE5yq7UwnNlcHRlbWJlcgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJlMdlyZTHZcQV; STOKEN=621f6ba3aa1e26cbad20ecfe531ea78659a0ec1878489146ad833b226ce9e2fa; TIEBAUID=f986682cc736e76dfd7f2ee8; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1551258762'
        self.s._managedCookie.add_cookies('tieba.baidu.com', start_cookie)
        self.content_url_queue = queue.Queue()
        self.dealing_queue = []
        # 当前文件夹
        self.filepath = Path(__file__).parents[0]
        self.tiebahost = 'https://tieba.baidu.com'

    def out_formate(self, s: str) -> str:
        try:
            return base64.b64encode(s.encode()).decode('utf-8')
        except Exception as ex:
            s = repr(s)
            return base64.b64encode(s.encode()).decode('utf-8')

    # def update_cookie(self, res: requests.Response, headers):
    #     """
    #     更新cookie,和refer_url
    #     也就相当于更新了cookie
    #     :return:
    #     """
    #     if res is not None:
    #         cookiedict = res.cookies.get_dict()
    #         cookie_string = ';'.join([str(x) + '=' + str(y) for x, y in cookiedict.items()])
    #         self.start_cookie += cookie_string
    #         headers['Cookie'] += self.start_cookie
    #     return headers

    def get_start_url(self):
        return f'http://tieba.baidu.com/f?kw={self.tieba_keyword}&ie=utf-8&pn=0'

    def judge_key_world_in_title(self, title):
        res = False
        try:
            for el in self.tiezi_keyword:
                if el in title:
                    res = True
        except:
            res = False
        return res

    def get_download_links(self):
        """
        获取需要下载的链接
        :return:
        """
        # http://tieba.baidu.com/f?kw=%E5%9B%9B%E5%B7%9D&ie=utf-8&pn=0
        # 每次url的增长是50
        # 从启始页开始拿
        next_page = True
        nextpagenum = 0
        # 最后一页
        last_page = None
        next_url = self.get_start_url()
        re_title = re.compile(
            '<a rel="noreferrer" href="(.+?)" title="(.+?)" target="_blank" class="j_th_tit ">.+?</a>')
        re_next_page = re.compile('pn=(\d+)')
        while next_page:
            try:
                response = self.s.getstring(next_url, headers='''
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8
Cache-Control: no-cache
Host: tieba.baidu.com
Pragma: no-cache
Proxy-Connection: keep-alive
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36''')
                # 更新cookie
                # headers = self.update_cookie(response, headers)
                all_find = re_title.findall(response)
                if len(all_find) > 0:
                    for el_title in all_find:
                        is_key_in = self.judge_key_world_in_title(el_title[1])
                        if is_key_in:
                            if not el_title[0].startswith('http://'):
                                content_url = el_title[0]
                                self.content_url_queue.put((content_url, el_title[1]))
                else:
                    print(f"没有获取到此页面,{next_url}")
                    nextpagenum += 50
                    next_url = f'http://tieba.baidu.com/f?kw={self.tieba_keyword}&ie=utf-8&pn={nextpagenum}'
                    print(f'now page num:{nextpagenum-50}, next_url:{next_url}')
                    continue
                next_page_all = re_next_page.findall(response)
                # 只用一次
                if last_page is None:
                    last_page = next_page_all[-1]
                # if int(next_page_all[-2]) < int(last_page):
                if nextpagenum < int(last_page):
                    # nextpagenum = next_page_all[-2]
                    nextpagenum +=50
                    next_url = f'http://tieba.baidu.com/f?kw={self.tieba_keyword}&ie=utf-8&pn={nextpagenum}'
                    print(f'next url:{next_url}')
                else:
                    next_page = False
                    break
            except Exception as err:
                nextpagenum += 50
                next_url = f'http://tieba.baidu.com/f?kw={self.tieba_keyword}&ie=utf-8&pn={nextpagenum}'
                print(f'error page:{nextpagenum - 50}, now go next_url:{next_url},error:{err}')
                continue

    def get_reply(self, tid, pn, refer, title) -> iter:
        url = "https://tieba.baidu.com/p/totalComment"

        querystring = {"tid": tid, "fid": "1", "pn": pn}

        response = self.s.getstring(url, headers='''
Accept: application/json, text/javascript, */*; q=0.01
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8
Cache-Control: no-cache
Connection: keep-alive
Host: tieba.baidu.com
Pragma: no-cache
Referer: {}
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36
X-Requested-With: XMLHttpRequest'''.format(refer), params=querystring)
        # 更新下cookie,虽然没有用
        # headers = self.update_cookie(response, headers)

        data = json.loads(response)
        if data.get('errno') != 0 or data.get('errmsg') != 'success':
            return []
        c_list = data.get('data').get('comment_list')
        if c_list is None:
            return []
        if len(c_list) == 0:
            return []
        for key, value in c_list.items():
            for el in value.get('comment_info'):
                try:
                    write_line = {}
                    s_id = el.get('comment_id')
                    post_id = el.get('post_id')
                    author = el.get('username')
                    content = el.get('content')
                    if content is None or content == '':
                        continue
                    g_time = el.get('now_time')  # unixtime
                    g_d_time = str(datetime.datetime.fromtimestamp(g_time))
                    write_line['id'] = s_id
                    write_line['replyid'] = post_id
                    write_line['author'] = self.out_formate(author)
                    write_line['title'] = self.out_formate(title)
                    write_line['stars'] = None
                    write_line['content'] = self.out_formate(content)
                    write_line['resources'] = None
                    write_line['createtime'] = g_d_time
                    write_line['updatetime'] = None
                    write_line['likes'] = None
                    yield write_line
                except Exception as err:
                    print(f"获取当前页的评论出错,err:{err}")
                    continue

    def get_content_info(self):
        re_next_page = re.compile('<a href="(.+?)">下一页</a>')
        no_data_times = 0
        with threading.Lock():
            csvfile = open(self.tieba_keyword + '.csv', 'a', newline='')
            fieldnames = ['id', 'replyid', 'author', 'title', 'stars', 'content', 'resources', 'createtime', 'updatetime',
                          'likes']

            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            # writer.writeheader()

        while True:
            if no_data_times > 50:
                print("no data to crawel")
                break

            if self.content_url_queue.empty():
                time.sleep(3)
                no_data_times += 1
                continue

            url_info = self.content_url_queue.get()

            url = url_info[0]
            title = url_info[1]
            has_next = True
            next_url = None
            # 拿评论需要的信息
            pn = 1
            tid = re.search('\d+', url).group()

            while has_next:
                # 第一次访问url
                try:
                    if next_url is None:
                        next_url = self.tiebahost + url
                    response = self.s.getstring(next_url, headers='''
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8
Cache-Control: no-cache
Host: tieba.baidu.com
Pragma: no-cache
Proxy-Connection: keep-alive
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36''')
                    # 更新cookie
                    # headers = self.update_cookie(response, headers)

                    soup = BeautifulSoup(response, 'lxml')
                    get_div_info = soup.find('div', attrs={'class': 'p_postlist'})
                    all_content_divs = get_div_info.contents
                    for content_div in all_content_divs:
                        try:
                            write_line = {}
                            data_info = content_div.get('data-field')
                            if data_info is None:
                                continue
                            data = json.loads(data_info)
                            floorid = data.get('content').get('post_id')
                            author = data.get('author').get('user_name')

                            re_get_time = re.search('\d{4}-\d{2}-\d{2} \d{2}:\d{2}', str(content_div))
                            if re_get_time:
                                get_time = re_get_time.group() + ':00'
                                write_line['createtime'] = get_time

                            # 文本内容和图片
                            content_info = content_div.find('div', attrs={'class': re.compile('.+?j_d_post_content.+?')})
                            content = content_info.text.strip()
                            imgs_info = content_info.find_all('img', attrs={'class': 'BDE_Image'})
                            if content is None and len(imgs_info)==0:
                                continue
                            if len(imgs_info) ==0:
                                resources = []
                                for img_info in imgs_info:
                                    img = img_info.get('src')
                                    resources.append(img)
                                write_line['resources'] = self.out_formate(json.dumps(resources))
                            # 数据写入
                            write_line['id'] = floorid
                            write_line['replyid'] = None
                            write_line['author'] = self.out_formate(author)
                            write_line['title'] = self.out_formate(title)
                            write_line['stars'] = None
                            write_line['content'] = self.out_formate(content)
                            write_line['updatetime'] = None
                            write_line['likes'] = None
                            with threading.Lock():
                                writer.writerow(write_line)
                            print(f'Write a line:{write_line}')
                        except Exception as err:
                            print(f"获取某个楼层出错,err:{err}")
                            continue

                    # 获取评论
                    for comm in self.get_reply(tid, pn, url, title):
                        with threading.Lock():
                            writer.writerow(comm)
                        print(f'Write a commit:{comm}')
                    # 获取下一页
                    nextpage = re_next_page.search(response)
                    if nextpage:
                        next_url = self.tiebahost + nextpage.group(1)
                        pn += 1
                    else:
                        has_next = False
                except Exception as err:
                    print(f"获取这页的url出错:{url}, err:{err}")
                    has_next = False

            self.content_url_queue.task_done()
        print("complex")
        csvfile.close()

    def start(self):
        csvfile = open(self.tieba_keyword + '.csv', 'a', newline='')
        fieldnames = ['id', 'replyid', 'author', 'title', 'stars', 'content', 'resources', 'createtime', 'updatetime',
                      'likes']

        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        csvfile.close()

        thread1 = threading.Thread(target=self.get_download_links, name="get_download_link")
        thread1.start()
        for i in range(3):
            threads = threading.Thread(target=self.get_content_info, name="get_comments")
            threads.start()
Exemplo n.º 16
0
class DeYang(object):
    imagesnumbel = 0

    def __init__(self):
        self._ha = HttpAccess()
        cookies = '_RF1=101.204.79.78; _RSG=7t4K9DysapAAy3T6IzZvP9; _RDG=28c3a46e16bd9527e206056f639f93f12d; _RGUID=ace4dbc3-4950-4dc7-9679-8fd486743f0a; ASP.NET_SessionSvc=MTAuOC4xODkuNTV8OTA5MHxqaW5xaWFvfGRlZmF1bHR8MTU0NzYzNTY5NDYxNA; bdshare_firstime=1550397871920; MKT_Pagesource=PC; _ga=GA1.2.1090470229.1550397875; _gid=GA1.2.111071048.1550397875; _bfa=1.1550397832747.3uetxn.1.1550397832747.1550397832747.1.4; _bfs=1.4; gad_city=be2e953e1ae09d16d9cc90a550611388; __zpspc=9.1.1550397884.1550397884.1%234%7C%7C%7C%7C%7C%23; _jzqco=%7C%7C%7C%7C1550397884384%7C1.1018365145.1550397884256.1550397884256.1550397884256.1550397884256.1550397884256.0.0.0.1.1; _bfi=p1%3D290510%26p2%3D290546%26v1%3D4%26v2%3D3; appFloatCnt=3'
        self._ha._managedCookie.add_cookies('ctrip.com', cookies)
        self.page_url = queue.Queue()
        self.que_dealing = []
        # 当前文件夹
        self.filepath = Path(__file__).parents[0]

    def out_formate(self, s: str) -> str:
        try:
            return base64.b64encode(s.encode()).decode('utf-8')
        except Exception as ex:
            s = repr(s)
            return base64.b64encode(s.encode()).decode('utf-8')

    def get_ctrip_link(self):
        re_name = re.compile(
            '<a target="_blank" href="(/.+?)" title="(.+?)">.+?</a>')
        for n in range(5):
            url = f"http://you.ctrip.com/sight/deyang462/s0-p{n + 1}.html"
            html_2 = self._ha.getstring(url,
                                        headers='''
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8
Cache-Control: no-cache
Host: you.ctrip.com
Pragma: no-cache
Proxy-Connection: keep-alive
Referer: http://you.ctrip.com/
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'''
                                        )

            name_info = re_name.findall(html_2)

            for name_one in name_info:
                self.page_url.put(name_one)
            print("所有要下载的链接均已获得")
        return

    def get_content_info(self, poid, did, dname, pageall, rid, dirloc: Path):
        url = "http://you.ctrip.com/destinationsite/TTDSecond/SharedView/AsynCommentView"
        for page in range(int(pageall)):
            payload = "poiID={}&districtId={}&districtEName={}&" \
                      "pagenow={}&order=3.0&star=0.0&tourist=0.0" \
                      "&resourceId={}&resourcetype=2".format(poid, did, dname, page+1, rid)
            page_html = self._ha.getstring(url,
                                           payload,
                                           headers='''
Accept: */*
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8
Cache-Control: no-cache
Content-Length: 125
Content-Type: application/x-www-form-urlencoded
Host: you.ctrip.com
Origin: http://you.ctrip.com
Pragma: no-cache
Proxy-Connection: keep-alive
Referer: http://you.ctrip.com/
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36
X-Requested-With: XMLHttpRequest
            ''')

            soup = BeautifulSoup(page_html, 'lxml')
            all_username = soup.find_all('div', attrs={"class": "userimg"})
            comments_divs = soup.find_all('div',
                                          attrs={"class": "comment_ctrip"})
            all_ul = comments_divs[0].find_all('ul')
            for i in range(len(all_ul)):
                try:
                    line = {}
                    name = all_username[i].get_text(strip=True)
                    line['author'] = self.out_formate(name)
                    all_lis = all_ul[i].find_all('li')
                    stars_info = all_lis[0].get_text()
                    stars = re.findall('\d', stars_info)
                    line['title'] = None
                    if len(stars) > 0:
                        get_starts = stars[-1]
                        line['stars'] = get_starts + '/5'
                    else:
                        line['stars'] = None
                    des = all_lis[1].get_text(strip=True)
                    line['content'] = self.out_formate(des)
                    if len(all_lis) == 4:
                        all_pics = []
                        # 有图片
                        all_a = all_lis[2].find_all('a')
                        for a_one in all_a:
                            with threading.Lock():
                                jpg_url = a_one.get('href')
                                # 下载图片
                                jpg_locname = str(DeYang.imagesnumbel) + '.jpg'
                                img = requests.get(jpg_url)
                                jpg_loc: Path = dirloc / jpg_locname
                                with jpg_loc.open('ab') as f:
                                    f.write(img.content)
                                    f.close()
                                print(f"download complete:{jpg_locname}")
                                all_pics.append(jpg_locname)
                                DeYang.imagesnumbel += 1
                        line['pictures'] = json.dumps(all_pics)
                    else:
                        line['pictures'] = None
                    others_info = all_lis[-1]
                    useful_info = others_info.get_text(strip=True)
                    useful = re.findall('\((\d+)\)', useful_info)
                    if len(useful) > 0:
                        useful_res = useful[-1]
                    else:
                        useful_res = None
                    time = others_info.find(
                        'span', attrs={
                            "class": "time_line"
                        }).get_text(strip=True) + ' 00:00:00'
                    line['createtime'] = None
                    line['updatetime'] = None
                    line['time'] = time
                    line['replyto'] = None
                    line['likes'] = useful_res
                    yield line
                except Exception as ex:
                    print(f'解析一行出错:{ex}')
                    continue

    def get_content(self):
        re_content_pages = re.compile('<b class="numpage">(\d+)</b>')
        re_poid = re.compile(
            '<a href="/dianping/edit/(\d+).html" class="b_orange_m">')
        while True:
            # 去队列里取一个url('url', 'name')
            try:
                url_info = self.page_url.get()
                if url_info in self.que_dealing:
                    print("这个url正在下载或者已经下载完成,跳过")
                    continue
                # 放进正在处理的列表里
                self.que_dealing.append(url_info)
                url = url_info[0]
                dirname = url_info[1]
                dir_loc = self.filepath / dirname
                dir_loc.mkdir(exist_ok=True)
                csvfilename = dir_loc / (dirname + '.csv')
                csvfile = open(str(csvfilename), 'w', newline='')
                fieldnames = [
                    'author', 'title', 'stars', 'content', 'pictures',
                    'createtime', 'updatetime', 'time', 'replyto', 'likes'
                ]

                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()

                infos = url.split('/')
                dname = re.findall('[a-zA-Z]+', infos[2])[0]
                did = re.findall('\d+', infos[2])[0]
                rid = re.findall('\d+', infos[-1])[0]
                url = "http://you.ctrip.com" + url
                # proxy = self.get_proxy()
                # response = requests.get(url, headers=self.get_headers(url))
                response = self._ha.getstring(url,
                                              headers='''
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8
Cache-Control: no-cache
Host: you.ctrip.com
Pragma: no-cache
Proxy-Connection: keep-alive
Referer: http://you.ctrip.com/
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'''
                                              )
                pages = re_content_pages.findall(response)
                if len(pages) == 0:
                    pages = 1
                else:
                    pages = pages[0]
                poid = re_poid.findall(response)[0]
                getline = self.get_content_info(poid, did, dname, pages, rid,
                                                dir_loc)
                for a_line in getline:
                    print(f"获取到一行数据:{a_line}")
                    try:
                        writer.writerow(a_line)
                    except Exception as err:
                        print(f"Write line error:{err}\nline:{a_line}")
                        continue
                self.page_url.task_done()
                # 最后所有任务执行完成后break
                if self.page_url.empty():
                    break
            except Exception as err:
                print(f"获取一个url出错,url:{url}, name:{dirname}, error:{err}")
                continue
                # self.delete_proxy(proxy)
        print("complte")
        return

    def start(self):
        thread1 = threading.Thread(target=self.get_ctrip_link,
                                   name="get_start_links")
        thread1.start()
        for i in range(10):
            threads = threading.Thread(target=self.get_content,
                                       name="writeinfo")
            threads.start()