Exemplo n.º 1
0
def get_page(url, need_login=True):
    """
    :param url: url to be crawled
    :param need_login: if the url is need to login, the value is True, else False
    :return: return '' if exception happens or status_code != 200
    """
    crawler.info('the crawling url is {url}'.format(url=url))
    count = 0

    while count < max_retries:
        if need_login:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning(
                    'no cookies in cookies pool, please find out the reason')
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        try:
            if need_login:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=time_out,
                                    verify=False)
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=time_out,
                                    verify=False)

            page = resp.text

            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue
            # slow down to aviod being banned
            time.sleep(interal)

        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning(
                'excepitons happens when crawling {},specific infos are {}'.
                format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('max tries for {},check the url in redis db2'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
Exemplo n.º 2
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: url to be crawled
    :param user_verify: if it's ajax url, the value is False, else True
    :param need_login: if the url is need to login, the value is True, else False
    :return: return '' if exception happens or status_code != 200
    """
    crawler.info('the crawling url is {url}'.format(url=url))
    count = 0

    while count < max_retries:
        if need_login:
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('no cookies in cookies pool, please find out the reason')
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        try:
            if need_login:
                resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('account {} has been banned'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url, headers=headers, timeout=time_out, verify=False)

            page = resp.text

            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # slow down to aviod being banned
            time.sleep(interal)

            if user_verify:
                if is_banned(resp.url) or is_403(page):
                    crawler.warning('account {} has been banned'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if 'verifybmobile' in resp.url:
                    crawler.warning('account {} has been locked,you should use your phone to unlock it'.
                                    format(name_cookies[0]))

                    freeze_account(name_cookies[0], -1)
                    Cookies.delete_cookies(name_cookies[0])
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('{url} seems to be 404'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('excepitons happens when crawling {},specific infos are {}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('max tries for {},check the url in redis db2'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
Exemplo n.º 3
0
def send_personal_message(target_uid, adver_message, user_verify=True, need_login=True):
    """
    :param url: url to be crawled
    :param user_verify: if it's ajax url, the value is False, else True
    :param need_login: if the url is need to login, the value is True, else False
    :return: return '' if exception happens or status_code != 200
    """
    crawler.info('the send_personal_message uid is {uid}'.format(uid=str(target_uid)))
    count = 0

    while count < max_retries:
        if need_login:
            name_cookies = Cookies.fetch_cookies()
            print(name_cookies)
            # check adver_timers
            if int(name_cookies[3]) >= int(adver_timers):
                continue

            if name_cookies is None:
                crawler.warning('no cookies in cookies pool, please find out the reason')
                send_email()
                os.kill(os.getppid(), signal.SIGTERM)
        try:
            if need_login:
                resp = requests.post('http://api.weibo.com/webim/2/direct_messages/new.json?source='+str(name_cookies[2]),
                              data={'text': adver_message, 'uid':str(target_uid)},
                              cookies=name_cookies[1], headers=personal_message_headers)

                if "error" in resp.text:
                    crawler.warning('account {} has been banned, resp.text is: {}'.format(name_cookies[0], resp.text))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
                else:
                    # update adver_times
                    Cookies.store_cookies(name_cookies[0], name_cookies[1], name_cookies[2], 1)
                    return None

            #     if "$CONFIG['islogin'] = '******'" in resp.text:
            #         crawler.warning('account {} has been banned'.format(name_cookies[0]))
            #         freeze_account(name_cookies[0], 0)
            #         Cookies.delete_cookies(name_cookies[0])
            #         continue
            # # else:
            # #     resp = requests.get(url, headers=headers, timeout=time_out, verify=False)
            #
            # page = resp.text
            #
            # if page:
            #     page = page.encode('utf-8', 'ignore').decode('utf-8')
            # else:
            #     continue
            #
            # # slow down to aviod being banned
            # time.sleep(interal)
            #
            # if user_verify:
            #     if is_banned(resp.url) or is_403(page):
            #         crawler.warning('account {} has been banned'.format(name_cookies[0]))
            #         freeze_account(name_cookies[0], 0)
            #         Cookies.delete_cookies(name_cookies[0])
            #         count += 1
            #         continue
            #
            #     if 'verifybmobile' in resp.url:
            #         crawler.warning('account {} has been locked,you should use your phone to unlock it'.
            #                         format(name_cookies[0]))
            #
            #         freeze_account(name_cookies[0], -1)
            #         Cookies.delete_cookies(name_cookies[0])
            #         continue
            #
            #     if not is_complete(page):
            #         count += 1
            #         continue
            #
            #     if is_404(page):
            #         crawler.warning('send_personal_message{uid} seems to be 404'.format(uid=str(target_uid)))
            #         return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('excepitons happens when send_personal_message {},specific infos are {}'.format(target_uid, e))
            count += 1
            time.sleep(excp_interal)

        else:
            # Urls.store_crawl_url(url, 1)
            # return page
            return None

    crawler.warning('max tries for {},check the target_uid in redis db2'.format(target_uid))
    # Urls.store_crawl_url(url, 0)
    return ''
Exemplo n.º 4
0
 def test_send_email(self):
     from utils.email_warning import send_email
     send_email()
Exemplo n.º 5
0
 def test_send_email(self):
     from utils.email_warning import send_email
     send_email()