Пример #1
0
    def __init__(self,
                 argv,
                 ip_addr,
                 baseline_type="OS",
                 base_dir="/opt/apache-tomcat-8.5.35"):
        self.parse_argv(argv)
        session = HTMLSession()
        session.mount('file://', FileAdapter())
        # Windows系统路径目录分隔符为反斜杠,但get需要正斜杠所以先进行一下替换
        pwd = os.getcwd().replace("\\", "/")
        # 测试发现使用相对路径读不到文件,需要使用绝对路径
        baseline_type = baseline_type.lower()

        self.ip_addr = ip_addr
        self.baseline_type = baseline_type
        self.base_dir = base_dir
        # ip_reg = "(\d{1,3}\.{1}){3}\d{1,3}"
        # full_reg = f"{ip_reg}_{baseline_type}\.html"
        # pwd_file_list = os.listdir()
        # for file in pwd_file_list:
        #     if re.search(full_reg,file):
        #         ip_addr = re.search(ip_reg,file).group()
        self.html_obj = session.get(
            f'file:///{pwd}/../4_report/{ip_addr}_{baseline_type}_report.html')
        self.shell_script_obj = open(
            f"../6_fix/{ip_addr}_{baseline_type}_fix.sh",
            "w+",
            encoding='utf-8',
            newline='\n')
        self.fix_item_list = {}
Пример #2
0
    def _init_session(self):
        sess = HTMLSession()
        adapter = adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100)
        sess.mount('http://', adapter)
        sess.mount('https://', adapter)

        return sess
Пример #3
0
 def parse_home(self):
     session = HTMLSession()
     session.mount('http://', HTTPAdapter(max_retries=3))
     session.mount('https://', HTTPAdapter(max_retries=3))
     print("parsing: " + self.homepage + "/read-the-story/")
     try:
         with session.get(self.homepage + "/read-the-story/", timeout=(5, 10)) as buf:
             chapters = buf.html.find('#chapters', first=True)
             if chapters == None:
                 return
             chapter_list = chapters.find('.chapter__box')
             for chapter in chapter_list:
                 url = chapter.links.pop()
                 name = re.sub(r'Chapter [\d]*', '',
                               chapter.full_text.strip())
                 name = name.strip()
                 index = re.search(
                     r'Chapter [\d]*', chapter.full_text.strip())
                 index = index.group()
                 chapter = BookChapter(name, index, url)
                 self.chapters.append(chapter)
     except Exception as e:
         print(e)
     print("finish: " + self.homepage + "/read-the-story/")
     session.close()
Пример #4
0
 def parse_chapters(self):
     session = HTMLSession()
     session.mount('http://', HTTPAdapter(max_retries=3))
     session.mount('https://', HTTPAdapter(max_retries=3))
     for chapter in self.chapters:
         html = PAGES_DIR + os.sep + self.name + os.sep + chapter.index + ".html"
         if os.path.exists(html):
             continue
         self.parse_chapter(session, chapter)
         time.sleep(5)
     session.close()
Пример #5
0
class MensaBase(object):

    def __init__(self, endpoints, location):
        """Constructor."""
        self.location = location
        # dict of language specific endpoints
        # { Language : url-string }
        self.endpoints = endpoints

        adapter = CacheControlAdapter(heuristic=ExpiresAfter(days=1))
        self.session = HTMLSession()
        self.session.mount('https://', adapter)

    def retrieve(self, datum=None, language=None, meals=None, emojize=None) -> Plan:
        # overwrite this
        # TODO how to make design more pythonic?
        # In Java terms: abstract class -> two implementation classes
        pass

    # Helper method to make a language-specific request
    def do_request(self, language=Language.DE):
        resp = self.session.get(self.endpoints[language.name])
        code = resp.status_code
        if code != 200:
            logger.warning(f'Non-200 status: {code}')
        logger.debug(f'Status Code: {code}')
        return resp.html

    @staticmethod
    def _normalize_key(k: str) -> str:
        return None if not k else k.strip().lower().replace(' ', '_')

    @staticmethod
    def _strip_additives(text: str) -> str:
        return re.sub('\((\s*(\d+)?[a-z]?[,.]?\s*)+\)', '', text)

    @staticmethod
    def _normalize_whitespace(text: str) -> str:
        return re.sub('\s{2,}', ' ', text)

    @staticmethod
    def _normalize_orthography(text: str) -> str:
        return re.sub('\s,', ',', text)

    @staticmethod
    def _clean_text(text: str) -> str:
        return MensaBase._normalize_orthography(MensaBase._normalize_whitespace(MensaBase._strip_additives(text.strip())))

    @staticmethod
    def _text_replace(text: str) -> str:
        return re.sub('Züricher', "Zürcher", text)
Пример #6
0
def get_signature(user_id):
    """获取所需的签名信息
    
    @oaram: user_id
    @return: signature
    """
    
    session = HTMLSession()    
    signature_url = 'file://' + os.getcwd() + os.sep +'signature.html?user_id=' + str(user_id)
    session.mount("file://", LocalFileAdapter())
    r = session.get(signature_url, headers=MOBIE_HEADERS)
    r.html.render()
    sign = r.html.find('#signature', first=True)
    return sign.text
Пример #7
0
    def session(self) -> HTMLSession:
        """
        Often when using a third party API you want to verify that the returned response is indeed valid.
        Requests offers the shorthand helper raise_for_status()
        which asserts that the response HTTP status code is not a 4xx or a 5xx,
        """
        session = HTMLSession()
        adapter = HTTPAdapter(max_retries=self.retry_strategy)
        session.mount("https://", adapter)
        session.mount("http://", adapter)

        assert_status_hook = (
            lambda response, *args, **kwargs: response.raise_for_status()
        )
        # the requests library offers a 'hooks' interface
        # where you can attach callbacks on certain parts of the request process.
        session.hooks["response"] = [assert_status_hook]
        return session
Пример #8
0
def get_report_values() -> tuple:
    r"""获取测试报告中的数据值,用于传参与email中的数据统计
    """
    session = HTMLSession()
    session.mount('file://', FileAdapter())
    filepath = (os.path.join(setting.BASE_DIR, setting.REPORT,
                             'Report.html')).replace("\\", "/")
    html_obj = session.get(f'file:///{filepath}')
    test_pass_pattern = re.findall(
        '"testPass": \d+,',
        html_obj.html.text)[0].split(':')[1].replace(',', '')
    test_all_pattern = re.findall('"testAll": \d+,',
                                  html_obj.html.text)[0].split(':')[1].replace(
                                      ',', '')
    test_fail_pattern = re.findall(
        '"testFail": \d+,',
        html_obj.html.text)[0].split(':')[1].replace(',', '')
    test_skip_pattern = re.findall(
        '"testSkip": \d+,',
        html_obj.html.text)[0].split(':')[1].replace(',', '')
    return test_all_pattern, test_pass_pattern, test_fail_pattern, test_skip_pattern
Пример #9
0
def main():
    login_session = login(username, password)
    code, msg = report(login_session)
    code_run, msg_run = runway(login_session)

    nameservers = ['8.8.8.8', "210.39.39.153"]

    session = HTMLSession()
    session.mount('http://', CustomAdapter(nameservers))
    session.mount('https://', CustomAdapter(nameservers))
    if server_chan_enable == 1:
        session.get('https://sctapi.ftqq.com/' + sckey + '.send',
                    params={'text': str(msg) + str(msg_run)},
                    timeout=5)
    if telegram_bot_enable == 1:
        session.get('https://api.telegram.org/bot' + telegram_bot_token +
                    '/sendMessage',
                    params={
                        'chat_id': telegram_chat_id,
                        'text': str(msg) + str(msg_run)
                    },
                    timeout=5)
    print(str(msg) + str(msg_run))
Пример #10
0
# Please refer to `https://api.fanyi.baidu.com/doc/21` for complete api document

import json
import random
from hashlib import md5

# import requests
# 基本配置
from loguru import logger
from requests.adapters import HTTPAdapter
from requests_html import HTMLSession

# ua = UserAgent(use_cache_server=False)
# ua = UserAgent(verify_ssl=False)
session = HTMLSession()
session.mount('http://', HTTPAdapter(max_retries=3))
session.mount('https://', HTTPAdapter(max_retries=3))
session.keep_alive = False

# Set your own appid/appkey.
appid = '20190508000295298'
appkey = 'SpZnEM6HliTHK1Mlp96I'

# For list of language codes, please refer to `https://api.fanyi.baidu.com/doc/21`
from_lang = 'en'
to_lang = 'zh'

endpoint = 'http://api.fanyi.baidu.com'
path = '/api/trans/vip/translate'
url = endpoint + path
Пример #11
0
    def __init__(self, **kwargs):
        '''
        Base class for common scraping tasks

        Args:

        '''
        logging.getLogger(__name__).addHandler(logging.NullHandler())
        self.urls = []

        # use requests HTML to aid parsing
        # has all same methods as requests.Session
        _s = HTMLSession()

        # delay/expire
        if kwargs.get('delay'):
            self.delay = kwargs['delay']
        else:
            self.delay = 2

        if kwargs.get('expire_hours'):
            self.expire_hours = kwargs['expire_hours']
        else:
            self.expire_hours = 168

        # add cookies
        if kwargs.get('cookies'):
            _s.cookies = kwargs['cookies']
        else:
            try:
                import cookielib
                _s.cookies = cookielib.MozillaCookieJar()
            except (NameError, ImportError):
                import http.cookiejar
                _s.cookies = http.cookiejar.MozillaCookieJar()
                
        # add headers
        if kwargs.get('headers'):
            _s.headers = kwargs['headers']
        else:
            ua = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
            _s.headers = {'User-Agent': ua}

        # add proxies
        if kwargs.get('proxies'):
            _s.proxies = kwargs['proxies']

        # add cache
        if not '/' in kwargs.get('cache_name', ''):
            self.cache_name = os.path.join('/tmp', kwargs['cache_name'])
        try:
            from cachecontrol import CacheControlAdapter
            from cachecontrol.heuristics import ExpiresAfter
            from cachecontrol.caches import FileCache
            _s.mount('http://', CacheControlAdapter(cache=FileCache(self.cache_name), 
                                    cache_etags = False,
                                    heuristic=ExpiresAfter(hours=self.expire_hours)))
        except ImportError as e:
            try:
                import requests_cache
                requests_cache.install_cache(self.cache_name)
            except:
                logging.exception('could not install cache')
        self.s = _s
Пример #12
0
class Worker(multiprocessing.Process):

    def __init__(self, unvisited_urls_queue, fetched_urls_queue, result_queue, counter, config):
        multiprocessing.Process.__init__(self)
        self.unvisited_urls_queue = unvisited_urls_queue
        self.fetched_urls_queue = fetched_urls_queue
        self.result_queue = result_queue
        self.counter = counter
        self.config = config
        self.kwargs = config["kwargs"]
        self.session = HTMLSession()

        a = adapters.HTTPAdapter(
            pool_connections = 100,
            pool_maxsize = 100
        )
        self.session.mount("http://", a)
        self.session.mount("https://", a)

    def get_url_type(self, url, resp):

        for include_snippet in self.config["include"]:
            if include_snippet in url:
                content_type = resp.headers.get('Content-Type', None)
                if content_type and "text/html" in content_type:
                    url_type = 'recursive'
                else:
                    url_type = 'static'

                return url_type
            else:
                continue

        return "external"

    def check_url_info(self, url):
        for exclude_snippet in self.config["exclude"]:
            if exclude_snippet in url:
                status_code = None
                url_type = "exclude"
                return (status_code, url_type)

        try:
            resp = self.session.head(url, **self.kwargs)
            status_code = resp.status_code
            url_type = self.get_url_type(url, resp)
        except exceptions.ConnectTimeout as ex:
            color_logging(f"{url}: {str(ex)}", 'WARNING')
            status_code = "ConnectTimeout"
            url_type = None
        except exceptions.ConnectionError as ex:
            color_logging(f"{url}: {str(ex)}", 'WARNING')
            status_code = "ConnectionError"
            url_type = None

        return (status_code, url_type)

    def get_hyper_links(self, url):
        # session.browser
        status_code = None
        hyper_links = set()

        try:
            resp = self.session.get(url, **self.kwargs)
            status_code = resp.status_code
        except exceptions.ConnectionError as ex:
            color_logging(f"{url}: {str(ex)}", 'ERROR')
            status_code = "ConnectionError"

        try:
            resp.html.render(sleep=1, timeout=30)
            hyper_links = resp.html.absolute_links
        except lxml.etree.ParserError as ex:
            color_logging(f"{url}: {str(ex)}", 'ERROR')
        except UnicodeDecodeError as ex:
            color_logging(f"{url}: {str(ex)}", 'ERROR')
        except MaxRetries as ex:
            color_logging(f"{url}: {str(ex)}", 'ERROR')

        return (status_code, hyper_links)

    def run(self):
        while True:
            unvisited_url = self.unvisited_urls_queue.get()
            if unvisited_url is None:
                # Poison pill means shutdown
                color_logging(f'{self.name}: Exiting')
                self.unvisited_urls_queue.task_done()
                break

            start_time = time.time()
            status_code, url_type = self.check_url_info(unvisited_url)

            method = "HEAD"
            if url_type in ["exclude"]:
                color_logging(f"skip url: {unvisited_url}", color="blue")
                self.unvisited_urls_queue.task_done()
                continue
            if url_type in ['static', 'external']:
                hyper_links = set()
            elif url_type in ['recursive']:
                method = "GET & Render"
                status_code, hyper_links = self.get_hyper_links(unvisited_url)
            else:
                # url_type is None
                # TODO: raise exception
                hyper_links = set()

            duration_time = time.time() - start_time
            result = (unvisited_url, status_code, duration_time, hyper_links)
            self.result_queue.put(result)

            for link in hyper_links:
                self.fetched_urls_queue.put(link)

            self.unvisited_urls_queue.task_done()
            self.counter.value += 1

            color_logging(f"index: {self.counter.value}, {method} {unvisited_url}, status_code: {status_code}, duration_time: {duration_time}, worker: {self.name}", color="white")
Пример #13
0
    def __init__(self, **kwargs):
        """
        """
        logging.getLogger(__name__).addHandler(logging.NullHandler())
        self.urls = []

        # use requests HTML to aid parsing
        # has all same methods as requests.Session
        _s = HTMLSession()
        self.delay = kwargs.get("delay", 2)
        self.expire_hours = kwargs.get("expire_hours", 168)

        # add cookies
        if kwargs.get("cookies"):
            _s.cookies = kwargs["cookies"]
        else:
            import http.cookiejar

            _s.cookies = http.cookiejar.MozillaCookieJar()

        # add headers
        default_headers = {
            "User-Agent": random.choice(USER_AGENTS),
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "en-US,en;q=0.9",
            "accept": "application/json, text/plain, */*",
        }
        _s.headers.update(default_headers)
        if kwargs.get("headers"):
            _s.headers.update(kwargs["headers"])

        # add proxies
        if kwargs.get("proxies"):
            _s.proxies = kwargs["proxies"]

        # add cache
        if not kwargs.get("cache_name"):
            self.cache_name = os.path.join("/tmp", random_string(32))
        elif "/" not in kwargs.get("cache_name", ""):
            self.cache_name = os.path.join("/tmp", kwargs["cache_name"])
        else:
            self.cache_name = kwargs.get("cache_name")

        try:
            from cachecontrol import CacheControlAdapter
            from cachecontrol.heuristics import ExpiresAfter
            from cachecontrol.caches import FileCache

            _s.mount(
                "http://",
                CacheControlAdapter(
                    cache=FileCache(self.cache_name),
                    cache_etags=False,
                    heuristic=ExpiresAfter(hours=self.expire_hours),
                ),
            )
        except ImportError:
            try:
                import requests_cache

                requests_cache.install_cache(self.cache_name)
            except BaseException:
                logging.exception("could not install cache")
        self.session = _s
Пример #14
0
import os
from bs4 import BeautifulSoup
from requests_html import HTMLSession
from requests.adapters import HTTPAdapter


SITE_URL = 'https://itpanda.net'

session = HTMLSession()
session.mount(SITE_URL, HTTPAdapter(max_retries=5))

PROXIES_SERVER = os.environ.get('PROXIES_SERVER')
PROXIES = {'http': PROXIES_SERVER, 'https': PROXIES_SERVER}


class Spider:
    def get_markdown(self):
        r = session.get(SITE_URL, proxies=PROXIES)

        nav = r.html.find('ul.nav', first=True)
        soup = BeautifulSoup(nav.html, features="lxml")
        ul = soup.find('ul')
        items = ul.find_all('li', recursive=False)
        print('# IT eBOOK')
        for item in items:
            sub_items = item.ul.find_all('li')
            cate_title = item.a.string.split('(')[0].strip()
            cate_link = f'{SITE_URL}{item.a["href"]}'
            print(f'## {cate_title}')
            for sub in sub_items:
                sub_cate_title = sub.a.string.split('(')[0].strip()
Пример #15
0
#     fb_post_id = row[8]
#     csv_writer.writerow([fb_post_id])

# multithread for each post id, request the story url and extract
start_time = datetime.now()
log_format = '%(relativeCreated)8d %(threadName)4s %(message)s'
logging.basicConfig(level=logging.DEBUG, format=log_format)
file_handler = logging.FileHandler('logs/converter.txt', 'w')
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(logging.Formatter(log_format))
logger = logging.getLogger()
logger.addHandler(file_handler)
threading.current_thread().name = 'M'

ses = HTMLSession()
ses.mount('https://', HTTPAdapter(pool_maxsize=2000))
base_url = 'https://m.facebook.com/story.php?story_fbid=%s&id=695707917166339'
user_agent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
              "Gecko/20100101 Firefox/86.0")
default_headers = {
    'User-Agent': user_agent,
    'Accept-Language': 'en-US,en;q=0.5'
}
ses.headers.update(default_headers)

# prev_data = ses.get('https://github.com/davidchoo12/nuswhispers-analysis/releases/latest/download/data-converted.csv').text
# with open('data-converted.csv', 'w') as fd:
#     fd.write(prev_data)
# last_no = 0
# with open('data-converted.csv', 'r') as fd:
#     csv_reader = csv.reader(fd)
Пример #16
0
import time

from requests_html import HTMLSession
from requests import adapters
from tqdm import tqdm
import filtering
import multiprocessing as mp
import os
import json

session = HTMLSession()
adapter = adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100)
session.mount("https://", adapter)
url = "https://dbpedia.org/sparql"

# !!!be modified when running on other servers
proxy = {"http": "http://127.0.0.1:10809", "https": "http://127.0.0.1:10809"}

with open("meaningless_predicates.txt", "r", encoding="utf-8") as fmp:
    bad_predicates = fmp.readline()


# position
# 0: query by the subject
# 1: query by the predicate
# 2: query by the object
# 3: query by the subject and the predicate
# 4: query by the predicate and the object
def crawl(query: tuple,
          position: int,
          lock: mp.Lock = None,
Пример #17
0
import os

from requests_html import HTMLSession, HTML
from requests_file import FileAdapter

session = HTMLSession()
session.mount('file://', FileAdapter())


def get():
    path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html'))
    url = 'file://{}'.format(path)

    return session.get(url)


def test_file_get():
    r = get()
    assert r.status_code == 200


def test_css_selector():
    r = get()

    about = r.html.find('#about', first=True)

    for menu_item in (
        'About', 'Applications', 'Quotes', 'Getting Started', 'Help',
        'Python Brochure'
    ):
        assert menu_item in about.text.split('\n')
Пример #18
0
import os
from functools import partial

import pytest
import psutil
from pyppeteer.browser import Browser
from pyppeteer.page import Page
from requests_html import HTMLSession, AsyncHTMLSession, HTML
from requests_file import FileAdapter

session = HTMLSession()
session.mount('file://', FileAdapter())


def get():
    path = os.path.sep.join(
        (os.path.dirname(os.path.abspath(__file__)), 'python.html'))
    url = 'file://{}'.format(path)

    return session.get(url)


@pytest.fixture
def async_get(event_loop):
    """ AsyncSession cannot be created global since it will create
        a different loop from pytest-asyncio. """
    async_session = AsyncHTMLSession()
    async_session.mount('file://', FileAdapter())
    path = os.path.sep.join(
        (os.path.dirname(os.path.abspath(__file__)), 'python.html'))
    url = 'file://{}'.format(path)
Пример #19
0
class KaniRequests(object):
    def __init__(self,
                 headers={},
                 proxy={},
                 default_timeout=None,
                 max_retries=3):
        def __init__(self, *args, **kwargs):
            if kwargs["connect"] is None:
                kwargs["connect"] = default_timeout
            if kwargs["read"] is None:
                kwargs["read"] = default_timeout
            return TimeoutSauce.__init__(self, *args, **kwargs)

        DefaultTimeout = type("DefaultTimeout", (TimeoutSauce, ),
                              {"__init__": __init__})

        self.headers = headers
        self.proxy = proxy
        self.session = HTMLSession()
        self.session.headers.update(headers)
        if proxy != {}:
            self.session.proxies = proxy
            # self.session.verify = os.path.join(os.path.dirname(__file__), "FiddlerRoot.pem")
            self.session.verify = None
        self.adapters = requests.adapters.HTTPAdapter(max_retries=max_retries)
        self.adapters.TimeoutSauce = DefaultTimeout
        requests.adapters.TimeoutSauce = DefaultTimeout
        self.session.mount("http://", self.adapters)
        self.session.mount("https://", self.adapters)
        self.yag = None
        self.mail_to = None
        self.subject = None
        self.log = logging.getLogger(self.__class__.__name__)

    def set_error_mailer(self, yag, mail_to, subject):
        self.yag = yag
        self.mail_to = mail_to
        self.subject = subject

    def mount(self, prefix, adapters):
        self.session.mount(prefix, adapters)
        self.session.mount(prefix, adapters)

    def get(self, url, *args, **kwargs):
        try:
            kwargs["cookies"] = self.session.cookies
            result = self.session.get(url, *args, **kwargs)
            if self.yag is not None:
                if result.status_code != 200:
                    status_code = result.status_code
                    body = f"status_code is not 200 on Get {url=} {args=} {kwargs=}\n"
                    body += f"{status_code=}"
                    self.yag.send(
                        to=self.mail_to,
                        subject=self.subject,
                        contents=body,
                    )
                    self.log.error(
                        "Sending error email because of status_code=%s.",
                        status_code)
            return result
        except Exception as e:
            if self.yag is not None:
                body = f"Error on Get {url=} {args=} {kwargs=}"
                body += "\n[sys.exe_info]\n"
                body += str(sys.exc_info())
                body = "\n[traceback.format_exc]\n"
                body += traceback.format_exc()
                self.yag.send(
                    to=self.mail_to,
                    subject=self.subject,
                    contents=body,
                )
                self.log.error("Sending error email because of Exception=%s.",
                               e)
            raise

    def post(self, url, *args, **kwargs):
        try:
            kwargs["cookies"] = self.session.cookies
            result = self.session.post(url, *args, **kwargs)
            if self.yag is not None:
                if result.status_code != 200:
                    status_code = result.status_code
                    body = f"status_code is not 200 on Get {url=} {args=} {kwargs=}\n"
                    body += f"{status_code=}"
                    self.yag.send(
                        to=self.mail_to,
                        subject=self.subject,
                        contents=body,
                    )
                    self.log.error(
                        "Sending error email because of status_code=%s.",
                        status_code)
            return result
        except Exception as e:
            if self.yag is not None:
                body = f"Error on Get {url=} {args=} {kwargs=}\n"
                body += "\n[sys.exe_info]\n"
                body += sys.exc_info()
                body = "\n[traceback.format_exc]\n"
                body += traceback.format_exc()
                self.yag.send(
                    to=self.mail_to,
                    subject=self.subject,
                    contents=body,
                )
                self.log.error("Sending error email because of Exception=%s.",
                               e)
            raise

    def put(self, url, *args, **kwargs):
        kwargs["cookies"] = self.session.cookies
        return self.session.put(url, *args, **kwargs)

    def delete(self, url, *args, **kwargs):
        kwargs["cookies"] = self.session.cookies
        return self.session.delete(url, *args, **kwargs)

    def close(self):
        self.session.close()

    def cookies_to_dict(self):
        return dict_from_cookiejar(self.session.cookies)

    def add_cookies(self, cookies):
        add_dict_to_cookiejar(self.session.cookies, cookies)
Пример #20
0
def _get_page_posts(path, pages=10, timeout=5, sleep=0, credentials=None, extra_info=False, begin_url=None, max_retries=5):
    """Gets posts for a given account."""
    global _session, _timeout

    _session = HTMLSession()
    _session.headers.update(_headers)
    a = HTTPAdapter(max_retries=max_retries)
    b = HTTPAdapter(max_retries=max_retries)
    _session.mount('http://', a)
    _session.mount('https://', b)

    if credentials:
        _login_user(*credentials)

    _timeout = timeout

    html = None
    cursor_blob = None
    if begin_url:
        try:
            response = _session.get(begin_url, timeout=timeout)
            response.raise_for_status()
            data = json.loads(response.text.replace('for (;;);', '', 1))
        except (RequestException, ValueError):
            return

        for action in data['payload']['actions']:
            if action['cmd'] == 'replace':
                html = HTML(html=action['html'], url=_base_url)
            elif action['cmd'] == 'script':
                cursor_blob = action['code']
        if not html:
            html = HTML(html=response.html.html.replace('<!--', '').replace('-->', ''))
        if not cursor_blob:
            cursor_blob = html.html
    else:
        url = f'{_base_url}/{path}'
        response = _session.get(url, timeout=_timeout)
        html = HTML(html=response.html.html.replace('<!--', '').replace('-->', ''))
        cursor_blob = html.html
    cursor = None
    next_url = None
    try:
        while True:
            for article in html.find('article'):
                try:
                    post = _extract_post(article)
                    if extra_info:
                        post = fetch_share_and_reactions(post)
                    yield post
                except:
                    print(traceback.format_exc())
                    print("But continuing...")

            pages -= 1
            if pages <= 0:
                return

            cursor = _find_cursor(cursor_blob)
            if not cursor:
                return
            next_url = f'{_base_url}{cursor}'

            if sleep:
                time.sleep(sleep)

            try:
                response = _session.get(next_url, timeout=timeout)
                response.raise_for_status()
                data = json.loads(response.text.replace('for (;;);', '', 1))
            except (RequestException, ValueError):
                raise

            for action in data['payload']['actions']:
                if action['cmd'] == 'replace':
                    html = HTML(html=action['html'], url=_base_url)
                elif action['cmd'] == 'script':
                    cursor_blob = action['code']
    except:
        print(f"Current url: {next_url}")
        print("-----------------------------------")
        print(f"Current page (total-cur_page): {pages}")
        print("-----------------------------------")
        print("Traceback:")
        print(traceback.format_exc())
        raise
Пример #21
0
    def download_resource(self):

        text = ''  # default to empty string

        # Was this file already downloaded?
        if (len(self.content_type) >= 1):
            print("ALREADY DOWNLOADED.")
            return self.request_dict

        # Is the file cached locally?

        # Does this already exist in database? ***************************************

        file_dict = get_from_cache(self.url_protocol_removed())
        if (len(file_dict['text']) > 0):
            self.request_dict = file_dict
            return file_dict['text']

        # --------- Download file from internet -------------
        try:
            self.increment_num_downloads()
            error = ''
            url = self.url

            # Use a User Agent to simulate what a Firefox user would see
            # session = requests.Sesson()
            session = HTMLSession()

            retry = Retry(connect=5, backoff_factor=0.5)
            adapter = HTTPAdapter(max_retries=retry)
            session.mount('http://', adapter)
            session.mount('https://', adapter)

            try:
                r = session.get(url, headers=HEADERS, verify=False)

            # Invalid URL
            except requests.exceptions.MissingSchema:
                return {
                    'text': '',  # unicode
                    'unicode': '',
                    'content': '',  # raw
                    'encoding': '',
                    'error': "Connection refused",
                    'language': '',
                    'content_type': ''
                }

            except requests.exceptions.ConnectionError:
                # r.status_code = "Connection refused"
                return {
                    'text': '',  # unicode
                    'unicode': url,
                    'content': url,  # raw
                    'encoding': '',
                    'error': "Connection refused",
                    'language': '',
                    'content_type': ''
                }

            print('Downloaded ' + url)
            self.request_stop = datetime.now()
            print("Encoding: %s" % r.encoding)
            print("num downloads: " + str(self.num_downloads))

            # Correct the Character Encoding

            if url in self.url_encoding_hardcoded():
                hardcoded_encoding = self.url_encoding_hardcoded()[url]
                r.encoding = hardcoded_encoding

            text = r.text

            self.unicode = r.text
            self.content = r.content

            self.encoding = r.encoding
            self.error = error
            self.language = detect(
                r.text
            )  # https://www.geeksforgeeks.org/detect-an-unknown-language-using-python/

            if 'Content-Type' in r.headers.keys():
                self.content_type = r.headers['Content-Type']
            else:
                self.content_type = 'application/html'
                print(r.headers
                      )  # TODO: Research why content-type is not always set

            print('Content-Type: ' + self.content_type)
            print('Language:     ' + self.language)
            print('Length: ' + str(len(self.content)))
            print("Attempting to save ..  ")

            print(self.unicode)

            ####### Archive a Copy of the Original File ########
            doc_type = self.doc_type()
            print("DocType::: " + doc_type)

            if (doc_type == 'pdf'):
                text = r.content  # file contents

            if (settings.SAVE_DOWNLOADS_TO_FILE):
                write_format = 'w'

                local_filename = self.filename_original()
                remote_path = ''.join(["archive/", self.canonical_url()])
                content_type = self.content_type
                if not content_type:
                    content_type = 'text/html'

                # Create Directory if it doesn't exist
                dirname = os.path.dirname(local_filename)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)

                if (content_type == 'application/pdf'):
                    text = self.content
                    write_format = 'wb'

                # Archive file
                print(
                    "ARCHIVE: local filename----------------------------------"
                )
                print(local_filename)

                my_file = Path(local_filename)
                if my_file.is_file():
                    pass

                else:
                    try:
                        with open(local_filename, write_format) as f:
                            f.write(text)
                    except IsADirectoryError:
                        pass

                # Add file extension if there is none
                filename, file_extension = os.path.splitext(remote_path)
                if not ((file_extension == '.html') or
                        (file_extension == '.htm')):
                    remote_path = os.path.splitext(
                        remote_path)[0] + 'index.html'
                    print("Remote path:")
                    print(remote_path)

                else:
                    remote_path = remote_path + '.' + doc_type

                save_file_to_cloud(local_filename, remote_path, content_type,
                                   'gzip')

            print("Saved original: " + self.filename_original())
            print('Content-Type: ' + self.content_type)
            print('Language:     ' + self.language)
            print('Length: ' + str(len(r.content)))

        except requests.HTTPError:
            self.request_stop = datetime.now()
            """ TODO: Add better error tracking """
            error = "document: HTTPError"

        self.request_dict = {
            'text': text,  # unicode
            'unicode': self.unicode,
            'content': self.content,  # raw
            'encoding': self.encoding,
            'error': self.error,
            'language': self.language,
            'content_type': self.content_type
        }

        # SAVE DB: TODO *********************************

        return self.request_dict
Пример #22
0
def retry_session(url):
    session = HTMLSession()
    session.mount(url, HTTPAdapter(max_retries=5))
    return session