Exemplos de HTMLSession.request em Python, exemplos de requests_html.HTMLSession.request em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: jsrequest.py Projeto: kostiukoleg/python-parser

def fetch(url):
    data = ""
    r = ""
    # p = current_process()
    # if(p.name != 'MainProcess' and p._identity[0] and os.getpid()):
    #     print('process counter:', p._identity[0], 'pid:', os.getpid())
    asession = HTMLSession()
    asession.headers.update({'User-Agent': fake_useragent.UserAgent().random})
    asession.max_redirects = 60
    #parsing from proxy
    # proxy = { 'http': 'http://' + choice(read_file("proxies.txt","\n")) +'/' }
    # asession.proxies.update(proxy)
    unf = uniform(1,6)
    time.sleep(unf)
    try:
        r = asession.request('GET', url, allow_redirects=False)
    except Exception as e:
        print('Failed to get page %s. Reason: %s' % (url, e))
        asession.close()
        return data
    try:
        if(r.status_code == 200):
            r.html.render(sleep = 2, timeout = 200)
            data = r.html
            asession.close()
            return data
        else:
            asession.close()
            return data
    except Exception as e:
        print('Failed to render page %s. Reason: %s' % (url, e))
        asession.close()
        return data

Exemplo n.º 2

0

Exibir arquivo

Arquivo: dojo_requests.py Projeto: tcmRyan/DojoMiner

class DojoRequests:
    """
    Small wrapper around the requests library to make it easy to make calls to the
    dojo/gdp api.  DojoRequests automatically adds the cookies in for to make sure
    the requests are authenticated.  For documentation just view the Pythons Requests
    library.
    """

    def __init__(self):
        self.session = HTMLSession()
        if config.get('browser') == 'firefox':
            self.cookies = browser_cookie3.firefox()
        else:
            self.cookies = browser_cookie3.chrome()

    def request(self, method, url, **kwargs):
        method = method.upper()
        request_args = {
            'url': url,
            'method': method,
            'cookies': self.cookies
        }
        request_args.update(kwargs)
        return self.session.request(**request_args)

    def get(self, url, **kwargs):
        return self.request('GET', url, **kwargs)

    def post(self, url, **kwargs):
        return self.request('POST', url, **kwargs)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: test_web_control.py Projeto: RCQuinn/reverse-image-scraper

def test_request_to_bs4__strainer_missing(mock_soup):
    session = HTMLSession()
    with patch.object(session, 'request') as mock_session:
        mock_session.return_value.text = "some_HTML_code"
        web_control.request_to_bs4(None,
                                   session.request("method", "http://url"))

    mock_soup.assert_called_once_with("some_HTML_code", "lxml")

Exemplo n.º 4

0

Exibir arquivo

def session(populated_cluster):

    url = populated_cluster["url"].rstrip("/")

    s = HTMLSession()

    def new_request(prefix, f, method, url, *args, **kwargs):
        return f(method, prefix + url, *args, **kwargs)

    s.request = partial(new_request, url, s.request)
    return s

Exemplo n.º 5

0

Exibir arquivo

Arquivo: grequests.py Projeto: yxm0513/grequests

class AsyncRequest(object):
    """ Asynchronous request.

    Accept same parameters as ``Session.request`` and some additional:

    :param session: Session which will do request
    :param callback: Callback called on response.
                     Same as passing ``hooks={'response': callback}``
    """
    def __init__(self, method, url, **kwargs):
        #: Request method
        self.method = method
        #: URL to request
        self.url = url
        #: Associated ``Session``
        self.session = kwargs.pop('session', None)
        if self.session is None:
            self.session = Session()
            self._close = True
        else:
            self._close = False  # don't close adapters after each request if the user provided the session

        callback = kwargs.pop('callback', None)
        if callback:
            kwargs['hooks'] = {'response': callback}

        #: The rest arguments for ``Session.request``
        self.kwargs = kwargs
        #: Resulting ``Response``
        self.response = None

    def send(self, **kwargs):
        """
        Prepares request based on parameter passed to constructor and optional ``kwargs```.
        Then sends request and saves response to :attr:`response`

        :returns: ``Response``
        """
        merged_kwargs = {}
        merged_kwargs.update(self.kwargs)
        merged_kwargs.update(kwargs)
        try:
            self.response = self.session.request(self.method, self.url,
                                                 **merged_kwargs)
        except Exception as e:
            self.exception = e
            self.traceback = traceback.format_exc()
        finally:
            if self._close:
                # if we provided the session object, make sure we're cleaning up
                # because there's no sense in keeping it open at this point if it wont be reused
                self.session.close()
        return self

Exemplo n.º 6

0

Exibir arquivo

def verfiy_localbitcoins(username, lbc_username):
    session = HTMLSession()
    # Fake user agent to reduce chance of getting seen as a bot
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0', }
    response = session.request(url=f'https://localbitcoins.com/accounts/profile/{lbc_username}/', method='GET', headers=headers)
    
    # If the profile is invalid it will redirect to home
    if response.html.next() == 'https://localbitcoins.com/':
        return False
    
    code_area = response.html.find('.overflow-catch', first=True)

    return code_area.text.find(get_verification_code(username)) != -1

Exemplo n.º 7

0

Exibir arquivo

Arquivo: mm_req_html_pic.py Projeto: wahlmzr/craw

class MM(object):
    def __init__(self):
        self.__page = 1
        self.__url = "http://www.mm131.com/qingchun/list_1_{}.html"
        self.__session = HTMLSession()
        self.__headers = {
            'Referer':
            'http://www.mm131.com/qingchun/',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
        }
        self.__imagePath = r'D:/Photo/MM'
        self.__confirmPath()

    def __confirmPath(self):
        if not os.path.exists(self.__imagePath):
            os.makedirs(self.__imagePath)

    def download(self, link, fileName):
        try:
            with open(self.__imagePath + '/' + fileName + '.jpg', 'wb') as f:
                f.write(
                    self.__session.request('get',
                                           link,
                                           headers=self.__headers,
                                           allow_redirects=False).content)
        except Exception as e:
            print(str(e))

    def parseData(self):
        start = time.time()
        while self.__page < 12:
            if self.__page == 1:
                self.__url = "http://www.mm131.com/qingchun/"
            else:
                self.__url = 'http://www.mm131.com/qingchun/list_1_{}.html'.format(
                    self.__page)
            r = self.__session.get(self.__url)
            main = r.html.find(".main", first=True)
            dl = main.find('dl')[0]
            dds = dl.find('dd')
            for dd in dds[:-1]:
                attr = dd.find('img')[0].attrs
                imageLink = attr['src']
                title = attr['alt']
                self.download(imageLink, title)
            self.__page += 1
        end = time.time() - start
        print("爬取时间:", end)

Exemplo n.º 8

0

Exibir arquivo

def parse_course_list(request):
    if not request.user.is_authenticated():
        return HttpResponse("Not authenticated.")
    session = HTMLSession()
    course_list_r = session.request(url='https://ocw.mit.edu/courses/',
                                    method='get')
    courses = course_list_r.html.find('.course_title')
    # Clear out old courses
    Course.objects.all().delete()
    sequence_sql = connection.ops.sequence_reset_sql(no_style(), [Course])
    with connection.cursor() as cursor:
        for sql in sequence_sql:
            cursor.execute(sql)
    for course in courses:
        course_db_obj = Course(url=course.absolute_links.pop())
        course_db_obj.save()

    return HttpResponse("Successfully added all ")

Exemplo n.º 9

0

Exibir arquivo

Arquivo: mm_req_html_pic.py Projeto: wahlmzr/craw

class MM(object):
    def __init__(self):
        self.__page = 1
        self.__url = "http://www.mm131.com/qingchun/list_1_{}.html"
        self.__session = HTMLSession()
        self.__headers = {
            'Referer':'http://www.mm131.com/qingchun/',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
        }
        self.__imagePath = r'D:/Photo/MM'
        self.__confirmPath()

    def __confirmPath(self):
        if not os.path.exists(self.__imagePath):
            os.makedirs(self.__imagePath)
            
    def download(self,link,fileName):
        try:
            with open(self.__imagePath+'/'+fileName+'.jpg','wb') as f:
                f.write(self.__session.request('get',link,headers = self.__headers,allow_redirects=False).content)
        except Exception as e:
            print(str(e))

    def parseData(self):
        start = time.time()
        while self.__page < 12:
            if self.__page == 1:
                self.__url = "http://www.mm131.com/qingchun/"
            else:
                self.__url = 'http://www.mm131.com/qingchun/list_1_{}.html'.format(self.__page)
            r = self.__session.get(self.__url)
            main = r.html.find(".main",first=True)
            dl = main.find('dl')[0]
            dds = dl.find('dd')
            for dd in dds[:-1]:
                attr = dd.find('img')[0].attrs
                imageLink = attr['src']
                title = attr['alt']
                self.download(imageLink,title)
            self.__page += 1
        end = time.time() - start
        print("爬取时间:",end)

Exemplo n.º 10

0

Exibir arquivo

def toRequestHtml(url):
    print(url)
    session = HTMLSession()
    # r = session.get(url)
    r = session.request("GET", url, headers=headerTwo)
    # r.html.render()
    print(r.html.html)

    # 获取页面上的所有链接。
    # all_links =  r.html.links
    # print(all_links)
    about = r.html.find('div.list-item-desc-top')
    # about = r.html.find('a',containing='www.meituan')
    for i, title in enumerate(about):
        aElement = title.html.find('a')
        nextUrl = f'https:{aElement.attrs["href"]}'
        nextHtml = session.get(nextUrl)
        print(f'{i + 1} [{aElement.text}](https:{aElement.attrs["href"]})')
        nextAbout = nextHtml.html.find('div.d-left')
        for j, nextTitle in enumerate(nextAbout):
            print(f'{j + 1} [{nextTitle.text}]')

Exemplo n.º 11

0

Exibir arquivo

class SkywardAPI():
    """Class for Skyward data retrieval.

    Parameters
    ----------
    service: str
        Skyward service for school.
    timout: int
        Request timeout (the default is 60)

    Attributes
    ----------
    timeout : int
        Seconds until request times out.
    base_url: str
        Base url for requests
    login_url: str
        URL for login.
    session_params : Dict[str, Any]
        Parameters for session.

    """
    def __init__(self, service: str, timeout: int = 60) -> None:
        self.base_url = "https://skyward.iscorp.com/scripts/wsisa.dll/WService={0}".format(
            service)
        self.login_url = self.base_url + "/skyporthttp.w"
        self.timeout = timeout
        self.session_params = {}  # type: Dict[str, str]
        self.session = HTMLSession()

    def edit_srcs(self, page: HTMLResponse) -> HTML:
        """Edits urls in page to request from Skyward and not local computer.

        Parameters
        ----------
        page : HTMLResponse
            HTMLResponse from a request to skyward.

        Returns
        -------
        HTML
            HTML object with urls pointing to Skyward website.

        Side Effects
        ------------
        Attached the HTML object to self.session. If rendering, make sure to close
        session so chromiums do not pile up.

        """
        new_text = page.text
        new_text = new_text.replace("src='",
                                    "src='{0}/".format(self.base_url)).replace(
                                        "href='",
                                        "href='{0}/".format(self.base_url))
        '''
            Replacing values here to make sure that all requests
            are being made to the skyward site and not the local
            computer.
        '''

        new_html = HTML(html=new_text, session=self.session)
        return new_html

    def timed_request(self,
                      url: str,
                      data: Dict[str, str] = {},
                      headers: Dict[str, str] = {},
                      method: str = "post",
                      params: Dict[str, str] = {}) -> HTMLResponse:
        """Issues a requests-html request with timeout functionality. Automatically
            closes session at end of request.

        Parameters
        ----------
        url : str
            URL for request.
        data : Dict[str, str]
            Data for request (the default is {}).
        headers : Dict[str, str]
            Headers for request (the default is {}).
        method : str
            Method of request (the default is "post").
        params : Dict[str, str]
            Params for request (the default is {}).

        Returns
        -------
        HTMLResponse
            Response of request.

        Raises
        -------
        SkywardError
            Unable to connect to skyward.

        Side Effects
        ------------
        Closes self.session and regenerates it.
        """
        start_time = time.time()
        return_data = None
        while True:
            try:
                return_data = self.session.request(method,
                                                   url,
                                                   data=data,
                                                   headers=headers,
                                                   params=params)
                break
            except requests.exceptions.ConnectionError:
                if time.time() > start_time + self.timeout:
                    raise SkywardError('Request to Skyward failed.')
                else:
                    time.sleep(1)
            finally:
                self.session.close()
                self.session = HTMLSession()
        return return_data

    def login(self, username: str, password: str) -> Dict[str, Any]:
        """Logs into Skyward and retreives session data.

        Parameters
        ----------
        username: str
            Skyward username.
        password: str
            Skyward password.

        Returns
        -------
        Dict[str, Any]
            Login data for skyward.

        Raises
        -------
        ValueError
            Incorrect username or password.
        SkywardError
            Unable to connect to Skyward.

        """
        params = skyward_req_conf
        params["codeValue"] = username
        params["login"] = username
        params["password"] = password
        req = self.timed_request(self.login_url, data=params)
        text = req.html.text
        if "Invalid" in text:
            raise ValueError("Incorrect username or password")
        times = 0
        while text == "" and times <= 5:
            req = self.timed_request(self.login_url, data=params)
            text = req.html.text
            times += 1
            """
            Sometimes a request does not go through on the first try.
            Looping to make sure the api catches this, if it occurs.
            """
        if text == "":
            raise SkywardError("Skyward returning no login data.")

        data = parse_login_text(self.base_url, text)
        return data

    def setup(self, username: str, password: str) -> None:
        """Sets up api session data using username and password.

        Parameters
        ----------
        username : str
            Skyward username.
        password : str
            Skyward password.
        """
        data = self.login(username, password)
        self.login_data = data
        self.session_params = self.get_session_params()

    @staticmethod
    def from_username_password(username: str,
                               password: str,
                               service: str,
                               timeout: int = 60) -> "SkywardAPI":
        """Returns a logged-in SkywardAPI object using username and password provided.

        Parameters
        ----------
        username : str
            Skyward username.
        password : str
            Skyward password.
        service : str
            Skyward service.
        timeout : int
            Timeout of requests made to Skyward (the default is 60).

        Returns
        -------
        SkywardAPI
            API object logged in with supplied credentials.

        Raises
        -------
        ValueError
            Incorrect username and password (from setup).
        SkywardError
            Unable to connect to Skyward (from setup).

        """
        api = SkywardAPI(service, timeout=timeout)
        api.setup(username, password)
        return api

    @staticmethod
    def from_session_data(service: str,
                          sky_data: Dict[str, str],
                          timeout: int = 60) -> "SkywardAPI":
        """Generates an API given a service and session data.

        Parameters
        ----------
        service : str
            Skyward service to be used.
        sky_data : Dict[str, str]
            Session data from skyward.

        Returns
        -------
        SkywardAPI
            An api for the user, given the session info.

        Raises
        -------
        SessionError
            If session credentials are revoked by Skyward, error is raised.

        Side Effects
        ------------
        Closes and regenerates self.session.

        """
        api = SkywardAPI(service, timeout=timeout)
        api.session_params = sky_data
        grade_url = api.base_url + "/sfhome01.w"
        sessionp = api.session_params
        req3 = api.timed_request(grade_url,
                                 data={
                                     "encses": sessionp["encses"],
                                     "sessionid": sessionp["sessid"]
                                 })
        new_html = api.edit_srcs(req3)
        try:
            other_data = new_html.render(script="""
                () => {
                    return {
                        dwd: sff.getValue('dwd'),
                        nameid: sff.getValue('nameid'),
                        wfaacl: sff.getValue('wfaacl'),
                    }
                }
            """,
                                         retries=2,
                                         timeout=2.5,
                                         keep_page=False)
            api.session.close()
            api.session = HTMLSession()
        except MaxRetries:
            raise SessionError("Session destroyed by Skyward.")
        api.session_params.update(other_data)

        return api

    def get_session_params(self) -> Dict[str, str]:
        """Gets session data from Skyward for login.

        Returns
        -------
        Dict[str, str]
            Session variables.

        """
        ldata = self.login_data

        req = self.timed_request(ldata["new_url"], data=ldata["params"])
        page = req.html
        obj = {}
        try:
            obj["sessid"] = page.find("#sessionid", first=True).attrs["value"]

            obj["encses"] = page.find("#encses", first=True).attrs["value"]
        except AttributeError:
            obj = self.get_session_params()
            #Again, sometimes this doesn't work on the first try.
        obj["dwd"] = ldata["params"]["dwd"]
        obj["nameid"] = ldata["params"]["nameid"]
        obj["wfaacl"] = ldata["params"]["wfaacl"]
        return obj

    def get_class_grades(self, sm_grade: Element, grid_count: int,
                         constant_options: Dict[str, str], url: str,
                         sm_num: int) -> SkywardClass:
        """Gets class grades given elements and request options.

        Parameters
        ----------
        sm_grade : Element
            HTML element containing request information.
        grid_count : int
            Grid count parameter on page.
        constant_options : Dict[str, str]
            Constant options provided to ensure valid request.
        url : str
            Request url.
        sm_num : int
            Semester number in question.

        Returns
        -------
        SkywardClass
            Grades from a class.

        """
        attrs = sm_grade.attrs
        specific_request_data = {
            "corNumId": attrs["data-cni"],
            "gbId": attrs["data-gid"],
            "stuId": attrs["data-sid"],
            "section": attrs["data-sec"],
            "entityId": attrs["data-eid"]
        }
        grade_request_data = constant_options
        grade_request_data.update(specific_request_data)

        grade_req = self.timed_request(url,
                                       data=grade_request_data,
                                       params={"file": "sfgradebook001.w"})
        text = grade_req.text

        start_split = text.find("<![CDATA[") + len("<![CDATA[")
        end_split = text.find("]]")
        text_split = text[start_split:end_split + 1]

        doc = HTML(html=text_split)

        class_name = doc.find(".gb_heading", first=True).text
        class_name = class_name.replace("\xa0", " ")

        sky_class = SkywardClass(class_name, [])

        semester_info = doc.find("th", first=True)
        date_range = semester_info.find("span", first=True).text
        date_range = date_range.replace("(", "").replace(")", "")

        sem_start_date = date_range.split(" - ")[0]
        # Date range looks like "(START - END)" so removing ( ) and splitting
        # gives the start date.

        sem_grade = doc.find(".odd", first=True)
        sem_grade_spl = sem_grade.text.split("\n")
        sem_lg = sem_grade_spl[0]
        sem_percent = sem_grade_spl[1]
        sem_asign = Assignment("SEM{0}".format(sm_num), sem_percent, "100",
                               sem_lg, sem_start_date)
        sky_class.add_grade(sem_asign)

        scope = doc.find("td")
        style_str = "padding-right:4px"
        scope = [
            row for row in scope
            if "style" in row.attrs and row.attrs["style"] == style_str
        ]
        scope_major = scope[0]
        scope_grades = scope[1]

        list_of_grades = scope_grades.find(".even") + scope_grades.find(".odd")
        list_of_major_grades = scope_major.find(".even") + scope_major.find(
            ".odd")
        assignments = [
            assignment for assignment in list_of_grades
            if "zebra-same" not in assignment.attrs
        ]

        major_grades = [
            grade for grade in list_of_major_grades if
            "zebra-same" in grade.attrs and grade.attrs["zebra-same"] == "true"
        ]

        for assignment in assignments:
            assignment_info = assignment.find("td")
            name = ""
            date = ""
            try:
                date = assignment_info[0].text
                name = assignment_info[1].text
            except IndexError:
                continue
            assign = None
            try:
                lg = assignment_info[2].text
                point_str = assignment_info[4].text
                point_str_spl = point_str.split(" out of ")
                earned = point_str_spl[0]
                out_of = point_str_spl[1]
                assign = Assignment(name, earned, out_of, lg, date)
            except IndexError:
                assign = Assignment(name, "*", "*", "*", date)
            sky_class.add_grade(assign)

        for grade in major_grades:
            grade_info = grade.find("td")
            name = ""
            lg = ""
            try:
                desc = grade_info[0].text
                desc = desc.replace("\n", "")
                colon_split = desc.split(":")
                name = colon_split[0]
                lg = colon_split[1][0]
            except IndexError as e:
                continue
            try:
                grade_data = grade_info[2].text
                str_split = grade_data.split(" out of ")
                earned = str_split[0]
                out_of = str_split[1]
                sky_class.add_grade(
                    Assignment(name, earned, out_of, lg, sem_start_date))
            except IndexError:
                sky_class.add_grade(
                    Assignment(name, "*", "*", "*", sem_start_date))

        sky_class.sort_grades_by_date()
        return sky_class

    def get_semester_grades(self, semester_num: int,
                            page: HTML) -> List[SkywardClass]:
        """Gets grades for a specific semester.

        Parameters
        ----------
        semester_num : int
            1 or 2 for first or second semester.
        page : HTML
            HTML Grade page to get buttons/links/etc.

        Returns
        -------
        List[SkywardClass]
            List of class grades.

        """
        grades = []  # type: List[SkywardClass]

        sessionp = self.session_params
        grade_buttons = page.find("#showGradeInfo")

        sm_grade_buttons = [
            button for button in grade_buttons
            if button.attrs["data-lit"] == "SM{0}".format(semester_num)
        ]
        grade_req_url = "{0}/httploader.p".format(self.base_url)

        constant_options = {
            "encses": sessionp["encses"],
            "sessionid": sessionp["sessid"],
            "ishttp": "true",
            "fromHttp": "yes",
            "action": "viewGradeInfoDialog",
            "bucket": "SEM {0}".format(semester_num)
        }
        grid_count = 1

        for class_sm_grade in sm_grade_buttons:
            grades.append(
                self.get_class_grades(class_sm_grade, grid_count,
                                      constant_options, grade_req_url,
                                      semester_num))
        return grades

    def get_grades(self) -> List[SkywardClass]:
        """Gets grades from both semesters.

        Returns
        -------
        List[SkywardClass]
            Grades from both semesters.

        Raises
        ------
        SessionError
            If the session is destroyed, no data can be received.

        """
        grade_url = self.base_url + "/sfgradebook001.w"
        sessionp = self.session_params
        req1 = self.timed_request(grade_url,
                                  data={
                                      "encses": sessionp["encses"],
                                      "sessionid": sessionp["sessid"]
                                  })
        new_html = self.edit_srcs(req1)
        if "Your session has timed out" in new_html.text or "session has expired" in new_html.text:
            raise SessionError("Session destroyed. Session timed out.")
        ret_data = new_html.render()

        grades = self.get_semester_grades(1, new_html)
        grades += self.get_semester_grades(2, new_html)
        if grades == {}:
            raise SessionError("Session destroyed. No grades returned.")
        self.session.close()
        self.session = HTMLSession()
        return grades

    def get_grades_text(self) -> Dict[str, List[str]]:
        """Converts Assignments in get_grades() to strings

        Returns
        -------
        Dict[str, List[str]]
            Grades (as a string) from both semesters.

        """
        grades = self.get_grades()
        str_grades = {}
        for sky_class in grades:
            str_grades[sky_class.skyward_title()] = sky_class.grades_to_text()
        return str_grades

    def get_grades_json(self) -> Dict[str, List[Dict[str, Any]]]:
        """Converts Assignments in get_grades() to strings

        Returns
        -------
        Dict[str, List[str]]
            Grades (as a string) from both semesters.

        """
        grades = self.get_grades()
        json_grades = {}
        for sky_class in grades:
            class_grades = sky_class.grades
            class_grades_json = list(
                map(lambda grade_obj: grade_obj.__dict__, class_grades))
            json_grades[sky_class.skyward_title()] = class_grades_json

        return json_grades

    def keep_alive(self) -> None:
        """Issues a keep-alive request for the session.

        """
        grade_url = self.base_url + "/qsuprhttp000.w?"
        sessionp = self.session_params
        req = self.timed_request(grade_url,
                                 data={
                                     "dwd": sessionp["dwd"],
                                     "idleTimeout": 300000,
                                     "myIdleSeconds": 60,
                                     "nameid": sessionp["nameid"],
                                     "requestAction": "mySession",
                                     "wfaacl": sessionp["wfaacl"]
                                 },
                                 method="get")

Exemplo n.º 12

0

Exibir arquivo

from requests_html import HTMLSession
import pymongo
import json
from util import *


if __name__ == "__main__":
    session = HTMLSession()
    client = pymongo.MongoClient('mongodb://localhost:27017/')
    database = client["film"]
    connection = database["area_detail_data"]
    for i in get_date_str("2011-01-01", "2019-10-18"):
        url = 'http://www.films.cn/api/top/theater/boxoffice/local?date=%s&size=30' % i
        r = session.request("GET", url).content
        raw_data = json.loads(r)
        data_theater = raw_data['data']
        print(data_theater)
        connection.update_one({'time': i}, {'$set': {'data_theater': data_theater}}, True)
        print("=======")

Exemplo n.º 13

0

Exibir arquivo

def parse_course(request, course_id):
    try:
        course = Course.objects.get(pk=course_id)
    except Course.DoesNotExist:
        return HttpResponse("Bad Course Id")
    session = HTMLSession()
    info_r = session.request(url=course.url, method='get')
    # Deactivate bad courses
    if not info_r:
        course.active = False
        return HttpResponse("URL not valid.")
    # Make sure course is active
    course.active = True
    # Set course name
    course.name = info_r.html.find('#course_title', first=True).find()[0].text
    # Set course info
    course_info = info_r.html.find('#course_info', first=True)
    authors = course_info.find('[itemprop=author]', first=False)
    course.instructors = ""
    for i, author in enumerate(authors):
        if i != 0:
            course.instructors += ", "
        course.instructors += author.text
    for paragraph in course_info.find('p'):
        if 'itemprop' not in paragraph.attrs:
            course.course_num = paragraph.text
    course.asTaughtIn = course_info.find('[itemprop=startDate]',
                                         first=True).text
    course.level = course_info.find('[itemprop=typicalAgeRange]',
                                    first=True).text
    course.description = info_r.html.find('#description',
                                          first=True).find('p',
                                                           first=True).text
    # Set course syllabus
    syllabus_r = session.request(url=course.url + '/syllabus/', method='get')
    if syllabus_r:
        syllabus = str(
            syllabus_r.html.find('#course_inner_section', first=True).html)
        # Removes help modals which come after the tag <!-- googleoff: index-->
        syllabus = syllabus[0:syllabus.index('<!--googleoff: index-->')]
        course.syllabus = syllabus
    else:
        course.syllabus = "N/A"
    # Set course readings
    readings_r = session.request(url=course.url + '/readings/', method='get')
    if readings_r:
        readings = str(
            readings_r.html.find('#course_inner_section', first=True).html)
        # Removes help modals which come after the tag <!-- googleoff: index-->
        readings = readings[0:readings.index('<!--googleoff: index-->')]
        course.readings = readings
    else:
        course.readings = "N/A"
    # Set course tools
    tools_r = session.request(url=course.url + '/tools/', method='get')
    if tools_r:
        tools = str(
            tools_r.html.find('#course_inner_section', first=True).html)
        # Removes help modals which come after the tag <!-- googleoff: index-->
        tools = tools[0:tools.index('<!--googleoff: index-->')]
        course.tools = tools
    else:
        course.tools = "N/A"
    # Save course
    course.save()
    return HttpResponse("Successfully parsed course " + str(course_id) + ".")

Exemplo n.º 14

0

Exibir arquivo

Arquivo: spider_films.py Projeto: Gyue15/MovieAnalysis

from requests_html import HTMLSession
import pymongo
import json
from util import *

if __name__ == "__main__":
    session = HTMLSession()
    client = pymongo.MongoClient('mongodb://localhost:27017/')
    database = client["film"]
    connection = database["film_detail_data"]
    for i in get_date_str("2011-01-01", "2019-10-18"):
        url1 = 'http://www.films.cn/api/top/movie/boxoffice?boxofficeTime=%s&movieType=all&topType=day&size=100' % i
        r1 = session.request("GET", url1).content
        raw_data1 = json.loads(r1)
        data_movie = []
        # print(raw_data1)
        for data in raw_data1['data']:
            data_movie.append({
                'rank':
                data['rank'],
                'id':
                data['id'],
                'movieName':
                data['movieName'],
                'boxOfficeIndexNum':
                data['boxOfficeIndexNum'],
                'popShowIndexNum':
                data['popShowIndexNum'],
                'showDays':
                int(data['showDays']),
                'boxOffice':

Exemplo n.º 15

0

Exibir arquivo

class Aqualink(object):
    def __init__(self, username: str, password: str):
        self.username = username
        self.password = password
        self.session = HTMLSession()
        self.login_link = None
        self.home_link = None
        self.home_cache = None
        self.devices_link = None
        self.devices_cache = None
        self._devices = {}
        self.lock = threading.Lock() 
        self.last_refresh = 0

        self.login()

    def request(self, url: str, method: str = 'get', **kwargs) -> Response:
        r = self.session.request(method, url, **kwargs)
        if r.status_code == 200:
            logger.debug(f"<- {r.status_code} {r.reason} - {url}")
        else:
            logger.warning(f"<- {r.status_code} {r.reason} - {url}")
        return r

    def login(self) -> None:
            logger.debug("Getting Aqualink start page...")
            start = self.request(ACTION_BASE_URL)
            form = start.html.find('form', first=True)
            action = form.xpath('//input[@id = "actionID"]', first=True).attrs['value']
            self.login_link = ACTION_BASE_URL % action
            logger.debug("Login Link: %s" % self.login_link)

            # Make sure our credentials work.
            self.home_cache = self.request(self.login_link)
            if len(self.home_cache.html.find("div.temps")) == 0:
                payload = {'userID': self.username, 'userPassword': self.password}
                logger.info("Logging in to Aqualink...")
                self.home_cache = self.request(self.login_link, 'post', data=payload)
                if len(self.home_cache.html.find("div.temps")) == 0:
                    self.home_link = None
                    self.home_cache = None
                    raise Exception("Check your username and password.")
                else:
                    self.home_link = self.home_cache.html.find('li#tabHeader_1', first=True).absolute_links.pop()
                    logger.debug("Home Link: %s" % self.home_link)


    def refresh(self, force_refresh=False) -> None:
        self.lock.acquire()

        # Be nice to Aqualink servers since we rely on polling.
        now = int(time.time())
        delta = now - self.last_refresh
        if delta < MIN_SECS_TO_REFRESH and not force_refresh:
            self.lock.release()
            return

        try:
            self._refresh()
        except Exception as e:
            logger.error(f"Unhandled exception: {e}")
            for line in traceback.format_exc().split('\n'):
                logger.error(line)
        else:
            self.last_refresh = int(time.time())

        self.lock.release()


    def _refresh(self, force_refresh=False) -> None:
        logger.debug("Refreshing device list...")

        if self.home_link is None:
            self.login()
        else:
            self.home_cache = self.request(self.home_link)

        self.devices_link = self.home_cache.html.find('li#tabHeader_3', first=True).absolute_links.pop()
        logger.debug("Devices Link: %s" % self.devices_link)

        self.devices_cache = self.request(self.devices_link)

        # Keep track of devices in case they change. This might be overkill and likely to work great.
        # Probably would be safer to restart the process altogether.
        previous = set(self._devices.keys())
        seen = set()

        home = self.home_cache.html.find("div#home", first=True)

        elements = home.find("div.top,div.inbetween,script")

        # Remove the last element that's a script we're not interested in.
        elements.pop()

        def _parse_temperatures(
            e: Element,
            devices: Dict[str, AqualinkDevice]
        ) -> List[str]:
            temps = self.home_cache.html.find("div.temps")

            sensors = []
            for i in temps:
                name = re.sub(r"(Temp).*$", r"\1", i.text)
                entity = name.lower().replace(' ', '_')
                temp = re.sub(r"^.*Temp", "", i.text)
                if temp == "--":
                    temp = None
                else:
                    temp = int(temp.rstrip("°F"))
                if entity in devices:
                    devices[entity].state = temp
                else:
                    ss = AqualinkSensor(self, name, entity, temp)
                    devices[entity] = ss
                sensors += [entity]
            return sensors

        def _parse_set_temperatures(
            e: Element,
            devices: Dict[str, AqualinkDevice]
        ) -> List[str]:
            # First, open the sub-page.
            sub = self.request(BASE_URL + (e.links.pop()))

            # Then get the action link to set thermostats. Same for all of them.
            script = sub.html.find('script')[-2]
            action = re.match(r".*actionID=(\w+)&temp.*", script.text).groups()[0]

            control = sub.html.find('div.set_temp_label')

            thermostats = []

            for c in control:
                (name, temp) = c.text.split("\n")
                temp = int(temp.rstrip("°F"))
                entity = c.find('span')[1].attrs['id']
                if entity in devices:
                    devices[entity].state = temp
                    devices[entity].action = action
                else:
                    ts = AqualinkThermostat(self, name, entity, temp, action)
                    devices[entity] = ts
                thermostats += [entity]

            return thermostats
            
        # Go through all the elements on the page.
        for e in elements:
            if e.tag == 'script':
                continue

            if 'top' in e.attrs['class']:
                # Current Temperatures.
                seen |= set(_parse_temperatures(e, self._devices))
                continue

            if 'inbetween' in e.attrs['class']:
                # Set Temperatures for Pool/Spa heaters.
                # This "Set Temperatures" string seems to be safe to use.
                if e.text == 'Set Temperatures':
                    seen |= set(_parse_set_temperatures(e, self._devices))
                    continue

                # At this point, we're pretty sure it's a toggle.
                # The 'inbetween' element gives us the name/state. The
                # following 'script' element gives the link to flip it.
                pass


        # XXX - This code needs to be made more robust, like the devices page.
        # Now find the switches for pool/spa.
        # This is a bit convoluted but labels, states and scripts don't live
        # in the same element so we need to find all of them individually and
        # put them together.
        labels = home.find("div.inbetween")
        states = home.find("div#home", first=True).find("img")
        scripts = home.find("script")

        labels.pop(0)
        states.pop(0)
        scripts.pop()

        for label, state, script in zip(labels, states, scripts):
            name = label.find('span', first=True).text
            entity = state.attrs['id'].replace('_state', '')
            state = DEVICE_STATE_MAP[state.attrs['src']]
            action = re.match(r".*actionID=(\w+)", script.text).groups()[0]

            if entity in self._devices:
                self._devices[entity].state = state
                self._devices[entity].action = action
            else:
                sw = AqualinkSwitch(self, name, entity, state, action)
                self._devices[entity] = sw
            seen.add(entity)

        # Now go through auxiliary devices. These typically include water
        # features, pool cleaner, lights, ...
        # Here again, we look for labels, states and scripts separately and
        # put them all together.
        devices = self.devices_cache.html.find('div#devices', first=True)

        objs = []
        for e in devices.find('div.inbetween,script'):
            if e.tag == 'div':
                label = e.find('span.row_label', first=True)
                name = " ".join([x.capitalize() for x in label.text.split()])
                entity = label.text.lower().replace(' ', '_')
                state = e.find('img', first=True)
                state = DEVICE_STATE_MAP[state.attrs['src']]
                # Create a Light
                if len(e.links) > 0:
                    # Device is a dimmable light.
                    sub = re.match(r".*actionID=(\w+)", e.links.pop()).groups()[0]
                    # Browse sub-menu. Find the dimming action url.
                    sub_cache = self.request(ACTION_BASE_URL % sub)
                    script = sub_cache.html.find('script')[-1]
                    cur = sub_cache.html.find('span.button-dimmer-selected', first=True)
                    cur = int(cur.attrs['id'].split('_')[-1])
                    action = re.match(r".*actionID=(\w+)&level=0.*", script.text).groups()[0]
                    sw = AqualinkLight(self, name, entity, state, action, brightness=cur)
                    objs += [sw]
            else:
                action = re.match(r".*actionID=(\w+)", e.text).groups()[0]
                # This is script with an action for the previous element.
                # Going to assume that people used sensible names for lights.
                # At least my installer did.
                if 'Light' in name:
                    sw = AqualinkLight(self, name, entity, state, action)
                else:
                    sw = AqualinkSwitch(self, name, entity, state, action)
                objs += [sw]

        for obj in objs:
            entity = obj.entity
            if entity in self._devices:
                self._devices[entity].state = obj.state
                self._devices[entity].action = obj.action
                if type(obj) == AqualinkLight and obj.is_dimmable:
                    self._devices[entity].brightness = obj.brightness
            else:
                self._devices[entity] = obj
            seen.add(entity)

        # Get rid of devices that went away.
        missing = previous - seen
        for i in list(missing):
            del(self._devices[i])

    @property
    def devices(self) -> List[AqualinkDevice]:
        return self._devices.values()

Exemplo n.º 16

0

Exibir arquivo

from requests_html import HTMLSession
import hashlib
import requests, os

productid = os.getenv("productid", "9")

session = HTMLSession()

host = "http://183.131.202.93:9090"

z = session.request(url=host + "/zentao/index.php?m=user&f=login",
                    method="get")
verifyRand = z.html.find("input#verifyRand")[0].attrs["value"]
print("随机数:%s" % verifyRand)


def getMd5(s):
    h = hashlib.md5()
    h.update(s.encode("utf-8"))
    return h.hexdigest()


pwd = getMd5(getMd5("wujingjing123456") + verifyRand)
print("密码:%s" % pwd)

data = {"account": "wujingjing", "password": pwd, "keepLogin[]": "on"}

headers = {
    "Content-Type": "application/x-www-form-urlencoded",
}

Exemplo n.º 17

0

Exibir arquivo

Arquivo: CSRFDetector.py Projeto: mukeran/dinlas

class CSRFDetector:
    def __init__(self, results, reports, **kwargs):
        report = {'title': 'CSRF Detector',
                  'overview': "CSRF vulnerabilities is an attack that forces an end user to execute "
                              "unwanted actions on a web application in which they're currently authenticated."
                              "<br> This Detector can automatic detect CSRF Token "
                              "and try to detect 'Referer' header check."
                              "If any following form is important, please consider adding protect.",
                  'entries': [], 'header': ['URL', 'Method', 'Data', 'Request with referer', 'Request without referer']}
        self.results = results
        self.reports = reports
        self.csrf_report = report
        self._session = HTMLSession()
        self.img_suffix = ['jpg', 'png', 'jpge', 'ico', 'gif', 'bmp']
        self.filter_words = ['跳', '搜', '查', '找', '登陆', '注册', 'search', 'register', 'login', 'log in', 'sign']
        self.action_filters = ['search', 'find', 'login', 'reg', 'logout', 'find'
                               "baidu.com", "google.com", "so.com", "bing.com",
                               "soso.com", "sogou.com"]
        self.args = kwargs
        if 'cookie' in self.args and self.args['cookie']:
            self.cookie = SimpleCookie()
            self.cookie.load(self.args['cookie'])
            self.cookie = {key: val for key, val in self.cookie.items()}
            for key in self.cookie:
                self._session.cookies.set(*[key, self.cookie[key]])

    @staticmethod
    def meta():
        return {
            'name': 'CSRF Detector for all',
            'version': '1.0'
        }

    def find_recaptcha(self, form):
        """
        find recaptcha by non-typical img suffix
        :param form:
        :return:
        """
        imgs = form.find('img')
        for img in imgs:
            src = img.attrs.get('src', '')
            if src:
                if '?' in src:
                    src = src.split('?')[0]
                suffix = src.split('/')[-1].split('.')[-1]
                logging.debug("img suffix: {}".format(suffix))
                if len(suffix) < 5:
                    if [fil for fil in self.img_suffix if fil in suffix]:  # or ==
                        logging.info("non recaptcha img in form detected: {}".format(img.attrs['src']))
                        return False
                    logging.info("recaptcha detected: {}".format(img.attrs['src']))
                    return True
        return False

    def report(self, url, method, data, req, req_no_ref):
        text = ''.join([str(res.status_code) + '->' for res in req.history]) + str(req.status_code) + ' '
        text += req.text if len(req.text) < 150 else 'data length: {}'.format(len(req.text))
        text_no_ref = ''.join([str(res.status_code) + '->' for res in req_no_ref.history]) \
                      + str(req_no_ref.status_code) + ' '
        text_no_ref += req_no_ref.text if len(req_no_ref.text) < 150 \
            else ' data length: {}'.format(len(req_no_ref.text))
        entry = [url, method, urlencode(data), text, text_no_ref]

        self.csrf_report['entries'].append(entry)

    def exec(self):
        visited = set()
        for form_ in self.results['requests'].values():
            location = form_['location']
            if location in visited:  # 要不要 start with
                continue
            visited.add(location)

            res = self._session.get(location)
            # render ??
            forms = res.html.find('form')
            for form_index, form in enumerate(forms):
                # skip useless form by black list
                action = form.attrs['action'] if 'action' in form.attrs else location
                if [fil for fil in self.action_filters if fil in action.lower()]:
                    logging.debug('keyword in action, skip {}:'.format(form))  # 用一个id表示？
                    continue

                # if form.find('input[type="password"]'):
                #     continue
                # skip by placeholder filter

                def filter_placeholder(form_to_filter):
                    for ph in form_to_filter.find('[placeholder]'):
                        if [fil for fil in self.filter_words if fil in ph.attrs['placeholder'].lower()]:
                            logging.debug('keyword in placeholder, skip {}'.format(form_to_filter))
                            return True
                    return False

                if filter_placeholder(form):
                    continue
                # skip form without submit button
                if not form.find('input[type="submit"], button[type="submit"]'):  # :submit
                    continue
                # CSRF Token check
                if form.find(':hidden'):
                    res2 = self._session.get(location)
                    form2 = res2.html.find('form')[form_index]
                    if find_token(form, form2):
                        continue

                if form.find('img'):
                    if self.find_recaptcha(form):
                        continue

                # Referer check
                data = craft_field(form, location)
                # if method == 'GET':
                #     location = location.split('?')[0]
                url = urljoin(location, form.attrs['action']) if 'action' in form.attrs else location
                method = form.attrs['method'].upper() if 'method' in form.attrs else 'GET'
                if method == 'GET':
                    req1 = self._session.get(url, headers={'referer': location})
                    req_no_ref = self._session.get(url)
                    req2 = self._session.get(url, headers={'referer': location})
                else:
                    req1 = self._session.request(method, url,
                                                 headers={'referer': location})  # "allow redirect but check"
                    req_no_ref = self._session.request(method, url)
                    req2 = self._session.request(method, url,
                                                 headers={'referer': location})  # ensure referer
                logging.debug('assert no referer: {}'.format(req_no_ref.headers))
                assert 'referer' not in req_no_ref.headers
                if compare(req1, req_no_ref):
                    if not compare(req1, req2):
                        logging.critical('Double check failed: {}\ndata2: {}'.format(req1.text, req2.text))
                        continue
                    logging.critical(
                        'CSRF Detected: text length:{}, without referer: text length:{}'.format(len(req1.text),
                                                                                                len(req_no_ref.text)))
                    self.report(url, method, data, req1, req_no_ref)
                logging.warning(
                    'Referer check detected: text length:{}, without referer: text length:{}'.format(len(req1.text),
                                                                                                     len(req2.text)))

        self.csrf_report['overview'] = 'Found {} possible csrf forms. <br>'.format(len(self.csrf_report['entries'])) + \
                                       self.csrf_report['overview']
        self.reports.append(self.csrf_report)
        logging.info("CSRF scan finished !")