Exemplo n.º 1
0
    def retrieve_product_data(self, product_link):
        cookies = mechanize.CookieJar()
        opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
        opener.addheaders = [('User-agent', 'Mozilla/5.0 (MyProgram/0.1)'),
                             ('From', '*****@*****.**')]
        mechanize.install_opener(opener)
        browser = mechanize.Browser()
        product_data = browser.open(product_link).get_data()
        soup = BeautifulSoup(product_data)

        product_name = soup.find('title').string.encode('ascii', 'ignore')

        product_prices = soup.find('div', 'price').contents

        try:
            cash_price = int(clean_price_string(product_prices[4]))

            product_data = ProductData()
            product_data.custom_name = product_name
            product_data.price = cash_price
            product_data.url = product_link
            product_data.comparison_field = product_link

            return product_data
        except IndexError:
            return None
Exemplo n.º 2
0
def readUrl(inUrl):

    tryCount = 0
    while tryCount < 5:
        #        print "Create CookieJar"
        cookies = mechanize.CookieJar()
        #        print "Build Opener"
        opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
        #        print "Add Headers"
        opener.addheaders = [("User-agent",
                              "Mozilla/5.0 (compatible; MyProgram/0.1)"),
                             ("From", "*****@*****.**")]
        #        print "Install Opener"
        mechanize.install_opener(opener)
        try:
            #            print "Open URL"
            response = mechanize.urlopen(inUrl)
            tryCount = 99
        except:
            tryCount += 1
            print "******** Error on urlopen ***********"
            print "URL: ", inUrl
            print "Trying Again....", tryCount


#    print response.read()
#    html = urllib.urlopen(inUrl).read()
#    print "Reading Response"
    html = response.read()
    #    print "Response Read:", html[0:100]
    root = lxml.html.fromstring(html)
    #    print "Root created: ", root

    return root
Exemplo n.º 3
0
    def retrieve_product_links(self):
        cookies = mechanize.CookieJar()
        opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
        opener.addheaders = [('User-agent', 'Mozilla/5.0 (MyProgram/0.1)'),
                             ('From', '*****@*****.**')]
        mechanize.install_opener(opener)
        url_base = 'http://www.globalmac.cl/'

        browser = mechanize.Browser()

        url_extensions = [
            ['Distribuidor-Apple-Chile/MacBook-Air', 'Notebook'],
            ['Distribuidor-Apple-Chile/MacBook-Pro', 'Notebook'],
            ['Hardware-Mac-PC/Discos-Duros-Notebook-SATA-2.5', 'StorageDrive'],
            ['Hardware-Mac-PC/Discos-Duros-SATA-3.5', 'StorageDrive'],
            ['Hardware-Mac-PC/Discos-Duros-SSD-SATA-2.5', 'StorageDrive'],
        ]

        product_links = []

        for url_extension, ptype in url_extensions:
            url = url_base + url_extension
            base_data = browser.open(url).get_data()
            soup = BeautifulSoup(base_data)

            for item in soup.findAll('div', 'name'):
                product_links.append([item.find('a')['href'], ptype])

        return product_links
Exemplo n.º 4
0
def slurp_with_login_and_pwd():
    import sys
    import mechanize
    # sys.path.append('ClientCookie-1.0.3')
    # from mechanize import ClientCookie
    # sys.path.append('ClientForm-0.1.17')
    # import ClientForm

    # Create special URL opener (for User-Agent) and cookieJar
    cookieJar = mechanize.CookieJar()

    opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookieJar))
    opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible)")]
    mechanize.install_opener(opener)
    fp = mechanize.urlopen("http://login.yahoo.com")
    forms = mechanize.ParseResponse(fp)
    fp.close()

    # print forms on this page
    for form in forms:
        print "***************************"
        print form

    form = forms[0]
    form["login"] = "******"  # use your userid
    form["passwd"] = "password"  # use your password
    fp = mechanize.urlopen(form.click())
    fp.close()
    fp = mechanize.urlopen(
        "https://class.coursera.org/ml-003/lecture/download.mp4?lecture_id=1"
    )  # use your group
    fp.readlines()
    fp.close()
Exemplo n.º 5
0
def themain():
    #browser=mechanize.Browser()
    #browser.open('http://www.baidu.com')
    cj = mechanize.LWPCookieJar()
    opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj))
    mechanize.install_opener(opener)
    r = mechanize.urlopen('http://www.baidu.com')
    cj.save('cookie.txt', ignore_discard=True, ignore_expires=True)
Exemplo n.º 6
0
    def resolve(self, url, cookie_jar, user_agent):
        headers = {'User-agent': user_agent, 'Referer': url}

        try:
            cookie_jar.load(ignore_discard=True)
        except Exception as e:
            logger.info(e)

        opener = mechanize.build_opener(
            mechanize.HTTPCookieProcessor(cookie_jar))

        request = mechanize.Request(url)
        for key in headers:
            request.add_header(key, headers[key])

        try:
            response = opener.open(request)
        except mechanize.HTTPError as e:
            response = e

        body = response.read()

        cookie_jar.extract_cookies(response, request)
        cookie_helper.check_cookies(cookie_jar)

        parsed_url = urlparse(url)
        submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme,
                                                      parsed_url.netloc)

        params = {}

        try:
            params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"',
                                           body).group(1)
            params["pass"] = re.search(r'name="pass" value="(.+?)"',
                                       body).group(1)

            js = self._extract_js(body)
        except mechanize.HTTPError as e:
            return None

        params["jschl_answer"] = str(js + len(parsed_url.netloc))

        sParameters = urllib.urlencode(params, True)

        request = mechanize.Request("%s?%s" % (submit_url, sParameters))
        for key in headers:
            request.add_header(key, headers[key])

        sleep(5)

        try:
            response = opener.open(request)
        except mechanize.HTTPError as e:
            response = e

        return response
Exemplo n.º 7
0
    def __callRequest(self):
        if self.caching and self.cacheTime > 0:
            sContent = self.readCache(self.getRequestUri())
            if sContent:
                return sContent

        cookieJar = mechanize.LWPCookieJar(filename=self._cookiePath)
        try:  # TODO ohne try evtl.
            cookieJar.load(ignore_discard=self.__bIgnoreDiscard,
                           ignore_expires=self.__bIgnoreExpired)
        except Exception as e:
            logger.info(e)

        sParameters = urllib.urlencode(self.__aParameters, True)

        handlers = [
            mechanize.HTTPCookieProcessor(cookiejar=cookieJar),
            mechanize.HTTPEquivProcessor, mechanize.HTTPRefreshProcessor
        ]
        if sys.version_info >= (2, 7, 9) and sys.version_info < (2, 7, 11):
            handlers.append(newHTTPSHandler)
        opener = mechanize.build_opener(*handlers)
        if (len(sParameters) > 0):
            oRequest = mechanize.Request(self.__sUrl, sParameters)
        else:
            oRequest = mechanize.Request(self.__sUrl)

        for key, value in self.__headerEntries.items():
            oRequest.add_header(key, value)
        cookieJar.add_cookie_header(oRequest)

        user_agent = self.__headerEntries.get('User-Agent',
                                              common.FF_USER_AGENT)

        try:
            oResponse = opener.open(oRequest, timeout=self.requestTimeout)
        except mechanize.HTTPError, e:
            if e.code == 503 and e.headers.get(
                    "Server") == 'cloudflare-nginx' or 'cloudflare':
                html = e.read()
                oResponse = self.__check_protection(html, user_agent,
                                                    cookieJar)
                if not oResponse:
                    logger.error("Failed to get CF-Cookie for Url: " +
                                 self.__sUrl)
                    return ''
            elif not self.ignoreErrors:
                xbmcgui.Dialog().ok('xStream', 'Fehler beim Abrufen der Url:',
                                    self.__sUrl, str(e))
                logger.error("HTTPError " + str(e) + " Url: " + self.__sUrl)
                return ''
            else:
                oResponse = e
Exemplo n.º 8
0
def initialize_browser():
    """Configurações para contornar os cookies, robots.txt e outros para fingir ser um browser normal."""
    cookiejar = cookielib.LWPCookieJar()
    opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookiejar))
    mechanize.install_opener(opener)
    browser = mechanize.Browser()
    browser.set_handle_robots(False)
    browser.set_handle_redirect(True)
    browser.set_cookiejar(cookiejar)
    browser.set_handle_equiv(True)
    browser.set_handle_referer(True)
    browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=2)
    browser.addheaders = [('User-agent', 'Google Chrome')]
    return browser, cookiejar  
Exemplo n.º 9
0
 def __init__(self):
     self.cj = mechanize.LWPCookieJar()
     opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(self.cj))
     mechanize.install_opener(opener)
     self.br = mechanize.Browser()
     self.br.set_cookiejar(self.cj)
     self.sessionkey = 'None'
     self.br.set_header(
         'User-Agent',
         value=
         'Mozilla/5.0 (X11; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0'
     )
     # self.br.set_debug_http(True)
     self.br.set_debug_redirects(True)
Exemplo n.º 10
0
def get_trash_zone(address, zip):
    #Make cookie jar.  See wwwsearch.sourceforge.dat/mechanize/hints.html
    cj = mechanize.LWPCookieJar()
    opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj))
    mechanize.install_opener(opener)

    #Save cookies
    cj.save(
        "/usr/local/django/recyclocity/recyclocity_static/cookies/cookie_jar",
        ignore_discard=True,
        ignore_expires=True)

    #Create a browser
    browser = mechanize.Browser()

    #Fill in form
    browser.open('http://lmt-web.lowermerion.org/cgi-bin/refuse2.plx')
    browser.form = list(browser.forms())[0]
    browser.form['askrecycle'] = address
    browser.form['postcode'] = zip

    #Submit form
    browser.submit()

    #Extract content
    content = browser.response().read()

    #Use pattern match to extract fields
    m = re.search('<b>(Monday|Tuesday|Wednesday|Thursday|Friday)</b>', content)
    if m:
        day, = m.groups()
        #Convert day to number
        day_number = schedule_helpers.get_day_number(day)
    else:
        #Failed
        return

    m = re.search('<b>Zone ([1-4])</b>', content)
    if m:
        zone, = m.groups()
    else:
        #Failed
        return

    #Match for both day and zone
    return day_number, zone
Exemplo n.º 11
0
def get_trash_zone(address, zip):

    #Make cookie jar.  See wwwsearch.sourceforge.dat/mechanize/hints.html
    cj = mechanize.LWPCookieJar()
    opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj))
    mechanize.install_opener(opener)

    #Create a browser
    browser = mechanize.Browser()

    #User-Agent (this is cheating, ok?)
    browser.addheaders = [(
        'User-agent',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
    )]

    #Save cookies
    cj.save(
        "/usr/local/django/recyclocity/recyclocity_static/cookies/cookie_jar",
        ignore_discard=True,
        ignore_expires=True)

    #Fill in form
    #browser.open('http://citymaps.phila.gov/portal/')
    #browser.select_form(name="form1")
    #browser.form['txtSearchAddress'] = address

    #Fill in form
    #browser.open('https://alpha.phila.gov/property/')
    #browser.open('http://www.lowermerion.org/cgi-bin/recycle2.plx/')
    browser.open(
        'http://www.lowermerion.org/services/public-works-department/refuse-and-recycling/how-to-determine-your-recycling-collection-day'
    )
    #browser.form = list(browser.forms())[0]
    #browser.form['askrecycle'] = address
    #browser.form['postcode'] = zip

    #Submit form
    #browser.submit()

    #Extract content
    content = browser.response().read()

    return content
Exemplo n.º 12
0
    def __init__(self, username, password):
        mechanize.Browser.__init__(self)
        cj = mechanize.LWPCookieJar()
        self.set_cookiejar(cj)
        self.set_handle_equiv(True)
        self.set_handle_redirect(True)
        self.set_handle_referer(True)
        self.set_handle_robots(False)
        self.addheaders = [(
            'User-agent',
            'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
        )]
        self.open(self.base_url)

        self.username = username
        self.password = password
        self.login()

        opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj))
        mechanize.install_opener(opener)
Exemplo n.º 13
0
def go():
    '''
    Main procedure of the scraper. Creates a browser, load the list of tasks and execute them
    '''
    try:
        # Prepare the browser
        cookies = mechanize.CookieJar()
        opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
        mechanize.install_opener(opener)
        br = mechanize.Browser()
        br.set_handle_robots(False)
        br.set_handle_refresh(False)
        br.set_handle_referer(False)
        br.open("http://www.infogreffe.fr/infogreffe/process.do")

        # Get the list of tasks
        tasks = load_task_queue()
        if len(tasks) == 0:
            # If there is no task to execute, init/reset the table
            init_task_queue()
            tasks = load_task_queue()

        for task in tasks:
            try:
                # Execute the task
                results = get_companies(br, task['name'], task['dept'])

                # If we hit the soft limit, add more refined searches to the queue
                if results == 100:
                    print "Limit reached for %s in %s, adding new tasks" % (
                        task['name'], task['dept'])
                    expand_task_queue(task['name'], task['dept'])

                # Mark the task as done
                mark_task_done(task['name'], task['dept'], results)
            except Exception as detail:
                # We may get an exception for using too much CPU time.
                print "Exception raised", detail
    except Exception as detail:
        # If we can't open the browser, just skip running the scraper
        print "Failed starting browser ", detail
Exemplo n.º 14
0
    def resolve(self, req, error, cookieJar):
        sleep(5)

        useragent = req.headers.get('User-agent')

        body = error.read()
        parsed_url = urlparse(error.url)
        submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme,
                                                      parsed_url.netloc)

        params = {}

        try:
            params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"',
                                           body).group(1)
            params["pass"] = re.search(r'name="pass" value="(.+?)"',
                                       body).group(1)

            js = self._extract_js(body)
        except:
            raise

        params["jschl_answer"] = str(js + len(parsed_url.netloc))

        opener = mechanize.build_opener(
            mechanize.HTTPCookieProcessor(cookieJar))

        sParameters = urllib.urlencode(params, True)

        request = mechanize.Request("%s?%s" % (submit_url, sParameters))
        request.add_header('Referer', error.url)
        request.add_header('User-agent', useragent)

        try:
            response = opener.open(request)
        except:
            raise

        return response, cookieJar
Exemplo n.º 15
0
    def logIn(self):
        """
        Logs in to private archives using the supplied email and password.
        Stores the cookie so we can continue to get subsequent pages.
        """

        cookieJar = mechanize.CookieJar()

        opener = mechanize.build_opener(
            mechanize.HTTPCookieProcessor(cookieJar))
        opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible)")]
        mechanize.install_opener(opener)

        self.message('Logging in to ' + self.list_url)
        fp = mechanize.urlopen(self.list_url)
        forms = ClientForm.ParseResponse(fp, backwards_compat=False)
        fp.close()

        form = forms[0]
        form['username'] = self.username
        form['password'] = self.password
        fp = mechanize.urlopen(form.click())
        fp.close()
    def BROWSER(self,cookie3=''):
        """
        :param url:
        """
        # global br, cj, r, proxy, User_Pass


        br = mechanize.Browser()
        # print br

        # Cookie Jar
        # fo=os.getcwd()+"\\cookies\\"
        # try :
        #     os.mkdir(fo)
        # except:
        #     pass
        # os.chdir(fo)
        # folder=sys.path.insert(0,'/cookies')
        if self.cookie3=='':
            fo = os.getcwd().replace('\\','/')
            # pathname = os.path.join("cookies", cookie3)
            site = urlparse2(self.url).hostname
            if not os.path.isdir(fo + "/cookies/"+site ):os.mkdir(fo + "/cookies/"+site )
            chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
            self.cookie3 = fo + "/cookies/"+site +'/'+''.join([random.choice(chars) for x in range(5)]) + ".txt"
            self.cj = cookielib.LWPCookieJar()
        else:
            self.cj = cookielib.LWPCookieJar()
            self.cj.revert(self.cookie3)
        opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(self.cj))

        br.set_cookiejar(self.cj)
        # os.chdir(..)


        # Browser options
        br.set_handle_equiv(True)
        br.set_handle_gzip(True)
        br.set_handle_referer(True)    # no allow everything to be written to
        br.set_handle_robots(False)   # no robots
        br.set_handle_refresh(True)  # can sometimes hang without this
        br.set_handle_redirect(True)

        # Follows refresh 0 but not hangs on refresh > 0
        br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

        # Want debugging messages?
        #br.set_debug_http(True)
        #br.set_debug_redirects(True)
        #br.set_debug_responses(True)

        # User-Agent (this is cheating, ok?)
        br.addheaders = [('User-Agent', 'Mozilla/5.0 (Linux; U; Android 2.3.4; en-us; T-Mobile myTouch 3G Slide Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'),
                         ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
                         ('Accept-Language', 'en-gb,en;q=0.5'),
                         ('Accept-Encoding', 'gzip,deflate'),
                         ('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'),
                         ('Keep-Alive', '115'),
                         ('Connection', 'keep-alive'),
                         ('Cache-Control', 'max-age=0'),
                         ('Referer', 'http://yahoo.com')]
        # # If the protected site didn't receive the authentication data you would
        # # end up with a 410 error in your face
        # br.add_password('http://safe-site.domain', 'username', 'password')
        # br.open('http://safe-site.domain')

        # Open some site, let's pick a random one, the first that pops in mind:
        # Proxy and user/password
        #proxy = "61.233.25.166:80"

        # proxy = "202.202.0.163:3128"
        # proxy=self.proxy
        # Proxy
        # dd=re.findall('None:None', proxy)
        if self.proxy != [] and self.proxy != '' and not (re.findall('None', self.proxy)):
            br.proxies = br.set_proxies({"http": self.proxy})
            # br.proxies=br.set_proxies( proxy)

        if self.User_Pass != [] and self.User_Pass != '' and not (re.findall('None:None', self.User_Pass)):
            br.add_proxy_password(self.User_Pass.split(":")[0], self.User_Pass.split(":")[1])

        # if  r!={}:
        # rr = br.open(url)

        # c= cookielib.Cookie(version=0, name='PON', value="xxx.xxx.xxx.111", expires=365, port=None, port_specified=False, domain='xxxx', domain_specified=True, domain_initial_dot=False, path='/', path_specified=True, secure=True, discard=False, comment=None, comment_url=None, rest={'HttpOnly': False}, rfc2109=False)
        # cj.set_cookie(c0)

        self.cj.save( self.cookie3)

        return br
Exemplo n.º 17
0
    def BROWSER(self):
        """
        :param url:
        """
        # global br, cj, r, proxy, User_Pass

        br = mechanize.Browser()
        # print br

        # Cookie Jar
        # fo=os.getcwd()+"\\cookies\\"
        # try :
        #     os.mkdir(fo)
        # except:
        #     pass
        # os.chdir(fo)
        # folder=sys.path.insert(0,'/cookies')
        chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
        cookie3 = ''.join([random.choice(chars) for x in range(5)]) + ".txt"
        cj = cookielib.LWPCookieJar()
        # cj.revert(cookie3)
        opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj))

        br.set_cookiejar(cj)
        fo = os.getcwd()
        # pathname = os.path.join("cookies", cookie3)
        cj.save(fo + "\\cookies\\" + cookie3)
        # os.chdir(..)

        # Browser options
        br.set_handle_equiv(True)
        br.set_handle_gzip(True)
        br.set_handle_redirect(True)
        br.set_handle_referer(True)
        br.set_handle_robots(False)

        # Follows refresh 0 but not hangs on refresh > 0
        br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(),
                              max_time=1)

        # Want debugging messages?
        #br.set_debug_http(True)
        #br.set_debug_redirects(True)
        #br.set_debug_responses(True)

        # User-Agent (this is cheating, ok?)
        br.addheaders = [(
            'User-agent',
            'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
        )]

        # # If the protected site didn't receive the authentication data you would
        # # end up with a 410 error in your face
        # br.add_password('http://safe-site.domain', 'username', 'password')
        # br.open('http://safe-site.domain')

        # Open some site, let's pick a random one, the first that pops in mind:
        # Proxy and user/password
        #proxy = "61.233.25.166:80"

        # proxy = "202.202.0.163:3128"
        # proxy=self.proxy
        # Proxy
        # dd=re.findall('None:None', proxy)
        if self.proxy != [] and self.proxy != '' and not (re.findall(
                'None', self.proxy)):
            br.proxies = br.set_proxies({"http": self.proxy})
            # br.proxies=br.set_proxies( proxy)

        if self.User_Pass != [] and self.User_Pass != '' and not (re.findall(
                'None:None', self.User_Pass)):
            br.add_proxy_password(
                self.User_Pass.split(":")[0],
                self.User_Pass.split(":")[1])

        # if  r!={}:
        # rr = br.open(url)
        return br
            except:
                oResponse.set_data(gzipper.extrabuf)

        if self.__aResponses:
            forms = mechanize.ParseResponse(oResponse, backwards_compat=False)
            form = forms[self.__formIndex]
            for field in self.__aResponses:
                #logger.info("Field: " + str(not field in form))
                try:
                    form.find_control(name=field)
                except:
                    form.new_control("text", field, {"value": ""})
                    form.fixup()
                form[field] = self.__aResponses[field]
            o = mechanize.build_opener(
                mechanize.HTTPCookieProcessor(cookieJar))
            oResponse = o.open(form.click(), timeout=self.requestTimeout)

        sContent = oResponse.read()

        checked_response = self.__check_protection(sContent, user_agent,
                                                   cookieJar)
        if checked_response:
            oResponse = checked_response
            sContent = oResponse.read()

        cookie_helper.check_cookies(cookieJar)
        cookieJar.save(ignore_discard=self.__bIgnoreDiscard,
                       ignore_expires=self.__bIgnoreExpired)

        if (self.__bRemoveNewLines == True):
Exemplo n.º 19
0
    def __init__(self, ip):
        self.ip = ip
        self.neighbours = {}
        self.laser_ports = {}
        self.new_adm = False
        #print(self.ip, type(self.ip))
        self.baseurl = 'http://%s:20080/' % (self.ip)

        try:
            br = mechanize.Browser()  #Create mechanize browser object
            #Added false headers
            try:
                cookies = mechanize.CookieJar()
                opener = mechanize.build_opener(
                    mechanize.HTTPCookieProcessor(cookies))
                opener.addheaders = [(
                    "User-agent",
                    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
                )]
                mechanize.install_opener(opener)
            except Exception as e:
                print(str(e))

            try:
                if 'TJ1400' in br.open(self.baseurl, timeout=5.0).read():
                    self.new_adm = True
                br.form = list(br.forms())[0]
                controls = list(br.form.controls)
                controls[0].value = 'tejas'
                controls[1].value = 'j72e#05t'
                page = br.submit()
                self.new_adm = True
                time.sleep(5)
                page = br.open(self.baseurl, timeout=5.0).read()
            except Exception as e:
                #print("{}-{}".format(str(e), self.ip))
                br = mechanize.Browser()
                br.add_password(
                    self.baseurl, username, passw
                )  #Get user id and password from command line arguements
                page = br.open(self.baseurl,
                               timeout=5.0).read()  #Check if NE is accessible
                self.new_adm = False
            if 'alarmBanner' in page:
                print "Logged in to %s" % (self.baseurl)

            loggedIn = self.get_laser_data(br)  #Read laser data of STM ports
            failTime = threading.Thread(target=self.get_fail_time, args=(br, ))
            failTime.start()
            #self.get_fail_time(br)                                                 #Read alarams (MS DCC Fail only)

            addNeighbours = threading.Thread(target=self.add_neighbours,
                                             args=(br, ))
            addNeighbours.start()
            #self.add_neighbours(br)                                                #Add neighbours

            if loggedIn:
                self.backup(br)  #Backup cross-connect info
            failTime.join()
            addNeighbours.join()
            #print(self.neighbours)
            if self.alarams_dict:
                for stm in self.alarams_dict.keys():
                    if stm in self.neighbours.keys():
                        fail_node_times = [[
                            self.ip, self.neighbours[stm][0],
                            self.alarams_dict[stm]
                        ]]
                        fail_times.extend(fail_node_times)

        except Exception as e:
            print("\nError reading {} \n-+--+- {} -+--+-".format(ip, str(e)))
        br.close()
        return (None)
Exemplo n.º 20
0
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector

from scrapy.item import Item, Field
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import MapCompose, TakeFirst

from scrapy.conf import settings
from scrapy import log

from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals

import mechanize
cookies = mechanize.CookieJar()
opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
opener.addheaders = [("User-agent", 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/534.56.5 (KHTML, like Gecko) Version/5.1.6 Safari/534.56.5'),
                     ("From", "*****@*****.**")]

class MoviefoneSpider(CrawlSpider):
    name = 'moviefone'
    allowed_domains = [ 'www.moviefone.com' ]
    start_urls = [ 'http://www.moviefone.com/dvd/' ]

    rules = (
        ## Pitchfork - Top 50 - 2011
        Rule(SgmlLinkExtractor(allow=('/dvd/?page'))),
        Rule(SgmlLinkExtractor(restrict_xpaths=('//div[@class="hub-body"]/div[43]/div[2]/div[1]/a')), follow=True),
        Rule(SgmlLinkExtractor(restrict_xpaths=('//div[@class="hub-body"]/div/div/a[@class="movieTitle"]')), callback='parseItem',  follow=True)
        )           
    
def login_to_site(url, form_data, proxy=[], User_Pass=[]):
    username = "******" % form_data
    password = "******" % form_data
    user_tag = "%(user_tag)s" % form_data
    pass_tag = "%(pass_tag)s" % form_data
    Form_id = "%(Form_id)s" % form_data
    log_done = "%(Log_test)s" % form_data
    br = mechanize.Browser(factory=mechanize.RobustFactory())
    # Browser options
    br.set_handle_robots(False)
    br.set_handle_referer(True)
    br.set_handle_refresh(True)

    br.set_handle_equiv(True)
    br.set_handle_gzip(True)
    br.set_handle_redirect(True)

    chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
    cookie3 = ''.join([random.choice(chars) for x in range(5)]) + ".txt"
    cj = cookielib.LWPCookieJar()
    # cj.revert(cookie3)
    opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj))

    br.set_cookiejar(cj)
    try:
        fo = os.getcwd()
        os.chdir(fo)
        os.mkdir(fo + "\\cookies\\")
    except:
        pass
    pathname = os.path.join("cookies", cookie3)
    cj.save(pathname)
    # Follows refresh 0 but not hangs on refresh > 0
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

    # Want debugging messages?
    # User-Agent (this is cheating, ok?)
    br.addheaders = [(
        'User-agent',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
    )]

    # txheaders = {
    #     'Accept':'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
    #     'Accept-Language':'en,hu;q=0.8,en-us;q=0.5,hu-hu;q=0.3',
    #     'Accept-Encoding': 'gzip, deflate',
    #     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
    #     'Keep-Alive': '300',
    #     'Connection': 'keep-alive',
    #     'Cache-Control': 'max-age=0',
    # }
    #
    # req = urllib2.Request(url, txheaders)
    # req2 = urllib2.urlopen(req)
    # print req2

    if proxy != [] and not (re.findall('None:None', proxy)):
        br.proxies = br.set_proxies({"http": proxy})
        # br.proxies=br.set_proxies( proxy)

    if User_Pass != [] and not (re.findall('None:None', User_Pass)):
        br.add_proxy_password(User_Pass.split(":")[0], User_Pass.split(":")[1])

    try:
        br.open(url)
    except urllib2.HTTPError, e:
        print "Got error code", e.code
        try:
            br.open(url)
        except urllib2.HTTPError, e:
            print "Got error code", e.code
Exemplo n.º 22
0
    def resolve(self, url, cookie_jar, user_agent):
        headers = {'User-agent': user_agent, 'Referer': url}

        try:
            cookie_jar.load(ignore_discard=True)
        except Exception as e:
            logger.info(e)

        opener = mechanize.build_opener(
            mechanize.HTTPCookieProcessor(cookie_jar))

        request = mechanize.Request(url)
        for key in headers:
            request.add_header(key, headers[key])

        try:
            response = opener.open(request)
        except mechanize.HTTPError as e:
            response = e

        body = response.read()

        cookie_jar.extract_cookies(response, request)
        cookie_helper.check_cookies(cookie_jar)

        pattern = 'xhr\.open\("GET","([^,]+),'
        match = cParser.parse(body, pattern)
        if not match[0]:
            return
        urlParts = match[1][0].split('"')
        parsed_url = urlparse(url)
        sid = '1200'
        script_url = '%s://%s%s%s%s' % (parsed_url.scheme, parsed_url.netloc,
                                        urlParts[0], sid, urlParts[2])

        request = mechanize.Request(script_url)
        for key in headers:
            request.add_header(key, headers[key])

        try:
            response = opener.open(request)
        except mechanize.HTTPError as e:
            response = e

        body = response.read()

        cookie_jar.extract_cookies(response, request)
        cookie_helper.check_cookies(cookie_jar)

        if not self.checkBFCookie(body):
            return body  # even if its false its probably not the right content, we'll see
        cookie = self.getCookieString(body)
        if not cookie:
            return

        name, value = cookie.split(';')[0].split('=')
        cookieData = dict(
            (k.strip(), v.strip())
            for k, v in (item.split("=") for item in cookie.split(";")))
        cookie = cookie_helper.create_cookie(name,
                                             value,
                                             domain=cookieData['domain'],
                                             expires=sys.maxint,
                                             discard=False)

        cookie_jar.set_cookie(cookie)

        request = mechanize.Request(url)
        for key in headers:
            request.add_header(key, headers[key])

        try:
            response = opener.open(request)
        except mechanize.HTTPError as e:
            response = e

        return response
Exemplo n.º 23
0
    def login(self,className):
        """
        Login into coursera and obtain the necessary session cookies.
        """
        hn,fn = tempfile.mkstemp()
        cookies = cookielib.LWPCookieJar()
        handlers = [
            urllib2.HTTPHandler(),
            urllib2.HTTPSHandler(),
            urllib2.HTTPCookieProcessor(cookies)
        ]

        # prepend a proxy handler if defined
        if(self.proxy):
            proxy = urllib2.ProxyHandler({'http': self.proxy})
            handlers = [proxy] + handlers

        opener = urllib2.build_opener(*handlers)

        url = self.lecture_url_from_name(className)
        req = urllib2.Request(url)

        try:
            res = opener.open(req)
        except urllib2.HTTPError as e:
            if e.code == 404:
                raise Exception("Unknown class %s" % className)

        # get the csrf token
        csrfcookie = [c for c in cookies if c.name == "csrf_token"]
        if not csrfcookie: raise Exception("Failed to find csrf cookie")
        csrftoken = csrfcookie[0].value
        opener.close()

        # call the authenticator url:
        cj = cookielib.MozillaCookieJar(fn)
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj),
                                    urllib2.HTTPHandler(),
                                    urllib2.HTTPSHandler())

        opener.addheaders.append(('Cookie', 'csrftoken=%s' % csrftoken))
        opener.addheaders.append(('Referer', 'https://accounts.coursera.org/signin'))
        opener.addheaders.append(('X-CSRFToken', csrftoken))
        req = urllib2.Request(self.LOGIN_URL)

        data = urllib.urlencode({'email': self.username,'password': self.password})
        req.add_data(data)

        try:
            opener.open(req)
        except urllib2.HTTPError as e:
            if e.code == 401:
                raise Exception("Invalid username or password")

        # check if we managed to login
        sessionid = [c.name for c in cj if c.name == "CAUTH"]
        if not sessionid:
            raise Exception("Failed to authenticate as %s" % self.username)

        # all should be ok now, mechanize can handle the rest if we give it the
        # cookies
        br = mechanize.Browser()
        #br.set_debug_http(True)
        #br.set_debug_responses(False)
        #br.set_debug_redirects(True)
        br.set_handle_robots(False)
        br.set_cookiejar(cj)

        if self.proxy:
            br.set_proxies({"http":self.proxy})

        self.browser = br

        # also use this cookiejar for other mechanize operations (e.g., urlopen)
        opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj))
        mechanize.install_opener(opener)
Exemplo n.º 24
0
 def getNewToken(self):
     import mechanize #@UnresolvedImport
     br = mechanize.Browser()
     __addon__ = xbmcaddon.Addon(id='script.facebook.media')
     cookiesPath = os.path.join(xbmc.translatePath(__addon__.getAddonInfo('profile')),'cache','cook​ies')
     LOG('Cookies will be saved to: ' + cookiesPath)
     cookies = mechanize.LWPCookieJar(cookiesPath)
     if os.path.exists(cookiesPath): cookies.load()
     self.cookieJar = cookies
     opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
     mechanize.install_opener(opener)
     br.set_cookiejar(self.cookieJar)
     br._ua_handlers["_cookies"].cookiejar.clear()
     br.set_handle_robots(False)
     agent = 'XBMC/{0} Facebook-Media/{1}'.format(xbmc.getInfoLabel('System.BuildVersion'),self.version)
     LOG('Setting User Agent: {0}'.format(agent))
     br.addheaders = [('User-agent',agent)]
     scope = ''
     if self.scope: scope = '&scope=' + self.scope
     url =     'https://www.facebook.com/dialog/oauth?client_id='+self.client_id+\
             '&redirect_uri='+self.redirect+\
             '&type=user_agent&display=popup'+scope
     LOG(url)
     try:
         res = br.open(url)
         html = res.read()
     except:
         LOG("ERROR: TOKEN PAGE INITIAL READ")
         raise
     
     script = False
     try:
         #check for login form
         br.select_form(nr=0)
         LOG("HTML")
     except:
         self.genericError()
         script = True
         LOG("SCRIPT")
         
     if script:
         #no form, maybe we're logged in and the token is in javascript on the page
         url = res.geturl()
         token = self.extractTokenFromURL(url)
         if not token: token = self.parseTokenFromScript(html)
     else:
         try:
             #fill out the form and submit
             br['email'] = self.login_email
             br['pass'] = self.login_pass
             res = br.submit()
             url = res.geturl()
             LOG("FORM")
         except:
             LOG("FORM ERROR")
             raise
             
         script = False
         token = self.extractTokenFromURL(url)
         html = self.browserRead(res,'-noscript')
         if not token:
             #if 'class="checkpoint"' in html:
             token = self.handleLoginNotificationCrap(br)
             
         if not token: script = True
         
         if script:
             LOG("SCRIPT TOKEN")
             #no token in the url, let's try to parse it from javascript on the page
             try:
                 __addon__ = xbmcaddon.Addon(id='script.facebook.media')
                 htmlFile = os.path.join(xbmc.translatePath(__addon__.getAddonInfo('profile')),'cache','DEBU​G_HTML.html')
                 open(htmlFile,'w').write(html)
                 LOG('html output written to: ' + htmlFile)
             except:
                 pass
             token = self.parseTokenFromScript(html)
             token = urllib.unquote(token.decode('unicode-escape'))
     
     if not self.tokenIsValid(token):
         #if script: LOG("HTML:" + html)
         return False
     LOG("\n|--------------------\n|TOKEN: %s\n|--------------------"  % token)
     self.saveToken(token)
     if self.cookieJar is not None:
         self.cookieJar.save()
     return token
Exemplo n.º 25
0
    def retrieve_product_links(self):
        cookies = mechanize.CookieJar()
        opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
        opener.addheaders = [('User-agent', 'Mozilla/5.0 (MyProgram/0.1)'),
            ('From', '*****@*****.**')]
        mechanize.install_opener(opener)

        url_buscar_productos = '/cl/'
        product_links = []
        url_base = 'http://www.dell.com'

        # Start home
        url_extensions = [
            'p/laptops?cat=laptops',
            ]

        for url_extension in url_extensions:
            url_webpage = url_base + url_buscar_productos + url_extension

            r = mechanize.urlopen(url_webpage)
            soup = BeautifulSoup(r.read())

            notebook_lines_container = soup.find('div',
                'tabschegoryGroups')
            notebook_lines = notebook_lines_container.findAll('div',
                recursive=False)

            notebook_urls = []
            for line in notebook_lines:
                for container in line.findAll('div', 'prodImg'):
                    link = container.find('a')['href'].replace('pd', 'fs')
                    notebook_urls.append(url_base + link)

            for url in notebook_urls:
                for url in self.retrieve_line_links(url):
                    product_links.append([url, 'Notebook'])

        # Start business

        url_extensions = [
            'empresas/p/laptops',
            ]

        for url_extension in url_extensions:
            url_webpage = url_base + url_buscar_productos + url_extension
            r = mechanize.urlopen(url_webpage)
            soup = BeautifulSoup(r.read())

            product_containers = soup.findAll('div', 'carouselProduct')
            for container in product_containers:
                url = url_base + container.find('a')['href']
                for url in self.retrieve_enteprise_links(url):
                    product_links.append([url, 'Notebook'])

        # Start Monitor
        url_extensions = [
            '/content/products/compare.aspx/19_22widescreen'
            '?c=cl&cs=cldhs1&l=es&s=dhs',
            '/content/products/compare.aspx/23_30widescreen'
            '?c=cl&cs=cldhs1&l=es&s=dhs',
            '/cl/es/empresas/Monitores/19_15widescreen/cp.aspx'
            '?refid=19_15widescreen&s=bsd&cs=clbsdt1',
            '/cl/es/empresas/Monitores/22_20widescreen/cp.aspx'
            '?refid=22_20widescreen&s=bsd&cs=clbsdt1',
            '/cl/es/empresas/Monitores/30_24widescreen/cp.aspx'
            '?refid=30_24widescreen&s=bsd&cs=clbsdt1',
            '/cl/es/empresas/Monitores/20_19flatpanel/cp.aspx'
            '?refid=20_19flatpanel&s=bsd&cs=clbsdt1',
            ]

        for url_extension in url_extensions:
            url_webpage = url_base + url_extension

            r = mechanize.urlopen(url_webpage)
            soup = BeautifulSoup(r.read())

            links = soup.findAll('a', {'class': 'lnk'})
            for link in links:
                if 'configure' in link['href']:
                    product_links.append([link['href'], 'Screen'])

        return product_links