예제 #1
0
def getUdemyLinks(class01):
    # print class01
    br = mechanize.Browser(factory=mechanize.RobustFactory())
    br.set_handle_robots(False)
    webpage = br.open(class01).read()
    soupPage = BeautifulSoup(webpage, 'xml')
    udemy_links = soupPage.findAll(
        "a",
        {"class": "button large visit"})[0]["href"].encode("ascii", "ignore")
    # print udemy_links
    return udemy_links
예제 #2
0
def patterncounter():  #defining function
    count = 0  #declaring variable count for counting
    browser = mechanize.Browser(factory=mechanize.RobustFactory())
    #initialising browser
    browser.set_handle_robots(False)
    browser.open("http://www.minerbots.blogspot.in/")  #opening URL
    html = browser.response().readlines()  #Fetching web contents
    for i in range(0, len(html)):  #Searching for pattern 'Vicz' line by line
        if 'Vicz' in html[i]:
            count = count + 1
    print "%d No of times found" % count  #analyzing and producing results
예제 #3
0
파일: xbmctools.py 프로젝트: noba3/KoTos
def sifre2():

    filepath = os.path.join(folders, 'nfo.txt')
    cj = mechanize.CookieJar()
    name = __settings__.getSetting("Name")
    login = __settings__.getSetting("Username")
    password = __settings__.getSetting("password")
    if not login:
        __settings__.openSettings()
    else:
        pass
    br = mechanize.Browser(factory=mechanize.RobustFactory())
    br.set_cookiejar(cj)

    br.set_handle_equiv(True)

    br.set_handle_redirect(True)

    br.set_handle_referer(True)

    br.set_handle_robots(False)

    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

    br.addheaders = [
        ('User-agent',
         'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
         ),
        ('Accept',
         'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
        ('Accept-Char‌​set', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'),
        ('Accept-Encoding', 'none'), ('Accept-Language', 'en-US,en;q=0.8'),
        ('Connection', 'keep-alive')
    ]
    br.open('https://koditr.org/wp-login.php')
    br.title()
    br.select_form(nr=0)
    br.form['log'] = __settings__.getSetting("Username")
    br.form['pwd'] = __settings__.getSetting("password")
    br.submit()
    html2 = br.response().read()
    if "welcome" in html2:
        print "basarili bir login yapildi"
    else:
        dialog = xbmcgui.DialogProgress()
        dialog1 = xbmcgui.Dialog()
        dialog1.ok(
            '[COLOR red][B]IPTV HATA UYARISI[/B][/COLOR]',
            '[COLOR yellow][B]Bronze Uye Olmaniz Gerekiyor!!! Eger Bronze Uye Iseniz ve Bu Mesaji Goruyorsaniz[/B][/COLOR]',
            '[COLOR red][B]Yanlis Kullanici adi veya Sifre Girdiniz!!! Lutfen Tekrar Deneyiniz.[/B][/COLOR]'
        )
    br.open('https://koditr.org/greating1/')
    html = br.response().read()
    return html
예제 #4
0
    def __init__(self, config, cookie_jar):
        # fix #218
        try:
            mechanize.Browser.__init__(self, factory=mechanize.RobustFactory())
        except BaseException:
            PixivHelper.GetLogger().info(
                "Using default factory (mechanize 3.x ?)")
            mechanize.Browser.__init__(self)

        self._configureBrowser(config)
        self._configureCookie(cookie_jar)
예제 #5
0
 def __init__(self):
     self._browser = mechanize.Browser(factory=mechanize.RobustFactory())
     self._browser.set_handle_equiv(True)
     self._browser.set_handle_redirect(True)
     self._browser.set_handle_referer(True)
     self._browser.set_handle_robots(False)
     self._browser.set_handle_refresh(
         mechanize._http.HTTPRefreshProcessor(), max_time=1)
     self._browser.addheaders = [(
         'User-agent',
         'Mozilla/5.0 (X11; Linux i686; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'
     )]
예제 #6
0
def donate(donation_amount, tid):

    logging.info('Donating ${} for tid {}'.format(donation_amount, tid))

    donation_amount = str(donation_amount)

    br = mechanize.Browser(factory=mechanize.RobustFactory())
    br.addheaders = [(
        'User-agent',
        '	Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'
    )]
    br.set_handle_robots(False)
    br.open('https://www.propublica.org/donate/')
    br.select_form('payment_form')

    prices = br.find_control('UnitPrice1')
    custom_price = mechanize.Item(prices, {
        'contents': donation_amount,
        'value': donation_amount
    })
    custom_price.selected = True

    br.form['c_amount'] = donation_amount

    br.form['BillingFirstName'] = billing['first']
    br.form['BillingLastName'] = billing['last']

    if billing['email'].endswith('+{}@gmail.com'):
        br.form['BillingEmail'] = billing['email'].format(tid)
    else:
        br.form['BillingEmail'] = billing['email'].format(tid)

    br.form['CardNumber'] = billing['cc']
    br.form['ExpirationMonth'] = [billing['exp_mo']]
    br.form['ExpirationYear'] = [billing['exp_yr']]
    br.form['Cvv2'] = billing['cvv']

    br.form['BillingAddress1'] = billing['street']
    br.form['BillingCity'] = billing['city']
    br.form['BillingStateProvince'] = [billing['state']]
    br.form['BillingPostalCode'] = billing['zip']
    br.form['BillingCountryCode'] = [billing['country']]

    response = br.submit()
    if 'Thank You.' in response.read():
        logging.info('Donation success: ${} for tid {}'.format(
            donation_amount, tid))
        return True
    else:
        logging.warning(response.read())
        logging.warning('Donation failed: ${} for tid {}'.format(
            donation_amount, tid))
        return False
예제 #7
0
 def __init__(self):
     """ creates a mechanize.Browser with some custom settings
         creates (empty) custom attributes
     """
     LOG.info("")
     LOG.info("Creating Browser")
     mechanize.Browser.__init__(self, factory=mechanize.RobustFactory())
     self.set_handle_equiv(True)
     self.set_handle_redirect(True)
     self.set_handle_robots(False)
     self.addheaders = [('User-agent', 'Mozilla/5.0')]
     self.userURL = None
     self.guessURL = {"index": None, "login": None, "upload": None}
    def __init__(self, username, password, ignore_re=None, retries=3, skip_existing=True):
        self.username = username
        self.password = password
        self.ignore_re = ignore_re
        self.retries = retries
        self.skip_existing = skip_existing

        self.br = mechanize.Browser(factory=mechanize.RobustFactory())
        self.br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) \
                                    Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
        self.br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

        self.br.open(self.ping_url).read()
예제 #9
0
def browser():
    br = mechanize.Browser(factory=mechanize.RobustFactory())
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)
    br.set_handle_robots(False)
    br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
    br.open(url)
    br.select_form(nr=0) # select 1st form
    br.find_control(name="sl").value = [flang]
    br.find_control(name="tl").value = [tlang]
    br.form["text"] = words
    br.submit()
    return etree.HTML(br.response().read())
예제 #10
0
def download_mechanize():
    browser = mechanize.Browser(factory=mechanize.RobustFactory())
    browser.set_handle_robots(False)

    browser.open("http://pypi.python.org/pypi")
    browser.select_form(name="searchform")
    browser.form["term"] = "mechanize"
    browser.submit()
    browser.follow_link(text_regex="mechanize-?(.*)")
    link = browser.find_link(text_regex=r"\.tar\.gz")
    filename = os.path.basename(urlparse.urlsplit(link.url)[2])
    if os.path.exists(filename):
        sys.exit("%s already exists, not grabbing" % filename)
    browser.retrieve(link.url, filename)
예제 #11
0
 def __init__(self):
     print("Initializing moodle ... ")
     self.br = mechanize.Browser(factory=mechanize.RobustFactory())
     self.br.set_handle_equiv(False)
     self.br.set_handle_robots(False)
     self.br.set_handle_referer(False)
     self.br.set_handle_redirect(True)
     self.br.set_debug_redirects(True)
     self.br.set_debug_responses(False)
     self.br.set_debug_http(False)
     self.br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(),
                                max_time=2)
     self.br.addheaders = [('User-agent',
                            'Mozilla/5.0 (X11; U; Linux 1686; en-US;\
         rv:1.9.0.1) Gecko/201171615 Ubuntu/11.10-1 Firefox/3.0.1')]
예제 #12
0
def GetContent():

    # Browser
    #br = mechanize.Browser()
    #br = mechanize.Browser(factory=mechanize.DefaultFactory(i_want_broken_xhtml_support=True))
    br = mechanize.Browser(factory=mechanize.RobustFactory())

    br.add_handler(PrettifyHandler())

    # Cookie Jar
    #cj = cookielib.LWPCookieJar()
    #br.set_cookiejar(cj)

    # Browser options
    br.set_handle_equiv(True)
    br.set_handle_gzip(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)

    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

    br.addheaders = [(
        'User-agent',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
    )]

    br.open('http://www.zvg-portal.de/index.php?button=Termine%20suchen')

    for form in br.forms():
        print "Form name:", form.name
        print form

    br.select_form(name='globe')
    #br.select_form("globe")         # works when form has a name
    #br.form = list(br.forms())[0]  # use when form is unnamed

    #Tests
    for control in br.form.controls:
        print control
        #print "type=%s, name=%s value=%s" % (control.type, control.name, br[control.name])

    br.form["land_abk"] = "by"

    # Login
    br.submit()

    return br.response().read()
예제 #13
0
def sifre3():

    filepath = os.path.join(folders, 'nfo.txt')
    cj = mechanize.CookieJar()
    name = __settings__.getSetting("Name")
    login = __settings__.getSetting("Username")
    password = __settings__.getSetting("password")
    if not login:
        __settings__.openSettings()
    else:
        pass
    br = mechanize.Browser(factory=mechanize.RobustFactory())
    br.set_cookiejar(cj)

    br.set_handle_equiv(True)

    br.set_handle_redirect(True)

    br.set_handle_referer(True)

    br.set_handle_robots(False)

    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

    br.addheaders = [(
        'User-agent',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
    )]
    br.open('http://koditr.org/wp-login.php')
    br.title()
    br.select_form(nr=0)
    br.form['log'] = __settings__.getSetting("Username")
    br.form['pwd'] = __settings__.getSetting("password")
    br.submit()
    html2 = br.response().read()
    if "welcome" in html2:
        print "basarili bir login yapildi"
    else:
        dialog = xbmcgui.DialogProgress()
        dialog1 = xbmcgui.Dialog()
        dialog1.ok(
            '[COLOR red][B]Vip +  HATA UYARISI[/B][/COLOR]',
            '[COLOR yellow][B]Silver Uye Olmaniz Gerekiyor!!! Eger Silver Uye Iseniz ve Bu Mesaji Goruyorsaniz[/B][/COLOR]',
            '[COLOR red][B]Yanlis Kullanici adi veya Sifre Girdiniz!!! Lutfen Tekrar Deneyiniz.[/B][/COLOR]'
        )
    br.open('http://koditr.org/xman/')
    html = br.response().read()
    return html
def cv_parser(file):
    browser = mechanize.Browser(factory=mechanize.RobustFactory()) 
    browser.set_handle_robots(False)
    browser.open("http://recruitplushrxmlapidemo.onlineresumeparser.com/Default.aspx")
    browser.select_form(nr=0)
    browser.form.set_all_readonly(False)
    file = '/home/heet/My-Work/MyProjects/hackiiitv18-submissions/team28-Automation/iscreening/Sample resumes/CV1.pdf'
    browser.form.add_file(open(file), 'text/plain', file)
    response = browser.submit()
    soup = BeautifulSoup(response.read().decode('utf-8'), 'html.parser')
    extracted_cv = soup.find(id="txtResume")
    text = extracted_cv.get_text().encode('utf-8')
    print(text)
    ext_file = open("./cv_parsed.txt", "w")
    ext_file.write(text)
    ext_file.close()
예제 #15
0
def getBrowser(config=None, cookieJar=None):
    global defaultCookieJar
    global defaultConfig

    if config != None:
        defaultConfig = config
    if cookieJar != None:
        defaultCookieJar = cookieJar
    if defaultCookieJar == None:
        PixivHelper.GetLogger().info(
            "No default cookie jar available, creating... ")
        defaultCookieJar = cookielib.LWPCookieJar()
    browser = Browser(factory=mechanize.RobustFactory())
    configureBrowser(browser, defaultConfig)
    configureCookie(browser, defaultCookieJar)
    return browser
예제 #16
0
def robots():
    import mechanize  # sudo apt-get install python-mechanize
    import re  # regular expressions  ( Text Filter )
    import sys  # to get paramstring for the filename
    browser = mechanize.Browser(factory=mechanize.RobustFactory())
    browser.set_handle_robots(False)
    browser.open("http://www.magistrix.de/")
    browser.select_form(nr=1)
    browser.form["q"] = "Kraftwerk,Roboter"
    browser.submit()
    browser.follow_link(text="Die Roboter", nr=0)
    html = browser.response().readlines()
    ##### get lyrics #################
    start = False
    lyrics = []
    for i in range(0, len(html)):
        line = html[i]
        OK = False
        if '<i>' in line:  # find start of the lyrics, here it was "<i>"
            start = True
        if "class" in line:  # find end of the lyrics , here it wass "class"
            start = False
        if '<p>' in line:
            OK = True
        if '<br' in line:
            OK = True
        if OK == True:
            if start == True:
                match = re.search('>[^<>]+',
                                  line)  #  <p>Text</p>   -->   ">Text"
                if match:
                    lyrics.append(match.group()[1:])
    lyrics.append(
        '\n\n\nIch hoffe hiermit geholfen zu haben\nund verbleibe mit freundlichen Grüssen\nund schüss'
    )
    #### save Lyrics to Text File ##################
    Filename = sys.argv[0]
    Filename = Filename[0:-3] + '-lyrics.txt'
    SaveFile = open(Filename, 'w')
    for line in lyrics:
        SaveFile.write(line + '\n')
        #print (line)
    SaveFile.close()
    print('Lyrics saved to ' + Filename)
    ### espeak lyrics ###########################
    os.system('espeak -v de -p0 -s150 -a200 -f "' + Filename +
              '" 2> /dev/null')
예제 #17
0
파일: address.py 프로젝트: gauravp2003/yp
def getDetails(lastname, add, state):
    """
    address.getDetails("lastname","city","state","|")
    """
    br = mechanize.Browser(factory=mechanize.RobustFactory())
    # Cookie Jar
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)
    # Don't handle HTTP-EQUIV headers (HTTP headers embedded in HTML).
    br.set_handle_equiv(False)
    # Don't Ignore robots.txt.
    br.set_handle_robots(True)
    #  add Referer (sic) header
    br.set_handle_referer(True)
    #  handle Refresh redirections
    br.set_debug_redirects(True)
    # Log HTTP response bodies (ie. the HTML, most of the time).
    br.set_debug_responses(True)
    # Print HTTP headers.
    #br.set_debug_http(True)

    # Follows refresh 0 but not hangs on refresh > 0
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=10)
    i = 0
    gonext = True
    details = []
    while gonext:
        urlCreated = 'http://www.yellowpages.com/whitepages?first=&last=' + str(
            lastname) + '&zip=' + str(add) + '&state=' + str(
                state) + '&start=' + str(i * 10)
        #Opens the site to be navigated
        response = br.open(urlCreated)
        soup = BeautifulSoup(br.response().read())
        allLi = soup.findAll("div", {"class": "phone-result-container"})
        gonext = len(allLi) > 0
        if gonext:
            for item in allLi:
                d = []
                d.append(item.find('a', {"class": 'fullname'}).text)
                d.append(item.find('p', {"class": 'address'}).text)
                d.append(item.find('p', {"class": 'phone'}).text)
                details.append(d)
            i = i + 1
        else:
            print "Processing Complete for", lastname, add, state
            return details
예제 #18
0
	def __init__(self, cur_cfg, cgen):
		self.br = mechanize.Browser(factory=mechanize.RobustFactory())
		self.cj = cookielib.LWPCookieJar()
		self.br.set_cookiejar(self.cj)
		self.br.set_handle_equiv(True)
		self.br.set_handle_redirect(False)
		self.br.set_handle_referer(True)
		self.br.set_handle_robots(False)
		self.br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
		self.cur_cfg = cur_cfg		
		self.timeout = cgen['default_timeout']
		self.baseURL = self.cur_cfg['url']
		#~ print self.cur_cfg['url']
		humanprovider = urlparse.urlparse(self.baseURL).hostname			
		self.name = humanprovider.replace("www.", "")
		self.basic_sz = 1024*1024
		#~ self.dologin()
		self.typesrch = 'DSNINIT'
예제 #19
0
 def __init__(self):
     print("Initializing notepal ... ")
     self.url = "https://doqcs.ncbs.res.in/notepal2015"
     self.proxy = None
     self.br = mechanize.Browser(factory=mechanize.RobustFactory())
     cj = cookielib.LWPCookieJar()
     self.br.set_cookiejar(cj)
     self.br.set_handle_equiv(False)
     self.br.set_handle_robots(False)
     self.br.set_handle_referer(False)
     self.br.set_handle_redirect(True)
     self.br.set_debug_redirects(True)
     self.br.set_debug_responses(False)
     self.br.set_debug_http(True)
     self.br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(),
                                max_time=2)
     self.br.addheaders = [('User-agent',
                            'Mozilla/5.0 (X11; U; Linux 1686; en-US;\
         rv:1.9.0.1) Gecko/201171615 Ubuntu/11.10-1 Firefox/3.0.1')]
예제 #20
0
def scrape_websites(latitude, longitude, radius):
    count = 1
    places = get_places_in_radius(latitude, longitude, radius, False)
    places = places.filter(website__isnull=True)
    br = mechanize.Browser(factory=mechanize.RobustFactory())
    br.set_handle_robots(False)
    br.set_handle_equiv(False)
    br.addheaders = [('User-agent', 'Mozilla/5.0')]
    for place in places:
        try:
            print count
            wt = random.uniform(1, 2)
            time.sleep(wt)
            br.open("http://google.com")
            br.select_form('f')
            br.form['q'] = remove_non_ascii(place.name) + "seattle wa"
            data = br.submit()
            soup = BeautifulSoup(data.read())
            num = 0
            while num < 3:
                url = urlparse.parse_qs(
                    urlparse.urlparse(
                        soup.select('.r a')[num]['href']).query)['q'][0]
                strings_to_exclude = [
                    'plus.google', 'yelp', 'facebook', 'urbanspoon',
                    'foursquare', 'zomato', 'tripadvisor', 'allmenus',
                    'thestranger', 'seattlemet', 'thrillist', 'seattle.eater',
                    'yahoo', 'capitolhillseattle', 'eventful', 'groupon',
                    'clubplanet', 'postfastr', 'opentable', 'menupix',
                    'menuism', 'letzgoout', 'barmano', '2findlocal',
                    'whitepages', 'manta', 'gigsounds', 'mapquest',
                    'www.restaurant.com', 'nochelatina'
                ]
                if 'http' in url and not any(string in url
                                             for string in strings_to_exclude):
                    place.website = url
                    place.save()
                    break
                num += 1
            count += 1
        except Exception:
            traceback.print_exc()
    print "Scrape websites successful"
 def __init__(self, num, keyword):
     self.num = num
     self.keyword = keyword
     self.br = Browser(factory=mechanize.RobustFactory())
     self.br.set_handle_robots(False)
     self.br.addheaders = [
         ('User-Agent', userAgent),
         ('Accept',
          'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
     ]
     self.cj = mechanize.LWPCookieJar()
     self.br.set_cookiejar(self.cj)
     self.br._factory.is_html = True
     self.br.set_handle_refresh(False)
     self.idletime = 0
     threading.Thread.__init__(self)
     self.url = ""
     self.depth = 0
     self.output = ""
예제 #22
0
def get_address_from_eircode(eircode):
    browser = mechanize.Browser(factory=mechanize.RobustFactory())
    url = "http://correctaddress.anpost.ie/pages/Search.aspx"
    browser.addheaders = [('User-agent', random_user_agent() )]
    browser.open(url)
    html = browser.response().read()
    browser.select_form(nr=0)
    browser.form.set_all_readonly(False)
    browser["ctl00$body$txtEircode"] = str(eircode)
    request = browser.form.click()
    response = browser.submit()
    html = response.read()
    soup = BeautifulSoup(html, "html.parser")
    tag = soup.find(id="ctl00_body_hfTextToCopy")
    try:
        value = tag['value']
        address = value.replace("\n",", ")
        return str(unicodedata.normalize('NFKD', address).encode('ascii','ignore'))
    except:
        return ""
예제 #23
0
def GetSaleInfo(url):
    while True:
        try:
            print "Getting Sale Info from " + url
            useragent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0"
            browser = mechanize.Browser(factory=mechanize.RobustFactory())
            browser.addheaders = [('User-agent', useragent)]
            browser.open(url)
            html = browser.response().read()
            try:
                soup = BeautifulSoup(html, "html.parser")
            except:
                soup = BeautifulSoup(html)
            results1 = soup.find("table", {"id": "SaleInfo"})
            results2 = soup.find("table", {"id": "OtherInfo"})
            results = str(results1) + str(results2)
            if results is not None:
                return results
        except:
            pass
예제 #24
0
def build_browser(cookiejar):
    """ Returns a mechanize.Browser object properly configured """
    browser = mechanize.Browser(factory=mechanize.RobustFactory())
    browser.set_cookiejar(cookiejar)
    #browser.set_handle_gzip(True)
    browser.set_handle_equiv(True)
    browser.set_handle_refresh(False)
    browser.set_handle_redirect(True)
    browser.set_handle_referer(True)
    browser.set_handle_robots(False)
    browser.addheaders = [
        ('User-agent', 'Mozilla/5.0 (Windows; U; Windows '
         'NT 5.1; en-US; rv:1.9.2.10) Gecko/20100914 Firefox/3.6.10')
    ]
    #browser.set_debug_http(True)
    if PARSER.has_section('general') and PARSER.has_option('general', 'proxy'):
        proxy = PARSER.get('general', 'proxy')
        if proxy:
            browser.set_proxies({"http": proxy, "https": proxy})
    return browser
예제 #25
0
def getFreeLinks(
        nisdon_query_link='http://www.nisdon.com/search/label/free?max-results=',
        max_query=200):
    """
    Get a the list of classes and links
    """

    full_link = '%s%d' % (nisdon_query_link, max_query)
    br = mechanize.Browser(factory=mechanize.RobustFactory())
    br.set_handle_robots(False)
    webpage = br.open(full_link).read()
    soupPage = BeautifulSoup(webpage, 'xml')
    # blog_post_div = soupPage.findAll("div",{"class":"blog-posts hfeed"})
    blog_post = soupPage.findAll("div", {"class": "post-outer"})

    all_link = map(
        lambda x: x.findAll("div", {"class": "thumb"})[0].find('a')['href'].
        encode('ascii', 'ignore'), blog_post)

    return all_link
예제 #26
0
    def __init__(self,
                 username,
                 password,
                 ignore_re=None,
                 retries=3,
                 skip_existing=True):
        self.username = username
        self.password = password
        self.ignore_re = ignore_re
        self.retries = retries
        self.skip_existing = skip_existing

        self.br = mechanize.Browser(factory=mechanize.RobustFactory())
        self.br.addheaders = [(
            'User-agent',
            'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
        )]
        self.br.set_handle_refresh(True)
        self.br.set_handle_redirect(True)

        self.br.open(self.ping_url).read()
예제 #27
0
def start_browser(url=None):
    br = mechanize.Browser(factory=mechanize.RobustFactory())
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)

    br.set_handle_equiv(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)

    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

    br.addheaders = [(
        'User-agent',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
    )]
    if not url:
        br.open('http://denver.craigslist.org')
    else:
        br.open(url)
    print br.title()
    return br
예제 #28
0
def browser(honor_time=True,
            max_time=2,
            mobile_browser=False,
            user_agent=None,
            use_robust_parser=False,
            verify_ssl_certificates=True):
    '''
    Create a mechanize browser for web scraping. The browser handles cookies,
    refresh requests and ignores robots.txt. Also uses proxy if available.

    :param honor_time: If True honors pause time in refresh requests
    :param max_time: Maximum time in seconds to wait during a refresh request
    :param verify_ssl_certificates: If false SSL certificates errors are ignored
    '''
    from calibre.utils.browser import Browser
    if use_robust_parser:
        import mechanize
        opener = Browser(factory=mechanize.RobustFactory(),
                         verify_ssl=verify_ssl_certificates)
    else:
        opener = Browser(verify_ssl=verify_ssl_certificates)
    opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
    opener.set_handle_robots(False)
    if user_agent is None:
        user_agent = USER_AGENT_MOBILE if mobile_browser else USER_AGENT
    opener.addheaders = [('User-agent', user_agent)]
    proxies = get_proxies()
    to_add = {}
    http_proxy = proxies.get('http', None)
    if http_proxy:
        to_add['http'] = http_proxy
    https_proxy = proxies.get('https', None)
    if https_proxy:
        to_add['https'] = https_proxy
    if to_add:
        opener.set_proxies(to_add)

    return opener
예제 #29
0
    def check_indeed(self, title, city):
        br = mechanize.Browser(factory=mechanize.RobustFactory())
        br.set_handle_robots(False)

        indeed_url = 'http://www.indeed.com'

        br.open(indeed_url)

        br.form = list(br.forms())[0]

        br["q"] = title  # The What id
        br["l"] = city  # The Where id
        response = br.submit()
        print br.geturl()
        response = br.open(
            br.geturl() + '&limit=20'
        )  # 20 items per page, this is good to keep only relevant items
        print br.geturl()
        response = response.read()

        soup = BeautifulSoup(response)

        titles_soup = soup.findAll("a", attrs={"data-tn-element": "jobTitle"})
        titles = [item.text for item in titles_soup]
        urls = [
            'http://www.indeed.com' + item.get('href') for item in titles_soup
        ]
        companies = self._find_field_in_soup(soup, "company")
        locations = self._find_field_in_soup(soup, "location")
        summaries = self._find_field_in_soup(soup, "summary")
        dates = self._find_field_in_soup(soup, "date")

        return self._create_jobs_dict(title=titles,
                                      company=companies,
                                      location=locations,
                                      summary=summaries,
                                      date_posted=dates,
                                      job_url=urls)
예제 #30
0
def post_data():

    # url
    url_add = "http://www.indeed.com"

    # details
    details = {"q": "Python Developer", "l": "Santa Clara, CA"}

    # search based on job title and place
    browser = mechanize.Browser(factory=mechanize.RobustFactory())
    browser.set_handle_robots(False)

    # open the absloute url
    browser.open(url_add)

    # select the form
    browser.select_form(nr=0)

    # fill the form
    browser.form["q"] = details.get("q", "")
    browser.form["l"] = details.get("l", "")

    return browser.submit()