Пример #1
0
def getNews():
  client = mc(host)
  collection = client['mydb']['nba_news']
  lines = open('links.tmp', 'r').readlines()

  toggle, title, link = True, None, None
  for l in lines:
    if toggle:
      title = l.strip()
    else:
      link = l.strip()
      
      req = requests.get('{}/{}'.format(head, link))
      page = soup(req.text, 'html.parser')
      section = page.find('section')
      section = '<html><body>{}</body></html>'.format(str(section)) 
      article = soup(section, 'html.parser').find_all('p')

      content = ''.join([ p.text for p in article ])
      print title, link, content

      doc = {
          "title": title,
          "link": '{}/{}'.format(head, link),
          "content": content.replace("\"", "\'")
      }
      collection.insert_one(doc)

    toggle = not toggle

  print collection.count()
Пример #2
0
def nextborrower(url, urls): #there's no centralized page that lists all past loans on Zidisha, so we need to do some crawling to find the next loan page
	maxtries = 30
	borrowurl = ""
	html = urlopen(url)
	bsobj = soup(html.read(), 'html.parser')
	mydivs = bsobj.findAll("div", {"class" : "lender-thumbnail"}) #get all the lenders who contributed
	otherborrowers = []
	tries = 0
	#keep trying until we find a lender with at least one other borrower listed on their page. there should be a more systematic way to do this to avoid repeats and reduce runtime. 
	while (len(otherborrowers) == 0) and (tries < 30):
		choice = mydivs[randint(0,len(mydivs)-1)]
		lendurl = choice.a.get('href')
		html = urlopen(lendurl)
		bsobj = soup(html.read(), 'html.parser')
		mydivs2 = bsobj.findAll("div", {"class" : "lender-thumbnail"}) #find all the borrowers that lender has given to
		if len(mydivs2) > 1:
			otherborrowers = mydivs2
			choice = mydivs2[randint(0,len(mydivs2)-1)]
			borrowurl = choice.a.get('href')
		tries += 1
	if borrowurl in urls:
		return nextborrower(url, urls) #if this borrower has already been used, recursively go back to the beginning. A bit kludgy. 
	html = urlopen(borrowurl)
	bsobj = soup(html.read(), 'html.parser')
	col = bsobj('div', {'class' : 'col-sm-6'})[2].get_text()
	if "Date Disbursed" not in col: #if the loan hasn't been disbursed yet, don't use it for training or validation
		return nextborrower(url, urls)
	assert tries < maxtries
	return borrowurl
Пример #3
0
def frontpage(n): #generates scores for the first n loans listed on Zidisha's main page and writes a csv file of them
	url = "https://www.zidisha.org/lend"
	html = urlopen(url)
	bsobj = soup(html.read(), 'html.parser')
	mydivs = bsobj.findAll("div", {"class" : "profile-image-container"})
	fpfile = open('frontpage.csv','wr')
	fpwriter = csv.writer(fpfile)
	fpwriter.writerow(['url','amount','cost','ratio','duration','city','country','ontime','notontime','history','posvote','negvote','fees','feeratio','title', 'description', 'pastscore'])
	links = [prof.a.get('href') for prof in mydivs]
	titles = []
	for i in range(n):
		beforescore, afterscore, ontime = getscore(links[i])
		fpwriter.writerow(profile(links[i]) + [beforescore])
		html = urlopen(links[i])
		bsobj = soup(html.read(), 'html.parser')
		hits = bsobj.findAll('p',{'class' : 'alpha'})
		titles.append(hits[0].get_text().replace('  ','').replace('\n',''))
	fpfile.close()
	h2o.init()
	from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glme
	fpdf = h2o.import_file(path = abspath('./frontpage.csv'))
	result = evalmodel(fpdf)
	resultfile = open('results.csv','wr')
	resultwriter = csv.writer(resultfile)
	resultwriter.writerow(['project','url','score'])
	for i in range(n):
		resultwriter.writerow([titles[i],links[i],result[i]])
Пример #4
0
def crawl(href,count):
	print(get_time() + ", Parsing Link: " + href)
	

	req = Request(href, headers={'User-Agent': 'Mozilla/5.0'})

	uClient = uReq(req)
	page_html = uClient.read()
	uClient.close()
	
	page_soup = soup(page_html, "html.parser")
	heading = page_soup.find('center')
	content_container = page_soup.find('table', attrs={'style' : "background:transparent; text-align:justify;"}).prettify()
	
	table = soup(content_container,"html.parser")	
	
	para = table.find_all('p')
	
	#name = str(count)+".html"
	with io.open("para_hn.html", "a", encoding="utf-8") as fout:
		#fout.write("\n\n" + heading.text + "\n\n")
		#	for i in para:
	 	#print(para[i])
		fout.write(str(para))
		

	link = page_soup.find('img', attrs={'alt' : 'Next.png'})
	next_link = link.findPrevious('a')['href']
	complete_link = "http://hi.krishnakosh.org" + quote(next_link, safe='%,/')

	return complete_link
Пример #5
0
def imdb():
    res = r.get("http://www.imdb.com/chart/top")
    s = soup(res.text)
    x = []
    for i in s.findAll("a"):
        x.append(i.get("href"))
    l = []
    for i in x:
        if i.startswith("/title"):
            l.append("http://imdb.com" + i)
    l = list(set(l))

    d = {}
    for k in l:
        k = k.split("?")
        k = k[0]
        res = r.get(k)
        s = soup(res.text)
        for j in s.findAll("span", {"class": "itemprop"}):
            q = s.title.text.split("-")
            q = q[0].strip()
            if d.get(j.text):
                d[j.text].append(q)
            else:
                d.setdefault(j.text, []).append(q)

    return d

    # <a href="fullcredits?ref_=tt_cl_sm#cast">See full cast</a>
    """
Пример #6
0
def appendFacebookInfo(stars):

    for key in stars:

        if stars[key]["url"]["facebook"] != "":


            if stars[key]["url"]["facebook"].endswith('/'):
                stars[key]["url"]["facebook"] = stars[key]["url"]["facebook"][:-1]


            try:
                url = stars[key]["url"]["facebook"] + "/info/?tab=page_info"

                try:
                    web_soup = soup(urllib2.urlopen(url),'lxml')

                    infoString = web_soup.find(name="div", attrs={'data-id': 'page_info'})
                    emails = get_emails(str(infoString).lower())
                    for email in emails:
                        #print email
                        stars[key]["email"].append(email)

                except urllib2.HTTPError:
                    #print "Invalid Facebook URL Format :("
                    pass
                except:
                    web_soup = soup(urllib2.urlopen(url),'lxml')

            except KeyError:
                print "f****n key error"
                pass

    return stars
Пример #7
0
def getLinks():
  cache = open('links.tmp', 'a')

  navigation = '{}/2016/news/archive/{}/index.html'
  for week in xrange(10, 20):
    req = requests.get(navigation.format(head, week))
    page = soup(req.text, 'html.parser')
    div = page.find('div', class_='nbaNAContent')
    div = '<html><body>{}</body></html>'.format(str(div)) 
    links = soup(div, 'html.parser').find_all('a')
    for l in links:
      cache.write(l.get_text() + '\n')
      cache.write(l.get('href') + '\n')
Пример #8
0
    def test_menu_item(self):

        output = soup(menu_item({}, link='/', name='bob', always_display=True)).find('li')
        self.assertEqual('bob', output.text)
        self.assertTrue('menu-bob' in output.attrs['class'])
        self.assertEqual('/', output.find('a').attrs['href'])

        output = soup(menu_item({}, link='/', name='bob', id='bobby', current=True, unavailable=True, always_display=True)).find('li')
        self.assertEqual('bob', output.text)
        self.assertTrue('menu-bobby' in output.attrs['class'])
        self.assertTrue('current' in output.attrs['class'])
        self.assertTrue('unavailable' in output.attrs['class'])
        self.assertEqual('/', output.find('a').attrs['href'])
Пример #9
0
def getscore(url): #does sentiment analysis on the comment thread for a given loan
	html = urlopen(url + '/discussion') 
	bsobj = soup(html.read(), 'html.parser')
	html2 = urlopen(url)
	bsobj2 = soup(html2.read(), 'html.parser')
	col = bsobj2('div', {'class' : 'col-sm-6'})[2]
	if "Date Disbursed" in col.get_text(): 
		cutoff = datetime.strptime(col('strong')[1].get_text(), '%b %d, %Y').date()
		if len(col('strong', text = re.compile(r'On Time'))) > 0:
			ontime = 1
		else:
			ontime = 0
	else:
		cutoff = datetime.now().date()
		ontime = 1
	mydivs = bsobj.findAll("div", {"class" : "media-body"})
	comments = [div.p.get_text() for div in mydivs]
	spans = bsobj('span', {'class' : 'comment-actions'})
	dates = [datetime.strptime(span.get_text(), '%b %d, %Y').date() for span in spans]
	beforecomments = [comments[i] for i in range(len(comments)) if dates[i] < cutoff]
	aftercomments = [comments[i] for i in range(len(comments)) if dates[i] >= cutoff]
	if len(beforecomments) > 0:
		comment = " ".join(beforecomments)
		comment = comment.replace("   ", "").replace("&","and").replace("#","") #there is often a lot of extra whitespace. get rid of that. Also, ampersands and pound signs seem to cause a problem, so toss 'em.
		chunks = re.findall(re.compile(r'.{1,1000}', re.DOTALL),comment) #chunks of text larger than 1-2k characters often don't seem to get processed properly. this is really kludgy, though. 
		chunks = [''.join(s for s in chunk if ord(s)>31 and ord(s)<126) for chunk in chunks] #get rid of special and non-ascii characters
		#print(chunks)
		scores = []
		for chunk in chunks:
			analysis = client.get_request({"text" : chunk}, HODApps.ANALYZE_SENTIMENT, async=False) #sentiment analysis of each chunk
			scores.append(analysis["aggregate"]["score"])
		beforescore = mean(scores)
	else:
		beforescore = 0.
	if len(aftercomments) > 0:
		comment = " ".join(aftercomments)
		comment = comment.replace("   ", "") #there is often a lot of extra whitespace. get rid of that. 
		chunks = re.findall(re.compile(r'.{1,1000}', re.DOTALL),comment) #chunks of text larger than 1-2k characters often don't seem to get processed properly. this is really kludgy, though. 
		chunks = [''.join(s for s in chunk if ord(s)>31 and ord(s)<126) for chunk in chunks] #get rid of special and non-ascii characters
		#print(chunks)
		scores = []
		for chunk in chunks:
			analysis = client.get_request({"text" : chunk}, HODApps.ANALYZE_SENTIMENT, async=False) #sentiment analysis of each chunk
			scores.append(analysis["aggregate"]["score"])
		afterscore = mean(scores)
	else:
		afterscore = 0.
	return beforescore, afterscore, ontime
Пример #10
0
    def get_title(self, url):
        """
        Get the title of the specified url.  If there are any redirects, they
        will first be followed before pulling the title.  Image and pdf links
        will be ignored.

        @param url  - url to pull title for.
        @return     - title if found.
        """
        while True:
            try:
                html = requests.get(url, verify=False)
                html.raise_for_status()
            except requests.exceptions.RequestException, e:
                log.err(str(e))
                return

            if html.headers['content-type'].startswith('image'):
                return
            elif html.headers['content-type'].startswith('application/pdf'):
                return
            else:
                parsed = soup(html.text, 'html.parser')
                if parsed.title is None:
                    redirect = self._meta_redirect(parsed)
                    if not redirect:
                        log.err("Couldn't parse content from %s" % (url,))
                        return
                    else:
                        url = redirect
                else:
                    break
Пример #11
0
def gcoos_describe_sensor(r_a, urn):
    """
    Notes:
    We get all settings from the .cfg file and build
    the_url. Different RAs are running different versions
    of SOS so the XML parsing might need some tweaking. This
    code is known to work with the GCOOS-RA SOS server.
    """
    the_url = CONFIG.get('ra_servers', r_a)
    the_url = the_url + CONFIG.get('base_urls', 'describe_sensor')
    the_url = the_url.replace('[anyURI]', urn)
    if DEBUG:
        print "gcoos_describe_sensor(%s, %s)..." % (r_a, urn)

    the_soup = soup(urllib2.urlopen(the_url).read(), 'html.parser')
    #get position
    the_pos = the_soup.find('gml:pos').contents[0]
    latitude = float(the_pos.split(' ')[0])
    longitude = float(the_pos.split(' ')[1])
    #slurp up the rest of the tasty bits...
    the_org = the_soup.find('sml:organizationname').contents[0]
    the_description = the_soup.find('gml:description').contents[0]
    sensor_list = []
    for sensor in set(the_soup.find_all('sml:output')):
        sensor_list.append(sensor['name'])
    #Get GeoJSON with it...
    my_feature = Feature(geometry=Point(([longitude, latitude])))
    my_feature.header = {'Organization' : the_org,
                         'Station' : urn,
                         'Description' : the_description,
                         'Sensors' : sensor_list}
    return my_feature
Пример #12
0
def scrape_news(input_from_click):
    input_from_click_processed = input_from_click.replace(" ","\%20")
    country_name = input_from_click_processed
    search_term=country_name+'\%20happiness'
    news_url=f"https://news.google.com/rss/search?q={search_term}"
    Client=urlopen(news_url)
    xml_page=Client.read()
    Client.close()

    soup_page=soup(xml_page,"xml")
    news_list=soup_page.findAll("item")

    news_articles = ""
    # Print news title, url and publish date
    for news in news_list[:25]:
        #print(news.title.text)
        news_articles+=news.title.text+'</br>'
        #print(news.link.text)
        news_articles+=news.link.text+'</br>'
        #print(news.pubDate.text)
        news_articles+=news.pubDate.text+'</br>'
        #print("-"*60)
        news_articles+="-"*60+'</br>'
        
    #print(f"Number of articles: {len(news_list)}")
    return news_articles
Пример #13
0
def inbound_sms():
    response = twiml.Response()
    # we get the SMS message from the request. we could also get the 
    # "To" and the "From" phone number as well
    inbound_message = request.form.get("Body")
    # we can now use the incoming message text in our Python application
    chromedriver = '//Users/Pranavtadepalli/Downloads/chromedriver'
    browser = webdriver.Chrome(chromedriver)
    browser.get('https://lgusd.powerschool.com/public/home.html')
    username = browser.find_element_by_name("account")
    password = browser.find_element_by_name("pw")
    
    username.send_keys("prantadepa")
    password.send_keys("17802249")
    
    browser.find_element_by_id("btn-enter").click()
    raw=soup(browser.page_source)
    browser.quit()
    simp='-'.join([elem.text for elem in raw.findAll('td') if elem.text!='\xa0']).split('-.-.-.-.-')
    teachers=[]
    for grade in simp[1:]:
        current=[]
        current.append(grade.split('\xa0')[0])
        current.append(' '.join(grade.split(',')[1].split('\xa0Email')))
        current.append(grade.split(':')[1].split('(')[0].split('-')[-4])
        teachers.append(current)
    if inbound_message != '':
        response.message(str(teachers))
    else:
        response.message("Hi! Not quite sure what you meant, but okay.")
    # we return back the mimetype because Twilio needs an XML response
    return Response(str(response), mimetype="application/xml"), 200
Пример #14
0
def get_leetcode_url():
    urls = []
    # 获取网页内容
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) '
                      'AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/43.0.2357.130 Safari/537.36'
    }
    cur_url = 'https://leetcode.com/problemset/algorithms/'
    s = requests.session()
    r = s.get(cur_url, headers=headers)
    r.encoding = 'utf-8'
    html = soup(r.contents)
    problem_list = html.findAll('table', {"id": "problemList"})
    tbody = problem_list[0].find('tbody')
    common_url = 'https://leetcode.com'
    trs = tbody.findAll('tr')
    for tr in trs:
        tds = tr.findAll('td')
        prob_id = tds[1].text
        prob_name = tds[2].text
        url = tds[2].find('a')['href']
        # print prob_id + ' ' + prob_name
        # print common_url + url
        urls.append(prob_id + '||' + prob_name + '||' + common_url + url)
    return urls
Пример #15
0
def serializar_ocorrencias(lista_urls, palavra):

	#Armazenara o array de todas as ocorrencias de cada link
	ocorrencias = []

	#Para cada link na lista sera feito o bloco de codigo
	for link in lista_urls:
		
		#formatando a pagina para html
		pag_url = str(uReq(link).read())
		pag_soup = soup(pag_url, "html.parser")

		#qtd de paragrafos <p> no html
		ct = paragrafos(pag_soup,palavra)
		
		#cria um dicionario com o link e a quantidade de ocorrencias
		ocorrencia = {"url":link, "qtd_de_ocorrencias": str(ct)}
		#adiciona o dicionario a lista de ocorrencias
		ocorrencias.append(ocorrencia)

		#printando no console a troca da url
		print("*************************************************************************")


	#retornando o array de dicionarios serializado no formato json
	#return json.dumps(ocorrencias, indent=4)
	return ocorrencias
def request_unique(session, num):
    unique = str(num)
    url = registrar_url + "/" + fall_2016_str + "/" + str(num)

    r = session.get(url)
    if r.status_code != 200:
        raise HTTPException("Got status code: " + str(r.status_code) + " when requesting unique range.")
    r = check_for_lares(r)

    results = soup(r.text).findAll("table", {"id": "details_table"})

    if len(results) < 1:
        return []
    elif len(results) > 1:
        raise HTTPException("Too many tables?")

    classes = []
    table = results[0]
    rows = table.findAll("tr")
    for tr in rows:
        unique = get_unique(tr)
        status = get_status(tr)
        days = get_days(tr)
        hours = get_hours(tr)
        if unique is not None:
            classes.append((unique, status, days, hours))

    sleep(0.02)
    return classes
Пример #17
0
 def get_auth_token(self):
     url = self.endpoint + '/' + self.authEndpoint
     res = self.session.get(url)
     dom = soup(res.text)
     token = dom.find('input', {'name': 'token'}).get('value')
     self.token = token
     return token
Пример #18
0
 def intitle(self, pattern, url):
     html = self.ResList[url].HTML
     try:
         title = soup(html).title
     except:
         return False
     return re.search(pattern, title.string) is not None
Пример #19
0
def content_seen_test(page):
	'''
	This document is responsible for performing a document seen test
	Checking to see if the document has being downloaded already by using the sha224 algorithm
        >>>content_seen(page)
       	True
	'''

	try:
		from pymongo import MongoClient
		from hashlib import sha224


 		connection=MongoClient()
		db=connection.jobsdbs
		
		assert db.connection==connection

                if page==soup('','lxml'):
			return False
			
		if db.crawler_page_info.find_one({'doc_digest':sha224(page.body.encode('utf-8')).hexdigest()}):
			return True
                else: 
			return False
	except:
                import sys
		db.crawler_error_log.insert({'error_type':str(sys.exc_info()),'from_module':str(__file__)})
		print 'problem with document finger printing algorithm'

		return False
Пример #20
0
def crawl_full():
    page = 1
    while True:
        params = {'page': page}
        response = requests.get(url + '/home/index.html', headers=headers, params=params, timeout=45)
        timestamp = time.time()
        with open('tmp', 'w') as fw:
            fw.write(response.text)
        s = soup(response.text, 'lxml')
        matches = s.find('div', {'class': 'items'}).findAll('a')
        pg = s.find('li', {'class': 'page active'})
        for match in matches:
            if not 'dota2-icon' in match.find('i').get('class'):
                continue
            series = match.find('span', {'class': 'spinach-league'}).text.strip().split('【')[0]
            try:
                matchtime_rlt = convert_time(match.find('div', {'class': 'pull-right spinach-league-right'}).text.strip())
            except:
                print('***ERROR***', match.find('div', {'class': 'pull-right spinach-league-right'}))
            try:
                notes = match.find('span', {'class': 'spinach-league'}).text.strip().split('【')[1][:-1]
            except IndexError:
                notes = None
            href = url + match.get('href')
            yield crawl_details(href, series, matchtime_rlt, notes)
        if not pg.findNextSibling() or 'Market' in pg.findNextSibling().find('a').get('href'):
            break
        page += 1
Пример #21
0
    def meta_scrape_table(self, url):
        from bs4 import BeautifulSoup as soup
        import requests

        type_map = {
        'Char':'varchar', 
        'Character':'varchar', 
        'Charter':'varchar', # A spelling error in the source page. 
        'Decimal':'real',
        'Date':'datetime'}

        with self.session:
            self.database.create()

            r = requests.get(url)

            for tr in soup(r.content).find('table').find_all('tr')[1:]:
                cells = [td.text for td in tr.find_all('td')]

                yield {
                    'field': cells[1].strip(),
                    'datatype': type_map[cells[2].strip()], 
                    'size': int(cells[3].strip()), 
                    'description': cells[4].strip()
                }
Пример #22
0
Файл: qrz.py Проект: hink/qrzpy
def login(username, password):
    # Login to QRZ - Must have access to XML API
    login_url = ('{0}?username={1};password={2};agent=qrzpy1.0'
        .format(api_root, username, password))
    
    # Send request
    try:
        res = requests.get(login_url)
    except requests.exceptions.Timeout:
        _error('Login request to QRZ.com timed out', True)

    # Check Response code
    if res.status_code != 200:
        _error('Invalid server response from QRZ.com', True)

    # Parse response and grab session key
    data = soup(res.content)
    if data.session.key:
        session_key = data.session.key.text
    else:
        if data.session.error:
            err = data.session.error.text
            _error('Could not login to QRZ.com - {0}'.format(err), True)
        else:
            _error('Unspecified error logging into QRZ.com', True)

    return session_key
Пример #23
0
Файл: qrz.py Проект: hink/qrzpy
def lookup_callsign(callsign, session_key):
    # Check for no callsign
    if not callsign:
        return

    search_url = ('{0}?s={1};callsign={2}'
        .format(api_root, session_key, callsign))

    # Send request
    try:
        res = requests.get(search_url)
    except requests.exceptions.Timeout:
        _error('Login request to QRZ.com timed out', True)

    # Check response code
    if res.status_code != 200:
        _error('Invalid server respnse from QRZ.com')
        return

    # Parse response and grab operator info
    data = soup(res.content)
    if not data.callsign:
        print 'No data found on {0}'.format(callsign)
    else:
        display_callsign_info(data.callsign)
def request_unique_range(session, start, finish):
    assert start < finish
    url = registrar_url + "/" + fall_2016_str + "/results/?search_type_main=UNIQUE&ccyys="
    url += fall_2016_str + "&start_unique=" + str(start) + "&end_unique=" + str(finish)
    r = session.get(url)
    if r.status_code != 200:
        raise HTTPException("Got status code: " + str(r.status_code) + " when requesting unique range.")
    r = check_for_lares(r)

    results = soup(r.text).findAll("table", {"class": "results"})

    if len(results) < 1:
        return []
    elif len(results) > 1:
        raise HTTPException("Too many tables?")

    classes = []
    table = results[0]
    rows = table.findAll("tr")
    for tr in rows:
        unique = get_unique(tr)
        status = get_status(tr)
        days = get_days(tr)
        hours = get_hours(tr)
        if unique is not None:
            classes.append((unique, status, days, hours))

    sleep(0.02)
    return classes
Пример #25
0
def get_github_url():
    urls = []
    # 获取网页内容
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) '
                      'AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/43.0.2357.130 Safari/537.36'
    }
    cur_url = 'https://github.com/wzqwsrf/Leetcode/tree/master/Java'
    s = requests.session()
    r = s.get(cur_url, headers=headers)
    # 这里主要转义一些特殊符号,如<会有问题
    r.encoding = 'utf-8'
    html = soup(r.text)
    problem_list = html.findAll('table', {"class": "files"})
    tbody = problem_list[0].find('tbody')
    common_url = 'https://github.com'
    spans = tbody.findAll('span', {"class": "css-truncate css-truncate-target"})
    k = 0
    slen = len(spans)
    for x in xrange(0, slen, 3):
        span = spans[x]
        print span
        a = span.find('a')
        title = a['title']
        url = a['href']
        urls.append(title)
    return urls
Пример #26
0
def findKonjugation(string):
    req = Request(
        f"{url}{quote(string)}",
        data=None,
        headers={
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
        })
    req.add_header('Referer', 'https://www.verbformen.de/konjugation/')

    try:
        s = soup(urlopen(req).read(), "html.parser")
    except urllib.error.HTTPError as e:
        print("Error while searching: %s" % e)
        sys.exit(1)

    result = {}
    for tense in filtered_tenses:
        result[tense] = []

    for table in s.findAll('div', attrs={'class': 'rAufZu'}):
        for column in table.findAll('div', attrs={'class': 'vTbl'}):
            tense = column.find('h3').text
            if (tense in filtered_tenses) and not result[tense]:
                for tr in column.findAll('tr'):
                    result[tense].append(tr.text.strip())

    if result["Präsens"]:
        for tense in filtered_tenses:
            print(f"\033[1m{tense}\033[0m")
            print(" / ".join(result[tense]))
            print("")
        print(f"Quelle: {url}{quote(string)}\n")
    else:
        print("No result found.")
Пример #27
0
    def send(self, request, **kwargs):
        response = super(HTTPAdapter, self).send(request, **kwargs)

        # ignore redirects
        if 'location' in response.headers:
            pass

        # look for the login page url
        elif response.url.startswith(auth_url + u'authenticate.html'):
            form = soup(response.text, 'lxml').find('form')
            if not form:
                raise ParseError("Could not parse login form", response)

            # build the login form param dict
            data = {
                i['name']: i.attrs.get('value')
                for i in form.find_all('input')
                if i['type'] not in ('checkbox', 'radio', 'submit')
            }
            data['userid'] = self.userid
            data['pwd'] = self.pwd

            # make a second request
            auth_request = requests.Request('POST', auth_url + 'authenticate2.html', data=data)
            auth_request = auth_request.prepare()

            response = self.send(auth_request)
        # look for the login page url
        elif response.url.startswith(auth_url + u'authenticate2.html'):
            raise CredentialError("Invalid credentials")

        return response
Пример #28
0
    def get_soup_from_url(self, url_in):
        """
        Return data loaded from an URL, as BeautifulSoup(3) object.

        Wrapper helper function aronud self.get_data_from_url()
        """
        return soup(self.get_data_from_url(url_in), 'html.parser')
Пример #29
0
def standards (user, year):
	standards = []
	#Get the standards given a user from the nzqa.login function
	s = soup(requests.get(nzqa_standards_url, cookies=user).text, "html.parser")
	for tbody in s.find_all("tbody", class_='results-{}'.format(year)):
	#Filter the results to the year chosen by the user
		for tr in tbody.find_all("tr")[0:]:
		#Find all 'tr' tags within the 'tbody' tags of the page
			tds = tr.find_all("td")
			#Find all 'td' tags within the 'tr' tags of the page


			if tds[0].span and tds[1].span and tds[2] and tds[3]:
				standard = {
				#Filtering the scraped data contain only assessment standard data
				   "Standard": tds[0].span.contents[0].strip(),
				   #The standard number for this result
				   "Assessment type": tds[2].contents[0].strip(),
				   #The Assessment type: Internal or External
				   "Description": re.sub('\s\s+',"",tds[3].span.contents[0]),
				   #Assessment description: What is the assessment?
				   "Level": tds[4].contents[0].strip(),
				   #Assessment level: What year of NCEA is this?
				   "Result": tds[10].span.contents[0].strip()
				   #What the student has achieved: A, M or E
				}
Пример #30
0
def solveIXL(problem):
    print(problem)        
    browser = webdriver.Chrome(chromedriver)
    browser.get('https://www.wolframalpha.com/input/?i='+problem.replace('+','%2B'))
    browser.find_element_by_name('equal').click()
    time.sleep(3)
    source=browser.page_source
    source=soup(source,'lxml')
    browser.quit()
    #print(source)
    try:
        stuff=source.findAll('wa-no-query-link',class_="ng-scope ng-isolate-scope")
        #print(stuff)
        for elem in stuff:
                pie=elem
        return str(pie).split('title')[-1].split('"')[1].strip('.')
    except:
        stuff=source.findAll('a',class_="ng-scope ng-isolate-scope")
        #print(stuff)
        for elem in stuff:
                pie=elem
        try:
            return str(pie).split('title')[-1].split('"')[1].strip('.')
        except:
            return "don't know"
def latest_ca_scrape():
    res = requests.get('https://www.bseindia.com/corporates/corporate_act.aspx')
    res.raise_for_status()
    page_soup = soup(res.content,features='lxml')

    no_of_pages_tab=page_soup.find('tr',{'class':'pgr'})
    no_of_pages=len(no_of_pages_tab.find_all('a'))+1

    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
    options.add_argument('--headless') #This prevents the browser from opening up
    driver = webdriver.Chrome("/Users/pratikbaid/Developer/chromedriver", chrome_options=options)

    pageSource=res.content
    dataList=[]
    page_soup = soup(pageSource,features='lxml')
    dataRows=page_soup.find_all('tr',{"class":"TTRow"})
    for dataRow in dataRows:
        dataColumns=dataRow.find_all('td')
        data=[]
        for dataColumn in dataColumns:
            data.append(dataColumn.text)
        dataList.append(data)

    if(no_of_pages>1):
        print('Entered first if')

        for i in range (2,no_of_pages+1):
            print("Entered ",i)
            xpath=f'//*[@id="ContentPlaceHolder1_gvData"]/tbody/tr[1]/td/table/tbody/tr/td[{i}]/a'
            print(xpath)
            driver.get('https://www.bseindia.com/corporates/corporate_act.aspx')
            driver.find_element_by_xpath(xpath).click()
            pageSource=driver.page_source
            page_soup = soup(pageSource,features='lxml')
            dataRows=page_soup.find_all('tr',{"class":"TTRow"})
            for dataRow in dataRows:
                dataColumns=dataRow.find_all('td')
                data=[]
                for dataColumn in dataColumns:
                    data.append(dataColumn.text)
                dataList.append(data)
            
        

    ca_array=[]
    for data in dataList:
        corporate_action={
            'secuarity_code':data[0],
            'secuarity_name':data[1],
            'ex_date':data[2],
            'purpose':data[3],
            'record_date':data[4],
            'bc_start_date':data[5],
            'bc_end_date':data[6],
            'nd_start_date':data[7],
            'nd_end_date':data[8],
            'actual_payment_date':data[9]
        }
        ca_array.append(corporate_action)
    latest_ca_json={
        'Latest_CA':ca_array
    }
    json_data=json.dumps(latest_ca_json)
    return(json_data)

    '''//*[@id="ContentPlaceHolder1_gvData"]/tbody/tr[1]/td/table/tbody/tr/td[2]/a'''
    '''//*[@id="ContentPlaceHolder1_gvData"]/tbody/tr[1]/td/table/tbody/tr/td[3]/a'''
Пример #32
0
import random

for x in range(1):
	x=random.randint(1,101)
	print(x)

myUrl='https://www4.bing.com/search?q=beans='+str(x)
print(myUrl)

#Opening connection
uClient=uReq(myUrl)
pageHtml=uClient.read()
uClient.close()

#Parsing part
pageSoup=soup(pageHtml, "html.parser")

# Grab products
boop=pageSoup.findAll("li",{"class":"b_algo"})

filename="Fact Spreadsheet.csv"
f=open(filename,"w")

headers="title, desc, link\n"

f.write("")

# Get Data
beep=boop[0]

for beep in boop:
Пример #33
0
def wiley(input, name):
    filename = "Wiley_" + name + ".xlsx"
    filepath = "wiley/csv/" + filename
    now = datetime.datetime.now()
    workbook = xlsxwriter.Workbook(filepath)
    f = workbook.add_worksheet()
    f.write('A1', 'Keyword : ')
    f.write('B1', input)
    f.write('A2', 'Database : ')
    f.write('B2', 'https://onlinelibrary.wiley.com/')
    f.write('A3', 'Date : ')
    f.write('B3', str(now.isoformat()))
    count = 1
    n = 4
    f.write('A' + str(n), 'S.No')
    f.write('B' + str(n), 'Website')
    f.write('C' + str(n), 'Title')
    f.write('D' + str(n), 'Journal name')
    f.write('E' + str(n), 'Volume')
    f.write('F' + str(n), 'Date')
    f.write('G' + str(n), 'Doi number')
    f.write('H' + str(n), 'Author name')
    f.write('I' + str(n), 'E-mail by method1')
    f.write('J' + str(n), 'E-mail by method2')
    f.write('K' + str(n), 'Affiliation')
    f.write('L' + str(n), 'Country')
    n += 1
    for i in range(0, 999999):
        print("Page : " + str(i))
        stop = True
        try:
            headers = {
                'user-agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
            }
            a = 'https://onlinelibrary.wiley.com/action/doSearch?AllField=' + input.replace(
                " ", "+") + '&startPage=&PubType=journal'
            b = 'https://onlinelibrary.wiley.com/action/doSearch?AllField=' + input.replace(
                " ", "%20") + '&startPage=' + str(i) + '&PubType=journal'
            my_url = ""
            if (i == 0):
                my_url = a
            else:
                my_url = b
            response = requests.get(my_url, headers=headers)
            page = soup(response.content, "html5lib")
            body = page.findAll("div", {"class": "item__body"})
            for each in body:
                link = each.h2.span.a['href']
                title = each.h2.text
                info = each.find("div", {"class": "meta__info"})
                date = info.find("span", {"class": "meta__epubDate"}).text
                doi = each.h2.span.a['href']

                #-------------------Initialization--------------------------------------------------------
                print("link : " + link)
                f.write('A' + str(n), str(count))
                f.write('B' + str(n), 'https://onlinelibrary.wiley.com' + link)

                #--------------Title----------------------------------------------
                print("Title : " + title)
                f.write('C' + str(n), title)

                #--------------Journal----------------------------------------------
                journal = info.find("a", {"class": "meta__serial"}).text
                print("Journal : " + journal)
                f.write('D' + str(n), journal)
                try:
                    vol = info.find("a", {"class": "meta__volume"}).text
                    print("Volume : " + vol)
                    f.write('E' + str(n), vol)
                except Exception as e:
                    print("Exception volume : " + str(e))
                    f.write('E' + str(n), 'Cannot get volume')
                #--------------Date----------------------------------------------
                try:
                    print("Date : " + date)
                    f.write('F' + str(n), date)
                except Exception as e:
                    print("Exception date : " + str(e))
                    f.write('F' + str(n), 'Cannot get date')

                #--------------Doi----------------------------------------------
                try:
                    print("Doi : https://nph.onlinelibrary.wiley.com" + doi)
                    f.write('G' + str(n),
                            'https://nph.onlinelibrary.wiley.com' + doi)
                except Exception as e:
                    print("Exception doi : " + str(e))
                    f.write('G' + str(n), 'Cannot get doi')

                #--------------Authors and email----------------------------------------------
                parse = "https://nph.onlinelibrary.wiley.com" + doi
                n = contact(parse, f, n)
                print("-------------------------------------------")
                count += 1
                n += 1
                stop = False
            if (stop):
                break
        except Exception as e:
            print("Exception big : " + str(e))
            print("Page : " + str(i))
            break
    print("Jimmy")
    workbook.close()
Пример #34
0
def contact(input, f, n):
    print("enter contact")
    headers = {
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }
    response = requests.get(input, headers=headers)
    page = soup(response.content, "html5lib")
    body = page.findAll(
        "div", {"class": "accordion-tabbed__tab-mobile accordion__closed"})
    print(len(body))
    for i in range(len(body) // 2):
        email = []
        country = []
        affiliation = []
        #--------------Authors----------------------------------------------
        print("Author : " + body[i].a.span.text)
        f.write('H' + str(n), body[i].a.span.text)
        try:
            add = body[i].find(
                "div", {"class": "author-info accordion-tabbed__content"})
            try:
                allP = add.findAll("p")
                for each in allP:
                    print("Address : " + each.text)
                    affiliation.append(each.text)
                    match = re.search(
                        "(( )*[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+)",
                        each.text)
                    if (match):
                        email.append(match.group(0))
                        print("Found email in author : " + match.group(0))
            except Exception as e:
                print("Exception address1 : " + str(e))
                f.write('K' + str(n), "Cannot get affiliation")
        except Exception as e:
            print("Exception address2 : " + str(e))
            f.write('K' + str(n), 'Cannot get affiliation')

        #--------------email 1----------------------------------------------
        print("Len email : " + str(len(email)))
        try:
            info = body[i].find("div", {"class": "bottom-info"})
            match = re.search(
                "(( )*[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)",
                info.text)
            if match:
                print("Email : " + match.group(0))
                email.append(match.group(0))
            else:
                print("Email not match :" + info.text)
                print("Email not match")
                if (len(email) == 0):
                    print("Enter if len(email)")
                    email.append("Cannot get email")
        except Exception as e:
            print("Exception email : " + str(e))
            if (len(email) == 0):
                print("Enter if len(email)")
                email.append("Cannot get email")

        if (len(email) == 0):
            f.write('I' + str(n), 'Cannot get email')
        else:
            f.write('I' + str(n), email[0])

        #--------------email 2----------------------------------------------
        try:
            text = page.find("div",
                             {"class": "article-header__correspondence-to"})
            match = re.search(body[i].a.span.text, text.text)
            if (match):
                match = re.search(
                    "(( )*[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)",
                    text.text)
                if (match):
                    f.write('J' + str(n), match.group(0))
                else:
                    f.write('J' + str(n), 'Cannot get email')
            else:
                f.write('J' + str(n), 'Cannot get email')
        except Exception as e:
            print("Exception email2 : " + str(e))
            f.write('J' + str(n), 'Cannot get email')
        print("-----------------------------------------")
        #--------------Country and affiliation----------------------------------------------
        for each in affiliation:
            checkCountry(each, country)
        try:
            for i in range(0, len(affiliation)):
                f.write('K' + str(n), affiliation[i])
                f.write('L' + str(n), country[i])
                print("Affiliation : " + affiliation[i])
                print("Country : " + country[i])
                n += 1
        except Exception as e:
            print("Exception country : " + str(e))
    return n
import csv  #excel file
from urllib.request import urlopen as uReq  # web client
from bs4 import BeautifulSoup as soup  #data stracture

#url web scrap from www.startech.com.bd
my_url = 'https://www.startech.com.bd/component/graphics-card'

#openning up connecting ,grabing the page url
uClient = uReq(my_url)

# parses html into a soup data structure to traverse html
# as if it were a json data type.
page_soup = soup(uClient.read(), "html.parser")
uClient.close()

#grabe each product store page
containers = page_soup.findAll('div', {'class': 'row main-content'})

#loops over each product and grabs attributes about
container = containers[0]
#print(container.div.img['alt'])

# finds each product from the store page
product_container = container.findAll('h4', {'class': 'product-name'})
#product(product_container[0].text)

# finds each product from the store page
price_container = container.findAll('div', {'class': 'price'})
#price(price_container[0].text)

# name the output file to write to local disk
Пример #36
0
category = 'landscape'
url = 'https://unsplash.com/s/photos/' + category
foldername = category
quality = 7 #1-7
regex = r'(?<=' + str(quality) + '00w,\s)https:\/\/images.unsplash.com\/photo.*w=' + str(quality+1) + '00&q=60(?=\s' + str(quality+1) + '00w)'

try:
    request = req(url)
    pageHtml = request.read()
    print('erfolgreich geladen')
except Exception as exception:
    print('Fehler :( ', exception)

links = []
sFile = soup(pageHtml, "html.parser")

try:
    imagesUncut = sFile.select('a > div > img')
    amount = len(imagesUncut)
    print('Es gibt ' + str(amount) + ' images')
except Exception as exception:
    print('Fehler beim Soupen ...\n', exception)

images=[]


for img in imagesUncut:
    picUrl = img.get('srcset')
    images.append(picUrl)
Пример #37
0
 def getPageSoup(self, url):
     urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
     page = requests.get(url, verify=False)
     page_html = page.text
     page_soup = soup(page_html, "html.parser")
     return page_soup
Пример #38
0
from urllib.request import urlopen as uReq

from bs4 import BeautifulSoup as soup

my_url = 'https://www.newegg.com/p/pl?d=graphic+card&cm_sp=KeywordRelated-_-graphics%20cards-_-graphic%20card-_-INFOCARD'

uClient = uReq(my_url)

page_html = uClient.read()

uClient.close()

page_soup = soup(page_html, 'html.parser')

print(page_soup.body)
Пример #39
0
def atualizaFipe():
    carsTree = None
    carrosList = []
    carrosMap = {}
    fipeMap = dicionario.HashMap()

    fipeURL = "http://fipeapi.appspot.com/api/1/carros/marcas.json"
    tknzr = WhitespaceTokenizer()

    chromeOptions = webdriver.ChromeOptions(
    )  # seta configs pra nao carregar imagens (aumenta velocidade do crawler)
    prefs = {"profile.managed_default_content_settings.images": 2}
    chromeOptions.add_experimental_option("prefs", prefs)
    chromeOptions.add_argument("--incognito")
    browser = webdriver.Chrome(chrome_options=chromeOptions)

    browser.get(fipeURL)
    page_json = soup(browser.page_source,
                     'html5lib').find("body").find("pre").text
    filename = 'fipe.csv'
    f = open(filename, "w")
    jsonMarcas = json.loads(page_json)
    mapMarcas = dicionario.HashMap()
    mapVeiculos = dicionario.HashMap()
    for marca in jsonMarcas:
        browser.get('http://fipeapi.appspot.com/api/1/carros/veiculos/' +
                    str(marca['id']) + '.json')
        time.sleep(1)
        modelos = soup(browser.page_source,
                       'html5lib').find("body").find("pre").text
        modelos = json.loads(modelos)
        mapMarcas.put(marca["fipe_name"], modelos)
        for modelo in modelos:
            print(modelo)
            modeloNome = tknzr.tokenize(modelo['name'])
            if modeloNome[0].upper() == "GRAND" or (
                    len(modeloNome) > 1 and modeloNome[1].upper()
                    == "LOUNGE") or modeloNome[0].upper() == "XC":
                modeloNome = str(modeloNome[0] + modeloNome[1])
            elif modeloNome[0].upper() == "SANTA":
                modeloNome = str(modeloNome[0] + modeloNome[1][:2])
            else:
                modeloNome = modeloNome[0]
            modeloNome = modeloNome.upper()
            modeloNome = modeloNome.replace("-", "")
            modeloNome = modeloNome.replace("!", "")
            if modelo['fipe_marca'].upper() == 'VW - VOLKSWAGEN':
                modelo['fipe_marca'] = 'VOLKSWAGEN'
            elif modelo['fipe_marca'].upper() == 'GM - CHEVROLET':
                modelo['fipe_marca'] = 'CHEVROLET'
            elif modelo['fipe_marca'] == 'Citro\u00ebn':
                modelo['fipe_marca'] = 'CITROEN'
            elif modelo['fipe_marca'].upper() == 'KIA MOTORS':
                modelo['fipe_marca'] = 'KIA'
            f.write(modelo["fipe_marca"].upper() + "," + str(marca['id']) +
                    "," + modeloNome + "," + modelo["id"] + "\n")

    with open('fipe.csv', 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            fipeMap.put(
                str(row[0]) + str(row[2]),
                str(row[1]) + " " +
                str(row[3]))  #pair(fipeMarca+fipeNome,marcaID+nomeID)

    with open('carros.csv', 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            chave = str(row[1]) + str(row[2]) + str(
                row[3])  #fabricante + modelo + ano
            objId = fipeMap.getObj(
                str(row[1]) + str(row[2])
            )  #guarda objeto com os varios codigos fip para aquele veículo. (vamos precisar iterar ele depois quando formos acessar os dados via selenium)
            car1 = {
                "chave": chave,
                "fabricante": row[1],
                "modelo": row[2],
                "ano": row[3],
                "objID": objId
            }  #tal iteracao é para driblar falta de informacao dos veículos das revendedoras (fipe usa nome completo com especificacoes e as revendedoras nao)
            if carrosList.count(car1) == 0:
                carrosList.append(car1)

    print(len(carrosList))
    print(carrosList)
    i = 0
    filename = "carrosEFipe.csv"  #abe arquivo para escrita
    fw = open(filename, "w")
    for car1 in carrosList:
        i += 1
        print(i)
        for j in range(1, 4, 1):
            for ID in car1['objID'].listValues:
                ID = tknzr.tokenize(str(ID))
                marcaID = ID[0]
                fipeID = ID[1]
                print("\t" + fipeID)
                year = str(car1["ano"]) + "-" + str(j)
                print("\t" + year)
                time.sleep(
                    0.5
                )  # servidor da api tem limite de requisicoes por minuto
                browser.get(
                    "http://fipeapi.appspot.com/api/1/carros/veiculo/" +
                    marcaID + "/" + fipeID + "/" + year + ".json")
                elem = soup(browser.page_source,
                            "html5lib").find("body").find("h1")
                if elem is not None and elem.text == '500 Internal Server Error':
                    continue
                else:
                    break
            if elem is not None and elem.text == '500 Internal Server Error':
                continue
            else:
                carroFipeInfo = soup(browser.page_source,
                                     'html5lib').find("body").find("pre").text
                carroFipeInfo = json.loads(carroFipeInfo)
                preco = tknzr.tokenize(carroFipeInfo["preco"])
                preco = preco[1]
                preco = preco[:len(preco) - 3].replace(".", "")
                print(car1["chave"])
                fw.write(car1["chave"] + "," + preco + "\n")
                break
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup as soup
import sys
import time
import smtplib

#set options for headless browser & getting the text i need
options = Options()
options.headless = True
driver = webdriver.Firefox(firefox_options=options)
driver.get("https://www.ubnt.com/download/edgemax/edgerouter-x")
time.sleep(5) # wait 5 sec to make sure page loads. Could be done with the driver.wait but this is easier
gethtml = driver.page_source # gets the source 
html = soup(gethtml, 'html.parser') # parse it to bs4
htmlFind = html.find('td', class_='downloadResults__name') # find the strings i want
text = htmlFind.get_text()
test = "EdgeRouter ER-X/ER-X-SFP/EP-R6: Firmware v1.10.8"

if text == test:
    driver.quit()
    sys.exit()
else:
    driver.quit()
#Make's connection to mail server of google.
    user = ''
    password = ''
    sendFrom = user
    to = ""
    msg = "\n Nieuwe update beschikbaar voor unifi router"
for page in pages:

    my_url = 'https://www.arrow.com/en/products/search?page=' + page + '&prodline=Telecom%20Transformers&selectedType=plNames&perPage=100'
    user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0)'
    headers = {'User-Agent': user_agent}

    #opening up connection' grabbing the page
    #uClient = uReq.(my_url, headers=headers)
    uClient = requests.get(my_url, headers=headers)
    page_html = uClient.content
    uClient.close()

    sleep(randint(30, 50))

    #html parsing
    page_soup = soup(page_html, "html.parser")
    table_body = page_soup.find("tbody")
    containers = table_body.find_all("tr")

    f.write(header)

    for container in containers:

        mfr_prt = container.find_all("span",
                                     {"class": "SearchResults-productName"})
        try:
            manufacturer_part = mfr_prt[0].text.strip()
        except IndexError:
            manufacturer_part = 'null'

        mfr = container.find_all(
Пример #42
0
def getStockData():
    data = [[], []]

    for i in range(2):
        temp_data = list()

        geturl = finance_url + stock_url[i]

        driver = webdriver.Chrome(executable_path=path_to_chromedriver,
                                  chrome_options=options)

        driver.get(geturl)

        time.sleep(5)

        for j in range(1000):

            t = str(10000 * (j + 1))
            driver.execute_script("window.scrollTo(0, " + t + ")")

        res = driver.execute_script(
            "return document.documentElement.outerHTML")
        driver.quit()

        page_soup = soup(res, "lxml")
        containers = page_soup.findAll("tr", {"class": "Whs(nw)"})
        # status = containers.findAll("small", {"class":"intraday__status"})
        x = 0
        for obj in containers:
            val_obj = obj.findAll("td", {"class": "Py(10px)"})
            x = x + 1
            # print(val_obj[0].text, val_obj[1].text, val_obj[2].text, val_obj[3].text, val_obj[4].text, val_obj[5].text)
            # print(stockNames[i], val_obj)
            Date = ""
            Open = ""
            High = ""
            Low = ""
            Close = ""
            AdjClose = ""
            Volume = ""
            # if(len(val_obj) != 7):
            #     temp_data.append(val_obj)
            if (len(val_obj) == 2):
                Date = val_obj[0].text
                Open = val_obj[1].text
                High = ""
                Low = ""
                Close = ""
                AdjClose = ""
                Volume = ""
            else:
                Date = val_obj[0].text
                Open = val_obj[1].text
                High = val_obj[2].text
                Low = val_obj[3].text
                Close = val_obj[4].text
                AdjClose = val_obj[5].text
                Volume = val_obj[6].text

            temp = [
                stockNames[i], Date, Open, High, Low, Close, AdjClose, Volume
            ]
            temp_data.append(temp)

        data[i] = temp_data

        time.sleep(3)

    return data
from urllib.request import urlopen as uReq
import pandas as pd

# url for future undergrad faq
# url = 'https://itsc.ontariotechu.ca/faqs/faqs-students.php'

# faculty itcs link
url = 'https://itsc.ontariotechu.ca/faqs/faqs-faculty-staff.php'

# open connection to webpage and read the html
client = uReq(url)
pagehtml = client.read()
client.close()

# read parse the html
html = soup(pagehtml, "html.parser")

# loop through all the html data and remove break tags
for i in html.findAll('br'):
    i.extract()

data = html.find_all('a', {'class': 'accordion-title'})

string_data = []
for i in data:
    string_data.append(str(i.text))

df = pd.DataFrame(string_data)

df.insert(1, '1', 'itcs_faculty-Staff_faq')
Пример #44
0
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import os
import csv
import pandas as pd
from datetime import datetime
import calendar
import re
from io import StringIO

my_url = "https://www.mohfw.gov.in/"
uClient = uReq(my_url)
pageHTML = uClient.read()
uClient.close()
pageSoup = soup(pageHTML, "html.parser")
containers = pageSoup.findAll("tbody")
raw_data = containers[-1].text.strip()
#print(raw_data)
data = ""
for c in raw_data:
    if c == '\n':
        data += ','
    else:
        data += c
#print(data)
data_array = data.split(',')
#print(data_array)
column_count = 6
mydata = ""
i = 0
for c in data_array:
Пример #45
0
                "Best places to visit around the world",
                "Adventurous places to visit", "Best luxuries of the world",
                "Top webseries to watch"
            ]
            searchFor = random.choice(topics)
            webbrowser.open('https://www.google.co.in/search?q=' +
                            "+".join(searchFor.split()))
            talk("If you did not find this helpful try watching youtube.")

        elif 'news for today' in Input:
            try:
                news_url = "https://news.google.com/news/rss"
                Client = urlopen(news_url)
                xml_page = Client.read()
                Client.close()
                soup_page = soup(xml_page, "xml")
                news_list = soup_page.findAll("item")
                talk("Okk sir tell me how may top news you wanna know ?")
                n = int(input("Enter the number -> "))
                talk("Here is what you need to know.")
                for news in news_list[:n]:
                    newsList = news.title.text.split("-")
                    talk("According to " + newsList[1])
                    talk(newsList[0])
                    print("\n")

                talk("That's about updates...")
            except Exception as e:
                print(e)

        else:
Пример #46
0
def scrape(baseURL, county):
    global cancelButtonFlag
    startPage = 1

    url = baseURL + "1"
    print("URL IS: " + url)
    try:
        request = Request(url, headers = {'User-Agent' :\
                  "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36"})
        uClient = urlopen(request)
        page_html = uClient.read()
        uClient.close()

        pageSoup = soup(page_html, "html.parser")
        pageSoup = pageSoup.body

        # find out how many pages of results there are and obtain that number
        pagination = pageSoup.find("nav", {"class": "pagination"})
        pageList = str(pagination)
        try:
            pageList = pageList.split("\n", 7)[-2]
        except:
            messagebox.showerror(
                "Form Error",
                "Make sure you spelled everything correctly in the forms and try agian."
            )
        result = re.search("/page-(.*)<", str(pageList))
        almostThere = result.group(1)
        pageTotal = ""
        for char in almostThere:
            if char.isdigit():
                pageTotal += char
                continue
            else:
                break
        pageTotal = int(pageTotal) + 1
        workingPage = 1

        for page in range(1, pageTotal + 1):
            if page == 0:
                continue
            else:
                url = baseURL + str(workingPage)

                print("DEBUG:  I'm opening result url " + url)
                print("DEBUG")

                request = Request(url, headers = {'User-Agent' :\
                          "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36"})
                print("1")
                uClient = urlopen(request)
                page_html = uClient.read()
                uClient.close()
                print("2")
                # find list of results on result page
                pageTag = soup(page_html, "html.parser")
                pageTag = pageTag.body.tbody

                # for each result in the result page, go to that result and pull data
                for i in pageTag:
                    print("in pagetag for loop                              3")
                    if cancelButtonFlag:
                        print(
                            "in cancelButtonFlag condition: should only be here if cancelButtonFlag == True                             4"
                        )
                        scrapeCanceled()
                        sys.exit()
                    print(
                        "after cancelButtonFlag condition                                         5"
                    )

                    i = i.a
                    i = str(i)

                    i = re.search("href=\"(.*)\">", i)
                    i = i.group(1)

                    url = "https://okcountyrecords.com" + i

                    print("DEBUG:  I'm opening page url" + url)
                    print("DEBUG")

                    # Open next result from result page
                    request = Request(url, headers = {'User-Agent' :\
                              "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36"})
                    uClient = urlopen(request)
                    page_html = uClient.read()
                    uClient.close()

                    # Program has reached the destination page for desired data
                    finalPage = soup(page_html, "html.parser")

                    print("DEBUG:  I'm looking in tables for data")
                    print("DEBUG")

                    # find all data fields in the table that contains the desired data
                    tables = finalPage.find_all('table')
                    for tbl in tables:
                        if tbl == tables[0]:
                            tds = tbl.findChildren('td')
                        else:
                            tds += tbl.findChildren('td')

                    # TODO: Add better handling here.  could result in shifted CSV rows if any of these data are missing.
                    book = re.search(">(.*)</td>", str(tds[0]))
                    book = book.group(1)

                    page = re.search(">(.*)</td>", str(tds[1]))
                    page = page.group(1)

                    instrument = re.search("heavy\">(.*)</td>", str(tds[2]))
                    instrument = instrument.group(1)

                    documentStamps = re.search("<td>(.*)</td>", str(tds[6]))
                    documentStamps = documentStamps.group(1)

                    recordedOn = re.search("<td>(.*)</td>", str(tds[7]))
                    recordedOn = recordedOn.group(1)

                    if len(tds) > 8:
                        instrumentDate = re.search("<td>(.*)</td>",
                                                   str(tds[8]))
                        instrumentDate = instrumentDate.group(1)
                    else:
                        instrumentDate = ""

                    # write the data to CSV
                    writeCSV(county, book, page, instrument, documentStamps,
                             recordedOn, instrumentDate, url)
                    # delay so we don't overwhelm the web servers and get blocked or something
                    sleep(5)

                # increment page number to go to next page
                workingPage += 1
    except HTTPError:
        messagebox.showerror(
            "URL/HTTP Error", "Could not access " + url +
            " Check your internet connection and try again")
    except URLError:
        messagebox.showerror(
            "URL/HTTP Error", "Could not access " + url +
            " Check your internet connection and try again")
Пример #47
0
from bs4 import BeautifulSoup as soup  #parsing
from urllib.request import urlopen as uReq  #requesting url

my_url = 'https://store.steampowered.com/app/1091500/Cyberpunk_2077/'  #Paste any link u want

#looking for url and scanning suiteble html of the page with the set above url
uClient = uReq(my_url)

#Turning html into text-variable
page_html = uClient.read()

uClient.close()

#html parser
page_soup = soup(
    page_html, "html.parser")  #parsing html into html(not xml,json and so on)

page_soup__directItem = page_soup.findAll("div", {
    "class": "game_purchase_price price"
})  #looking for every div with class on the page "game_description_snippet"
directItem__content = page_soup__directItem[0].text.strip(
)  #Getting exact value of the container

print("Actual product price:", directItem__content)
Пример #48
0
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup

url = 'https://www.fool.ca/recent-headlines/'
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()
page_soup = soup(webpage, "html.parser")
title = page_soup.find("title")
print(title)
containers = page_soup.findAll("p", "promo")
for container in containers:
    print(container)
    def scrape(urlOriginal, data_list):

        i = 0
        #7
        for value in range(1, 7):
            url = ""
            url = urlOriginal + format(value)
            print(url)
            try:
                uClient = uReq(url)
            except:
                pass
            page_html = uClient.read()
            uClient.close()
            #Parsing
            page_soup = soup(page_html, "html.parser")
            #article = page_soup.findAll('ul',class_='search-main-content__events-list')
            article_1 = page_soup.findAll('div',
                                          class_='search-event-card-wrapper')

            # fetching each details
            for container in article_1:
                title = container.findAll(
                    'div', class_='eds-event-card__formatted-name--is-clamped'
                )[0].text

                try:
                    Date_time = container.findAll(
                        'div',
                        class_=
                        'eds-text-color--primary-brand eds-l-pad-bot-1 eds-text-weight--heavy eds-text-bs'
                    )[0].text
                except:
                    Date_time = 'None'
                # try:
                #     Location = container.findAll('div',class_='card-text--truncated__one')[0].text
                # except:
                #     Location='None'
                try:
                    Price = container.findAll(
                        'div',
                        class_=
                        'eds-media-card-content__sub eds-text-bm eds-text-color--grey-600 eds-l-mar-top-1 eds-media-card-content__sub--cropped'
                    )[1].text
                except:
                    Price = 'None'
                a_tags = container.findAll('a')
                try:
                    image = a_tags[0].img['src']
                except:
                    image = 'None'
                read_more = a_tags[0]['href']
                print(read_more)

                category = 'EDUCATION, BUSINESS & TECHNOLOGY'
                if category == 'EDUCATION, BUSINESS & TECHNOLOGY' and image == 'None':
                    image = 'https://uindia.net/assets/img/MediaTechnology.jpg'

                # description

                descurl = read_more
                #Opening connection , grabbing the page
                try:
                    uClient = uReq(descurl)
                except:
                    pass
                desc_html = uClient.read()
                uClient.close()
                #Parsing
                desc_soup = soup(desc_html, "html.parser")
                #print(desc_soup)

                desc = desc_soup.findAll(
                    'div', class_='js-xd-read-more-contents l-mar-top-3'
                ) or desc_soup.findAll(
                    'div',
                    class_=
                    'structured-content-rich-text structured-content__module l-align-left l-mar-vert-6 l-sm-mar-vert-4 text-body-medium'
                )
                if len(desc) > 0:
                    try:
                        p_tags = desc[0].findAll('p')
                    except:
                        continue

                    descrip = []
                    for i in range(len(p_tags)):
                        descript = p_tags[i].text
                        descrip.append(descript)
                    description = ''.join(str(e) for e in descrip)
                else:
                    description = 'None'

                # date fetching and formatting

                time = desc_soup.findAll('time', class_='clrfix')
                if len(time) > 0:
                    time_tags = time[0].findAll('p')
                    date_check = time_tags[0].text
                    if date_check == 'Multiple Dates' or date_check == 'Multiple Dates GMT':
                        Final_Date = date_check

                    else:
                        Date_time = date_check.split(',')
                        #print(Date_time)
                        #print(len(Date_time))
                        if (len(Date_time)) == 2:
                            Final_Date = Date_time[1].strip(' ')

                        else:
                            Mon_Date = Date_time[1].split(' ')
                            if len(Mon_Date) == 3:
                                Date = Mon_Date[2]
                                month = Mon_Date[1]
                                if len(month) <= 3:
                                    Month = datetime.datetime.strptime(
                                        month, '%b').strftime('%B')
                                else:
                                    Month = month
                                year = Date_time[2]
                                Final_Date = Date + (' ') + Month + year

                            elif len(Mon_Date) == 4:
                                Date = Mon_Date[1]
                                month = Mon_Date[2]
                                Month = datetime.datetime.strptime(
                                    month, '%b').strftime('%B')
                                year = Mon_Date[3]
                                Final_Date = Date + (' ') + Month + (
                                    ' ') + year

                else:
                    Final_Date = 'None'

                #location fetching
                location_div = desc_soup.findAll('div',
                                                 class_='event-details__data')
                if len(location_div) > 0:
                    location_tags = location_div[1].findAll('p')
                    locat = location_tags[0].text
                    location = locat + (' ') + "Dublin"
                else:
                    location = 'Dublin'

                print(location)

                try:

                    if location == 'Dublin':
                        ordinates[2] = "The Spire,North City,Dublin"
                        ordinates[0] = 53.3498091
                        ordinates[1] = -6.2602548

                    else:
                        ordinates = getOrdinates(location)

                except:
                    continue

                try:
                    d1 = datetime.datetime(int(year),
                                           int(month_string_to_number(Month)),
                                           int(Date))
                except:
                    continue

                d2 = datetime.datetime.now()

                if d1 > d2:
                    data = EventData()

                    data.id = uuid.uuid1().__str__()
                    data.title = title
                    data.time = ''
                    data.location = location
                    data.summary = description
                    data.img = image
                    data.category = category
                    data.startdate = Final_Date
                    data.read_more = read_more
                    data.address = ordinates[2]
                    data.latitude = ordinates[0]
                    data.longitude = ordinates[1]
                    data.enddate = ''
                    data.price = Price
                    data_list.append(data)
                    i = i + 1

            # print(len(data))

        print(len(data_list))
        return data_list
Пример #50
0
def getpics(url):
	request=requests.get(url)
	page=request.text
	doc=soup(page,'html.parser')
	imglink=[element.get('src') for element in doc.find_all('img')]
	return imglink
Пример #51
0
    params = {
        '_from': 'R40',
        '_sacat': 0,
        'LH_Complete': 1,
        'LH_Sold': 1,
        'LH_ItemCondition': 1000,
        '_nkw': shoe_search,
        '_dcat': 15709,
        "US%20Shoe%20Size%20%28Men%27s%29": size,
        'rt': 'nc',
    }

    r = requests.get(my_url, params=params)

    # html parsing
    page_soup = soup(r.text, "html.parser")

    #class nllclt is only there when there are 0 results
    if bool(page_soup.find("span", {"class": "nllclt"})) == True:
        continue

    #find the first of this only because Ebay sometimes adds suggested results that don't match right away
    matches = page_soup.find("ul", {"class": "gv-ic"})

    # grabs each sale
    containers = matches.findAll("li", {"class": "sresult"})

    # Create table, comment out after making it the first time
    create_table()

    for container in containers:
Пример #52
0
        bar.update(i)
        i += 1
    f.close()
    return


# <a href="http://webmusic.cc/hindi_music.php?id=5012">Hate Story IV</a>
# expression for movie names
expr = re.compile(
    r'(http://webmusic.cc/hindi_music.php\?id=\d+)">([\w\s.]+)</a>')

#home page for the latest bollywood entries
url = 'http://webmusic.cc/music/mobile/hindi/latest.php'

response = urlopen(url)
bs_obj = soup(response.read(), "html.parser")
data = str(bs_obj)
response.close

#get the current directory and then create a new folder names 'latest-bollywood'
new_path = os.path.dirname(os.path.realpath(__file__)) + '\latest-bollywood'

if not os.path.exists(new_path):
    os.makedirs(new_path)

subprocess.Popen('explorer ".\latest-bollywood"')

result = re.findall(expr, data)
#print(len(result)," results found :---->")

print("TOTAL", len(result), " files to download")
Пример #53
0
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
import pandas as pd

url = 'https://etherscan.io/token/generic-tokentxns2?m=normal&contractAddress=0xd6a55c63865affd67e2fb9f284f87b7a9e5ff3bd&a=0xd071f6e384cf271282fc37eb40456332307bb8af'
req = Request(
    url,
    headers={
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'
    }
)  # I got this line from another post since "uClient = uReq(URL)" and "page_html = uClient.read()" would not work (I beleive that etherscan is attemption to block webscraping or something?)
response = urlopen(req, timeout=20).read()
response_close = urlopen(req, timeout=20).close()
page_soup = soup(response, "html.parser")
Transfers_info_table_1 = page_soup.find(
    "table", {"class": "table table-md-text-normal table-hover mb-4"})
df = pd.read_html(str(Transfers_info_table_1))[0]
df.to_csv("TransferTable.csv", index=False)
Пример #54
0
def get_links(url):
    result = requests.get(url)
    page = result.text
    doc = soup(page, 'html.parser')
    links = [element.get('href') for element in doc.find_all('a')]
    return links
Пример #55
0
        link = link2
    elif l == 3:
        link = link3
    elif l == 4:
        link = link4
    elif l == 5:
        link = link5
    else:
        print("error in links \n\n\n")

    # scrape data into array here while iterating through
    # weekly links
    client = request(link)
    page_html = client.read()
    client.close()
    page_content = soup(page_html, "html.parser")

    # get all rows
    player_row = page_content.findAll('table')[0].find_all('tr')

    for i in range(len(player_row)):

        # grab relevant data from each row

        row_data = player_row[i].find_all('td')

        if (len(row_data) != 0):
            # make sure we are getting stats for QBs only
            if (row_data[1].text == 'QB'):
                # fill matrix row w data
                cmp_percentage = row_data[14].text
Пример #56
0
import pandas as pd

# Set the executable path and initialize the chrome browser in splinter
executable_path = {'executable_path': 'chromedriver'}
browser = Browser('chrome', **executable_path)

# Visit the mars nasa news site - assign the url and instruct the browser to visit it
url = 'https://mars.nasa.gov/news/'
browser.visit(url)

# Optional delay for loading the page
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

# Set up the HTML parser
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('ul.item_list li.slide')

# Find the first article title
slide_elem.find("div", class_='content_title')

# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find("div", class_='content_title').get_text()
news_title

# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_="article_teaser_body").get_text()
news_p

# Get the featured image from nasa.gov - takes us to URL
# Visit URL
Пример #57
0
import html5lib
import requests
from bs4 import BeautifulSoup as soup
import io
import logging
import datetime
import copy
import argparse
from arcgis.gis import GIS
from arcgis.features import FeatureLayer
import pprint
import re

src = r'https://www.dhhs.vic.gov.au/coronavirus-update-victoria-2-july-2020'
r = requests.get(src)
page_soup = soup(r.content, 'html5lib')
items = page_soup.findAll(
    'div', {
        'class':
        'field field--name-field-dhhs-rich-text-text field--type-text-long field--label-hidden field--item'
    })
for item in items:
    txt = str(item.text)
    x = re.search(
        "Victoria is [0-9]+ with [0-9]+ new cases reported yesterday", txt)
    print(x.group())
    d = re.findall('[0-9]+', x.group())
    print(d)
    """paras = item.findAll('p')
    for para in paras:
        txt = para.text
from requests import get
from bs4 import BeautifulSoup as soup

url = "http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1"
response = get(url)

html_soup = soup(response.text, "html.parser")
movie_container = html_soup.find_all("div",
                                     {"class": "lister-item mode-advanced"})

headers = "Movie_Name,Movie_Year,Movie_Rating,Movie_MetaScore,Movie_Votes\n"
whandle = open("Movie_List.csv", "w")
whandle.write(headers)

for movie in movie_container:
    movie_metascore = "Not Available"
    movie_name_container = movie.find("h3", {"class": "lister-item-header"})
    movie_name = movie_name_container.a.text.replace(",", " |")

    movie_year_container = movie.find(
        "span", {"class": "lister-item-year text-muted unbold"})
    movie_year = movie_year_container.text

    movie_rating_container = movie.find(
        "div", {"class": "inline-block ratings-imdb-rating"})
    movie_rating = (movie_rating_container["data-value"])

    movie_metascore_container = movie.find(
        "div", {"class": "inline-block ratings-metascore"})
    if movie_metascore_container != None:
        movie_metascore = (movie_metascore_container.span.text.strip())
Пример #59
0
async def get_img_urls(link: str) -> AsyncIterator[str]:
    plain = await get_plain_text(link)
    s = soup(plain, "html.parser")
    for link in s.find_all("img", class_="imagecontent"):
        yield link.get("src")
Пример #60
0
# Scrape with BeautifulSoup - basic parsing example
# -----------------------------------------------------------------------------
# If you already have the HTML data from a website and just want to extract
# from it, BeautifulSoup is a good choice. HTML parsing is a headache because
# much of the HTML on public web pages is technically invalid: unclosed tags,
# incorrect nesting, and other complications.

# $ pip3 install beautifulsoup4

import requests
from bs4 import BeautifulSoup as soup

r = requests.get('http://pythonhow.com/example.html')
c = r.content  # the entire source code
s = soup(c, 'html.parser')  # feed the content to the bs4 parser
a = s.find('div', {'class': 'cities'})  # returns the first div
a = s.find_all('div', {'class': 'cities'})  # returns a list of all divs

# print(s.prettify)

print(type(r))  # <class 'requests.models.Response'>
print(type(c))  # <class 'bytes'>
print(type(s))  # <class 'bs4.BeautifulSoup'>
print(type(a))  # <class 'bs4.element.ResultSet'>
print(type(a[2]))  # <class 'bs4.element.Tag'>
print(a)  # prints all the divs and their contents
print(a[2])  # prints the 3rd div and its contents
print(a[0].find_all('h2'))  # list of all the h2 tags from the 1st div
print(a[0].find_all('h2')[0])  # just the 1st h2
print(a[0].find_all('h2')[0].text)  # just the text inside the h2 tag