def processPDFLink(df): #testing #url = "http://www.silicontao.com/ProgrammingGuide/other/beejnet.pdf" for datarow in df: if datarow['whitepaper'][-3:] == 'pdf': URL = datarow['whitepaper'] req = urllib.request(URL, headers={'User-Agent' : "Magic Browser"}) remote_file = urllib.urlopen(req).read() memory_file = io.BytesIO(remote_file) read_pdf = PyPDF2.PdfFileReader(memory_file) number_of_pages = read_pdf.getNumPages() for i in range(0, number_of_pages): pageObj = read_pdf.getPage(i) page = pageObj.extractText() #print (page) elif datarow['whitepaper']: URL = datarow['whitepaper'] htmlString = urllib.request.urlopen(URL).read() html = BeautifulSoup(htmlString, 'html.parser') texts = html.findAll(text=True) visible_texts = filter(tag_visible, texts) page = " ".join(t.strip() for t in visible_texts) else: print("{0} has no whitepaper!".format(datarow['name'])) outputStream = open("output.pdf","wb") writer.write(outputStream) outputStream.close()
def check_registration_time(self): html = self.session.post( 'https://mystudentrecord.ucmerced.edu/pls/PROD/bwskrsta.P_RegsStatusDisp', data={ 'term_in': self.term }).text soup = BeautifulSoup(html, "html.parser") html = soup.find('table', attrs={'class': 'datadisplaytable'}) times = html.findAll('td') date = times[0].text time = times[1].text tzinfos = {"PST": gettz("America/Los_Angeles")} return parse("{} {} {}".format(date, time, "PST"), tzinfos=tzinfos)
rs = rs.strip("\r") rs = rs.strip("\n") return (rs) else: return '' pagine = range(1, 97) for count in pagine: try: searchURL = "http://www.comuni-italiani.it/cap/%02d.html" % count print searchURL html = scraperwiki.scrape(searchURL) html = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) tables = html.findAll('table') except: print "error for WKI_ID:" + str(id) rows = tables[5].findAll('tr') li = {} for row in rows: try: cells = row.findChildren('td') if len(cells) <= 4: li['CAP'] = cells[0].text li['COMUNE'] = cells[1].text #print cells #print cells[1]
# for other purposes. import scraperwiki import urllib2 import urllib import lxml.html import BeautifulSoup as bs import re import xml.etree.ElementTree as ET for i in range(2380, 5000): url = 'http://www.dlapiper.com/us/people/detail.aspx?attorney=' + str(i) try: page = scraperwiki.scrape(url) html = bs.BeautifulSoup(page) lawer_name = html.findAll('h1') lawer_post = html.findAll('h2') root = lxml.html.fromstring(page) links = root.cssselect("div.bio a") contact = root.cssselect("div.bio td") data = { 'name': lawer_name[4].getText(), 'post': lawer_post[3].getText(), 'email': links[0].text, 'contact': contact[0].text_content(), 'id': i } scraperwiki.sqlite.save(unique_keys=['id'], data=data) except: pass
for content in mainContent: if content !="http://www.dlapiper.com/us/people/#": totalLinks.append(content) i=0; for url in totalLinks: if i<=481: i=i+1 continue try: page = scraperwiki.scrape(url) html = bs.BeautifulSoup(page) lawer_name = html.findAll('h1') lawer_post = html.findAll('h2') root = lxml.html.fromstring(page) links = root.cssselect("div.bio a") contact = root.cssselect("div.bio td") data = {'name':lawer_name[4].getText(), 'post' : lawer_post[3].getText(), 'email' : links[0].text, 'contact':contact[0].text_content(),'id':i} scraperwiki.sqlite.save(unique_keys=['id'], data=data) i=i+1 except: pass # PLEASE READ THIS BEFORE EDITING
def extract_data(): #soup = BeautifulSoup(f) #matches = soup.find(id = "zapasVypis") #matches = matches.findAll('tr') output_structure = [] counter = 0 seasons = range(2015, 2016) URL = "http://www.hc-kometa.cz/zapas.asp?stats=true&sezona=%s&kde=" for season in seasons: html = BeautifulSoup(get_webpage(URL % season).decode('windows-1250').encode('utf-8')) matches = html.findAll('tr') for i in range(1, len(list(matches)), 2): temp_hash = {} detailed_structure = parse_detailed_info(matches[i+1]) match_structure = parse_match_info(matches[i]) output_structure.append([detailed_structure, match_structure]) #if i == 11: # break root = etree.Element('kometa') season_element = etree.Element('season') season_element.attrib['year'] = str(season) for data in output_structure: match = etree.Element('match') match_round = etree.Element('round') match_round.text = strip_accents(data[1]['round']) home_element = etree.Element('home') home_element.text = str(data[1]['is_it_home']) participants_element = etree.Element('participants') parti_home = etree.Element('home') parti_home.text = strip_accents(data[1]['home_team']) parti_visi = etree.Element('visitors') parti_visi.text = strip_accents(data[1]['visitor_team']) participants_element.append(parti_home) participants_element.append(parti_visi) result_element = etree.Element('result') parti_home = etree.Element('visitors') parti_home.text = str(1) parti_visitor = etree.Element('home') parti_visitor.text = str(2) result_element.append(parti_home) result_element.append(parti_visitor) part_result = etree.Element('endOfPeriods') date = etree.Element('date') date.text = strip_accents(data[1]['date']) referees = etree.Element('referees') main = etree.Element('mainReferee') main.text = strip_accents(data[0]['main_referee']) line = etree.Element('assistantReferees') temp_line = etree.Element('assistantReferee') temp_line.text = strip_accents(data[0]['line_referee'][0]) line.append(temp_line) temp_line1 = etree.Element('assistantReferee') try: temp_line1.text = strip_accents(data[0]['line_referee'][1]) except: temp_line1.text = '0' line.append(temp_line1) video = etree.Element('videoReferee') video.text = strip_accents(data[0]['video_referee']) referees.append(main) referees.append(line) referees.append(video) penalty = etree.Element('penalty') penalty_home = etree.Element('home') penalty_visitor = etree.Element('visitor') penalty_home.text = data[0]['disqualification'][0] try: penalty_visitor.text = strip_accents(data[0]['disqualification'][1]) except IndexError: penalty_visitor.text = '' penalty.append(penalty_home) penalty.append(penalty_visitor) powerplay = etree.Element('powerplay') powerplay_home = etree.Element('home') powerplay_visitor = etree.Element('visitor') powerplay_home.text = data[0]['success_rate'][0] powerplay_visitor.text = data[0]['success_rate'][0] powerplay.append(powerplay_home) powerplay.append(powerplay_visitor) shorthanded = etree.Element('shorthandedGoal') shorthanded_home = etree.Element('home') shorthanded_visitor = etree.Element('visitor') try: shorthanded_home.text = data[0]['shorthanded'][0] shorthanded_visitor.text = data[0]['shorthanded'][1] except IndexError: shorthanded_visitor.text = str(0) shorthanded.append(shorthanded_home) shorthanded.append(shorthanded_visitor) shots_on_goal = etree.Element('shotsOnGoal') shots_on_goal_home = etree.Element('home') shots_on_goal_visitor = etree.Element('visitor') shots_on_goal_home.text = data[0]['shots_on_net'][0] shots_on_goal_visitor.text = data[0]['shots_on_net'][1] shots_on_goal.append(shots_on_goal_home) shots_on_goal.append(shots_on_goal_visitor) audience = etree.Element('audience') audience.text = strip_accents(data[0]['visitors']) players = etree.Element('players') players_data = data[1]['players_stats'] for player in players_data: player_element = etree.Element('player') player_element.attrib['id'] = str(player['player_id']) name = etree.Element('name') name.text = strip_accents(str(player['name'])) goals = etree.Element('goals') goals.text = str(player['goals']) assistances = etree.Element('assistances') assistances.text = str(player['asistances']) tm = etree.Element('tm') tm.text = str(player['tm']) radegast = etree.Element('radegastIndex') radegast.text = str(player['bilance']) player_element.append(name) player_element.append(goals) player_element.append(assistances) player_element.append(tm) player_element.append(radegast) players.append(player_element) match.append(match_round) match.append(home_element) match.append(participants_element) match.append(result_element) match.append(part_result) match.append(date) match.append(referees) match.append(penalty) match.append(powerplay) match.append(shorthanded) match.append(shots_on_goal) match.append(audience) match.append(players) season_element.append(match) root.append(season_element) string = etree.tostring(root , pretty_print=True) f = open(XML_LOCATION, 'w') f.write(string) f.close() print string
def scapper_tsv(n_links, path_html, path_tsv): # loading bar with tqdm(total=n_links) as pbar: for article in range(n_links): # range of files try: html = BeautifulSoup( open(path_html + "/article_{}.html".format(article)), 'html.parser') except Exception as e: # if article doesn't exists print(article, e) continue title = html.select("h1")[0].text # initialize tmp as intro tmp = 'intro' sections = {'intro': '', 'plot': ''} # take all paragraphs section by section and save only intro and plot for section in html.select('div.mw-parser-output > *' ): # take only notes in the first level if (section.name == 'p' and tmp == 'intro'): sections['intro'] += section.text.strip() # chage tmp on section names if (section.name in ['h2', 'h3']): tmp = section.span['id'] # take only sections we are interrested in if (section.name == 'p' and tmp in [ 'Plot', 'Plot_summary', 'Premise' ]): # check different names for plot sections sections['plot'] += section.text.strip() # we doesn't take in consideration pages without Plot if (sections['plot'] == ''): print(article, 'No Plot') continue # dictionary for infobox d = { 'film_name': title, 'Directed by': 'NA', 'Produced by': 'NA', 'Written by': 'NA', 'Starring': 'NA', 'Music by': 'NA', 'Release date': 'NA', 'Running time': 'NA', 'Country': 'NA', 'Language': 'NA', 'Budget': 'NA' } # take elem from infobox info_box = html.findAll(['th', 'td']) for elem in info_box: info = elem.text.strip('\n') # take text from the table if info in d: d[info] = info_box[info_box.index(elem) + 1].text.strip('\n') # select elem in oroder as a list to save in .tsv ld = list(d.values()) columns = [ 'title', 'intro', 'plot', 'film_name', 'Directed by', 'Produced by', 'Written by', 'Starring', 'Music by', 'Release date', 'Running time', 'Country', 'Language', 'Budget' ] data = [title, sections['intro'], sections['plot']] + ld[0:] # create and save a tsv with open(path_tsv + '/article_{}.tsv'.format(article), 'w', newline='', encoding='utf-8') as f_output: tsv_output = csv.writer(f_output, delimiter='\t') tsv_output.writerow(columns) tsv_output.writerow(data) pbar.update(1)
def filter( html ): imgs = html.findAll( "img" ) if imgs: return imgs else: sys.exit("[~] No images detected on the page.")
def getHyperlinks(html): return [(a.text, a['href']) for a in html.findAll("a")]
scraperwiki.sqlite.attach("elencocapitalia") data = scraperwiki.sqlite.select( '''distinct CAP from ElencoCapItalia.swdata ''' ) for d in data: searchURL = "http://www.youinweb.it/profiles_it/" + d["CAP"] try: html = scraperwiki.scrape(searchURL) time.sleep(1) # don't overload html = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES) tables = html.findAll('table') except: sys.exc_clear() rows = tables[0].findAll('tr') li = {} for row in rows: try: cells = row.findChildren('td') #print cells[1].text URL = searchURL+get_href (cells[1]) #print URL li['CAP'] = d["CAP"] li['URL'] = URL