def parse_itunes_transactions(mail_body): soup = BeautifulSoup(mail_body) receipt = soup.findChildren('table')[2] date = list(receipt.findChildren('td')[2].stripped_strings)[3] items = [] number_of_categories = 5 i = 0 transaction_table = soup.findChildren('table')[4] transactions = transaction_table.findChildren(['tr']) for transaction in transactions: cells = transaction.findChildren('td') row_values = [account] for cell in cells: values = list(cell.stripped_strings) if len(values) > 0: text = values[0].encode(csv_text_encoding) if (i > 0): row_values.append(get_itunes_values(len(row_values), text)) else: row_values.append(text) if len(row_values) == number_of_categories: if i > 0: row_values.append(date) items.append(row_values) i += 1 return items
def extract_details(painting_table: BeautifulSoup) -> dict: """Extracts necessary metadata for an image from a table found on the image's information page Parameters ---------- painting_table : BeautifulSoup Tag, required Table found on the info page of an image. Holds important metadata for the image including title, artist, medium, date, etc. Returns ------- dict Dictionary containing all of the metadata for the image """ details = {} # Each 'dd' tag represents the category for the piece of metadata painting_info = painting_table.findChildren("dd") # Each 'dt' tag represents the actual piece of info tied to the 'dd' tag info_heading = painting_table.findChildren("dt") # For each piece of metadata, tie the info to the category in the dict for child in range(0, len(painting_info)): details[info_heading[child].text.strip( )] = painting_info[child].text.strip() # Return the dictionary return details
def drugMentions(): dir = os.path.dirname(__file__) finalList = [] textList = [] input = open(os.path.join(dir, "output/output.txt"), "r") htmlParse = input.read().decode("utf-8") soup = BeautifulSoup(htmlParse) tables = soup.findChildren(["table"]) tableIDs = [(n["value"]) for n in soup.findChildren("input")] del tableIDs[0] for c in range (0, len(tables)): dictList = tables[c].findChildren(["td"]) for t in range (0, len(dictList)): textList.append(re.sub(' +', ' ', (dictList[t].getText().strip("\t\n\r").replace("\n", "").strip().upper().encode("utf-8")))) finalList.append([tableIDs[c].encode("utf-8"), textList]) textList = [] file = open(os.path.join(dir, "output/finalList.txt"), "w") file.write(str(finalList)) for c in range (0, len(finalList)): file = open(os.path.join(dir, finalList[c][0].replace("output/TABLE-", "")), "w") file.write(str(finalList[c][1]))
def _crawl_level_2(self, link): """ :rtype: dict :type link: str """ response = requests.post(link) soup = BeautifulSoup(response.content, "html.parser") soup.findChildren() comic_name = soup.find(id="breadcrumbs") main_section = soup.find(id="wrapper") \ .find("section", class_="main-content") \ .div \ .find("div", class_="col-md-8") \ .section thumbnail = main_section.find("div", class_="thumbnail") \ .img description = main_section.find("div", class_="detail") \ .find("div", class_="content") generic_information = main_section.find("div", class_="description") chapters = soup.find(id="list-chapters") information = self._get_generic_information(generic_information) result = self._get_chapters(chapters) chapters_link_list = result.get("chapters_link") chapters_name_list = result.get("chapters_name") chapter_page_list = list() for chapter_link in chapters_link_list: chapter_page_list.append( CrawlLevel1().crawl("https://blogtruyen.com" + chapter_link)) pass # print(information) # print(chapters_link_list) # print(chapters_name_list) # print(self.get_comic_name(comic_name)) # print(self.get_thumbnail(thumbnail)) # print(self.get_description(description)) # print(chapter_page_list) # print(chapters_dict) # print("~~~~~~~~~~~~~~~~~~~~~~~~~~") # comic_name: #breadcrumbs > span:nth-child(2) # thumbnail: #wrapper > section.main-content > div > div.col-md-8 > section > div.thumbnail > img # description: #wrapper > section.main-content > div > div.col-md-8 > section > div.detail > div.content # generic_information: #wrapper > section.main-content > div > div.col-md-8 > section > div.description # chapters: #list-chapters return dict(comicname=self._get_comic_name(comic_name), thumbnail=self._get_thumbnail(thumbnail), description=self._get_description(description), authors=information.get("tacgia"), translators=information.get("nhomdich"), genres=information.get("theloai"), finishstatus=information.get("trangthai")[0], chaptersname=chapters_name_list, chapterspage=chapter_page_list) pass
def parseHTMLtoJSON(htmlText): soup = BeautifulSoup(htmlText, "html.parser") global caseData global headerData caseData = [] headerData = {} allTables = soup.findChildren("table") storeHeader(allTables[0]) bodyTables = soup.findChildren("table", {"class": "style3"}) storeBody(bodyTables)
def scrape_stats(): #urls run from 0 to 402 url_count = 0 #src_url = 'http://crimereporting.ncdoj.gov/public/2013/LEPersonnel/LEPerPopRatAgyTrd/leperpopratagytrd/' src_url = 'http://crimereporting.ncsbi.gov/public/2014/LEOKillAsslt/LEOAssltWeaAgyTrd/leoassltweaagytrd/' src_url_end = '.htm' fieldnames = ['agency_id','agency_name','Year', 'Firearm', 'Knife or Other Cutting Instrument','Other Dangerous Weapon','Hands, Fists, Feet, etc.',' Total Officer Assaults'] writer = csv.DictWriter(open('lea_assaults.csv', 'wb'),fieldnames=fieldnames) lea_row = {'agency_id':'','agency_name':'','Year':'', 'Firearm':0, 'Knife or Other Cutting Instrument':0,'Other Dangerous Weapon':0,'Hands, Fists, Feet, etc.':0,' Total Officer Assaults':0} writer.writeheader() print 'ALERT: New log created...' while url_count < 402: expect_year = 2005 print src_url + str(url_count) + src_url_end html_file = urllib2.urlopen(src_url + str(url_count) + src_url_end).read() soup = BeautifulSoup(html_file, 'html.parser') for lea_detail in soup.findChildren('table')[11].findChildren('td'): lea_name = lea_detail.string data_table = soup.findChildren('table')[12] rows = data_table.findChildren('tr') header = 1 header_row = [] for row in rows: cell_count = 0 lea_row = {'agency_id':url_count,'agency_name':lea_name,'Year':'', 'Firearm':0, 'Knife or Other Cutting Instrument':0,'Other Dangerous Weapon':0,'Hands, Fists, Feet, etc.':0,' Total Officer Assaults':0} cells = row.findChildren('td') current_year = cells[0].string if header == 1: for cell in cells: header_row.append(cell.string) header = 0 else: while int(current_year) != expect_year: lea_row = {'agency_id':url_count,'agency_name':lea_name,'Year':expect_year, 'Firearm':0, 'Knife or Other Cutting Instrument':0,'Other Dangerous Weapon':0,'Hands, Fists, Feet, etc.':0,' Total Officer Assaults':0} writer.writerow(lea_row) expect_year += 1 for cell in cells: try: value = int(cell.string) lea_row[header_row[cell_count]] = value except ValueError: pass cell_count += 1 writer.writerow(lea_row) expect_year += 1 while int(expect_year) <= 2014: lea_row = {'agency_id':url_count,'agency_name':lea_name,'Year':expect_year, 'Firearm':0, 'Knife or Other Cutting Instrument':0,'Other Dangerous Weapon':0,'Hands, Fists, Feet, etc.':0,' Total Officer Assaults':0} writer.writerow(lea_row) expect_year += 1 url_count += 1
def parse_response(self, response): """ Parses Apache serve-status response. """ VHOST_List = [] REQUEST_URI_List = [] FULL_URL_List = [] CLIENT_IP_ADDRESS_List = [] # URL-related. soup = BeautifulSoup(response, 'lxml') try: table_index_id = 0 VHOST_index_id = -2 REQUEST_URI_index_id = -1 CLIENT_IP_ADDRESS_index_id = -3 for _ in range(len(soup.findChildren('table')[table_index_id].findChildren('tr'))): if _ != 0: try: VHOST = soup.findChildren('table')[table_index_id].findChildren('tr')[_].findChildren('td')[VHOST_index_id].getText() except Exception as e: Exception_Handler(e) VHOST = '' try: REQUEST_URI = soup.findChildren('table')[table_index_id].findChildren('tr')[_].findChildren('td')[REQUEST_URI_index_id].getText().split(' ')[1] except Exception as e: Exception_Handler(e) REQUEST_URI = '' try: if (VHOST == REQUEST_URI == ''): FULL_URL = '' else: FULL_URL = 'http://' + str(VHOST) + str(REQUEST_URI) except Exception as e: Exception_Handler(e) FULL_URL = '' VHOST_List.append(VHOST) REQUEST_URI_List.append(REQUEST_URI) FULL_URL_List.append(FULL_URL) # Client-related. try: CLIENT_IP_ADDRESS = soup.findChildren('table')[table_index_id].findChildren('tr')[_].findChildren('td')[CLIENT_IP_ADDRESS_index_id].getText() except: CLIENT_IP_ADDRESS = '' CLIENT_IP_ADDRESS_List.append(CLIENT_IP_ADDRESS) except Exception as e: Exception_Handler(e) pass output = {"VHOST": VHOST_List, "REQUEST_URI": REQUEST_URI_List, "FULL_URL": FULL_URL_List, "CLIENT_IP_ADDRESS": CLIENT_IP_ADDRESS_List} return(output)
def scrape_page(self, page_request: Future) -> Generator[Vod, None, None]: page_content = page_request.result().content page_strainer = SoupStrainer("table") page_soup = BeautifulSoup(page_content, "lxml", parse_only=page_strainer) vod_requests = [ self.request(tr.findChild("a")["href"]) for tr in page_soup.findChildren("tr") ] for table in page_soup.findChildren(recursive=False): date = table.caption.span.getText() for i, row in enumerate(table.tbody.findChildren(recursive=False)): cells = row.findChildren(recursive=False) try: vod_id = re.search(r".*\/(.*)", cells[1].a["href"]).group(1) try: best_of = int( re.search(r"Bo([\d]*)", cells[3].getText()).group(1)) except AttributeError: continue players = [] player = Vod.Player("Unknown", []) for tag in cells[1].a.span.findChildren(recursive=False): if tag.name == "b": if len(player.characters) != 0: players.append(player) player = Vod.Player("Unknown", []) player.alias = tag.getText() elif tag.name == "img": player.characters.append( guess_character(tag["src"][24:-4])) players.append(player) video_ids, casters = self.scrape_vod_page( vod_id, vod_requests[i]) tournament = re.search(r"[^\s].*[^\s]", cells[0].getText()).group() _round = re.search(r"[^\s].*[^\s]", cells[4].getText()).group() yield Vod(vod_id, video_ids, date, tournament, players, casters, _round, best_of) except InvalidVideoError as e: if self.verbose: print(e, file=sys.stderr)
def import_from_file(path_to_html): if not os.path.isfile(path_to_html): sys.stdout.flush() err = f"Could not open doc file '{path_to_html}': No such file or directory." raise FileNotFoundError(err) with open(path_to_html, "r") as file: html_content = file.read() soup = BeautifulSoup(html_content, "html5lib") headers = soup.findChildren("h1") tables = soup.findChildren("table") assert len(headers) == len(tables) reqs_array_array = [] for reqs_table in soup.findChildren("table"): reqs = ConfluenceHTMLTableImport.parse_table(reqs_table) reqs_array_array.append(reqs) document = Document(None, "Imported Doc", None, [], []) for section_idx, reqs in enumerate(reqs_array_array): section_name = headers[section_idx].text section = Section(document, 1, section_name, [], []) document.section_contents.append(section) for req in reqs: uid = req["UID"] title = req["TITLE"] statement = req["STATEMENT"] rationale = req["RATIONALE"] comment = req["COMMENT"] sreq = Requirement( section, None, statement, uid, None, None, None, title, None, None, rationale, [RequirementComment(None, None, comment)] if comment else [], None, ) sreq.ng_level = 2 section.section_contents.append(sreq) return document
def execution(list1, id1): for domain_name in list1: try: driver = webdriver.Firefox() driver.get("http://www.dmoz.com") name = '' if 'http://' in domain_name: name = domain_name[7:] else: name = domain_name elem = driver.find_element_by_name("q") elem.send_keys(domain_name) elem.send_keys(Keys.RETURN) soup = BeautifulSoup(driver.page_source) c = soup.findChildren("ol", {"class": "dir"}) str1 = "" for item in c: for link in item.find_all('a'): str1 = str1 + link.get('href') + '\n' d = soup.findChildren("ol", {"class": "site"}) flag = 0 for i in d: for link in i.find_all('a'): x = link.get('href') if domain_name in x or x in domain_name: flag = 1 path1 = 'D://Thesis//data//domain_name//sources_in_dmoz//' if len(str1) > 0: f = open(path1 + name + '.txt', 'w') f.write(str1) f.close() print 'completed', domain_name, id1 break if flag == 1: break if flag == 0: path2 = 'D://Thesis//data//domain_name//source_not_in _dmoz//unavailable_sources.txt' f1 = open(path2, 'a+') f1.write(domain_name + '\n') f1.close() print 'not found in dmoz', domain_name, id1 #print 'completed ',domain_name,id1 driver.close() except: path3 = 'D://Thesis//data//domain_name//source_not_in _dmoz//invalid1.txt' f2 = open(path3, 'a+') f2.write(domain_name + '\n') f2.close() print 'url not valid', domain_name, id1
def write(f): data=f.read() soup=BeautifulSoup(data,"lxml") worksheet= workbook.add_worksheet() title= soup.find("h3",{"class":"sectionTitle"}).contents[0] if title!=None: worksheet.write(0,0,title) table= soup.findChildren("table") rows = soup.findChildren(["th","td"]) b=0 try: i=0 while i<len(rows): # for i in xrange(len(rows)): # print rows[i].get("class") if rows[i].get("class")[0]=="groupHead": # print "Heading: " + rows[i].text worksheet.write(b+1,0,rows[i].text) b+=1 i+=1 # rows.remove(rows[i]) elif rows[i].get("class")[0]=="specsKey": worksheet.write(b+1,0,rows[i].text) worksheet.write(b+1,1,rows[i+1].text) # print rows[i].text + ":" + rows[i+1].text # rows.remove(rows[i]) # rows.remove(rows[i+1]) b+=1 i+=2 elif rows[i].get("class")[0]=="specsValue": # print "Value:" + rows[i].text worksheet.write(b+1,0,rows[i].text) b+=1 i+=1 else: print "Un" + rows[i].text except Exception as e: print "Filename: " +i print e workbook.close()
def scrape_stats(): #urls run from 0 to 605 url_count = 0 src_url = 'http://crimereporting.ncsbi.gov/public/2014/LEPersonnel/LEPerPopRatAgyTrd/leperpopratagytrd/' src_url_end = '.htm' fieldnames = ['agency_id','Agency Name','Year','Reporting Status','Fulltime Male Sworn','Fulltime Female Sworn','Fulltime Male Civilian','Fulltime Female Civilian','Total Employees','Population Coverage','Sworn Rate per 1,000 Population'] writer = csv.DictWriter(open('lea_personnel.csv', 'wb'),fieldnames=fieldnames) lea_row = {'agency_id':'','Agency Name':'','Year':'','Reporting Status':'','Fulltime Male Sworn':0,'Fulltime Female Sworn':0,'Fulltime Male Civilian':0,'Fulltime Female Civilian':0,'Total Employees':0,'Population Coverage':0,'Sworn Rate per 1,000 Population':0} writer.writeheader() print 'ALERT: New log created...' while url_count < 605: expect_year = 2005 print src_url + str(url_count) + src_url_end html_file = urllib2.urlopen(src_url + str(url_count) + src_url_end).read() soup = BeautifulSoup(html_file, 'html.parser') for lea_detail in soup.findChildren('table')[11].findChildren('td'): lea_name = lea_detail.string data_table = soup.findChildren('table')[12] rows = data_table.findChildren('tr') header = 1 header_row = [] for row in rows: cell_count = 0 cells = row.findChildren('td') if header == 1: for cell in cells: header_row.append(cell.string) header = 0 else: while int(cells[1].string) != expect_year: lea_row = {'agency_id':url_count,'Agency Name':cells[0].string,'Year':expect_year,'Reporting Status':'','Fulltime Male Sworn':'','Fulltime Female Sworn':'','Fulltime Male Civilian':'','Fulltime Female Civilian':'','Total Employees':'','Population Coverage':'','Sworn Rate per 1,000 Population':''} writer.writerow(lea_row) expect_year += 1 lea_row = {'agency_id':url_count,'Agency Name':cells[0].string,'Year':'','Reporting Status':'','Fulltime Male Sworn':0,'Fulltime Female Sworn':0,'Fulltime Male Civilian':0,'Fulltime Female Civilian':0,'Total Employees':0,'Population Coverage':0,'Sworn Rate per 1,000 Population':0} for cell in cells: try: value = int(cell.string.replace(',','')) lea_row[header_row[cell_count].lstrip()] = value except ValueError: if cell.string == 'Does Not Participate' or cell.string == 'Reporting': lea_row[header_row[cell_count].lstrip()] = cell.string cell_count += 1 writer.writerow(lea_row) expect_year += 1 url_count += 1
def extract_code(url, file_name): print 'file : ', url global file_count # if(file_count>=50): # sys.exit page = urllib2.urlopen(url) soup = BeautifulSoup(page, 'xml') file = open('codes/' + file_name, 'w') file_count += 1 # repcontent=(soup.find(class_="repository-content")).find_all('table') # for row in soup.find(class_="repository-content").find_all("tr"): # print(row,'\n') # date=soup.find_all(datetime=True) try: urllib2.urlopen(url) driver.get(url) sleep(6) date = soup.findChildren(['relative-time']) # print("Date is ",date) if (date != []): datetime = date[0]['datetime'] # print "Date : ",datetime else: datetime = 'Date Not Available' tables = soup.findChildren('table') # print(tables) my_table = tables[0] rows = my_table.findChildren(['th', 'tr']) line = 0 for row in rows: # print(line) line += 1 for string in row.stripped_strings: string = (unicode(string)).encode('utf-8') # print(string) file.write(string) file.write(' ') # spans=cells[1].findChildren('span') # for span in spans: # print(span.string) file.write('\n') label_file.write('codes/' + str(file_count - 1) + '.txt ' + url + '\t' + (unicode(datetime)).encode('utf-8') + '\n') file.close() return datetime except Exception as inst: print("Error : ", inst) file_count -= 1 return -1
def get_points_bw(self, m, p, s, rs): try: # BROWSE logging.info('Loading Browser') b = new_browser() b.open(s.page) b.form = list(b.forms())[0] b[s.form_user] = m.username b[s.form_pass] = p #m.password b.submit() html = b.open('/mt/www.bestwestern.com/rewards/').read() b.close() # TRAVERSE logging.info('Traversing') soup = BeautifulSoup(html).find(id=s.match) rs['content'] = soup.findChildren()[1] rs['points'] = rs['content'].div('div')[1].contents[1][2:] rs['success'] = True except Exception, e: logging.info(e) rs['success'] = False
def get_blurb(first, last, sport, player_url=None): # for some weird reason its actually better to omit the first name in the search form response = get(player_url if player_url else blurb_search_url.format(first="", last=last, sport=sport)) soup = BeautifulSoup(response.text, 'html.parser') # did we land a result page? if not soup.findChild('div', class_='RW_pn'): name_map = {} results_table = soup.find('table', attrs={'id':'cp1_tblSearchResults'}) # filter results, omitting duplicate "position" links that don't include the player's name filtered_results = results_table.findChildren(lambda tag: tag.name == 'a' and 'player' in tag['href'] and len(tag.text) > 3) if not filtered_results: raise NoResultsError("No results for %s %s" % (first, last)) else: for result in filtered_results: name = " ".join(result.text.split()) name_map[result] = SequenceMatcher(None, first + " " + last, name).ratio() # sort names by similarity to search criteria sorted_names = sorted(name_map, key=name_map.get, reverse=True) return get_blurb(first, last, sport, player_url='http://www.rotoworld.com' + sorted_names[0].get('href')) else: news = soup.findChildren('div', class_='playernews') if news: recent_news = news[0] report = recent_news.find('div', class_='report') impact = recent_news.find('div', class_='impact') blurb = report.text + '\n\n' + impact.text return blurb else: raise NoResultsError("No recent player news for %s %s" % (first, last))
def parse_day(canteen, url): content = urlopen(url).read() data = BeautifulSoup(content.decode('utf-8'), 'xml') for group in data.findChildren('group'): date = group['productiondate'] category = group.findChild('name').getText() prices = parse_prices(group.findChild('prices').findChildren('price')) components = group.findChild('components').findChildren('component') components = [ c.findChild("name1").getText() for c in components ] tags = group.findChild('taggings').findChildren('tagging') tags = [ t.getText() for t in tags if not t.is_empty_element ] if '1' == group['type']: # meal consisting of multiple parts, use first component as name if len(components) < 1: print("meal without component: {}".format(group)) continue notes = components[1:] + tags canteen.addMeal(date, category, components[0], notes, prices) elif '2' == group['type']: # multiple components to choose from for component in components: canteen.addMeal(date, category, component, tags, prices) else: print('unknown meal type: {}'.format(group['type']))
def query_profile(url, first_pass=False): html = urlopen(url).read() soup = BeautifulSoup(html) if html.find("Sorry, no content found for this URL") > 0: return 404, "", "", "", "", "" else: tables = soup.findChildren("table") pub_table = tables[1] stats_table = tables[0] if html.find("There are no articles in this profile.") > 0: return None elif first_pass == True: scholar = soup.find("div", {"id": "gsc_prf_i"}) name = str(scholar.find("div", {"id": "gsc_prf_in"}).get_text()) # try: institution = str(scholar.find("div", {"class": "gsc_prf_il"}).get_text()) # except: # institution = str(scholar.find('div', {'class':'gsc_prf_il'}).get_text()) # # interests = ",".join([i.get_text() for i in scholar.find_all(id='gsc_prf_ila')]) # interests=[] interests = scholar.find_all("div", {"class": "gsc_prf_il"})[1] # # for a in interests: # # interests.append(a.get_text()) # try: # interests=interests.get_text() # except: interests = ", ".join([i.get_text() for i in interests.find_all("a", {"class": "gsc_prf_ila"})]) email = str(scholar.find_all("div", {"class": "gsc_prf_il"})[2].get_text()) return name, institution, parse_pubs(pub_table), parse_stats(stats_table), interests, email else: return parse_pubs(pub_table)
def run_crawl(self): start = time.time() domains = [] url = 'http://cybercrime-tracker.net/ccam.php' source = 'cybercrime-tracker.net' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None soup = BeautifulSoup(_info, 'lxml') table = soup.findChildren('tbody')[2] rows = table.findChildren('tr', attrs={'class': 'monitoring'}) for row in rows: date_str = row.findChildren('td')[1].string time_obj = time.strptime(date_str, "%d/%m/%Y %H:%M:%S") updatetime = time.strftime("%Y-%m-%d", time_obj) domain = row.findChildren('td')[2].string hashstr = row.findChildren('td')[3].string if self.is_ip(domain): continue block = [domain, updatetime, source] domains.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(domains, source, crawl_time)
def getNews(): page = urllib2.urlopen("https://www.inshorts.com/en/read") soup = BeautifulSoup(page, 'lxml') news_image = soup.findChildren('div', {"class": "news-card-image"}) news_title = soup.findAll('div', {'class': 'news-card-title news-right-box'}) news_read_more = soup.findAll("div", {"class": "read-more"}) news_body = soup.findAll("div", {"itemprop": "articleBody"}) news_readmore = [] news_img = [] news_tit = [] news_con = [] for x in range(len(news_image)): news_img.append(re.findall("url\((.*)\)", news_image[x]['style'])[0]) news_tit.append(re.split("\\n\n", news_title[x].text)[1]) news_con.append(news_body[x].text) news_readmore.append(news_read_more[x].findChildren()) news_readmore = news_readmore[0:3] news_img = news_img[0:3] news_tit = news_tit[0:3] news_con = news_con[0:3] newshtml = "" for i in range(3): newshtml += """<div style="background-color:#FFFFFF;color:#000000;padding:15px"> <img src = %s, align = "left", width = "171", height = "128", style = "padding:15px"></img></div> <div style = "display:inline"><h4>%s</h4> <p style = "font-size:12px;padding-left:15px">%s</p> <p style = "font-size:8px;color:#A9A9A9;padding-left:15px">read more at%s</p></div><br>""" % ( news_img[i], news_tit[i], news_con[i], news_readmore[i]) return (newshtml)
def main(): url = 'http://www.ieee.org/conferences_events/conferences/search/index.html?KEYWORDS=&CONF_SRCH_RDO=conf_date&RANGE_FROM_DATE=&RANGE_TO_DATE=®ION=Region10-Asia+and+Pacific&COUNTRY=Bangladesh&RowsPerPage=10&PageLinkNum=10&ActivePage=1&SORTORDER=desc&SORTFIELD=start_date' content = urlopen(url) soup = BeautifulSoup(content, 'lxml') conference_table = soup.findChildren('table', class_='nogrid-nopad') rows = conference_table[0].findChildren('td', class_='pad10') events = [] for row in rows: event = row.find_all('p') for info in event: events.append(get_text(str(info))) label = [ "Event title: ", "Date of Submissions:", "Event Date:", "Event Location:" ] extra_decoration = 0 print("*" * 60, "\n") for lab, event in zip(label * len(events), events): print(lab, event, end="\n") extra_decoration += 1 if extra_decoration == 4: print("\n", "*" * 60, "\n") extra_decoration = 0
def scrape_leaderboard(url): request = urllib2.Request(url) page = urllib2.urlopen(request) content = page.read() soup = BeautifulSoup(content) soupTable = soup.findChildren('table',{'class':"leaderboard-table"}) headers = soupTable[0].findChildren('th') rows = soupTable[0].findChildren('tr') rows = filter(lambda r: len(r) > 5, rows) tableheaders = [i.text.replace(' ','') for i in headers] cutplace = 1000 cutcheck = True row_details = [] for i, row in enumerate(rows): tt = [j.find("a", {"class":"full-name"}).text if j.get("class")[0] == "playerName" else j.text for j in row.findAll('td')] if len(tt) > 5: if cutcheck and tt[tableheaders.index("POS")] == "-": cutplace = i + 5 cutcheck = False tt[tableheaders.index("POS")] = cutplace elif tt[tableheaders.index("POS")] == "-": tt[tableheaders.index("POS")] = cutplace if "T" in str(tt[tableheaders.index("POS")]): tt[tableheaders.index("POS")] = tt[tableheaders.index("POS")][1:] try: tt[tableheaders.index("POS")] = int(tt[tableheaders.index("POS")]) except ValueError: tt[tableheaders.index("POS")] = cutplace try: row_details.append(leaderboard(*tt)) except: continue return pd.DataFrame(row_details)
def parseNoteFirst(self, text=None, infile=None): """Parse NoteFirst record (xml format), return self""" if isinstance(text, basestring): pass elif isinstance(infile, basestring): f = open(infile) text = f.read() f.close() elif isinstance(infile, file): text = infile.read() else: # Do nothing return None soup = BeautifulSoup(text, "html.parser") self.title = soup.primarytitle.text doi = soup.doi.text self.doi = doi[doi.find("10.") :] self.journal = soup.media.info.text self.year = soup.year.text self.volume = soup.volume.text self.issue = soup.issue.text self.pages = soup.pagescope.text authors = soup.findChildren("fullname") self.authors = [author.info.text for author in authors] # self.issn="" return self
def table_parse(page) : data = [] soup = BeautifulSoup(page) table = soup.findChildren('table')[0] heads = table.findChildren(['th']) rows = table.findChildren(['tr']) rows.pop(0) # Pop first row when it is the head for row in rows : tmp = [] # organized data record cells = row.findChildren('td') # cells.pop(0) # Pop first cell when there is special flag/symbol/space for cell in cells : if cell.string is not None : value = cell.string value = value.replace('\\n\\t', '') value = value.replace('\\n', '') value = value.strip() tmp.append(value) else : value = cell.findChildren('a')[0].string value = value.replace('\\n\\t', '') value = value.replace('\\n', '') value = value.strip() tmp.append(value) url = cell.find('a').get('href') tmp.append(url) if tmp[3] == 'United States' : country = 'USA' else : country = tmp[3] data.append([tmp[0], tmp[1]] + location_parse(tmp[4]) + [country, tmp[2]]) return data
def get_categories(web_data): """Put items into readable nested dict, and return for building the menu""" soup = BeautifulSoup(web_data, 'html.parser') #xbmc.log("soup: {0}.".format(soup),level=xbmc.LOGERROR) ##Grab only the section list items: children = soup.findChildren("li") my_dict = {} for child in children: try: if child['class'][0] == "_depth0": key0 = child.text.lstrip(" ").encode('utf-8') my_href = child.find("a")["href"] if len(my_href) > 0: my_dict[key0] = my_href else: my_dict[key0] = {} elif child['class'][0] == "_depth1": key1 = child.text.lstrip(" ").encode('utf-8') my_href = child.find("a")["href"] if len(my_href) > 0: my_dict[key0][key1] = my_href else: my_dict[key0][key1] = {} elif child['class'][0] == "_depth2": key2 = child.text.lstrip(" ").encode('utf-8') my_href = child.find("a")["href"] if len(my_href) > 0: my_dict[key0][key1][key2] = my_href else: my_dict[key0][key1][key2] = {} elif child['class'][0] == "_depth3": key3 = child.text.lstrip(" ").encode('utf-8') my_href = child.find("a")["href"] if len(my_href) > 0: my_dict[key0][key1][key2][key3] = my_href else: pass except: pass #xbmc.log("My_dict: {0}".format(my_dict),level=xbmc.LOGERROR) return my_dict
def get_images(self, url, headers): """ Collect all the urls, iterate over them, downloading every one.""" res = requests.get(url=url, headers=headers) if not res.status_code == requests.codes.ok: raise res.raise_for_status() content = res.content soup = BeautifulSoup(content) link_tags = soup.findChildren(attrs={'class':'item view album-view-image-link'}) for elem in link_tags: url = elem.find('a').get('href') self.urls.append(url) folder = 'Imgur_Album' # if not os.path.exists(folder): # os.makedirs(folder) # mkdir -p # The above way is bad, because a dir can be created between the 2 function calls, thus causing a race condition # better way: try: os.makedirs(folder) except OSError, e: if e.errno == errno.EEXIST: print "Directory Already exists." print "Download to existing directory?" input = raw_input("[y/n]") if not input.lower() == 'y': print "Rename, or modify the directory name in the program." sys.exit()
def generate_html(self, url, status_url, last_checked_time): with open("AppView.html") as inf: txt = inf.read() soup = BeautifulSoup(txt, "html.parser") #print(soup.prettify()) new_tr = soup.new_tag('tr') new_td_url = soup.new_tag('td') new_td_url.append(soup.new_string(url)) new_td_status_url = soup.new_tag('td') new_td_status_url.append(soup.new_string(status_url)) new_td_last_checked_time = soup.new_tag('td') new_td_last_checked_time.append(soup.new_string(last_checked_time)) # insert it into the document new_tr.append(new_td_url) new_tr.append(new_td_status_url) new_tr.append(new_td_last_checked_time) old_tr = soup.findChildren('tr') for tr in old_tr: old_td = tr.findChildren('td') url_string = old_td[0].getText() if url_string != '': if url_string == url: soup.table.tr.replaceWith(new_tr) else: soup.table.append(new_tr) else: soup.table.tr.replaceWith(new_tr) # save the file again with open("AppView.html", "w") as outf: outf.write(str(soup))
def extract_workshops(url): """ Extracts all information available for workshops provided at https://coling2020.org/pages/workshops :return: list of dictionaries with a workshop represented as one dictionary. """ workshops = [] # url = "https://coling2020.org/pages/workshops" try: page = request.urlopen(url) except: print("Could not connect to url.") soup = BeautifulSoup(page, 'html.parser').find("section", {"id": "main_content"}) for child in soup.findChildren('h3'): for i in child.findNext('ul').find_all('li'): workshop = { attribute: None for attribute in [ "workshop_name", "workshop_organizer", "workshop_description", "workshop_day", "workshop_location", "workshop_link" ] } workshop['workshop_day'] = child.text workshop['workshop_name'] = util.basic_string_clean( i.find('a').text) workshop['workshop_link'] = i.find('a')['href'] workshops.append(copy.copy(workshop)) # print(json.dumps(workshops, indent=1)) return workshops
def calculateLineTagRatio(line): """ Calculation of the line's tag ratio. :param line: the line containing HTML data :return: the text-to-tag ratio is returned """ soup = BeautifulSoup(line, "html.parser") tags = [] non_tag_data = "" # Would loop through all the children of the HTML text for tag in soup.findChildren(): # Would append the tag to the tags list tags.append(tag.name) # If the tag has content as an immediate descentent, add to the non tag data if len(tag.contents) == 1: # child is HML content if isinstance(tag.contents[0], basestring): non_tag_data += tag.contents[0] # Compute the number of tags seen tag_count = len(tags) # Computation of TTR for the line if tag_count == 0: return len(non_tag_data) else: return len(non_tag_data) / tag_count
def CymathFunction(): sitedemo = "https://www.cymath.com/answer?q=sin(x)%3D24" site = formaturlexpr(uinput, "https://www.cymath.com/answer?q=", "cymath") browser.set_window_size(1120, 550) browser.get(site) browser.save_screenshot('cymathscreenshot.png') html = browser.page_source soup = BeautifulSoup(html, 'html.parser') stepsdivlist = soup.find_all(id="steps_div") itnlist = soup.find_all(class_='itn') katexlist = soup.find_all(class_='katex') listmord = soup.find_all(class_='mord mathrm') sollist = soup.findChildren(class_='base textstyle uncramped') hiddenanswers = soup.find(id="answer") print("CYMATH") hiddenanswertext = hiddenanswers.get_text() # print(hiddenanswertext) hat1 = hiddenanswertext.replace('),sequence(', ' & ') hat2 = hat1.replace('sequence(', ' ') hat3 = hat2[:-1] hat4 = hat3.replace('PI', 'π') hiddenanswertext = hat4 print(hiddenanswertext)
def getFacultyAdvisor(self, faculty_advisor): #opening faculty advisor details page self.br.open( "https://academics.vit.ac.in/student/faculty_advisor_view.asp") response = self.br.open( "https://academics.vit.ac.in/student/faculty_advisor_view.asp") #getting the soup soup = BeautifulSoup(response.get_data()) #extracting tables tables = soup.findChildren('table') myTable = tables[1] rows = myTable.findChildren(['th', 'tr']) #extracting data for row in rows: #creating thread for each row thrd = myThread(row, 5, faculty_advisor) #starting the thread thrd.start() #appending into thread list threads.append(thrd) #waiting for each thread to complete for t in threads: t.join() #returning faculty_advisor return faculty_advisor
def collect_content(): final = [] URL = 'https://www.mohfw.gov.in/' response = requests.get(URL).content soup = BeautifulSoup(response, "html.parser") table = soup.findChildren('table') global needed_table needed_table = table[0] all_rows = needed_table.find_all('tr') for row in all_rows: stats_row = [] stats_row.append(row.find_all('td')) for spec_row in stats_row: ans = [] for stats in spec_row: ans.append(stats.string) final.append(ans) final.pop(0) print(final) cur_data = {x[0]: {current_time: x[1:]} for x in final} past_data = load() if past_data != cur_data: mail.create_mail(needed_table, "", main.name_email()[0], main.name_email()[1]) save(cur_data) else: print(f"No update at {current_time}")
def process_one(self, bundle, index): page = self.session.get(self.bundles[bundle]).text listpage = BeautifulSoup(page, 'html.parser') gamerow = listpage.findChildren('div', attrs={'class': 'game_row'})[index] imageurl = gamerow.findChild('div', attrs={ 'class': 'game_thumb' }).get('data-background_image') gamename = gamerow.findChild('h2', attrs={ 'class': 'game_title' }).getText() gamepage = gamerow.findChild('a').get('href') linux = False mac = False windows = False if gamerow.findChild('span', attrs={'class': 'icon icon-tux'}): linux = True if gamerow.findChild('span', attrs={'class': 'icon icon-apple'}): mac = True if gamerow.findChild('span', attrs={'class': 'icon icon-windows8'}): windows = True self.cache_game(gamename, imageurl=imageurl, downloadpage=gamepage, linux=linux, windows=windows, mac=mac)
def getFacultyAdvisor(self, faculty_advisor): #opening faculty advisor details page self.br.open("https://academics.vit.ac.in/student/faculty_advisor_view.asp") response = self.br.open("https://academics.vit.ac.in/student/faculty_advisor_view.asp") #getting the soup soup = BeautifulSoup(response.get_data()) #extracting tables tables = soup.findChildren('table') myTable = tables[1] rows = myTable.findChildren(['th','tr']) #extracting data for row in rows: #creating thread for each row thrd = myThread(row, 5, faculty_advisor) #starting the thread thrd.start() #appending into thread list threads.append(thrd) #waiting for each thread to complete for t in threads: t.join() #returning faculty_advisor return faculty_advisor
def parse_text_block(self, bs_textblock: BeautifulSoup): assert (bs_textblock.name == "p" and bs_textblock.attrs["blocktype"] == "Text" or bs_textblock.name == 'td') textblock = Abby(bs_textblock.attrs, AbbyType.P) els = bs_textblock.findChildren(recursive=False) current_word: Abby = Abby({}, AbbyType.WORD) current_line: Abby = Abby({}, AbbyType.LINE) for i in range(len(els)): el = els[i] if el.name == 'br': # Line break current_line.add_child(current_word) textblock.add_child(current_line) current_word = Abby({}, AbbyType.WORD) current_line = Abby({}, AbbyType.LINE) elif el.text == " " or el.text == "\n": # Space / Word end current_line.add_child(current_word) current_word = Abby({}, AbbyType.WORD) elif el.name == 'span': # Character char_attributes = el.attrs char_attributes["text"] = el.text char = Abby(char_attributes, AbbyType.CHAR) current_word.add_child(char) else: raise ("CANNOT PARSE UNKNOWN TYPE") return textblock
def getAttendance(self, attendance): #opening the attendance page self.br.open("https://academics.vit.ac.in/student/attn_report.asp?sem=WS&fmdt=09-Jul-2015&todt=%(to_date)s" % {"to_date" : today }) response = self.br.open("https://academics.vit.ac.in/student/attn_report.asp?sem=WS&fmdt=09-Jul-2015&todt=%(to_date)s" % {"to_date" : today }) soup = BeautifulSoup(response.get_data()) #extracting tables tables = soup.findChildren('table') myTable = tables[3] rows = myTable.findChildren(['th','tr']) rows = rows[1:] i = 1 #extracting data for row in rows: #creating thread for each row thrd = myThread(row, 2, attendance, i, self.br) #starting the thread thrd.start() #appending into thread list threads.append(thrd) i = i+1 #waiting for each thread to end for t in threads: t.join() return attendance
def getNewData(thisURL): # reads XML file, converts to pandas dataFrame. Each row is one station. cabiBase = requests.get(thisURL) cabiSoup = BeautifulSoup(cabiBase.content,"lxml") CC = cabiSoup.findChildren() fnSoup = [x.name for x in CC] sta = cabiSoup.findAll('station') allContents = [x.contents for x in sta] fieldsHere = [[re.search('(?<=\<)\w+(?=>)',str(entry)).group(0) \ for entry in x] for x in allContents] valuesHere = [[re.sub('&','&',re.search('(?<=>)[^\<]*(?=\<)',str(entry)).group(0)) \ for entry in x] for x in allContents] dNew = {} for ff in range(len(fieldsHere[0])): # assumes they're all identical! thisField = fieldsHere[0][ff] thisType = getDtype(thisField) try: dNew.update({thisField:[thisType(x[ff]) for x in valuesHere]}) except: temptemp = [x[ff] for x in valuesHere] temp2 = [thisType(x) if (len(x)) else -999 for x in temptemp] dNew.update({thisField:temp2}) overall_LastUpdate_sec = [int(CC[fnSoup.index('stations')].attrs['lastupdate'])/sec_2_msec]*(len(sta)) zipIt = zip([1000000*OLU for OLU in overall_LastUpdate_sec],dNew['id']) DF = pd.DataFrame(dNew,index=[sum(zz) for zz in zipIt]) return [DF,(cabiBase.content)]
def scrape(self, url): """ Here's the general algorithm... - perform these actions for EACH <table> within <body> - hit each <tr class="evenColor"> """ events = [] soup = BeautifulSoup(urllib2.urlopen(url).read()) # this .findChildren is giving off way too many false positives. must be a better call here. [todo] tables = soup.findChildren("table") data_tables = [] for table in tables: if table.findParent("table") is None: data_tables.append(table) print "count: ", len(data_tables) print "other count: ", len(tables) # parse out individual rows. for table in data_tables: rows = table.findAll("tr", {"class": "evenColor"}) print "row count: ", len(rows) for row in rows: event = self.parseEventRow(row) print event events.append(event) #pdb.set_trace() return events
def scrape(self, url): """ Here's the general algorithm... - perform these actions for EACH <table> within <body> - hit each <tr class="evenColor"> """ events = [] soup = BeautifulSoup(urllib2.urlopen(url).read()) # this .findChildren is giving off way too many false positives. must be a better call here. [todo] tables = soup.findChildren("table") data_tables = [] for table in tables: if table.findParent("table") is None: data_tables.append(table) print "count: ", len(data_tables) print "other count: ", len(tables) # parse out individual rows. for table in data_tables: rows = table.findAll("tr", { "class" : "evenColor"}) print "row count: ", len(rows) for row in rows: event = self.parseEventRow(row) print event events.append(event) #pdb.set_trace() return events
def getStats(link, games): score_page = requests.get(BASEURL + link) score_soup = BeautifulSoup(score_page.text, 'html.parser') #Team names headings = score_soup.find_all(class_='section_anchor') away_team = headings[3]['data-label'].split('(')[0].rstrip() home_team = headings[5]['data-label'].split('(')[0].rstrip() tables = score_soup.findChildren('table') #Basic stats away_basic_table = tables[0] home_basic_table = tables[2] away_basic_stats = ripStatsFromTable(away_basic_table, False) home_basic_stats = ripStatsFromTable(home_basic_table, True) #'Advanced' stats away_adv_table = tables[1] home_adv_table = tables[3] away_adv_stats = ripStatsFromTable(away_adv_table, False) home_adv_stats = ripStatsFromTable(home_adv_table, True) #combine both sets of stats into one dictionary home_stats = {**home_adv_stats, **home_basic_stats} away_stats = {**away_adv_stats, **away_basic_stats} title = score_soup.title.getText().split('|')[0].rstrip().split( ' Box Score') title2 = title[0] + title[1] game = {title2: {home_team: home_stats, away_team: away_stats}} games.append(game)
def test_only_the_custom_region_is_created(self): caption_set = DFXPReader().read( SAMPLE_DFXP_TO_RENDER_WITH_ONLY_DEFAULT_POSITIONING_INPUT) new_region = Layout(alignment=Alignment(HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP)) dfxp = SinglePositioningDFXPWriter(new_region).write(caption_set) # Using a different parser, because this preserves letter case # The output file is ok, but when parsing it, the "regular" parses # loses letter case. layout = BeautifulSoup(dfxp, features='xml').findChild('layout') self.assertEqual(len(layout.findChildren('region')), 1) region = layout.findChild('region') text_align = region['tts:textAlign'] display_align = region['tts:displayAlign'] internal_alignment = _create_internal_alignment( text_align, display_align) # noqa self.assertEqual(internal_alignment.horizontal, HorizontalAlignmentEnum.LEFT) # noqa self.assertEqual(internal_alignment.vertical, VerticalAlignmentEnum.TOP) # noqa
def results(reg_no = "", pswd = ""): #logging into student login br = login(reg_no,pswd) #checking that are we logged in or not if br.geturl() == ("https://academics.vit.ac.in/student/stud_home.asp") or br.geturl() == ("https://academics.vit.ac.in/student/home.asp"): print "SUCCESS" br.open("https://academics.vit.ac.in/student/grade.asp?sem=WS") response = br.open("https://academics.vit.ac.in/student/grade.asp?sem=WS") soup = BeautifulSoup(response.get_data()) #extracting tables tables = soup.findChildren('table') try: myTable = tables[1] except IndexError: myTable = 'null' return {"status" : "Not_Updated"} rows = myTable.findChildren(['th','tr']) result = {} return {"status" : "Updated"} else: print "FAIL" return {"status" : "Failure"}
def get_class_instructor(class_number, term): """ :returns: a string that is the instructor for CLASS_NUMBER in term TERM :param: class_number: String, class number :param: term: String, term number """ url = 'http://www.courses.as.pitt.edu/detail.asp?CLASSNUM={}&TERM={}'.format( class_number, term) page = urlopen(url) soup = BeautifulSoup(page.read(), 'html.parser') table = soup.findChildren('table')[0] rows = table.findChildren('tr') for row in rows: cells = row.findChildren('td') try: for index, cell in enumerate(cells): if len(cell.contents) > 0 and str( cell.contents[0]) == 'Description': prev = cells[index - 1] return prev.string.strip() except Exception: print("blah")
def scrape_web(path: str, file: str) -> None: init_data = JsonFile(path, file) arguments = init_data.load() number_of_elements = len(arguments["search"]) for index in range(number_of_elements): try: search_name = arguments["search"][index] web_request = requests.get(arguments["url"] + "/currencies/" + search_name) content = BeautifulSoup(web_request.content, 'lxml') table = content.findChildren('table')[0] rows = table.find_all('td') information = (search_name, ) + format_table(rows) current_coin = Coin(*information) if index == number_of_elements - 1: print(current_coin) else: print(current_coin, end="\n\n") file = current_coin.time + ".json" path = "out/" + current_coin.name coin_info = current_coin.__dict__ result = JsonFile(path, file) result.save(coin_info) except Exception as error: print(f"Error: {error}")
def get_course_numbers(subject, term, course_title): url = 'http://www.courses.as.pitt.edu/results-subja.asp?TERM={}&SUBJ={}'.format( term, subject) page = urlopen(url) soup = BeautifulSoup(page.read(), 'html.parser') table = soup.findChildren('table')[0] rows = table.findChildren('tr') course_numbers = [] for row in rows: cells = row.findChildren('td') try: for index, cell in enumerate(cells): if len(cell.contents) > 0 and str( cell.contents[0]) == course_title: prev = cells[index - 1] course_numbers.append(prev.find('a').contents[0]) #print(prev.find('a').contents[0]) #print(cells) except Exception: print("blah") return course_numbers
def get_class_time(class_number, term): """ :returns: a string that is the class time for CLASS_NUMBER in term TERM :param: class_number: String, class number :param: term: String, term number """ url = 'http://www.courses.as.pitt.edu/detail.asp?CLASSNUM={}&TERM={}'.format( class_number, term) page = urlopen(url) soup = BeautifulSoup(page.read(), 'html.parser') table = soup.findChildren('table')[0] rows = table.findChildren('tr') has_time = False for row in rows: cells = row.findChildren('td') for cell in cells: if has_time: if len(cell) > 1: return (cell.contents[0].string.strip() + ' and ' + cell.contents[2].string.strip()) else: return cell.contents[0].string.strip() if str(cell.contents[0]) == 'AT' or str( cell.contents[0]) == 'SE3' or str( cell.contents[0]) == 'ST' or str( cell.contents[0]) == '6W1' or str( cell.contents[0]) == '6W2' or str( cell.contents[0]) == '12W': has_time = True
def test_only_the_custom_region_is_created(self): caption_set = DFXPReader().read( SAMPLE_DFXP_TO_RENDER_WITH_ONLY_DEFAULT_POSITIONING_INPUT) new_region = Layout( alignment=Alignment( HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP ) ) dfxp = SinglePositioningDFXPWriter(new_region).write(caption_set) # Using a different parser, because this preserves letter case # The output file is ok, but when parsing it, the "regular" parses # loses letter case. layout = BeautifulSoup(dfxp, features='xml').findChild('layout') self.assertEqual(len(layout.findChildren('region')), 1) region = layout.findChild('region') text_align = region['tts:textAlign'] display_align = region['tts:displayAlign'] internal_alignment = _create_internal_alignment(text_align, display_align) # noqa self.assertEqual(internal_alignment.horizontal, HorizontalAlignmentEnum.LEFT) # noqa self.assertEqual(internal_alignment.vertical, VerticalAlignmentEnum.TOP) # noqa
def details(br): details = [] r = br.submit() dsoup = BeautifulSoup(r.get_data()) dtables = dsoup.findChildren('table') try: dmyTable = dtables[2] drows = dmyTable.findChildren(['th', 'tr']) drows = drows[2:] for drow in drows: dcells = drow.findChildren('td') details.append({ "date": dcells[1].getText(), "slot": dcells[2].getText(), "status": dcells[3].getText(), "class_units": dcells[4].getText(), "reason": dcells[5].getText() }) except: print "No_table" br.open( "https://academics.vit.ac.in/student/attn_report.asp?sem=WS&fmdt=09-Jul-2015&todt=%(to_date)s" % {"to_date": today}) return details
def parse_data(url): response = requests.get(url) data = response.text soup = BeautifulSoup(data, 'html.parser') tables = soup.findChildren('table') my_table = tables[0] return my_table.findChildren(['tr'])
def parseData(epicNo): global iterator, formParams, headerList, notFoundCount # print "Finding data for Epic No", epicNo response = requests.get("http://164.100.180.4/searchengine/SearchEngineEnglish.aspx") soup = BeautifulSoup(response.text, 'lxml') # Prepare for inital Post Request to get the form which will contain the input field to enter EPIC No formParams = {} formParams = extractHiddenFields(formParams, soup) formParams = setDefaultFormFields(formParams) # Get the page to enter EPIC No selectedDistrictForm = requests.post('http://164.100.180.4/searchengine/SearchEngineEnglish.aspx', data = formParams) selectedDistrictSoup = BeautifulSoup(selectedDistrictForm.text, 'lxml') # Prepare for final request to get the required information finalFormParams = {} finalFormParams = extractHiddenFields(finalFormParams, selectedDistrictSoup) finalFormParams = setDefaultFormFields(finalFormParams) finalFormParams['txtEPICNo'] = epicNo finalFormParams['RdlSearchBy'] = 0 getVoterDetails = requests.post('http://164.100.180.4/searchengine/SearchEngineEnglish.aspx', data = finalFormParams) finalDetailsSoup = BeautifulSoup(getVoterDetails.text, 'lxml') # Details received now write it to a file with open('data.csv', 'a') as csvWriterFile: csvWriter = csv.writer(csvWriterFile) # Check if EPIC Number valid or not if(len(finalDetailsSoup.findChildren('table', {'id': 'gvSearchResult'}))) > 0: dateTable = finalDetailsSoup.findChildren('table', {'id': 'gvSearchResult'})[0] dataRow = dateTable.findChildren(['tr'])[1] dataCell = dataRow.findChildren('td') dataList = [] dataCell.pop(0) for cell in dataCell: value = "" + cell.string dataList.append(cell.string) # print dataList csvWriter.writerow(dataList) else: notFoundCount += 0 notFound = ["Not Found"] * 10 notFound.append(epicNo) csvWriter.writerow(notFound)
def test_only_the_default_region_is_created(self): caption_set = DFXPReader().read( SAMPLE_DFXP_TO_RENDER_WITH_ONLY_DEFAULT_POSITIONING_INPUT) dfxp = SinglePositioningDFXPWriter().write(caption_set) layout = BeautifulSoup(dfxp, features='html.parser').findChild('layout') # noqa self.assertEqual(len(layout.findChildren('region')), 1)
def GET(self, inLeague, inYear, inTeam=0): web.header('Content-Type', 'application/json') web.header('Access-Control-Allow-Origin', '*') web.header('Access-Control-Allow-Credentials', 'true') # return 1 #http://games.espn.go.com/ffl/clubhouse?leagueId=716644&teamId=5&seasonId=2014 foundTeamRosters=[] if inTeam==0: inds = range(1,12) else: inds = [];inds.append(inTeam) for teamId in inds: url = 'http://games.espn.go.com/ffl/clubhouse?leagueId=%s&teamId=%s&seasonId=%s' %(inLeague, teamId, inYear) # return url soup = BeautifulSoup(urllib2.urlopen(url).read()) # found = soup.findChildren('table')[0].findChildren('td', class_='playertablePlayerName') found = soup.findChildren('table')[0].findChildren('tr', class_='pncPlayerRow') foundPlayers=[] foundPositions=[] teamName = soup.findChildren('table')[0].findChildren(class_='team-name')[0].text realName = soup.findChildren('table')[0].findChildren(class_='per-info')[0].text for f in found: try: fullString = str(f.findChildren('td')[1]).replace('\xa0',' ').replace('\xc2',' ') posString = BeautifulSoup(fullString.replace(str(BeautifulSoup(fullString).a),'')).body.string.replace(', ','') thisPlayerName = f.a.string thisPlayerSlot = f.findChildren('td')[0].string posStr = fullString.find('</td>') strLen = len(fullString) aLoc = fullString.find('</a>') subString = fullString[strLen-7:strLen] pos = subString[0:2] #print fullString #print subString objPlayer = player(name=thisPlayerName, position=pos, slot=thisPlayerSlot) #print pos foundPlayers.append(objPlayer.__dict__) except: pass foundTeamRosters.append(team(name=teamName, roster=foundPlayers, realName=realName).__dict__) return json.dumps(foundTeamRosters)
def calScrape(br, row, i, calmarks): details = [] cells = row.findChildren('td') br.select_form(nr=i) i = i+1 r = br.submit() dsoup = BeautifulSoup(r.get_data()) dtables = dsoup.findChildren('table') #if table is present try: dmyTable = dtables[2] #if table is absent except: br.open("https://academics.vit.ac.in/student/cal_da.asp?sem=WS") if cells[2].getText().replace("\r\n\t\t","") not in calmarks.keys(): calmarks[cells[2].getText().replace("\r\n\t\t","")] = {"course_type" : cells[4].getText().replace("\r\n\t\t",""), "faculty" : cells[5].getText().replace("\r\n\t\t",""), "details" : details} else: if cells[4].getText().replace("\r\n\t\t","") == "Embedded Lab": calmarks[cells[2].getText().replace("\r\n\t\t","")+"L"] = {"course_type" : cells[4].getText().replace("\r\n\t\t",""), "faculty" : cells[5].getText().replace("\r\n\t\t",""), "details" : details} elif cells[4].getText().replace("\r\n\t\t","") == "Embedded Project": calmarks[cells[2].getText().replace("\r\n\t\t","")+"P"] = {"course_type" : cells[4].getText().replace("\r\n\t\t",""), "faculty" : cells[5].getText().replace("\r\n\t\t",""), "details" : details} else: drows = dmyTable.findChildren(['th','tr']) drows = drows[2:-1] for drow in drows: dcells = drow.findAll('td') details.append({"assignment_title" : dcells[1].getText(), "due_date" : dcells[2].getText(),"max_marks" : dcells[3].getText() ,"assignment_status" : dcells[5].getText() if dcells[5].getText() else "NA", "marks_status" : dcells[7].getText() if dcells[7].getText() else "NA", "marks_score" : dcells[8].getText() if dcells[3].getText() else "NA"}) br.open("https://academics.vit.ac.in/student/cal_da.asp?sem=WS") if cells[2].getText().replace("\r\n\t\t","") not in calmarks.keys(): calmarks[cells[2].getText().replace("\r\n\t\t","")] = {"course_type" : cells[4].getText().replace("\r\n\t\t",""), "faculty" : cells[5].getText().replace("\r\n\t\t",""), "details" : details} else: if cells[4].getText().replace("\r\n\t\t","") == "Embedded Lab": calmarks[cells[2].getText().replace("\r\n\t\t","")+"L"] = {"course_type" : cells[4].getText().replace("\r\n\t\t",""), "faculty" : cells[5].getText().replace("\r\n\t\t",""), "details" : details} elif cells[4].getText().replace("\r\n\t\t","") == "Embedded Project": calmarks[cells[2].getText().replace("\r\n\t\t","")+"P"] = {"course_type" : cells[4].getText().replace("\r\n\t\t",""), "faculty" : cells[5].getText().replace("\r\n\t\t",""), "details" : details} return calmarks
def parse(self, response): sel = Selector(response) profile = {'url': response.url, 'skills': [], 'experience': []} # Parse current page URL (public profile URL) # Read Skills section skills_list = sel.xpath('//a[@class="endorse-item-name-text"]').extract() for skill in skills_list: skill = self.remove_tag('a', skill) profile['skills'].append(skill) # List of experience items exp_items = [] # Read Companies and Titles exp_entries = sel.xpath('//div[contains(@id, "experience-") and contains(@id, "-view")]').extract() for exp_entry in exp_entries: b_soup = BeautifulSoup(exp_entry) #Get company name exp_company_matches = b_soup.findChildren('a', href=re.compile(r'prof-exp-company-name')) exp_company = exp_company_matches[len(exp_company_matches) - 1].get_text()\ if len(exp_company_matches) > 0 else None # Get title within company exp_title = b_soup.findChild('a', {'name': 'title'}).get_text() # Get work description exp_desc_match = b_soup.findChild('p', {'class': 'description'}) exp_desc = exp_desc_match.get_text() if exp_desc_match is not None else None # Get work date-locale exp_date_loc = b_soup.findChild('span', {'class': 'experience-date-locale'}) exp_duration_items = exp_date_loc.findChildren('time') exp_is_current = 'Present' in exp_duration_items[1].get_text() exp_duration = re.sub(r'[^a-zA-Z0-9 ]', '', exp_duration_items[2].get_text()).strip() exp_location_item = exp_date_loc.findChild('span', {'class': 'locality'}) exp_location = None if exp_location_item is not None: exp_location = re.sub(r'^[^"]*"', '', exp_location_item.get_text()) exp_location = exp_location.replace("\"", "").strip() exp_items.append(ExperienceItem(exp_is_current, exp_title, exp_company, exp_location, exp_duration, exp_desc)) profile['experience'] = exp_items # Sleep to appease LinkedIn rate limiting time.sleep(5) self.profile_map[response.url] = profile return LinkedInItem(profile)
def main(): numOfArgs=len(sys.argv) if numOfArgs<4 or numOfArgs>4: print'Usage: A1.py <university> <sec> < URI>' print'e.g.: A1.py "old dominion" 60 http://sports.yahoo.com' sys.exit(1) print 'Number of arguments:', len(sys.argv), 'arguments.' univ = str(sys.argv[1]) sec = int(sys.argv[2]) uri = str(sys.argv[3]) print 'Team Name: ' ,univ print 'Time to Sleep: ' ,sec print 'URI: ' ,uri response = requests.get(uri) soup = BeautifulSoup(response.content)#gives you the html content of that page tables = soup.findChildren('table')#finds all the children of type table print "-" * 72 #print tables[1].prettify() score_table = tables[1]#storing the results of the second table in score_table variable as our intersting stuff is in table # when you extract data from web and use beautiful soup it is stored in the form of array nothing but list in python while True: for row in score_table('tr', {'class' : 'game link' }): if univ.lower() in str(row).lower() : td_team_home = row('td', {'class' : 'home' }) span_home = td_team_home[0]('em')[0].contents[0]#the td_team_home is treated as a list so you have to get the contents of it td_team_away = row('td', {'class' : 'away' }) span_away = td_team_away[0]('em')[0].contents[0] td_score = row('td', {'class' : 'score' }) span_home_score = td_score[0]('span')[1].contents[0] span_away_score = td_score[0]('span')[0].contents[0] print "*" * 8 print span_home print span_home_score print print span_away print span_away_score print print 'Press ctrl+c to exit getting the scores' time.sleep(sec) # delays for 60 seconds print "*" * 8 print "-" * 72
def add_page_to_index(url, html): body_soup = BeautifulSoup(html, "html.parser").find('body') for child_tag in body_soup.findChildren(): if child_tag.name == 'script': continue child_text = child_tag.text for line in child_text.split('\n'): line = line.rstrip().lstrip() for word in _split_to_word(line): add_to_index(word, url)
def GetCourses(link): html = requests.get(link).text soup = BeautifulSoup(html, 'html.parser') table = soup.findChildren('tr',{'class': 'title'}) results = [] for row in table: results.append(row.find('a').find(text=True)) return results
def parse_html(link, tag, tag_name): try: html = "" try: html = urlopen(link).read() except Exception as e: print "Error1 = " + str(e) soup = BeautifulSoup(html) data = soup.findChildren(attrs = {tag: re.compile(tag_name)}) return data[0] except Exception as e: print "Error2 = " + str(e)