def parse_obituary(url,category): """ Extracts the necessary information from a single obituary page """ page = requests.get(url) soup = Soup(page.text) try: date = select(soup, 'p strong')[0].contents[0] date = date[date.rfind('died ')+5:].strip() cal = pdt.Calendar() print >> sys.stderr, 'parsing',date date = cal.parseDateText(date) except: print >> sys.stderr, 'failed to parse' return date = str('%s/%s/%s' % (date[2],date[1],date[0])) publisher = 'Telegraph' type = 'obituaries' name = select(soup, '.storyHead h1')[0].contents[0] content = '' for para in select(soup, '#mainBodyArea p'): if len(para.contents) > 0: content = content + para.contents[0] content = content.strip().replace('"','\'') content = content.strip().replace('\n','') print >> sys.stdout, '%s,%s,%s,%s,"%s","%s"' % (date.encode("UTF-8"), publisher.encode("UTF-8"), type.encode("UTF-8"), name.encode("UTF-8"), content.encode("UTF-8"), category.encode("UTF-8"))
def fetch_data(): def bvbreplace(s): return "BVB" if "Dortmund" in s else s doc = None try: doc, errs = tidy_document(urllib2.urlopen('http://www.bvb.de/').read(), tidyoptions) soup = Soup(doc) except Exception as e: raise Exception(u"Error fetching/parsing website: %s" % e) out = '' matchtime = datetime.datetime.now() + datetime.timedelta(hours=25) timestr = '' try: home = bvbreplace(select(soup, "div.next-match p span")[0].contents[0].strip()) guest = bvbreplace(select(soup, "div.next-match p span")[1].contents[0].strip()) league = '' try: league = select(soup, "div.next-match p span.tournament")[0].contents[0].strip() except: league = select(soup, "div.next-match p span")[2].contents[0].strip() matchtime = datetime.datetime.strptime(select(soup, "div.next-match p")[1].contents[-1].strip(), u"%d.%m.%Y %H:%M") timestr = matchtime.strftime(u"%a, %d.%m.%Y %H:%M") dontgo = u"U42/U46/Kreuzviertel/Borsigplatz/Uni-Parkplatz" if u"BVB" == home else u"Kneipen mit TV in Dortmund" location = u"Heim" if u"BVB" == home else u"Auswaerts" out = u"WARNUNG! %s: %s vs %s (%s/%s). Meide %s." % (timestr, home, guest, location, league, dontgo) except IndexError: # This means: No next game on the webpage. sys.exit(1) except Exception as e: #print(traceback.format_exc()) raise Exception(u"ERRBVB while parsing bvb.de: %s" % e) return out, matchtime
def expandDocument(self,header,content,config=None): raise "obsolete" part = self.partDocument(header["document"],config) soup = part.expandSoup(content) header = part.get_collapsed_header(header=header) stateful_doc = "stateful" in header and header["stateful"] is True if stateful_doc: script = part.statefulConfigScript() if script: script_tag = soup.new_tag("script") script_tag["type"] = "application/config" script_tag.string = script soup.body.append(script_tag) # fill in meta tags self._applyMetaAndTitle(soup,header,config) if config["appcache"] == False: for h in select(soup,"html"): del h["manifest"] elif "manifest" in header: for h in select(soup,"html"): h["manifest"] = header["manifest"] if "Content-Language" in header: for h in select(soup,"html"): h["lang"] = header["Content-Language"] # offline markers lists = { "offline": self._getOfflineList(soup,header), } return soup.prettify(), lists
def Loop_Through_Messages(i): #i = start ID - 1 while i < MaxMSG: i += 1 Humanize(2) #Humanize the program by sleeping 0-2 seconds try: soup = Make_Soup("http://groups.yahoo.com/group/freecycledc/message/" + str(i)) MSG_Title = select(soup, 'title')[0].text.replace('\n', '~n-break~') msgbodyhtml = select(soup, '.msgarea')[0] MSG_Body = unicode.join(u' ',map(unicode,msgbodyhtml)).replace('<br />', '~break~').replace('\n', '~n-break~') if MSG_Title == '': MSG_Title = '(none)' if MSG_Body == '': MSG_Body = '(none)' Message_Data_to_Table(i, MSG_Title, MSG_Body) print i, "of", MaxMSG except: print "ERROR: SCRAPE FAIL ON POSTING ID", i Check_Column("Title", MSG_Title) Check_Column("Body HTML", msgbodyhtml) Check_Column("Body Text", MSG_Body) if MSG_Title == 'freecycledc' or 'message' not in MSG_Title.lower(): Message_Data_to_Table(i, 'Message does not exist', 'NOTHING TO SEE HERE, FOLKS') else: Message_Data_to_Table(i, 'FAIL', 'FAIL')
def getClassTable(self): br = mechanize.Browser() br.set_handle_robots(False) #directly open the U of I login page br.open( "https://eas.admin.uillinois.edu/eas/servlet/EasLogin?redirect=https://webprod.admin.uillinois.edu/ssa/servlet/SelfServiceLogin?appName=edu.uillinois.aits.SelfServiceLogin&dad=BANPROD1" ) br.select_form(name="easForm") br["inputEnterpriseId"] = self.username #self.username br["password"] = self.password #self.password br.submit() br.open( "https://ui2web1.apps.uillinois.edu/BANPROD1/bwskcrse.P_CrseSchdDetl" ) try: br.select_form(nr=1) except: return None resp = br.submit() soup = BeautifulSoup(resp.read()) br.close() sem_info_row = BeautifulSoup( str(select(soup, "div.pagetitlediv table tr td")[2])) #get course metadata and append it to courses data course_sch_table = BeautifulSoup( str(select(soup, "div.pagebodydiv table")[-2])) courses = self.parseSchTable(course_sch_table) return courses
def sees_an_element(self, doc, element=None, css_class=None, id=None, css_selector=None): """ Tests for the presence of a specified element on the current page... self.alice.sees_an_element(doc, id="element_id") self.alice.sees_an_element(doc, "element") self.alice.sees_an_element(doc, "div", "element_css_class") self.alice.sees_an_element(doc, selector="#myid element.bar") """ selector = "any" if id: displayed_element = doc.find(id=id) selector = id elif css_selector: displayed_elements = select(doc, css_selector) displayed_element = displayed_elements[0] if displayed_elements else None selector = css_selector else: if css_class: selector = "%s.%s" % (element, css_class) displayed_element = select(doc, selector) else: displayed_element = doc.find(element) selector = element self.failUnless(displayed_element, "Could not find %s" % (selector)) return displayed_element
def fetch_review_counts(appid): class FetchError(StandardError): pass url = 'http://store.steampowered.com/app/%i/' % appid request = urllib.urlopen(url) if request.code < 200 or request.code > 299: raise FetchError('Unable to fetch %s' % url, { 'appid': appid, 'status': request.code}) soup = BeautifulSoup(request) positive_count = '' positive_count_elements = select(soup, '#ReviewsTab_positive .user_reviews_count') if len(positive_count_elements) > 0: positive_count = get_count(positive_count_elements[0]) if not positive_count: print >>sys.stderr, "Warning: Unable to find positive user review count on page %s" % url negative_count = '' negative_count_elements = select(soup, '#ReviewsTab_negative .user_reviews_count') if len(negative_count_elements) > 0: negative_count = get_count(negative_count_elements[0]) if not negative_count: print >>sys.stderr, "Warning: Unable to find negative user review count on page %s" % url return positive_count, negative_count
def scrapeBlog(blog): global completed blogurl = blog['postUrl'] blogData = {} try: soup = Soup(urllib2.urlopen(blogurl)) post = select(soup, 'div.post-body') title = select(soup, 'h1.title') titleNoTags = Soup(str(title)) rawTitle = ''.join(filter(visible, titleNoTags.findAll(text=True))).strip() #print rawTitle noScript = Soup(str(post)) rawText = ''.join(filter(visible, noScript.findAll(text=True))).strip() #print raw_text blogData['source'] = str(rawTitle) blogData['title'] = blog['titleNoFormatting'] blogData['content'] = str(rawText) blogData['date'] = blog['publishedDate'] blogData['url'] = str(blogurl) except e: pass with dataLock: data.append(blogData) completed += 1
def get_raw_boxscore_data(self, boxscore_soup): # Load boxscore data. No logic here, just splitting from HTML into more # processable data. boxscore_data = [] boxscore_rows = select(boxscore_soup, '#my-players-table tbody tr') for player_data in boxscore_rows: cells = select(player_data, 'td') if len(cells) == 13: # This order should match the boxscore table on espn (player_name, minutes, fgma, tpma, ftma, oreb, reb, ast, stl, blk, to, pf, pts) = [ cell.text for cell in cells ] if not player_name: continue fgm, fga = fgma.split('-') tpm, tpa = tpma.split('-') ftm, fta = ftma.split('-') (minutes, fgm, fga, tpm, tpa, ftm, fta, oreb, reb, ast, stl, blk, to, pf, pts) = map(int, [ minutes, fgm, fga, tpm, tpa, ftm, fta, oreb, reb, ast, stl, blk, to, pf, pts ]) boxscore_data.append({ 'name': player_name, 'minutes': minutes, 'fgm': fgm, 'fga': fga, 'tpm': tpm, 'tpa': tpa, 'ftm': ftm, 'fta': fta, 'oreb': oreb, 'reb': reb, 'ast': ast, 'stl': stl, 'blk': blk, 'to': to, 'pf': pf, 'pts': pts, }) return boxscore_data
def _extract_predictions(self, html): if '<p class="predictHead"><nobr><span id=\'i18n_en\'>No current prediction' in html: return None else: predictions = [] soup = BeautifulSoup(html) # get the primary/imminent prediction try: minutes = self._clean_prediction_html(select(soup, '.predictionNumberForFirstPred')[0]) except: return None if ('departing' in minutes.lower()) or ('arriving' in minutes.lower()): predictions.append(0) else: predictions.append(int(minutes)) # get the other predictions for m in select(soup, '.predictionNumberForOtherPreds'): m = self._clean_prediction_html(m) try: predictions.append(int(m)) except: pass return predictions
def extractPage(url, pagination=True): print 'Extracting : %s' % url result = [] page = request(url) soup = BeautifulSoup(page) info = select(soup, '.courseInfo') for record in info: courseNumber = record.find('span', {'class': 'courseNumber'}).text courseTitle = record.find('span', {'class': 'courseTitle'}).text courseAttrs = record.find('div', {'class': 'courseAttributes'}).text terms = [x for x in courseAttrs.split('|') if 'terms' in x.lower()] if terms: courseTime = str(terms[0].split(':')[1]).strip() else: courseTime = "not given this year" obj = { 'title': courseTitle, 'number': courseNumber, 'time': courseTime } result.append(obj) subresults = [] if pagination: pages = select(soup, '#pagination a') pagesLinks = href(pages) for l in set(pagesLinks): subresults.extend(extractPage(BASE + l, False)) if subresults: result.extend(subresults) return result
def expand(self,header,content,markup=None,config=None): """ General header/content expansion replacing expandDocument and expandScss """ lists = { "offline": [], } if "charset" not in header and markup is not None: header["charset"] = config["charset"] parent_doc = None if "document" in header: parent_doc = self.partDocument(header["document"],config) header = parent_doc.get_collapsed_header(header=header) if markup == "scss": content = self.expandScss(header,content,config=config) elif markup in ("text","xml"): pass #TODO consider what to do elif markup == "html": soup = None if parent_doc: soup = parent_doc.expandSoup(content) else: soup = BeautifulSoup(content,"html5lib") if "lang" in header: pass #TODO mark html element # print soup.head stateful_doc = "stateful" in header and header["stateful"] is True if stateful_doc: script = parent_doc.statefulConfigScript() if script: script_tag = soup.new_tag("script") script_tag["type"] = "application/config" script_tag.string = script soup.body.append(script_tag) # fill in meta tags self._applyMetaAndTitle(soup,header,config) if config["appcache"] == False: for h in select(soup,"html"): del h["manifest"] elif "manifest" in header: for h in select(soup,"html"): h["manifest"] = header["manifest"] if "Content-Language" in header: for h in select(soup,"html"): h["lang"] = header["Content-Language"] # offline markers lists["offline"] = self._getOfflineList(soup,header) content = soup.encode() return header, content, lists
def get_info_from_title(soup, name): stats = select(soup, "dt.stat-title") for stat in stats: stat_name = select(stat, "span.title") if stat_name: if stat_name[0].text == name: return select(stat, "span.stat-point")[0].text
def get_games(page=1): def select_first(soup, selector): result = select(soup, selector) if result and len(result) > 0: return result[0] else: return None def inner_text(soup): if isinstance(soup, NavigableString): return unicode(soup) elif soup.contents: return u"".join(inner_text(c) for c in soup.contents) else: return unicode(soup) result = [] soup = BeautifulSoup(urllib.urlopen(search_result_url(page))) games = select(soup, "a.search_result_row") for game in games: href = str(game["href"]) if re.search("http://store.steampowered.com/app/(\\d+)/", href): id = re.search("http://store.steampowered.com/app/(\\d+)/", href).group(1) else: logging.error("Error extracting ID, skipping") continue name = inner_text(select(game, "h4")[0]) price = select_first(game, ".search_price") if price and price.contents: price = price.contents[-1].lower() if price.find("free") != -1: price = float(0) elif price.startswith("$"): # Grab the last node, which is either the price or the "reduced # price" try: price = float(price[5:]) except: logging.error("Price conversion error for %s: '%s'" % (name, price)) price = None else: price = None logging.error("Price parse error for %s: '%s'" % (name, price)) else: price = None metascore = select_first(game, ".search_metascore") if metascore and metascore.string: metascore = int(metascore.string) else: metascore = None result.append(Game(id=id, name=name, price=price, metascore=metascore)) return result
def raw_events(file): match = open(file, 'r') soup = BeautifulSoup(match.read()) events = select(soup, 'div#live-text-commentary-wrapper div#live-text') more_events = select(soup, 'div#live-text-commentary-wrapper div#more-live-text') for event in events + more_events: for child in event.children: if type(child) is bs4.element.Tag: yield child.getText().strip()
def get_resources(self, doc): resources = [] for a in select(doc, 'a'): url = a.get('href') img = select(a, 'img[src]')[0] src = img.get('src') f_type = REG_URL_FILE.search(src).group(1).lower() resources.append((url, f_type)) return resources
def get_info_from_description(soup, desc_name): stats = select(soup, "dd.stat-description") for stat in stats: stat_name_decs = select(stat, "li") if stat_name_decs: for stat_name in stat_name_decs: if select(stat_name, "span.title")[0].text == desc_name: return select(stat_name, "span.stat-point")[0].text
def parse(self): if not self.soup: return out = [] for tr in select(self.soup, '#content table tr'): td = select(tr, 'td') if len(td) != 3: continue name = select(td[1], 'strong')[0].string msg = urlizetrunc(striptags(select(td[2], 'div')[0].renderContents()), 30) out.append((name, msg)) self.data = out[:]
def find_footnotes_and_anchors(soup): selector = '.sdfootnoteanc' footnote_anchors = select(soup, selector) #print '\n'.join([str(anc) for anc in footnote_anchors]) footnotes = [] for i in range(len(footnote_anchors)): selector = '#sdfootnote%s' % (i+1) footnotes.extend(select(soup, selector)) #print '\n'.join([str(f) for f in footnotes]) return footnote_anchors, footnotes
def find_footnotes_and_anchors(soup): selector = '.sdfootnoteanc' footnote_anchors = select(soup, selector) #print '\n'.join([str(anc) for anc in footnote_anchors]) footnotes = [] for i in range(len(footnote_anchors)): selector = '#sdfootnote%s' % (i + 1) footnotes.extend(select(soup, selector)) #print '\n'.join([str(f) for f in footnotes]) return footnote_anchors, footnotes
def getLinks(cat, sponsor=True): _links = [] r = s.get(cat) soup = soupify(r) table = select(soup, 'table.categories')[0] if page != 1 or sponsor==False else select(soup, 'table.categories')[1] tr = select(table, 'tr') for t in tr: link = select(t, 'h3 a') if link: _links.append(str(dict(link[0].attrs)['href'])) return _links
def process(d, i=None): ''' function to process one entry of the table ''' # to keep a small idea if this is still working (output) if i: print '%s' % i else: print '.' # extraction of the link of interest link = d['penalty_notice_link'] # if we havn't downloaded the link yet, we do it and keep in into a html file into the temp folder if not os.path.exists('./temp/%s.html' % hash(link)): r = requests.get(link) with open('./temp/%s.html' % hash(link), 'w') as h: h.write(r.text.encode('utf-8')) # load the hmtl markup with open('./temp/%s.html' % hash(link), 'r') as h: source = h.read() # if we havnt previously extracted the info, we do it now if not os.path.exists('./temp/%s.pickle' % hash(link)): # to extract info it's usually the same way: # - use BeautifulSoup to create the soup of the source # - use select and some css classes/ids to extract info # => it's exaclty what is down below soup = BeautifulSoup(source) div = select(soup, 'div.cim_content')[0] table = select(div, 'table')[0] rows = select(table, 'tr') address = str(select(rows[2], 'td')[-1].contents[0]) offence_code = str(select(rows[5], 'td')[-1].contents[0]) nature = str(select(rows[6], 'td')[-1].contents[0]) amount = str(select(rows[7], 'td')[-1].contents[0]) data_penalty = str(select(rows[9], 'td')[-1].contents[0]) issued_by = str(select(rows[10], 'td')[-1].contents[0]) d['address'] = address d['offence_code'] = offence_code d['nature'] = nature d['amount'] = amount d['data_penalty'] = data_penalty d['issued_by'] = issued_by with open('./temp/%s.pickle' % hash(link), 'w') as h: pickle.dump(d, h) else: # we have previously extracted the info, we simply load it avoiding extra work with open('./temp/%s.pickle' % hash(link), 'r') as h: d = pickle.load(h) return d
def mine_city_weather(city_name, airport_name, filename_to_save): with open(filename_to_save) as data_file: weather_storage = json.load(data_file) for single_date in daterange(start_date, end_date): dict_key = single_date.strftime("%Y-%m-%d") year = single_date.strftime("%Y") month = single_date.strftime("%m") day = single_date.strftime("%d") if (dict_key not in weather_storage.keys()): print dict_key soup = Soup( urllib2.urlopen( 'https://www.wunderground.com/history/airport/' + airport_name + '/' + year + '/' + month + '/' + day + '/DailyHistory.html?req_city=' + city_name + '&req_statename=Denmark')) tekst = str(select(soup, '#observations_details')) column_counter = 0 weather_conditions = [] for column in select(soup, '#obsTable tbody tr'): #print column.text.split(';')[-1] time_clock = column.text.split(';')[0].split(' ')[0].split( ':')[0] time_clock = int(time_clock) am_pm = column.text.split(';')[0].split(' ')[1] if ('AM' in am_pm): am_pm = 'AM' else: am_pm = 'PM' if (am_pm == 'AM' and time_clock > 6 and time_clock != 12): weather_conditions.append(column.text.split(';')[-1]) elif (am_pm == 'PM' and time_clock <= 10): weather_conditions.append(column.text.split(';')[-1]) #if(column_counter % 13 == 12): #print '-------------------' # print column.text # weather_conditions.append(column.text) #print '-------------------' #column_counter += 1 weather_storage[dict_key] = weather_conditions time.sleep(1) with open(filename_to_save, 'w') as outfile: json.dump(weather_storage, outfile)
def html_cleanup(html, remove_list = (), encoding=None, log=False): """ Returns (str cleaned_html, bool changes) ``remove_list``: is list of selectors, currently supported only attribute and class selectors, e.g. ['p.[lang]', u'p.список-western', '[orphaned-attribute]', '.orphaned-class-name'] ``encoding`` is html encoding, autodetected if not passed """ soup = BeautifulSoup(html, fromEncoding=encoding) changes = False for selector in remove_list: m = REC_ATTR.match(selector) if m: attr, = m.groups() for element in select(soup, selector): if log: print "removing %s[%s]" % (element.name, attr) element.attrs = [item for item in element.attrs if item[0] != attr] changes = True else: m = REC_CLASS.match(selector) if m: tag, cls = m.groups() selector = (tag or '') + u'[class]' for element in select(soup, selector): for i, (attr, value) in enumerate(element.attrs): if attr == u'class': class_index = i classes = filter(None, element.attrs[class_index][1].split(' ')) try: classes.remove(cls) except ValueError: # not in list pass else: if log: print "removing %s.%s" % (element.name, cls) element.attrs[class_index] = (u'class', ' '.join(classes)) changes = True if changes: return soup.prettify(encoding=soup.fromEncoding or soup.originalEncoding), changes else: return html, changes
def parse_base(self): for page in range(self.pages): base_url = "http://www.kissfm.ua/news.html?p=%s" % str(page) doc = BeautifulSoup(urlopen(base_url)) for comm in select(doc, "div.news-item-content"): elem = {} for item in select(comm, "a.main-item-title"): elem["link"] = item["href"] elem["title"] = item.string for item in select(comm, "img"): elem["thumb"] = item["src"] for item in select(comm, "div.news-block-item-date"): elem["date"] = item.string.strip() self.structure.append(elem)
def parseStance(stance): issue = select(stance, "div.issue div.issuetext")[0].text e = select(stance, "div.quotelink")[0] if e.text: attrs = map(attrSplit, e.text.split("\" quote")) attrMap = {} for attr in attrs: if len(attr) == 2: attrMap[attr[0]] = attr[1] value = attrMap["stand"] source = attrMap["src"] else: value = e["quotestand"] source = e["quotesrc"] value = value == "colgreencheckmark" return [issue, value, source]
def test_items_in_id(self): els = select(self.soup, 'div#inner p') self.assertEqual(len(els), 3) for el in els: self.assertEqual(el.name, 'p') self.assertEqual(els[1]['class'], [u'onep']) self.assert_(not els[0].has_key('class'))
def verifyHtml(self,html): #reg = '((class|id)=\")[a-zA-Z0-9\-\_\s]*({})[a-zA-Z0-9\-\_\s]*(\")' for tag in self.tagObjects: if tag.found: continue for i,t in enumerate(tag.tags): if t.find('*')!=-1: tag.found = True continue if t.find(':')!=-1: tag.found = True continue #print 'finding matches for :',t matches = [] try: matches = select(html,t) except IndexError as e: #print 'Error finding matches',e tag.found = True tag.tagsFound[i] = True if len(matches)>0: tag.found = True tag.tagsFound[i] = True #print 'Found Match(s)' else: pass
def legendary_necklace_stage(soup): necklace = select(soup, "div.necklace") necklace = select(necklace[0], "img") if necklace: necklace = necklace[0] else: return False data_parts = necklace['item-data'].split(".") if data_parts[0] == "3300981": return data_parts[1] return False
def get_statement_urls(browser, base_url, is_old_statement): """Finds the URLs for each other statement, given a base URL for a statement.""" STATEMENT_SELECT_ID = '#ctl00_ContentInfo_HeaderInformation_drpStatementDates option' # There is a <select> on the page with the other statements linked. response = browser.open(base_url) response_data = response.read() soup = BeautifulSoup(response_data) statement_dates = select(soup, STATEMENT_SELECT_ID) if len(statement_dates) == 0: print "Couldn't find statement selector at %s" % base_url sys.exit(1) statement_urls = [] if is_old_statement: statement_base_url = OLD_URL_STATEMENT else: statement_base_url = NEW_URL_STATEMENT for statement_date in statement_dates: url = statement_base_url % urllib.quote(statement_date['value'].split(' ')[0], '') statement_urls.append(url) time.sleep(CRAWL_DELAY) return statement_urls
def parse_statement_flights(soup, opts=None): """Takes a BeautifulSoup instance and finds what flights, if any, it contains.""" notes = select(soup, "span.Notes") if len(notes) % 11 == 0: delta = 11 else: delta = 10 # Every 10 or 11 "Notes" is one entry. entries = [notes[i:i+delta] for i in range(0, len(notes), delta)] trips = [] for entry in entries: values = map(lambda x: x.text.strip(), entry) num_empty_values = values.count('') # Entries with lots of blanks are mile transfers or car rentals, # so don't include them unless they are desired. if num_empty_values > 3: if opts and opts.include_non_flights: trips.append(values) else: # For flights, also try to look up IATA codes since United doesn't provide them. if not opts.skip_iata_codes: parser = united_utils.UnitedAirportParser() try: codes = parser.get_iata_codes(values[2]) values.extend(codes) except ValueError: values.extend(('','')) trips.append(values) return trips
def add_footnotes_to_sections(sections_strings, footnote_anchors, footnotes): j = 0 for i, ss in enumerate(sections_strings): ssoup = BeautifulSoup(ss) selector = '.sdfootnoteanc' footnote_anchors = select(ssoup, selector) #print 'For section ', i, ' found footnotes ' \ # '\n'.join([str(anc) for anc in footnote_anchors]) for k in range(len(footnote_anchors)): anchor = footnote_anchors[k] footnote = footnotes[j] if '#' + anchor['name'] == footnote.p.a['href'] \ and '#' + footnote.p.a['name'] == anchor['href']: #print 'Found match for footnote', j, \ # ' in section ', i, ' anchor ', k pass else: print 'ERROR: wrong match for footnote', j, \ ' with anchor ', k, ' from section ', i sections_strings[i] = sections_strings[i] + str(footnote) j += 1
def test_items_in_id(self): els = select(self.soup, "div#inner p") self.assertEqual(len(els), 3) for el in els: self.assertEqual(el.name, "p") self.assertEqual(els[1]["class"], "onep") self.assert_(not els[0].has_key("class"))
def get_resources(self, doc): resources = [] for a in select(doc, 'a'): url = a.get('href') title = a.get('title').lower() resources.append((url, title)) return resources
def get_photos(race, bibs, path_prefix=''): for bib in bibs: bib = bib.strip() bib_url = FINISHERPIX_URL % (race, bib) photo_list_html = requests.get(bib_url) soup = Soup(photo_list_html.text) photo_list = select( soup, '#photocommerce-photos-gallery .photocommerce-gallery-photo img') for photo in photo_list: photo_url = photo['src'] photo_filename = photo_url.split('/')[-1] bib_dir_path = os.path.join(path_prefix, race, bib) if path_prefix and not os.path.exists(path_prefix): os.makedirs(path_prefix) if not os.path.exists(os.path.join(path_prefix, race)): os.makedirs(os.path.join(path_prefix, race)) if not os.path.exists(bib_dir_path): os.makedirs(bib_dir_path) r = requests.get(photo_url, stream=True) if r.status_code == 200: with open(os.path.join(bib_dir_path, photo_filename), 'wb') \ as f: for chunk in r.iter_content(1024): f.write(chunk) print('Downloaded http:%s to %s' % (photo_url, os.path.join(bib_dir_path, photo_filename))) return bib_dir_path
def isohunt_search(q): #Query the isohunt search engine and get the results HTML q = urllib.quote(q) soup = Soup(open_url('http://isohunt.com/torrents/?ihq=%s' % q), convertEntities='html', markupMassage=hexentityMassage) anchors = select(soup, 'a[id^link]') anchors = filter(lambda a: a.parent.name == 'td', anchors) results = {} for a in anchors: if str(a.contents[0]) != '0': a = Soup(a.renderContents().split("<br />").pop()) result = ' '.join([ unicode(node.renderContents()) if type(node) != NavigableString else unicode(node) for node in a.contents ]) result = scene_cleanup(result) if result not in results.keys(): results[result] = 1 else: results[result] += 1 results = sorted(results.iteritems(), key=operator.itemgetter(1)) res = [] for r in results: res = [r[0]] + res return res
def assertSelects(self, selector, expected_ids): el_ids = [el['id'] for el in select(self.soup, selector)] el_ids.sort() expected_ids.sort() self.assertEqual(expected_ids, el_ids, "Selector %r, expected %r, got %r" % (selector, expected_ids, el_ids))
def get_statement_urls(browser, base_url, is_old_statement): """Finds the URLs for each other statement, given a base URL for a statement.""" STATEMENT_SELECT_ID = '#ctl00_ContentInfo_HeaderInformation_drpStatementDates option' # There is a <select> on the page with the other statements linked. response = browser.open(base_url) response_data = response.read() soup = BeautifulSoup(response_data) statement_dates = select(soup, STATEMENT_SELECT_ID) if len(statement_dates) == 0: print "Couldn't find statement selector at %s" % base_url sys.exit(1) statement_urls = [] if is_old_statement: statement_base_url = OLD_URL_STATEMENT else: statement_base_url = NEW_URL_STATEMENT for statement_date in statement_dates: url = statement_base_url % urllib.quote( statement_date['value'].split(' ')[0], '') statement_urls.append(url) time.sleep(CRAWL_DELAY) return statement_urls
def _apply_styles(self): """Steps through CSS rules and applies each to all the proper elements as @style attributes prepending any current @style attributes. """ rules = self.stylesheet.cssRules.rulesOfType(1) elem_prop_map = {} elem_style_map = {} # build up a property list for every styled element for rule in rules: # select elements for every selector selectors = map(lambda s: s.strip(), rule.selectorText.split(',')) elements = [] for selector in selectors: try: elements += select(self.soup, selector) except SelectorNotSupportedException, ex: if self.ingore_unsupported_selectors: pass else: raise # build prop_list for each selected element for elem in elements: if elem not in elem_prop_map: elem_prop_map[elem] = [] elem_prop_map[elem].append({ 'specificity': self._get_rule_specificity(rule), 'props': rule.style.getProperties(), })
def assertSelects(self, selector, expected_ids): el_ids = [el['id'] for el in select(self.soup, selector)] el_ids.sort() expected_ids.sort() self.assertEqual( expected_ids, el_ids, "Selector %s, expected [%s], got [%s]" % (selector, ', '.join(expected_ids), ', '.join(el_ids)))
def test_items_in_id(self): els = select(self.soup, 'div#inner p') self.assertEqual(len(els), 3) for el in els: self.assertEqual(el.name, 'p') self.assertEqual(els[1]['class'], 'onep') self.assert_(not els[0].has_key('class'))
def parse_statement_flights(soup, opts=None): """Takes a BeautifulSoup instance and finds what flights, if any, it contains.""" notes = select(soup, "span.Notes") if len(notes) % 11 == 0: delta = 11 else: delta = 10 # Every 10 or 11 "Notes" is one entry. entries = [notes[i:i + delta] for i in range(0, len(notes), delta)] trips = [] for entry in entries: values = map(lambda x: x.text.strip(), entry) num_empty_values = values.count('') # Entries with lots of blanks are mile transfers or car rentals, # so don't include them unless they are desired. if num_empty_values > 3: if opts and opts.include_non_flights: trips.append(values) else: # For flights, also try to look up IATA codes since United doesn't provide them. if not opts.skip_iata_codes: parser = united_utils.UnitedAirportParser() try: codes = parser.get_iata_codes(values[2]) values.extend(codes) except ValueError: values.extend(('', '')) trips.append(values) return trips
def imdb_movie(url): response = open_url(url, html=False) soup = Soup(response.read(), convertEntities='html', markupMassage=hexentityMassage) #Parse the HTML, fetch movie names and their corresponding page URLs h1 = select(soup, 'h1.header') return {'url': url, 'title': h1[0].contents[0].strip()}
def get_content(url, num): if num[0] >= max_num: return response = urllib2.urlopen(url) msg = response.read() soup = BS(''.join(msg)) news_body = select(soup, 'div.blkContainerSblk') if len(news_body) == 1: num[0] = num[0] + 1 title = select(soup, 'h1#artibodyTitle') time = select(soup, 'span#pub_date') content = select(soup, 'div#artibody.blkContainerSblkCon') founds.append({u'url': url, u'timestamp': datetime.now()}) links = select(soup, 'a')['href'] for link in links: if link.find("news.sina.com.cn") != -1: get_content(url, num)
def search(query): query = query.replace(' ', '%20') query = query.replace('&', '') # don't want & in get query parameters url = base_url + query response = requests.get(url) soup = BeautifulSoup(response.content, 'lxml') products = [] food_descriptions = soupselect.select(soup, 'div.food_description') for food_description in food_descriptions: try: anchors = soupselect.select(food_description, 'a') match = re.search(">.+<\/a>", str(anchors[0])) name = match.group(0).replace('</a>', '').replace('>', '') match = re.search(">.+<\/a>", str(anchors[1])) brand = match.group(0).replace('</a>', '').replace('>', '') product = {} product['name'] = name product['brand'] = brand products.append(product) except: pass index = 0 nutritional_info = soupselect.select(soup, 'div.nutritional_info') nutritional_data = soupselect.select(soup, 'div.nutritional_facts') # print(requests.get('https://www.myfitnesspal.com/food/search?search=chick%20fil%20a%20nuggets').text) for nutrition in nutritional_info: try: product = products[index] matches = re.finditer("<label>.+<\/label>.+(,|[\s|\t|\n]*<\/div>)", str(nutrition)) for match in matches: s = match.group(0) label = s[:s.index('</label>')] label = label.replace('<label>', '').replace(':', '').strip() label_value = s[s.index("</label>"):] label_value = label_value.replace('</label>', '').replace( ',', '').replace('</div>', '').strip() product[label] = label_value index += 1 except: pass return products[:5]
def scrape(postcode,page=1): if page == 1: print "*************** %s *******************"%postcode post_url = 'http://finddrivinginstructor.direct.gov.uk/DSAFindNearestWebApp/findNearest.form?postcode=%s&pageNumber=%s'%(postcode,page) resp = requests.get(post_url).text if len(resp) < 8000: print "Invalid post codes" return 'Invalid' soup = BeautifulSoup(resp) results_list = select(soup, 'ul.results-list li') if len(results_list) == 0: print "No more pages left." return "no pages left" print "Page %s"%page for i in results_list: name = select(i,'h3')[0].get_text() detail1 = select(i,'div.instructor-details')[0] mail = select(detail1,'a')[0].get('href').split(":")[-1] phone = select(detail1,'span')[0].get_text() detail2 = select(i,'div.instructor-details')[1] try: select(detail2,'span.cpd')[0] cpd = True except IndexError: cpd = False pass try: select(detail2,'span.cop')[0] cop = True except IndexError: cop = False print name,mail,phone,cpd,cop with open("go.txt", "a") as myfile1: myfile1.write("%s|%s|%s|%s|%s|%s\n"%(postcode,name,mail,phone,cpd,cop)) return scrape(postcode,page+1)
def parse_row(tr): if not tr: return None address = select(tr, '.address-tag a') if len(address) != 3: warning(tr) return None tx_id = address[0].contents[0] tx_from = address[1].contents[0] tx_to = address[2].contents[0] tds = select(tr, 'td') val = tds[6].text num_val = float(val.replace('Ether', '').replace( ' ', '')) if 'Ether' in val else 0 return { 'tx_id': strip(tx_id), 'from': strip(tx_from), 'to': strip(tx_to), 'value': num_val }
def download_subtitle_file_from_episode(url, file_name): # first extract the download url r2 = requests.get(url) requestText = str(r2.text.encode("utf-8")) notUnicode = requestText.decode('unicode-escape') notUnicodeStr = str(notUnicode.encode("utf-8")) #print notUnicodeStr soup = BeautifulSoup(notUnicodeStr, 'html.parser') download_url = 'https://subscene.com' + select(soup, '#downloadButton')[0]['href'] print download_url urllib.urlretrieve (download_url, file_name + ".zip")
def get_family(character_id): # print character_id file_name = character_id.split("/")[-1] wikia = BeautifulSoup( open("data/wikia/characters/{0}".format(file_name), "r"), "html.parser") family_element = [ tag for tag in select(wikia, 'h3') if tag.text == "Family" ] if len(family_element) > 0: family = family_element[0].next_sibling.next_sibling collapsed = select(family, "div.mw-collapsed") if len(collapsed) > 0: return extract_houses( select(family, "div.mw-collapsed")[0].contents) else: return extract_houses(family.contents) else: return []
def parse(self): if not self.soup: return book = {} for a in select(self.soup, 'ul.detailsList li h3 a'): link = self.url.replace('/cz/', '') + a['href'] xa = str(select(a, 'span.given-name')[0].string) xb = str(select(a, 'span.family-name')[0].string) name = u"%s %s" % ( # select(a, 'span.given-name')[0].string.capitalize(), # select(a, 'span.family-name')[0].string.capitalize() xa.decode('utf-8'), xb.decode('utf-8') ) key = name.split(u' ') key.reverse() key = slugify(u" ".join(key)) book[key] = (name, link) keys = sorted(book.keys()) self.data = [book[k] for k in keys]
def remove_footnotes_from_last_section(sections_strings): last_ss = sections_strings[-1] last_ssoup = BeautifulSoup(last_ss) footnote_anchors = find_footnotes_and_anchors(last_ssoup) last_ssfootnotes = [] for i in range(len(footnote_anchors)): selector = '#sdfootnote%s' % (i + 1) last_ssfootnotes.extend(select(last_ssoup, selector)) for f in last_ssfootnotes: f.extract()
def download_subtitles(): #seasons = ['https://subscene.com/subtitles/game-of-thrones-first-season', 'https://subscene.com/subtitles/game-of-thrones-second-season', 'https://subscene.com/subtitles/game-of-thrones-third-season', 'https://subscene.com/subtitles/game-of-thrones-fourth-season', 'https://subscene.com/subtitles/game-of-thrones-fifth-season-2015'] seasons = ['https://subscene.com/subtitles/game-of-thrones-fourth-season'] season_counter = 4 for season in seasons: r2 = requests.get(season) requestText = str(r2.text.encode("utf-8")) notUnicode = requestText.decode('unicode-escape') notUnicodeStr = str(notUnicode.encode("utf-8")) #print notUnicodeStr soup = BeautifulSoup(notUnicodeStr, 'html.parser') rows = select(soup, '.a1') for episode_number in range(1,2): if(episode_number < 10): file_name = 'S0' + str(season_counter) + 'E0' + str(episode_number) else: file_name = 'S0' + str(season_counter) + 'E' + str(episode_number) print file_name for row in rows: #print row soup = BeautifulSoup(str(row.encode("utf-8")), 'html.parser') spans = select(soup, 'a span') language = spans[0].text.strip() title = spans[1].text.strip() if(language == 'English'): if(file_name in title): print title #print aTag = select(soup, 'a') url = 'https://subscene.com' + aTag[0]['href'] print url download_subtitle_file_from_episode(url, file_name) break season_counter += 1
def files_info(self): html = Soup(urllib.urlopen(self.url)) urls = select(html, 'li.file') files_info = {} for f in urls: fileinfo = { 'url': self.get_url(f), 'name': self.get_name(f), 'size': self.get_size(f), 'modified': self.get_modified(f), } files_info[fileinfo['name']] = fileinfo return files_info
def get_size(self, html): # TODO does not exist for folders size_desc = select(html, 'span.file_size') if not size_desc: return 0 size_desc = size_desc[0].text size = int(re.search('([\d]+)', size_desc).groups()[0]) if "gb" in size_desc.lower(): size = int(float(size) * 1024 * 1024 * 1024) elif "mb" in size_desc.lower(): size = int(float(size) * 1024 * 1024) elif "kb" in size_desc.lower(): size = int(float(size) * 1024) return size
def propagate_styles(soup): style_contents = str(soup.html.head.style.contents[0]) styles = style_contents.split('\n') for s in styles: s = s.strip() if s == '' or s == '<!--' or s == '-->': continue if s.startswith('@page'): continue selector, rest = s.split('{', 1) style = rest.split('}', 1)[0] for tag in select(soup, selector.lower()): if tag.has_key('style'): tag['style'] += style else: tag['style'] = style
def get_houses(character_id): print character_id file_name = character_id.split("/")[-1] wikia = BeautifulSoup( open("data/wikia/characters/{0}".format(file_name), "r"), "html.parser") allegiance_element = [ tag for tag in select(wikia, 'h3') if tag.text == "Allegiance" ] if len(allegiance_element) > 0: houses_elements = allegiance_element[ 0].next_sibling.next_sibling.contents return extract_houses(houses_elements) else: return []