def __init__(self): try: self.seed = Seed.objects.get(url=self.seed_url) except Seed.DoesNotExist: raise NoSeedYet('You need to add a Seed with the URL %r' % self.seed_url) self.logger = logging.getLogger('eb.retrieval.%s.%s' % (settings.SHORT_NAME, self.schema)) if self.retriever is None: self.retriever = UnicodeRetriever(cache=None, sleep=self.seed.delay)
def main(url): if not url: print "No url provided" sys.exit() #url = 'http://newstatesman.com/politics/2013/10/russell-brand-on-revolution' #h = getHTML(url) html = UnicodeRetriever().fetch_data(url) tree = make_tree(html) lines = article_text(tree) file_type = magic.from_buffer(html, mime=True) print "File Type: %s" % file_type #print html url_obj = urlparse(url) if not url_obj.path: print "URL is top-level" if len(lines) < 1: print "URL is top-level" soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) #print get_attribute(html, 'img', url) img = get_attribute(soup, 'img', url) title = get_attribute(soup, 'title', url) desc = get_attribute(soup, 'description', lines) print "Title: %s" % title print "Desc: %s" % desc print "IMG: %s" % img
def update(seed_id=None): """ Retrieves and saves every new item for every Seed that is an RSS feed. """ retriever = UnicodeRetriever(cache=None) logger = logging.getLogger('eb.retrieval.blob_rss') qs = Seed.objects.filter(is_rss_feed=True, is_active=True) if seed_id is not None: qs = qs.filter(id=seed_id) for seed in qs: updater = FeedUpdater(seed, retriever, logger) updater.update()
def add_newsitem(seed_url, seed_name, url, article_headline, article_date, name_excerpts): schema = Schema.objects.get(slug='news-articles') geocoder = SmartGeocoder() try: s = Seed.objects.get(url=seed_url) except Seed.DoesNotExist: s = Seed.objects.create( url=seed_url, base_url=seed_url, delay=0, depth=0, is_crawled=False, is_rss_feed=False, is_active='t', rss_full_entry=False, normalize_www=3, pretty_name=seed_name, schema=schema, autodetect_locations=True, guess_article_text=False, strip_noise=False, city='', ) try: p = Page.objects.get(url=url) except Page.DoesNotExist: html = UnicodeRetriever().fetch_data(url) p = Page.objects.create(seed=s, url=url, scraped_url=url, html=html, when_crawled=datetime.datetime.now(), is_article=True, is_pdf=False, is_printer_friendly=False, article_headline=article_headline, article_date=article_date, has_addresses=None, when_geocoded=None, geocoded_by='', times_skipped=0, robot_report='') data_tuples = [] for location_name, excerpt in name_excerpts: point = geocoder.geocode(location_name) # Let exceptions bubble up. data_tuples.append( (location_name, point['point'], excerpt, point['block'])) return geotag_page(p.id, seed_name, schema, url, data_tuples, article_headline, article_date)
def getHTML(url): """Get the HTML for a URL """ html = None try: html = UnicodeRetriever().fetch_data(url) except URLError: #if printMsg: print "[getHTML]: Error - URLError - %s" % url return None except (HTTPError, BadStatusLine, InvalidURL): #if printMsg: print "[getHTML]: Error - HTTPError - %s" % url return None except (socket.timeout, ssl.SSLError): #if printMsg: print "[getHTML]: Error - Timeout - %s" % url return None except Exception as e: #if printMsg: print "[getHTML]: Error - %s - %s" % (url, e) return None finally: return html
class SpecializedCrawler(object): """ Base class for Page crawlers. """ schema = None seed_url = None date_headline_re = None date_format = None retriever = None def __init__(self): try: self.seed = Seed.objects.get(url=self.seed_url) except Seed.DoesNotExist: raise NoSeedYet('You need to add a Seed with the URL %r' % self.seed_url) self.logger = logging.getLogger('eb.retrieval.%s.%s' % (settings.SHORT_NAME, self.schema)) if self.retriever is None: self.retriever = UnicodeRetriever(cache=None, sleep=self.seed.delay) def save_page(self, unique_id): """ Downloads the page with the given unique ID (possibly a numeric ID, or a URL) and saves it as a Page object. Returns the Page object, or None if the page couldn't be found. The page won't be retrieved/saved if it's already in the database. In this case, the existing Page object will be returned. """ self.logger.debug('save_page(%s)', unique_id) retrieval_url = self.retrieval_url(unique_id) public_url = self.public_url(unique_id) try: p = Page.objects.get(seed__id=self.seed.id, url=public_url) except Page.DoesNotExist: pass else: self.logger.debug('Skipping already-saved URL %s', public_url) return p try: html = self.retriever.get_html(retrieval_url).strip() except (RetrievalError, UnicodeDecodeError): return None if not html: self.logger.debug('Got empty page for %s', retrieval_url) return None self.logger.debug('Got VALID page for %s', retrieval_url) m = self.date_headline_re.search(html) if not m: self.logger.debug('Could not find date/headline on %s', retrieval_url) return None article_date, article_headline = m.groupdict()['article_date'], m.groupdict()['article_headline'] try: article_date = parse_date(article_date, self.date_format) except ValueError: self.logger.debug('Got unparseable date %r on %s', article_date, retrieval_url) return None article_headline = strip_tags(article_headline) if len(article_headline) > 255: article_headline = article_headline[:252] + '...' p = Page.objects.create( seed=self.seed, url=public_url, scraped_url=retrieval_url, html=html, when_crawled=datetime.datetime.now(), is_article=True, is_pdf=False, is_printer_friendly=False, article_headline=article_headline, article_date=article_date, has_addresses=None, when_geocoded=None, geocoded_by='', times_skipped=0, robot_report='', ) self.logger.debug('Created Page ID %s' % p.id) save_locations_for_page(p) return p ###################################### # METHODS SUBCLASSES SHOULD OVERRIDE # ###################################### def public_url(self, unique_id): "Given the ID value, returns the URL that we should publish." raise NotImplementedError() def retrieval_url(self, unique_id): "Given the ID value, returns the URL that we should scrape." return self.public_url(unique_id)
def __init__(self, letters=None, *args, **kwargs): super(RestaurantScraper, self).__init__(*args, **kwargs) self.letters = letters or DEFAULT_LETTERS self.retriever = UnicodeRetriever()
if count >= NUM_PARAGRAPHS_SAFE_GUESS or ( count >= MIN_NUM_PUNCTUATED and percent_punctuated >= MIN_PERCENTAGE_PUNCTUATED): for i in reversed( to_delete ): # Delete in reverse so that index order is preserved. del section[i] final_sections.append(section) return final_sections def article_text(tree): """ Simple wrapper around article_text_sections() that "flattens" sections into a single section. """ result = [] for section in article_text_sections(tree): result.extend(section) return result if __name__ == "__main__": from ebdata.retrieval import UnicodeRetriever from ebdata.textmining.treeutils import make_tree import sys html = UnicodeRetriever().fetch_data(sys.argv[1]) lines = article_text(make_tree(html)) print lines
def __init__(self): self.url = 'http://sfgov.org/site/planning_meeting.asp?id=15840' self.retriever = UnicodeRetriever() self.delay = 2
class ZoningUpdater(object): def __init__(self): self.url = 'http://sfgov.org/site/planning_meeting.asp?id=15840' self.retriever = UnicodeRetriever() self.delay = 2 def update(self): for year in self.get_years(self.url): self.update_year(year['url']) def get_years(self, url): html = self.retriever.get_html(url) t = document_fromstring(html) for a in t.xpath("//table[@id='Table4']//a"): year_url = 'http://sfgov.org/site/planning_meeting.asp%s' % a.get( 'href')[:-8] yield {'url': year_url, 'year': a.text} def update_year(self, url): minutes_schema = Schema.objects.get(slug='zoning-minutes') agendas_schema = Schema.objects.get(slug='zoning-agenda') for page in self.get_minutes(url): self.save_page(page, minutes_schema) for page in self.get_agendas(url): self.save_page(page, agendas_schema) def get_minutes(self, url): return self._helper(url, 'Minutes') def get_agendas(self, url): return self._helper(url, 'Agendas') def _helper(self, url, item_type): html = self.retriever.get_html(url) t = document_fromstring(html) for a in t.xpath( "//a[@name='%s']/parent::td/parent::tr/following-sibling::*[4]//a" % item_type): if '(cancellation notice)' in a.text.lower(): continue url = 'http://sfgov.org/site/%s' % a.get('href') yield {'title': a.text, 'url': url} def save_page(self, page, schema): url = page['url'] # If we've already retrieved the page, there's no need to retrieve # it again. try: Blob.objects.filter(url=url)[0] except IndexError: pass else: #self.logger.debug('URL %s has already been retrieved', url) return # Fetch the html for the page and save it html = self.retriever.get_html(url + '&mode=text') b = Blob(schema=schema, title=page['title'], url=url, html=html, is_pdf=False, when_crawled=datetime.now(), has_addresses=None, when_geocoded=None, geocoded_by='').save() time.sleep(self.delay)
class SpecializedCrawler(object): """ Base class for Page crawlers. """ schema = None seed_url = None date_headline_re = None date_format = None retriever = None def __init__(self): try: self.seed = Seed.objects.get(url=self.seed_url) except Seed.DoesNotExist: raise NoSeedYet('You need to add a Seed with the URL %r' % self.seed_url) self.logger = logging.getLogger('eb.retrieval.%s.%s' % (settings.SHORT_NAME, self.schema)) if self.retriever is None: self.retriever = UnicodeRetriever(cache=None, sleep=self.seed.delay) def save_page(self, unique_id): """ Downloads the page with the given unique ID (possibly a numeric ID, or a URL) and saves it as a Page object. Returns the Page object, or None if the page couldn't be found. The page won't be retrieved/saved if it's already in the database. In this case, the existing Page object will be returned. """ self.logger.debug('save_page(%s)', unique_id) retrieval_url = self.retrieval_url(unique_id) public_url = self.public_url(unique_id) try: p = Page.objects.get(seed__id=self.seed.id, url=public_url) except Page.DoesNotExist: pass else: self.logger.debug('Skipping already-saved URL %s', public_url) return p try: html = self.retriever.fetch_data(retrieval_url).strip() except (RetrievalError, UnicodeDecodeError): return None if not html: self.logger.debug('Got empty page for %s', retrieval_url) return None self.logger.debug('Got VALID page for %s', retrieval_url) m = self.date_headline_re.search(html) if not m: self.logger.debug('Could not find date/headline on %s', retrieval_url) return None article_date, article_headline = m.groupdict( )['article_date'], m.groupdict()['article_headline'] try: article_date = parse_date(article_date, self.date_format) except ValueError: self.logger.debug('Got unparseable date %r on %s', article_date, retrieval_url) return None article_headline = strip_tags(article_headline) if len(article_headline) > 255: article_headline = article_headline[:252] + '...' p = Page.objects.create( seed=self.seed, url=public_url, scraped_url=retrieval_url, html=html, when_crawled=datetime.datetime.now(), is_article=True, is_pdf=False, is_printer_friendly=False, article_headline=article_headline, article_date=article_date, has_addresses=None, when_geocoded=None, geocoded_by='', times_skipped=0, robot_report='', ) self.logger.debug('Created Page ID %s' % p.id) save_locations_for_page(p) return p ###################################### # METHODS SUBCLASSES SHOULD OVERRIDE # ###################################### def public_url(self, unique_id): "Given the ID value, returns the URL that we should publish." raise NotImplementedError() def retrieval_url(self, unique_id): "Given the ID value, returns the URL that we should scrape." return self.public_url(unique_id)
to_delete = [] for i, paragraph in enumerate(section): if paragraph.lower() in ignored_paragraphs: to_delete.append(i) elif is_punctuated(paragraph) and len(paragraph) >= MIN_CHARS_IN_PARAGRAPH: count += 1 percent_punctuated = decimal.Decimal(count) / decimal.Decimal(len(section)) if count >= NUM_PARAGRAPHS_SAFE_GUESS or (count >= MIN_NUM_PUNCTUATED and percent_punctuated >= MIN_PERCENTAGE_PUNCTUATED): for i in reversed(to_delete): # Delete in reverse so that index order is preserved. del section[i] final_sections.append(section) return final_sections def article_text(tree): """ Simple wrapper around article_text_sections() that "flattens" sections into a single section. """ result = [] for section in article_text_sections(tree): result.extend(section) return result if __name__ == "__main__": from ebdata.retrieval import UnicodeRetriever from ebdata.textmining.treeutils import make_tree import sys html = UnicodeRetriever().get_html(sys.argv[1]) lines = article_text(make_tree(html)) print lines
def __init__(self, *args, **kwargs): self.get_archive = kwargs.pop('get_archive', False) super(SeattleFireDispatchScraper, self).__init__(*args, **kwargs) self.retriever = UnicodeRetriever()
def main(): url = 'http://buzzfeed.com/michaelrusch/a-new-trailer-from-anchorman-2-is-released-and-its-awesome' html = UnicodeRetriever().fetch_data(url) ReadableText(html)
class ZoningUpdater(object): def __init__(self): self.url = 'http://sfgov.org/site/planning_meeting.asp?id=15840' self.retriever = UnicodeRetriever() self.delay = 2 def update(self): for year in self.get_years(self.url): self.update_year(year['url']) def get_years(self, url): html = self.retriever.get_html(url) t = document_fromstring(html) for a in t.xpath("//table[@id='Table4']//a"): year_url = 'http://sfgov.org/site/planning_meeting.asp%s' % a.get('href')[:-8] yield {'url': year_url, 'year': a.text} def update_year(self, url): minutes_schema = Schema.objects.get(slug='zoning-minutes') agendas_schema = Schema.objects.get(slug='zoning-agenda') for page in self.get_minutes(url): self.save_page(page, minutes_schema) for page in self.get_agendas(url): self.save_page(page, agendas_schema) def get_minutes(self, url): return self._helper(url, 'Minutes') def get_agendas(self, url): return self._helper(url, 'Agendas') def _helper(self, url, item_type): html = self.retriever.get_html(url) t = document_fromstring(html) for a in t.xpath("//a[@name='%s']/parent::td/parent::tr/following-sibling::*[4]//a" % item_type): if '(cancellation notice)' in a.text.lower(): continue url = 'http://sfgov.org/site/%s' % a.get('href') yield {'title': a.text, 'url': url} def save_page(self, page, schema): url = page['url'] # If we've already retrieved the page, there's no need to retrieve # it again. try: Blob.objects.filter(url=url)[0] except IndexError: pass else: #self.logger.debug('URL %s has already been retrieved', url) return # Fetch the html for the page and save it html = self.retriever.get_html(url + '&mode=text') b = Blob( schema=schema, title=page['title'], url=url, html=html, is_pdf=False, when_crawled=datetime.now(), has_addresses=None, when_geocoded=None, geocoded_by='' ).save() time.sleep(self.delay)