Exemplo n.º 1
0
 def __init__(self):
     try:
         self.seed = Seed.objects.get(url=self.seed_url)
     except Seed.DoesNotExist:
         raise NoSeedYet('You need to add a Seed with the URL %r' %
                         self.seed_url)
     self.logger = logging.getLogger('eb.retrieval.%s.%s' %
                                     (settings.SHORT_NAME, self.schema))
     if self.retriever is None:
         self.retriever = UnicodeRetriever(cache=None,
                                           sleep=self.seed.delay)
Exemplo n.º 2
0
def main(url):
    if not url:
        print "No url provided"
        sys.exit()

    #url = 'http://newstatesman.com/politics/2013/10/russell-brand-on-revolution'
    #h = getHTML(url)
    html = UnicodeRetriever().fetch_data(url)
    tree = make_tree(html)
    lines = article_text(tree)

    file_type = magic.from_buffer(html, mime=True)
    print "File Type: %s" % file_type
    #print html

    url_obj = urlparse(url)
    if not url_obj.path:
        print "URL is top-level"

    if len(lines) < 1:
        print "URL is top-level"

    soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
    #print get_attribute(html, 'img', url)

    img = get_attribute(soup, 'img', url)
    title = get_attribute(soup, 'title', url)
    desc = get_attribute(soup, 'description', lines)

    print "Title: %s" % title
    print "Desc: %s" % desc
    print "IMG: %s" % img
Exemplo n.º 3
0
 def __init__(self):
     try:
         self.seed = Seed.objects.get(url=self.seed_url)
     except Seed.DoesNotExist:
         raise NoSeedYet('You need to add a Seed with the URL %r' % self.seed_url)
     self.logger = logging.getLogger('eb.retrieval.%s.%s' % (settings.SHORT_NAME, self.schema))
     if self.retriever is None:
         self.retriever = UnicodeRetriever(cache=None, sleep=self.seed.delay)
Exemplo n.º 4
0
def update(seed_id=None):
    """
    Retrieves and saves every new item for every Seed that is an RSS feed.
    """
    retriever = UnicodeRetriever(cache=None)
    logger = logging.getLogger('eb.retrieval.blob_rss')
    qs = Seed.objects.filter(is_rss_feed=True, is_active=True)
    if seed_id is not None:
        qs = qs.filter(id=seed_id)
    for seed in qs:
        updater = FeedUpdater(seed, retriever, logger)
        updater.update()
Exemplo n.º 5
0
def add_newsitem(seed_url, seed_name, url, article_headline, article_date,
                 name_excerpts):
    schema = Schema.objects.get(slug='news-articles')
    geocoder = SmartGeocoder()
    try:
        s = Seed.objects.get(url=seed_url)
    except Seed.DoesNotExist:
        s = Seed.objects.create(
            url=seed_url,
            base_url=seed_url,
            delay=0,
            depth=0,
            is_crawled=False,
            is_rss_feed=False,
            is_active='t',
            rss_full_entry=False,
            normalize_www=3,
            pretty_name=seed_name,
            schema=schema,
            autodetect_locations=True,
            guess_article_text=False,
            strip_noise=False,
            city='',
        )
    try:
        p = Page.objects.get(url=url)
    except Page.DoesNotExist:
        html = UnicodeRetriever().fetch_data(url)
        p = Page.objects.create(seed=s,
                                url=url,
                                scraped_url=url,
                                html=html,
                                when_crawled=datetime.datetime.now(),
                                is_article=True,
                                is_pdf=False,
                                is_printer_friendly=False,
                                article_headline=article_headline,
                                article_date=article_date,
                                has_addresses=None,
                                when_geocoded=None,
                                geocoded_by='',
                                times_skipped=0,
                                robot_report='')
    data_tuples = []
    for location_name, excerpt in name_excerpts:
        point = geocoder.geocode(location_name)  # Let exceptions bubble up.
        data_tuples.append(
            (location_name, point['point'], excerpt, point['block']))
    return geotag_page(p.id, seed_name, schema, url, data_tuples,
                       article_headline, article_date)
def getHTML(url):
    """Get the HTML for a URL
    """
    html = None
    try:

        html = UnicodeRetriever().fetch_data(url)

    except URLError:
        #if printMsg: print "[getHTML]: Error - URLError - %s" % url
        return None
    except (HTTPError, BadStatusLine, InvalidURL):
        #if printMsg: print "[getHTML]: Error - HTTPError - %s" % url
        return None
    except (socket.timeout, ssl.SSLError):
        #if printMsg: print "[getHTML]: Error - Timeout - %s" % url
        return None
    except Exception as e:
        #if printMsg: print "[getHTML]: Error - %s - %s" % (url, e)
        return None
    finally:
        return html
Exemplo n.º 7
0
class SpecializedCrawler(object):
    """
    Base class for Page crawlers.
    """

    schema = None
    seed_url = None
    date_headline_re = None
    date_format = None
    retriever = None

    def __init__(self):
        try:
            self.seed = Seed.objects.get(url=self.seed_url)
        except Seed.DoesNotExist:
            raise NoSeedYet('You need to add a Seed with the URL %r' % self.seed_url)
        self.logger = logging.getLogger('eb.retrieval.%s.%s' % (settings.SHORT_NAME, self.schema))
        if self.retriever is None:
            self.retriever = UnicodeRetriever(cache=None, sleep=self.seed.delay)

    def save_page(self, unique_id):
        """
        Downloads the page with the given unique ID (possibly a numeric ID, or
        a URL) and saves it as a Page object. Returns the Page object, or None
        if the page couldn't be found.

        The page won't be retrieved/saved if it's already in the database. In
        this case, the existing Page object will be returned.
        """
        self.logger.debug('save_page(%s)', unique_id)
        retrieval_url = self.retrieval_url(unique_id)
        public_url = self.public_url(unique_id)

        try:
            p = Page.objects.get(seed__id=self.seed.id, url=public_url)
        except Page.DoesNotExist:
            pass
        else:
            self.logger.debug('Skipping already-saved URL %s', public_url)
            return p

        try:
            html = self.retriever.get_html(retrieval_url).strip()
        except (RetrievalError, UnicodeDecodeError):
            return None
        if not html:
            self.logger.debug('Got empty page for %s', retrieval_url)
            return None
        self.logger.debug('Got VALID page for %s', retrieval_url)

        m = self.date_headline_re.search(html)
        if not m:
            self.logger.debug('Could not find date/headline on %s', retrieval_url)
            return None
        article_date, article_headline = m.groupdict()['article_date'], m.groupdict()['article_headline']
        try:
            article_date = parse_date(article_date, self.date_format)
        except ValueError:
            self.logger.debug('Got unparseable date %r on %s', article_date, retrieval_url)
            return None
        article_headline = strip_tags(article_headline)
        if len(article_headline) > 255:
            article_headline = article_headline[:252] + '...'

        p = Page.objects.create(
            seed=self.seed,
            url=public_url,
            scraped_url=retrieval_url,
            html=html,
            when_crawled=datetime.datetime.now(),
            is_article=True,
            is_pdf=False,
            is_printer_friendly=False,
            article_headline=article_headline,
            article_date=article_date,
            has_addresses=None,
            when_geocoded=None,
            geocoded_by='',
            times_skipped=0,
            robot_report='',
        )
        self.logger.debug('Created Page ID %s' % p.id)
        save_locations_for_page(p)
        return p

    ######################################
    # METHODS SUBCLASSES SHOULD OVERRIDE #
    ######################################

    def public_url(self, unique_id):
        "Given the ID value, returns the URL that we should publish."
        raise NotImplementedError()

    def retrieval_url(self, unique_id):
        "Given the ID value, returns the URL that we should scrape."
        return self.public_url(unique_id)
Exemplo n.º 8
0
 def __init__(self, letters=None, *args, **kwargs):
     super(RestaurantScraper, self).__init__(*args, **kwargs)
     self.letters = letters or DEFAULT_LETTERS
     self.retriever = UnicodeRetriever()
Exemplo n.º 9
0
        if count >= NUM_PARAGRAPHS_SAFE_GUESS or (
                count >= MIN_NUM_PUNCTUATED
                and percent_punctuated >= MIN_PERCENTAGE_PUNCTUATED):

            for i in reversed(
                    to_delete
            ):  # Delete in reverse so that index order is preserved.
                del section[i]
            final_sections.append(section)
    return final_sections


def article_text(tree):
    """
    Simple wrapper around article_text_sections() that "flattens" sections into
    a single section.
    """
    result = []
    for section in article_text_sections(tree):
        result.extend(section)
    return result


if __name__ == "__main__":
    from ebdata.retrieval import UnicodeRetriever
    from ebdata.textmining.treeutils import make_tree
    import sys
    html = UnicodeRetriever().fetch_data(sys.argv[1])
    lines = article_text(make_tree(html))
    print lines
Exemplo n.º 10
0
 def __init__(self):
     self.url = 'http://sfgov.org/site/planning_meeting.asp?id=15840'
     self.retriever = UnicodeRetriever()
     self.delay = 2
Exemplo n.º 11
0
class ZoningUpdater(object):
    def __init__(self):
        self.url = 'http://sfgov.org/site/planning_meeting.asp?id=15840'
        self.retriever = UnicodeRetriever()
        self.delay = 2

    def update(self):
        for year in self.get_years(self.url):
            self.update_year(year['url'])

    def get_years(self, url):
        html = self.retriever.get_html(url)
        t = document_fromstring(html)
        for a in t.xpath("//table[@id='Table4']//a"):
            year_url = 'http://sfgov.org/site/planning_meeting.asp%s' % a.get(
                'href')[:-8]
            yield {'url': year_url, 'year': a.text}

    def update_year(self, url):
        minutes_schema = Schema.objects.get(slug='zoning-minutes')
        agendas_schema = Schema.objects.get(slug='zoning-agenda')
        for page in self.get_minutes(url):
            self.save_page(page, minutes_schema)
        for page in self.get_agendas(url):
            self.save_page(page, agendas_schema)

    def get_minutes(self, url):
        return self._helper(url, 'Minutes')

    def get_agendas(self, url):
        return self._helper(url, 'Agendas')

    def _helper(self, url, item_type):
        html = self.retriever.get_html(url)
        t = document_fromstring(html)
        for a in t.xpath(
                "//a[@name='%s']/parent::td/parent::tr/following-sibling::*[4]//a"
                % item_type):
            if '(cancellation notice)' in a.text.lower():
                continue
            url = 'http://sfgov.org/site/%s' % a.get('href')
            yield {'title': a.text, 'url': url}

    def save_page(self, page, schema):
        url = page['url']
        # If we've already retrieved the page, there's no need to retrieve
        # it again.
        try:
            Blob.objects.filter(url=url)[0]
        except IndexError:
            pass
        else:
            #self.logger.debug('URL %s has already been retrieved', url)
            return

        # Fetch the html for the page and save it
        html = self.retriever.get_html(url + '&mode=text')
        b = Blob(schema=schema,
                 title=page['title'],
                 url=url,
                 html=html,
                 is_pdf=False,
                 when_crawled=datetime.now(),
                 has_addresses=None,
                 when_geocoded=None,
                 geocoded_by='').save()

        time.sleep(self.delay)
Exemplo n.º 12
0
class SpecializedCrawler(object):
    """
    Base class for Page crawlers.
    """

    schema = None
    seed_url = None
    date_headline_re = None
    date_format = None
    retriever = None

    def __init__(self):
        try:
            self.seed = Seed.objects.get(url=self.seed_url)
        except Seed.DoesNotExist:
            raise NoSeedYet('You need to add a Seed with the URL %r' %
                            self.seed_url)
        self.logger = logging.getLogger('eb.retrieval.%s.%s' %
                                        (settings.SHORT_NAME, self.schema))
        if self.retriever is None:
            self.retriever = UnicodeRetriever(cache=None,
                                              sleep=self.seed.delay)

    def save_page(self, unique_id):
        """
        Downloads the page with the given unique ID (possibly a numeric ID, or
        a URL) and saves it as a Page object. Returns the Page object, or None
        if the page couldn't be found.

        The page won't be retrieved/saved if it's already in the database. In
        this case, the existing Page object will be returned.
        """
        self.logger.debug('save_page(%s)', unique_id)
        retrieval_url = self.retrieval_url(unique_id)
        public_url = self.public_url(unique_id)

        try:
            p = Page.objects.get(seed__id=self.seed.id, url=public_url)
        except Page.DoesNotExist:
            pass
        else:
            self.logger.debug('Skipping already-saved URL %s', public_url)
            return p

        try:
            html = self.retriever.fetch_data(retrieval_url).strip()
        except (RetrievalError, UnicodeDecodeError):
            return None
        if not html:
            self.logger.debug('Got empty page for %s', retrieval_url)
            return None
        self.logger.debug('Got VALID page for %s', retrieval_url)

        m = self.date_headline_re.search(html)
        if not m:
            self.logger.debug('Could not find date/headline on %s',
                              retrieval_url)
            return None
        article_date, article_headline = m.groupdict(
        )['article_date'], m.groupdict()['article_headline']
        try:
            article_date = parse_date(article_date, self.date_format)
        except ValueError:
            self.logger.debug('Got unparseable date %r on %s', article_date,
                              retrieval_url)
            return None
        article_headline = strip_tags(article_headline)
        if len(article_headline) > 255:
            article_headline = article_headline[:252] + '...'

        p = Page.objects.create(
            seed=self.seed,
            url=public_url,
            scraped_url=retrieval_url,
            html=html,
            when_crawled=datetime.datetime.now(),
            is_article=True,
            is_pdf=False,
            is_printer_friendly=False,
            article_headline=article_headline,
            article_date=article_date,
            has_addresses=None,
            when_geocoded=None,
            geocoded_by='',
            times_skipped=0,
            robot_report='',
        )
        self.logger.debug('Created Page ID %s' % p.id)
        save_locations_for_page(p)
        return p

    ######################################
    # METHODS SUBCLASSES SHOULD OVERRIDE #
    ######################################

    def public_url(self, unique_id):
        "Given the ID value, returns the URL that we should publish."
        raise NotImplementedError()

    def retrieval_url(self, unique_id):
        "Given the ID value, returns the URL that we should scrape."
        return self.public_url(unique_id)
Exemplo n.º 13
0
        to_delete = []
        for i, paragraph in enumerate(section):
            if paragraph.lower() in ignored_paragraphs:
                to_delete.append(i)
            elif is_punctuated(paragraph) and len(paragraph) >= MIN_CHARS_IN_PARAGRAPH:
                count += 1
        percent_punctuated = decimal.Decimal(count) / decimal.Decimal(len(section))
        if count >= NUM_PARAGRAPHS_SAFE_GUESS or (count >= MIN_NUM_PUNCTUATED and percent_punctuated >= MIN_PERCENTAGE_PUNCTUATED):
            for i in reversed(to_delete): # Delete in reverse so that index order is preserved.
                del section[i]
            final_sections.append(section)
    return final_sections

def article_text(tree):
    """
    Simple wrapper around article_text_sections() that "flattens" sections into
    a single section.
    """
    result = []
    for section in article_text_sections(tree):
        result.extend(section)
    return result

if __name__ == "__main__":
    from ebdata.retrieval import UnicodeRetriever
    from ebdata.textmining.treeutils import make_tree
    import sys
    html = UnicodeRetriever().get_html(sys.argv[1])
    lines = article_text(make_tree(html))
    print lines
Exemplo n.º 14
0
 def __init__(self, *args, **kwargs):
     self.get_archive = kwargs.pop('get_archive', False)
     super(SeattleFireDispatchScraper, self).__init__(*args, **kwargs)
     self.retriever = UnicodeRetriever()
Exemplo n.º 15
0
 def __init__(self):
     self.url = 'http://sfgov.org/site/planning_meeting.asp?id=15840'
     self.retriever = UnicodeRetriever()
     self.delay = 2
Exemplo n.º 16
0
def main():
    url = 'http://buzzfeed.com/michaelrusch/a-new-trailer-from-anchorman-2-is-released-and-its-awesome'
    html = UnicodeRetriever().fetch_data(url)

    ReadableText(html)
Exemplo n.º 17
0
class ZoningUpdater(object):
    def __init__(self):
        self.url = 'http://sfgov.org/site/planning_meeting.asp?id=15840'
        self.retriever = UnicodeRetriever()
        self.delay = 2

    def update(self):
        for year in self.get_years(self.url):
            self.update_year(year['url'])

    def get_years(self, url):
        html = self.retriever.get_html(url)
        t = document_fromstring(html)
        for a in t.xpath("//table[@id='Table4']//a"):
            year_url = 'http://sfgov.org/site/planning_meeting.asp%s' % a.get('href')[:-8]
            yield {'url': year_url, 'year': a.text}

    def update_year(self, url):
        minutes_schema = Schema.objects.get(slug='zoning-minutes')
        agendas_schema = Schema.objects.get(slug='zoning-agenda')
        for page in self.get_minutes(url):
            self.save_page(page, minutes_schema)
        for page in self.get_agendas(url):
            self.save_page(page, agendas_schema)

    def get_minutes(self, url):
        return self._helper(url, 'Minutes')

    def get_agendas(self, url):
        return self._helper(url, 'Agendas')

    def _helper(self, url, item_type):
        html = self.retriever.get_html(url)
        t = document_fromstring(html)
        for a in t.xpath("//a[@name='%s']/parent::td/parent::tr/following-sibling::*[4]//a" % item_type):
            if '(cancellation notice)' in a.text.lower():
                continue
            url = 'http://sfgov.org/site/%s' % a.get('href')
            yield {'title': a.text, 'url': url}

    def save_page(self, page, schema):
        url = page['url']
        # If we've already retrieved the page, there's no need to retrieve
        # it again.
        try:
            Blob.objects.filter(url=url)[0]
        except IndexError:
            pass
        else:
            #self.logger.debug('URL %s has already been retrieved', url)
            return

        # Fetch the html for the page and save it
        html = self.retriever.get_html(url + '&mode=text')
        b = Blob(
            schema=schema,
            title=page['title'],
            url=url,
            html=html,
            is_pdf=False,
            when_crawled=datetime.now(),
            has_addresses=None,
            when_geocoded=None,
            geocoded_by=''
        ).save()

        time.sleep(self.delay)