Python convert_entities示例，ebdata.retrieval.utils.convert_entities Python示例

示例#1

0

显示文件

文件： retrieval.py 项目： DotNetWebs/openblock

    def clean_list_record(self, record):
        record.title = convert_entities(record['title'])
        record.description = convert_entities(record['description'])
        # Don't know why, but some feeds have 'id' *instead* of 'link'.
        if record.get('id', '').startswith('http'):
            record['link'] = record['id']

        # This tries GeoRSS, RDF Geo, xCal, ...
        point, location_name = self.get_point_and_location_name(record)

        _short_title = record['title'][:30] + '...'

        if not point:
            raise SkipRecord("couldn't geocode any addresses in item '%s...'"
                             % _short_title)

        if not location_name:
            raise SkipRecord(
                "Skip, no location name and failed to reverse geocode %s for %r" % (point.wkt, _short_title))

        if not intersects_metro_bbox(point):
            # Check if latitude, longitude seem to be reversed; I've
            # seen that in some bad feeds!
            reversed_loc = Point(point.y, point.x)
            if intersects_metro_bbox(reversed_loc):
                self.logger.info(
                    "Got points in apparently reverse order, flipping them")
                point = reversed_loc
            else:
                raise SkipRecord("Skipping %r as %s,%s is out of bounds" %
                                 (_short_title, point.y, point.x))

        record['location_name'] = location_name
        record['location'] = point
        return record

示例#2

0

显示文件

文件： add_events.py 项目： egrommet/openblock

def main():
    """ Download Calendar RSS feed and update database """

    url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\
&new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\
&st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1"""
    schema = 'events'

    parser = OptionParser()
    parser.add_option('-q', '--quiet', action="store_true", dest="quiet",
        default=False, help="no output")

    (options, args) = parser.parse_args()

    if len(args) > 0:
        return parser.error('script does not take any arguments')

    try:
        schema = Schema.objects.get(slug=schema)
    except Schema.DoesNotExist:
        print "Schema (%s): DoesNotExist" % schema
        sys.exit(1)

    feed = feedparser.parse(url)

    for entry in feed.entries:
        try:
            item = NewsItem.objects.get(title=entry.title,
                description=entry.description)
            status = "Updated"
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = "Added"

        try:
            item.location_name = entry.get('xcal_x-calconnect-street')
            item.schema = schema
            item.title = convert_entities(entry.title)
            item.description = convert_entities(entry.description)
            item.url = entry.link
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
            item.location = Point((float(entry['geo_long']),
                                   float(entry['geo_lat'])))
            if (item.location.x, item.location.y) == (0.0, 0.0):
                print "Skipping %r, bad location 0,0" % item.title
                continue

            item.save()
            if not options.quiet:
                print "%s: %s" % (status, item.title)
        except ValueError:
            if not options.quiet:
                print "unexpected error:", sys.exc_info()[1]

示例#3

0

显示文件

文件： treeutils.py 项目： christaggart/openblock

def text_from_html(html):
    """Remove ALL tags and return all plain text.
    """
    text = preprocess_to_string(html, drop_tags=_html_droptags,
                                drop_trees=_html_droptrees)
    text = convert_entities(text)
    return text

示例#4

0

显示文件

文件： retrieval.py 项目： frankk00/openblock

    def clean_detail_record(self, record):
        if record == {}:
            # Parsing the detail page failed.
            return None
        if record['violation_points'].startswith('Not Available'):
            record['violation_points'] = 'N/A'
            record['followup_inspection'] = False
        else:
            if not record['violation_points'].isdigit():
                raise ScraperBroken('Got odd violation points value %r' %
                                    record['violation_points'])
            record['followup_inspection'] = int(
                record['violation_points']) > 27

        # Parse the violations from the HTML chunk. When we're done,
        # record['violation_list'] will be a (possibly empty) list of strings.
        vio_chunk = record.pop('violations')
        if vio_chunk == '':
            record['violation_list'] = []
        else:
            vios = violation_list_re.findall(vio_chunk)
            if not vios:
                raise ScraperBroken(
                    "Violation data not found for restaurant %s",
                    record['restaurant_name'])
            record['violation_list'] = [
                strip_tags(convert_entities(v.strip())) for v in vios
            ]

        # Remove the ZIP code from the address, as it complicates geocoding.
        record['address'] = re.sub(r'\s*\d{5}\s*$', '', record['address'])
        # Strip extra internal whitespace.
        record['address'] = re.sub(r'\s+', ' ', record['address'])

        return record

示例#5

0

显示文件

文件： retrieval.py 项目： AndrewJHart/everyblock_code

    def clean_detail_record(self, record):
        if record == {}:
            # Parsing the detail page failed.
            return None
        if record['violation_points'].startswith('Not Available'):
            record['violation_points'] = 'N/A'
            record['followup_inspection'] = False
        else:
            if not record['violation_points'].isdigit():
                raise ScraperBroken('Got odd violation points value %r' % record['violation_points'])
            record['followup_inspection'] = int(record['violation_points']) > 27

        # Parse the violations from the HTML chunk. When we're done,
        # record['violation_list'] will be a (possibly empty) list of strings.
        vio_chunk = record.pop('violations')
        if vio_chunk == '':
            record['violation_list'] = []
        else:
            vios = violation_list_re.findall(vio_chunk)
            if not vios:
                raise ScraperBroken("Violation data not found for restaurant %s", record['restaurant_name'])
            record['violation_list'] = [strip_tags(convert_entities(v.strip())) for v in vios]

        # Remove the ZIP code from the address, as it complicates geocoding.
        record['address'] = re.sub(r'\s*\d{5}\s*$', '', record['address'])
        # Strip extra internal whitespace.
        record['address'] = re.sub(r'\s+', ' ', record['address'])

        return record

示例#6

0

显示文件

文件： treeutils.py 项目： frankk00/openblock

def text_from_html(html):
    """Remove ALL tags and return all plain text.
    """
    text = preprocess_to_string(html,
                                drop_tags=_html_droptags,
                                drop_trees=_html_droptrees)
    text = convert_entities(text)
    return text

示例#7

0

显示文件

文件： photo_o_day_scraper.py 项目： jtalbott22/OpenCampusKent

    def clean_list_record(self, record):
        # clean up a record dict
        # Item date, in timezone of the photo owner.
        # Not sure how to determine what that is, so we'll leave it.
        cleaned = {}
        cleaned['item_date'] = datetime.datetime.strptime(record['datetaken'],
                                                          '%Y-%m-%d %H:%M:%S')
        cleaned['item_date'] = cleaned['item_date'].date()
        # Posted date, UTC timestamp.
        pub_date = datetime.datetime.fromtimestamp(
            float(record['dateupload']), utc)
        cleaned['pub_date'] = pub_date.astimezone(local_tz)

        description = record['description']['_content']
        cleaned['description'] = convert_entities(description.strip())

        cleaned['title'] = convert_entities(record['title'])
	
        x, y = record['longitude'], record['latitude']
        cleaned['location'] = Point((float(x), float(y)))

        # Possibly we could figure out flickr's geo API and resolve
        # the photo's place_id and/or woeid to the place name?  But
        # those are probably not specific enough; reverse-geocode
        # instead.
        try:
            block, distance = reverse_geocode(cleaned['location'])
            cleaned['location_name'] = block.pretty_name
        except ReverseGeocodeError:
            raise SkipRecord("Could not geocode location %s, %s" % (x, y))

        # Don't think any of the urls returned by the API's "extras"
        # correspond to the page? not sure.
        cleaned['url'] = 'http://www.flickr.com/photos/%(owner)s/%(id)s' % record

        attributes = {}
        attributes['sourcename'] = 'Flickr'
        #attributes['photo_id'] = record['id']
        attributes['user_id'] = record['owner']
        attributes['username'] = record['ownername']
        # Thumbnail. 'Small square' photos are 75x75.
        attributes['photo_href'] = record['url_m']
        cleaned['_attributes'] = attributes
        return cleaned

示例#8

0

显示文件

文件： flickr_retrieval.py 项目： peudadayusuf/openblock

    def clean_list_record(self, record):
        # clean up a record dict
        # Item date, in timezone of the photo owner.
        # Not sure how to determine what that is, so we'll leave it.
        cleaned = {}
        cleaned['item_date'] = datetime.datetime.strptime(
            record['datetaken'], '%Y-%m-%d %H:%M:%S')
        cleaned['item_date'] = cleaned['item_date'].date()
        # Posted date, UTC timestamp.
        pub_date = datetime.datetime.fromtimestamp(float(record['dateupload']),
                                                   utc)
        cleaned['pub_date'] = pub_date.astimezone(local_tz)

        description = record['description']['_content']
        cleaned['description'] = convert_entities(description.strip())

        cleaned['title'] = convert_entities(record['title'])
        x, y = record['longitude'], record['latitude']
        cleaned['location'] = Point((float(x), float(y)))

        # Possibly we could figure out flickr's geo API and resolve
        # the photo's place_id and/or woeid to the place name?  But
        # those are probably not specific enough; reverse-geocode
        # instead.
        try:
            block, distance = reverse_geocode(cleaned['location'])
            cleaned['location_name'] = block.pretty_name
        except ReverseGeocodeError:
            raise SkipRecord("Could not geocode location %s, %s" % (x, y))

        # Don't think any of the urls returned by the API's "extras"
        # correspond to the page? not sure.
        cleaned[
            'url'] = 'http://www.flickr.com/photos/%(owner)s/%(id)s' % record

        attributes = {}
        attributes['sourcename'] = 'Flickr'
        #attributes['photo_id'] = record['id']
        attributes['user_id'] = record['owner']
        attributes['username'] = record['ownername']
        # Thumbnail. 'Small square' photos are 75x75.
        attributes['photo_href'] = record['url_sq']
        cleaned['_attributes'] = attributes
        return cleaned

示例#9

0

显示文件

def html_to_paragraph_list(tree):
    """
    Given an HTML tree, removes HTML tags and returns a list of strings, with
    each string representing a paragraph/block.
    """
    block_tags = set([
        'blockquote', 'dd', 'div', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
        'h7', 'h8', 'li', 'p', 'td', 'th', 'tr'
    ])
    drop_tags_only = set([
        'a', 'abbr', 'acronym', 'b', 'center', 'dir', 'dl', 'em', 'font',
        'form', 'hr', 'i', 'label', 'menu', 'ol', 'pre', 'small', 'span',
        'strong', 'sub', 'sup', 'table', 'tbody', 'tfoot', 'thead', 'topic',
        'u', 'ul', 'wbr'
    ])
    drop_tags_and_contents = set([
        'applet', 'area', 'button', 'embed', 'img', 'iframe', 'head', 'input',
        'link', 'map', 'meta', 'noscript', 'object', 'option', 'script',
        'select', 'spacer', 'style', 'textarea', 'title'
    ])

    elements_to_drop = []
    for element in tree.getiterator():
        if not isinstance(element.tag, basestring):  # If it's a comment...
            element.drop_tag()
            continue
        if element.text and '\n' in element.text:
            element.text = element.text.replace('\n', ' ')
        if element.tail and '\n' in element.tail:
            element.tail = element.tail.replace('\n', ' ')
        if element.tag in block_tags:
            element.text = '\n' + (element.text or '')
            element.tail = '\n' + (element.tail or '')
            element.drop_tag()
        elif element.tag == 'br':
            element.tail = '\n' + (element.tail or '')
            element.drop_tag()
        elif element.tag in drop_tags_only:
            element.drop_tag()
        elif element.tag in drop_tags_and_contents:
            elements_to_drop.append(element)
        elif element.tag not in ('html', 'body'):  # Unknown tag!
            element.drop_tag()
    for e in elements_to_drop:
        e.drop_tree()

    try:
        tree.body
    except IndexError:
        return ''
    else:
        new_html = etree.tostring(
            tree.body, method='html')[6:-7]  # strip <body> and </body>
        new_html = convert_entities(new_html)
        return re.split(r'\s*\n+\s*', new_html.strip())

示例#10

0

显示文件

文件： treeutils.py 项目： DotNetWebs/openblock

def text_from_html(html):
    """Remove ALL tags and return all plain text.
    """
    text = preprocess_to_string(html, drop_tags=_html_droptags,
                                drop_trees=_html_droptrees)
    if not text:
        # Maybe there was something there but not really HTML.
        if text and not isinstance(text, unicode):
            text = UnicodeDammit(html, isHTML=False).unicode.strip()
        else:
            text = u''
    text = convert_entities(text)
    return text

示例#11

0

显示文件

def text_from_html(html):
    """Remove ALL tags and return all plain text.
    """
    text = preprocess_to_string(html,
                                drop_tags=_html_droptags,
                                drop_trees=_html_droptrees)
    if not text:
        # Maybe there was something there but not really HTML.
        if text and not isinstance(text, unicode):
            text = UnicodeDammit(html, isHTML=False).unicode.strip()
        else:
            text = u''
    text = convert_entities(text)
    return text

示例#12

0

显示文件

    def geocode_if_needed(self,
                          point,
                          location_name,
                          address_text='',
                          **kwargs):
        """
        If either ``point`` or ``location_name`` is not set, try to
        geocode / reverse-geocode as needed to derive one from the
        other.  Returns (point, location_name).

        If neither one is set, try to parse addresses out of
        ``address_text`` and derive both.

        Either value may be None if it can't be determined.

        Any other keyword args are passed to ``full_geocode()``.
        """
        if not point:
            text = convert_entities(location_name or address_text)
            self.logger.debug("...Falling back on geocoding from '%s...'" %
                              text[:50])
            addrs = parse_addresses(text)
            for addr, unused in addrs:
                try:
                    result = self.geocode(addr, **kwargs)
                    if result is not None:
                        point = result['point']
                        self.logger.debug("internally geocoded %r" % addr)
                        # TODO: what if it's a Place?
                        if not location_name:
                            location_name = result['address']
                        break
                except:
                    self.logger.exception(
                        'uncaught geocoder exception on %r\n' % addr)
                    continue

        if point and not location_name:
            # Fall back to reverse-geocoding.
            from ebpub.geocoder import reverse
            try:
                block, distance = reverse.reverse_geocode(point)
                self.logger.debug(" Reverse-geocoded point to %r" %
                                  block.pretty_name)
                location_name = block.pretty_name
            except reverse.ReverseGeocodeError:
                location_name = None

        return (point, location_name)

示例#13

0

显示文件

    def clean_list_record(self, record):
        record.title = convert_entities(record['title'])
        record.description = convert_entities(record['description'])
        # Don't know why, but some feeds have 'id' *instead* of 'link'.
        if record.get('id', '').startswith('http'):
            record['link'] = record['id']

        # This tries GeoRSS, RDF Geo, xCal, ...
        point, location_name = self.get_point_and_location_name(record)

        _short_title = record['title'][:30] + '...'

        if not point:
            raise SkipRecord("couldn't geocode any addresses in item '%s...'" %
                             _short_title)

        if not location_name:
            raise SkipRecord(
                "Skip, no location name and failed to reverse geocode %s for %r"
                % (point.wkt, _short_title))

        if not intersects_metro_bbox(point):
            # Check if latitude, longitude seem to be reversed; I've
            # seen that in some bad feeds!
            reversed_loc = Point(point.y, point.x)
            if intersects_metro_bbox(reversed_loc):
                self.logger.info(
                    "Got points in apparently reverse order, flipping them")
                point = reversed_loc
            else:
                raise SkipRecord("Skipping %r as %s,%s is out of bounds" %
                                 (_short_title, point.y, point.x))

        record['location_name'] = location_name
        record['location'] = point
        return record

示例#14

0

显示文件

文件： whiteville_obituaries.py 项目： OpenData-NC/columbus-county-nc

 def update(self):
     """ Download Calendar RSS feed and update database """
     logger.info("Starting ObituaryScraper")
     feed = feedparser.parse(self.url)
     total_created = 0
     for entry in feed.entries:
         title = convert_entities(entry.title)
         try:
             created = self.parse_entry(entry, title)
             if created:
                 total_created += 1
         except:
             logger.error("unexpected error:", sys.exc_info()[1])
             log_exception()
             break
     logger.info("Created %d of %d total" % (created, len(feed.entries)))

示例#15

0

显示文件

文件： base.py 项目： DotNetWebs/openblock

    def geocode_if_needed(self, point, location_name, address_text='',
                          **kwargs):
        """
        If either ``point`` or ``location_name`` is not set, try to
        geocode / reverse-geocode as needed to derive one from the
        other.  Returns (point, location_name).

        If neither one is set, try to parse addresses out of
        ``address_text`` and derive both.

        Either value may be None if it can't be determined.

        Any other keyword args are passed to ``full_geocode()``.
        """
        if not point:
            text = convert_entities(location_name or address_text)
            self.logger.debug("...Falling back on geocoding from '%s...'" % text[:50])
            addrs = parse_addresses(text)
            for addr, unused in addrs:
                try:
                    result = self.geocode(addr, **kwargs)
                    if result is not None:
                        point = result['point']
                        self.logger.debug("internally geocoded %r" % addr)
                        # TODO: what if it's a Place?
                        if not location_name:
                            location_name = result['address']
                        break
                except:
                    self.logger.exception('uncaught geocoder exception on %r\n' % addr)
                    continue

        if point and not location_name:
            # Fall back to reverse-geocoding.
            from ebpub.geocoder import reverse
            try:
                block, distance = reverse.reverse_geocode(point)
                self.logger.debug(" Reverse-geocoded point to %r" % block.pretty_name)
                location_name = block.pretty_name
            except reverse.ReverseGeocodeError:
                location_name = None

        return (point, location_name)

示例#16

0

显示文件

文件： textlist.py 项目： AndrewJHart/everyblock_code

def html_to_paragraph_list(tree):
    """
    Given an HTML tree, removes HTML tags and returns a list of strings, with
    each string representing a paragraph/block.
    """
    block_tags = set(['blockquote', 'dd', 'div', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'li', 'p', 'td', 'th', 'tr'])
    drop_tags_only = set(['a', 'abbr', 'acronym', 'b', 'center', 'dir', 'dl', 'em', 'font', 'form', 'hr', 'i', 'label', 'menu', 'ol', 'pre', 'small', 'span', 'strong', 'sub', 'sup', 'table', 'tbody', 'tfoot', 'thead', 'topic', 'u', 'ul', 'wbr'])
    drop_tags_and_contents = set(['applet', 'area', 'button', 'embed', 'img', 'iframe', 'head', 'input', 'link', 'map', 'meta', 'noscript', 'object', 'option', 'script', 'select', 'spacer', 'style', 'textarea', 'title'])

    elements_to_drop = []
    for element in tree.getiterator():
        if not isinstance(element.tag, basestring): # If it's a comment...
            element.drop_tag()
            continue
        if element.text and '\n' in element.text:
            element.text = element.text.replace('\n', ' ')
        if element.tail and '\n' in element.tail:
            element.tail = element.tail.replace('\n', ' ')
        if element.tag in block_tags:
            element.text = '\n' + (element.text or '')
            element.tail = '\n' + (element.tail or '')
            element.drop_tag()
        elif element.tag == 'br':
            element.tail = '\n' + (element.tail or '')
            element.drop_tag()
        elif element.tag in drop_tags_only:
            element.drop_tag()
        elif element.tag in drop_tags_and_contents:
            elements_to_drop.append(element)
        elif element.tag not in ('html', 'body'): # Unknown tag!
            element.drop_tag()
    for e in elements_to_drop:
        e.drop_tree()

    try:
        tree.body
    except IndexError:
        return ''
    else:
        new_html = etree.tostring(tree.body, method='html')[6:-7] # strip <body> and </body>
        new_html = convert_entities(new_html)
        return re.split(r'\s*\n+\s*', new_html.strip())

示例#17

0

显示文件

文件： whiteville_obituaries.py 项目： OpenData-NC/columbus-county-nc

 def parse_entry(self, entry, title):
     try:
         item = NewsItem.objects.get(title=title, schema__id=self.schema.id)
     except NewsItem.DoesNotExist:
         item = NewsItem(title=title, schema=self.schema)
     description = convert_entities(entry.description)
     try:
         location, description = description.split(' -- ', 1)
     except ValueError:
         logger.error("Unable to parse description: %s", description)
         return
     item.url = entry.link
     item.description = description
     item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
     try:
         item.location_name = self.geocoder.geocode(location)
     except geocoder.DoesNotExist:
         logger.error("Failed to geocode %s" % location)
         item.location_name = location
     created = item.pk is not None
     item.save()
     return created

示例#18

0

显示文件

文件： add_events.py 项目： DotNetWebs/openblock

def update():
    """ Download Calendar RSS feed and update database """
    logger.info("Starting add_events")
    url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\
&new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\
&st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1"""
    schema = 'events'


    try:
        schema = Schema.objects.get(slug=schema)
    except Schema.DoesNotExist:
        logger.error("Schema (%s): DoesNotExist" % schema)
        sys.exit(1)

    feed = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in feed.entries:
        title = convert_entities(entry.title).strip()
        # Putting 'event' in the title is redundant, ticket #227
        if title.lower().startswith('event: '):
            title = title[7:]
        try:
            item = NewsItem.objects.get(title=title,
                                        schema__id=schema.id)
            status = "updated"
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = "added"
        except NewsItem.MultipleObjectsReturned:
            logger.warn("Multiple entries matched title %r, event titles are not unique?" % title)
            continue
        try:
            item.location_name = entry.get('xcal_x-calconnect-street') or entry.get('x-calconnect-street') or u''
            item.schema = schema
            item.title = title
            item.description = convert_entities(entry.description)
            item.url = entry.link
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
            item.location = Point((float(entry['geo_long']),
                                   float(entry['geo_lat'])))
            if (item.location.x, item.location.y) == (0.0, 0.0):
                logger.warn("Skipping %r, bad location 0,0" % item.title)
                continue

            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    logger.info(" Reverse-geocoded point to %r" % block.pretty_name)
                    item.location_name = block.pretty_name
                except reverse.ReverseGeocodeError:
                    logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title))
                    item.location_name = u''

            item.save()
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.exception("unexpected error:", sys.exc_info()[1])

    logger.info("add_events finished: %d added, %d updated" % (addcount, updatecount))

示例#19

0

显示文件

文件： bpdnews_retrieval.py 项目： egrommet/openblock

    def unique_fields(self, list_record):
        # not necessarily primary key, but for this script's purposes
        # these are the fields that in combination uniquely idenfity
        # an article.
        date = datetime.date(*list_record['updated_parsed'][:3])
        precincts = ['A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4',
                     'E13', 'E18', 'E5']
        precinct = None
        tags = [t['term'] for t in list_record['tags']]
        if not tags:
            return
        for precinct in tags:
            if precinct in precincts:
                # TODO: we need a LocationType for precincts, and shapes; and
                # then we can set newsitem.location_object to the Location
                # for this precinct.
                break

        if not precinct:
            self.logger.debug("no precinct found in tags %r" % tags)

        if 'Boston 24' in tags:
            # TODO: the 'Boston 24' tag indicates posts with aggregate
            # daily stats.  Make a separate schema for aggregates,
            # with attributes like those used in
            # everyblock/everyblock/cities/nyc/crime_aggregate/retrieval.py.
            # These are citywide though, not by precinct.
            # So what would be the Location?  Whole city??
            self.logger.info("boston daily crime stats, we don't know how to "
                             "handle these yet")

        description = list_record['content'][0]['value']
        # TODO: we should have a stock 'clean up html' function.
        description = preprocess_to_string(
            description,
            drop_tags=('a', 'area', 'b', 'center', 'font', 'form', 'img', 'input', 'p', 'strong', 'map', 'small', 'span', 'sub', 'sup', 'topic', 'u'),
            drop_trees=('applet', 'button', 'embed', 'iframe', 'object', 'select', 'textarea'),
            drop_attrs=('background', 'border', 'cellpadding', 'cellspacing', 'class', 'clear', 'id', 'rel', 'style', 'target'))
        from ebdata.retrieval.utils import convert_entities
        description = convert_entities(description)
        #description = description.replace('&nbsp;', ' ').replace('&#160;', ' ')

        addrs = parse_addresses(description)
        if not addrs:
            self.logger.info("no addresses found in %r" % list_record['title'])

        location = None
        location_name = u''
        for addr, unused in addrs:
            addr = addr.strip()
            try:
                from geocoder_hack import quick_dirty_fallback_geocode
                x, y = quick_dirty_fallback_geocode(addr)
                if (x, y) != (None, None):
                    location = Point((float(x), float(y)))
                    location_name = addr.title()
            except:
                print "ugh, %r" % addr
                # XXX log something

        return dict(item_date=date,
                    location=location,
                    location_name=location_name,
                    title=list_record['title'],
                    description=description,
                    )

示例#20

0

显示文件

文件： police.py 项目： columbia-daily-tribune/obcolumbia

def update(url):
    logger.info("Scraping police reports")
    schema_slug = 'police'

    try:
        schema = Schema.objects.get(slug=schema_slug)
    except Schema.DoesNotExist:
        logger.error( "Schema (%s): DoesNotExist" % schema_slug)
        sys.exit(1)

    incident_type_field = SchemaField.objects.get(schema=schema, name='incident_type')

    f = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in f.entries:
        title = convert_entities(entry.title).strip()
        # The title will be used as the incident type.
        if title in SKIP_TYPES:
            logger.info("Skipping entry of type %s" % title)
        description = convert_entities(entry.summary)
        try:
            item = NewsItem.objects.get(schema__id=schema.id,
                                        title=title,
                                        description=description)
            #url=item_url)
            status = 'updated'
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = 'added'
        except NewsItem.MultipleObjectsReturned:
            # Seen some where we get the same story with multiple URLs. Why?
            logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description))
            continue
        try:
            item.title = title
            item.schema = schema
            item.description = description
            item.pub_date = datetime(*entry.updated_parsed[:6])
            item.location = Point((float(entry.geo_long), float(entry.geo_lat)))
            item.location_name = get_element(entry, 'address')

            # parse call time
            ct = datetime.strptime(get_element(entry, 'calldatetime'),
                                   r"%m/%d/%Y %I:%M:%S %p")
            #ct = datetime(ct.year, ct.month, ct.day, ct.hour, ct.minute, ct.second, tzinfo=tzlocal())
            #ct = ct.astimezone(tzutc())

            item.item_date = ct
            item.save()


            # extra attributes
            try:
                item.attributes['calldatetime'] = ct
            except: 
                pass

            try: 
                item.attributes['innum'] = int(get_element(entry, 'innum'))
            except: 
                pass
                
            for k in ['disp', 'aptlot', 'address']: 
                try: 
                    item.attributes[k] = get_element(entry, k)
                except: 
                    pass

            # create a lookup based on the title, this is the closest thing to 
            # a category that is available in the data.
            lu = Lookup.objects.get_or_create_lookup(incident_type_field, title, title, "", False)
            item.attributes['incident_type'] = lu.id


            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.error("Warning: couldn't save %r. Traceback: %s" % (item.title, traceback.format_exc()))
    logger.info("Finished scraping police reports: %d added, %d updated" % (addcount, updatecount))

示例#21

0

显示文件

文件： retrieval.py 项目： mesrut/openblock

    def update(self):
        logger.info("Starting LocalNewsScraper update %s" % self.url)

        try:
            schema = Schema.objects.get(slug=self.schema_slug)
        except Schema.DoesNotExist:
            logger.error( "Schema (%s): DoesNotExist" % self.schema_slug)
            return 1

        response, content = self.http.request(self.url)
        if response.fromcache:
            logger.info("Feed is unchanged since last update (cached)")
            return

        f = feedparser.parse(content)
        addcount = updatecount = 0
        for entry in f.entries:
            title = convert_entities(entry.title)
            description = convert_entities(entry.description)

            if entry.id.startswith('http'):
                item_url = entry.id
            else:
                item_url = entry.link
            try:
                item = NewsItem.objects.get(schema__id=schema.id,
                                            title=title,
                                            description=description)
                #url=item_url)
                status = 'updated'
            except NewsItem.DoesNotExist:
                item = NewsItem()
                status = 'added'
            except NewsItem.MultipleObjectsReturned:
                # Seen some where we get the same story with multiple URLs. Why?
                logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description))
                continue
            try:
                item.title = title
                item.schema = schema
                item.description = description
                item.url = item_url
                # Support both georss and xcal for getting the location name.
                # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/
                item.location_name = entry.get('xCal_x-calconnect-street') or entry.get('x-calconnect-street') or entry.get('georss_featurename') or entry.get('featurename')
                item.item_date = datetime.datetime(*entry.updated_parsed[:6])
                item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
                _short_title = item.title[:30] + '...'

                # feedparser bug: depending on which parser it magically uses,
                # we either get the xml namespace in the key name, or we don't.
                point = entry.get('georss_point') or entry.get('point')
                x, y = None, None
                if point:
                    # GeoRSS puts latitude (Y) first.
                    y, x = point.split(' ')
                else:
                    if item.location_name:
                        text = item.location_name
                    else:
                        # Geocode whatever we can find.
                        text = item.title + ' ' + item.description
                    logger.debug("...Falling back on geocoding from %r..." % text[:50])
                    addrs = parse_addresses(text)
                    for addr, unused in addrs:
                        try:
                            result = SmartGeocoder().geocode(addr)
                            point = result['point']
                            logger.debug("internally geocoded %r" % addr)
                            x, y = point.x, point.y
                            if not item.location_name:
                                item.location_name = result['address']
                            item.block = result['block']
                            break
                        except GeocodingException:
                            logger.debug("Geocoding exception on %r:" % text)
                            log_exception(level=logging.DEBUG)
                            continue
                        except:
                            logger.error('uncaught geocoder exception on %r\n' % addr)
                            log_exception()
                    if None in (x, y):
                        logger.debug("Skip, couldn't geocode any addresses in item '%s...'"
                                     % _short_title)
                        continue
                item.location = Point((float(x), float(y)))
                if not intersects_metro_bbox(item.location):
                    reversed_loc = Point((float(y), float(x)))
                    if intersects_metro_bbox(reversed_loc):
                        logger.info(
                            "Got points in apparently reverse order, flipping them")
                        item.location = reversed_loc
                    else:
                        logger.info("Skipping %r as %s,%s is out of bounds" %
                                    (_short_title, y, x))
                        continue
                if not item.location_name:
                    # Fall back to reverse-geocoding.
                    from ebpub.geocoder import reverse
                    try:
                        block, distance = reverse.reverse_geocode(item.location)
                        logger.debug(" Reverse-geocoded point to %r" % block.pretty_name)
                        item.location_name = block.pretty_name
                        item.block = block
                    except reverse.ReverseGeocodeError:
                        logger.info(" Skip, failed to reverse geocode %s for %r" % (item.location.wkt, _short_title))
                        continue
                item.save()
                if status == 'added':
                    addcount += 1
                else:
                    updatecount += 1
                logger.info("%s: %s" % (status, _short_title))
            except:
                logger.error("Warning: couldn't save %r. Traceback:" % _short_title)
                log_exception()
        logger.info("Finished LocalNewsScraper update: %d added, %d updated" % (addcount, updatecount))

示例#22

0

显示文件

文件： retrieval.py 项目： mesrut/openblock

    def update(self):
        """ Download Calendar RSS feed and update database """
        logger.info("Starting EventsCalendarScraper")
        
        feed = feedparser.parse(self.url)
        seencount = addcount = updatecount = 0
        for entry in feed.entries:

            def ns_get(element):
                # work around feedparser unpredictability.
                namespace, element = element.split(':')
                result = entry.get('%s_%s' % (namespace, element))
                if result is None:
                    result = entry.get(element)
                return result

            seencount += 1
            title = convert_entities(entry.title)
            try:
                item = NewsItem.objects.get(title=title,
                                            schema__id=self.schema.id)
                status = "updated"
            except NewsItem.DoesNotExist:
                item = NewsItem()
                status = "added"
            except NewsItem.MultipleObjectsReturned:
                logger.warn("Multiple entries matched title %r, event titles are not unique?" % title)
                continue
            try:
                item.location_name = '%s %s' % (ns_get('xcal:x-calconnect-venue-name'),
                                                ns_get('xcal:x-calconnect-street'))
                item.location_name = item.location_name.strip()
                item.schema = self.schema
                item.title = title
                item.description = convert_entities(entry.description)
                item.url = entry.link
                start_dt = ns_get('xcal:dtstart')
                start_dt = dateutil.parser.parse(start_dt)
                # Upstream bug: They provide a UTC offset of +0000 which
                # means times in UTC, but they're actually times in
                # US/Eastern, so do *not* fix the zone.
                #start_dt = start_dt.astimezone(local_tz)
                item.item_date = start_dt.date()
                item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
                item.location = Point((float(ns_get('geo:long')),
                                       float(ns_get('geo:lat'))))
                if (item.location.x, item.location.y) == (0.0, 0.0):
                    logger.warn("Skipping %r, bad location 0,0" % item.title)
                    continue

                if not item.location_name:
                    # Fall back to reverse-geocoding.
                    from ebpub.geocoder import reverse
                    try:
                        block, distance = reverse.reverse_geocode(item.location)
                        logger.info(" Reverse-geocoded point to %r" % block.pretty_name)
                        item.location_name = block.pretty_name
                        item.block = block
                    except reverse.ReverseGeocodeError:
                        logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title))
                        item.location_name = u''

                item.save()
                item.attributes['start_time'] = start_dt.time()
                end_dt = ns_get('xcal:dtend') or u''
                if end_dt.strip():
                    end_dt = dateutil.parser.parse(end_dt.strip())
                    #end_dt = end_dt.astimezone(local_tz)
                    item.attributes['end_time'] = end_dt.time()
                if status == 'added':
                    addcount += 1
                else:
                    updatecount += 1
                logger.info("%s: %s" % (status, item.title))
            except:
                logger.error("unexpected error:", sys.exc_info()[1])
                log_exception()
        logger.info("EventsCalendarScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))

示例#23

0

显示文件

文件： add_news.py 项目： egrommet/openblock

def main(argv=None):
    if argv:
        url = argv[0]
    else:
        url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai'
    schema = 'local-news'

    try:
        schema = Schema.objects.get(slug=schema)
    except Schema.DoesNotExist:
        print "Schema (%s): DoesNotExist" % schema
        sys.exit(1)

    f = feedparser.parse(url)

    for e in f.entries:
        try:
            item = NewsItem.objects.get(title=e.title, description=e.description)
            print "Already have %r (id %d)" % (item.title, item.id)
        except NewsItem.DoesNotExist:
            item = NewsItem()
        try:
            item.schema = schema
            item.title = convert_entities(e.title)
            item.description = convert_entities(e.description)
            item.url = e.link
            item.location_name = e.get('x-calconnect-street') or e.get('georss_featurename')
            item.item_date = datetime.datetime(*e.updated_parsed[:6])
            item.pub_date = datetime.datetime(*e.updated_parsed[:6])
            if 'point' in e:
                x,y = e.point.split(' ')
            elif 'georss_point' in e:
                x,y = e.georss_point.split(' ')
            else:
                text = item.title + ' ' + item.description
                from geocoder_hack import quick_dirty_fallback_geocode
                x, y = quick_dirty_fallback_geocode(text, parse=True)
                if None in (x, y):
                    print " couldn't geocode '%s...'" % item.title[:30]
                    continue
            item.location = Point((float(y), float(x)))
            if item.location.x == 0.0 and item.location.y == 0.0:
                # There's a lot of these. Maybe attempt to
                # parse and geocode if we haven't already?
                print "Skipping %r as it has bad location 0,0" % item.title
                continue
            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    print " Reverse-geocoded point to %r" % block.pretty_name
                    item.location_name = block.pretty_name
                except reverse.ReverseGeocodeError:
                    print " Failed to reverse geocode %s for %r" % (item.location.wkt, item.title)
                    item.location_name = u''
            item.save()
            print "Added: %s" % item.title
        except:
            print "Warning: couldn't save %r. Traceback:" % item.title
            import cStringIO, traceback
            f = cStringIO.StringIO()
            traceback.print_exc(file=f)
            msg = f.getvalue()
            print msg

示例#24

0

显示文件

文件： sheriff.py 项目： columbia-daily-tribune/obcolumbia

def update(url):
    schema_slug = 'sheriff'
    try:
        schema = Schema.objects.get(slug=schema_slug)
    except Schema.DoesNotExist:
        logger.error( "Schema (%s): DoesNotExist" % schema_slug)
        sys.exit(1)

    incident_type_field = SchemaField.objects.get(schema=schema, name='incident_type')


    try: 
        innum_field = SchemaField.objects.get(schema=schema, name='innum')
    except SchemaField.DoesNotExist: 
        logger.error( "SchemaField innum Does Not Exist for %s" % schema_slug)
        sys.exit(1)

    logger.info("Scraping %s" % schema.name)


    f = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in f.entries:
        
        innum = int(get_element(entry, 'innum'))
        title = convert_entities(entry.title)
        description = convert_entities(entry.summary)

        try:
            item = NewsItem.objects.filter(schema=schema).by_attribute(innum_field, innum)[0]
            #url=item_url)
            status = 'updated'
        except IndexError:
            item = NewsItem()
            status = 'added'

        try:
            item.title = title
            item.schema = schema
            item.description = description

            try: 
                item.location = Point((float(entry.geo_long), float(entry.geo_lat)))
            except: 
                logger.info("Skipping item %s with no location information" % innum)

            item.location_name = get_element(entry, 'address')


            # this feed uses an invalidly formatted pubDate which 
            # appears to be intended to express the time of the 
            # incident, used for publication time as well.
            # 24 hour time. 
            ct = datetime.strptime(entry.updated, r"%m/%d/%Y %H:%M:%S")
            #ct = datetime(ct.year, ct.month, ct.day, ct.hour, ct.minute, ct.second, tzinfo=tzlocal())
            #ct = ct.astimezone(tzutc())

            item.item_date = ct
            item.pub_date = ct.date()
            item.save()

            # extra attributes
            item.attributes['innum'] = innum

            for k in ['address']: 
                try:
                    item.attributes[k] = get_element(entry, k)
                except:
                    pass

            # create a lookup based on the title, this is the closest thing to 
            # a category that is available in the data.
            lu = Lookup.objects.get_or_create_lookup(incident_type_field, title, title, "", False)
            item.attributes['incident_type'] = lu.id


            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.error("Warning: couldn't save %r. Traceback: %s" % (item.title, traceback.format_exc()))

    
    logger.info("Finished scraping %s: %d added, %d updated" % (schema.name, addcount, updatecount))

示例#25

0

显示文件

文件： add_events.py 项目： slinkp/openblock

def update():
    """ Download Calendar RSS feed and update database """
    logger.info("Starting add_events")
    url = """http://calendar.boston.com/search?acat=&cat=&commit=Search\
&new=n&rss=1&search=true&sort=0&srad=20&srss=50&ssrss=5&st=event\
&st_select=any&svt=text&swhat=&swhen=today&swhere=&trim=1"""
    schema = 'events'

    try:
        schema = Schema.objects.get(slug=schema)
    except Schema.DoesNotExist:
        logger.error("Schema (%s): DoesNotExist" % schema)
        sys.exit(1)

    feed = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in feed.entries:
        title = convert_entities(entry.title).strip()
        # Putting 'event' in the title is redundant, ticket #227
        if title.lower().startswith('event: '):
            title = title[7:]
        try:
            item = NewsItem.objects.get(title=title, schema__id=schema.id)
            status = "updated"
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = "added"
        except NewsItem.MultipleObjectsReturned:
            logger.warn(
                "Multiple entries matched title %r, event titles are not unique?"
                % title)
            continue
        try:
            item.location_name = entry.get(
                'xcal_x-calconnect-street') or entry.get(
                    'x-calconnect-street') or u''
            item.schema = schema
            item.title = title
            item.description = convert_entities(entry.description)
            item.url = entry.link
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
            item.location = Point(
                (float(entry['geo_long']), float(entry['geo_lat'])))
            if (item.location.x, item.location.y) == (0.0, 0.0):
                logger.warn("Skipping %r, bad location 0,0" % item.title)
                continue

            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    logger.info(" Reverse-geocoded point to %r" %
                                block.pretty_name)
                    item.location_name = block.pretty_name
                except reverse.ReverseGeocodeError:
                    logger.debug(" Failed to reverse geocode %s for %r" %
                                 (item.location.wkt, item.title))
                    item.location_name = u''

            item.save()
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.exception("unexpected error:", sys.exc_info()[1])

    logger.info("add_events finished: %d added, %d updated" %
                (addcount, updatecount))

示例#26

0

显示文件

文件： articletext.py 项目： hussainaphroj/ebcode

def article_text_sections(tree):
    """
    Given an HTML tree of a news article (or blog entry permalink), deduces
    which part of it is text and returns a list of lists of strings, with each
    string representing a paragraph and each list of strings representing a
    "section" of the page.
    """

    # The basic algorithm here is to combine all text within the same block
    # (e.g., a <div>).

    MIN_NUM_PARAGRAPHS = 3
    MIN_NUM_PUNCTUATED = 3

    # In order for a paragraph to be counted toward MIN_NUM_PUNCTUATED, it must
    # have this number of characters.
    MIN_CHARS_IN_PARAGRAPH = 30

    # If this many paragraphs with MIN_CHARS_IN_PARAGRAPH are included in the
    # section, then the section will be included, regardless of failing
    # MIN_PERCENTAGE_PUNCTUATED.
    NUM_PARAGRAPHS_SAFE_GUESS = 6

    # In order for a section to be included in the result, at least this
    # percentage of paragraphs in the section must be punctuated.
    MIN_PERCENTAGE_PUNCTUATED = decimal.Decimal(".5")

    block_tags = set(
        ["blockquote", "dd", "div", "dt", "h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "li", "p", "td", "th", "tr"]
    )
    drop_tags_only = set(
        [
            "a",
            "abbr",
            "acronym",
            "b",
            "center",
            "dir",
            "dl",
            "em",
            "font",
            "form",
            "hr",
            "i",
            "label",
            "menu",
            "ol",
            "pre",
            "small",
            "span",
            "strong",
            "sub",
            "sup",
            "table",
            "tbody",
            "tfoot",
            "thead",
            "topic",
            "u",
            "ul",
            "wbr",
        ]
    )
    drop_tags_and_contents = set(
        [
            "applet",
            "area",
            "button",
            "embed",
            "img",
            "iframe",
            "head",
            "input",
            "link",
            "map",
            "meta",
            "noscript",
            "object",
            "option",
            "script",
            "select",
            "spacer",
            "style",
            "textarea",
            "title",
        ]
    )
    layout_tags = set(["div", "h1", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "td", "th", "tr"])
    is_open_tag = re.compile("^<[^/][^>]+>$").search
    is_close_tag = re.compile("^</[^>]+>$").search
    ignored_paragraphs = set(
        [
            "del.icio.us",
            "digg",
            "email",
            "e-mail editor",
            "e-mail story",
            "no comments",
            "print",
            "print article",
            "printer-friendly",
            "printer version",
            "reprints",
        ]
    )

    elements_to_drop = []
    for element in tree.getiterator():
        if not isinstance(element.tag, basestring):  # If it's a comment...
            element.drop_tag()
            continue
        if element.text and "\n" in element.text:
            element.text = element.text.replace("\n", " ")
        if element.tail and "\n" in element.tail:
            element.tail = element.tail.replace("\n", " ")
        if element.tag in block_tags:
            element.text = "\n" + (element.text or "")
            element.tail = "\n" + (element.tail or "")
        elif element.tag == "br":
            element.tail = "\n" + (element.tail or "")
            element.drop_tag()
        elif element.tag in drop_tags_only:
            element.drop_tag()
        elif element.tag in drop_tags_and_contents:
            elements_to_drop.append(element)
        elif element.tag not in ("html", "body"):  # Unknown tag!
            element.drop_tag()
    for e in elements_to_drop:
        e.drop_tree()

    for element in tree.getiterator():
        if element.tag in block_tags:
            if element.tag in layout_tags:
                element.text = "\n<%s>\n%s\n" % (element.tag, (element.text or ""))
                element.tail = "\n</%s>\n%s\n" % (element.tag, (element.tail or ""))
            element.drop_tag()

    try:
        tree.body
    except IndexError:
        # In some cases, the article is missing a <body> tag, and tree.body
        # will result in an IndexError. Just skip these.
        return []

    new_html = etree.tostring(tree.body, method="html")
    new_html = convert_entities(new_html)
    lines = re.split(r"\s*\n+\s*", new_html.strip())
    result = []
    sections = []
    for line in lines:
        if is_open_tag(line):
            result.append([])
        elif is_close_tag(line):
            last_bit = result.pop()
            if len(last_bit) >= MIN_NUM_PARAGRAPHS:
                sections.append(last_bit)
        else:  # It's text, not a tag.
            try:
                result[-1].append(line)
            except IndexError:  # No tags seen yet.
                result.append([line])

    # Cut out the sections that don't contain enough punctuated sentences.
    final_sections = []
    for section in sections:
        count = 0
        to_delete = []
        for i, paragraph in enumerate(section):
            if paragraph.lower() in ignored_paragraphs:
                to_delete.append(i)
            elif is_punctuated(paragraph) and len(paragraph) >= MIN_CHARS_IN_PARAGRAPH:
                count += 1
        percent_punctuated = decimal.Decimal(count) / decimal.Decimal(len(section))
        if count >= NUM_PARAGRAPHS_SAFE_GUESS or (
            count >= MIN_NUM_PUNCTUATED and percent_punctuated >= MIN_PERCENTAGE_PUNCTUATED
        ):
            for i in reversed(to_delete):  # Delete in reverse so that index order is preserved.
                del section[i]
            final_sections.append(section)
    return final_sections

示例#27

0

显示文件

文件： articletext.py 项目： christaggart/openblock

def article_text_sections(tree):
    """
    Given an HTML tree of a news article (or blog entry permalink), deduces
    which part of it is text and returns a list of lists of strings, with each
    string representing a paragraph and each list of strings representing a
    "section" of the page.
    """

    # The basic algorithm here is to combine all text within the same block
    # (e.g., a <div>).

    MIN_NUM_PARAGRAPHS = 3
    MIN_NUM_PUNCTUATED = 3

    # In order for a paragraph to be counted toward MIN_NUM_PUNCTUATED, it must
    # have this number of characters.
    MIN_CHARS_IN_PARAGRAPH = 30

    # If this many paragraphs with MIN_CHARS_IN_PARAGRAPH are included in the
    # section, then the section will be included, regardless of failing
    # MIN_PERCENTAGE_PUNCTUATED.
    NUM_PARAGRAPHS_SAFE_GUESS = 6

    # In order for a section to be included in the result, at least this
    # percentage of paragraphs in the section must be punctuated.
    MIN_PERCENTAGE_PUNCTUATED = decimal.Decimal('.5')

    block_tags = set(['blockquote', 'dd', 'div', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'li', 'p', 'td', 'th', 'tr'])
    drop_tags_only = set(['a', 'abbr', 'acronym', 'b', 'center', 'dir', 'dl', 'em', 'font', 'form', 'hr', 'i', 'label', 'menu', 'ol', 'pre', 'small', 'span', 'strong', 'sub', 'sup', 'table', 'tbody', 'tfoot', 'thead', 'topic', 'u', 'ul', 'wbr'])
    drop_tags_and_contents = set(['applet', 'area', 'button', 'embed', 'img', 'iframe', 'head', 'input', 'link', 'map', 'meta', 'noscript', 'object', 'option', 'script', 'select', 'spacer', 'style', 'textarea', 'title'])
    layout_tags = set(['div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'td', 'th', 'tr'])
    is_open_tag = re.compile('^<[^/][^>]+>$').search
    is_close_tag = re.compile('^</[^>]+>$').search
    ignored_paragraphs = set(['del.icio.us', 'digg', 'email', 'e-mail editor', 'e-mail story', 'no comments', 'print', 'print article', 'printer-friendly', 'printer version', 'reprints'])

    elements_to_drop = []
    for element in tree.getiterator():
        if not isinstance(element.tag, basestring): # If it's a comment...
            element.drop_tag()
            continue
        if element.text and '\n' in element.text:
            element.text = element.text.replace('\n', ' ')
        if element.tail and '\n' in element.tail:
            element.tail = element.tail.replace('\n', ' ')
        if element.tag in block_tags:
            element.text = '\n' + (element.text or '')
            element.tail = '\n' + (element.tail or '')
        elif element.tag == 'br':
            element.tail = '\n' + (element.tail or '')
            element.drop_tag()
        elif element.tag in drop_tags_only:
            element.drop_tag()
        elif element.tag in drop_tags_and_contents:
            elements_to_drop.append(element)
        elif element.tag not in ('html', 'body'): # Unknown tag!
            element.drop_tag()
    for e in elements_to_drop:
        e.drop_tree()

    for element in tree.getiterator():
        if element.tag in block_tags:
            if element.tag in layout_tags:
                element.text = '\n<%s>\n%s\n' % (element.tag, (element.text or ''))
                element.tail = '\n</%s>\n%s\n' % (element.tag, (element.tail or ''))
            element.drop_tag()

    try:
        tree.body
    except IndexError:
        # In some cases, the article is missing a <body> tag, and tree.body
        # will result in an IndexError. Just skip these.
        return []

    new_html = etree.tostring(tree.body, method='html')
    new_html = convert_entities(new_html)
    lines = re.split(r'\s*\n+\s*', new_html.strip())
    result = []
    sections = []
    for line in lines:
        if is_open_tag(line):
            result.append([])
        elif is_close_tag(line):
            last_bit = result.pop()
            if len(last_bit) >= MIN_NUM_PARAGRAPHS:
                sections.append(last_bit)
        else: # It's text, not a tag.
            try:
                result[-1].append(line)
            except IndexError: # No tags seen yet.
                result.append([line])

    # Cut out the sections that don't contain enough punctuated sentences.
    final_sections = []
    for section in sections:
        count = 0
        to_delete = []
        for i, paragraph in enumerate(section):
            if paragraph.lower() in ignored_paragraphs:
                to_delete.append(i)
            elif is_punctuated(paragraph) and len(paragraph) >= MIN_CHARS_IN_PARAGRAPH:
                count += 1
        percent_punctuated = decimal.Decimal(count) / decimal.Decimal(len(section))
        if count >= NUM_PARAGRAPHS_SAFE_GUESS or (count >= MIN_NUM_PUNCTUATED and percent_punctuated >= MIN_PERCENTAGE_PUNCTUATED):
            for i in reversed(to_delete): # Delete in reverse so that index order is preserved.
                del section[i]
            final_sections.append(section)
    return final_sections

示例#28

0

显示文件

    def update(self):
        logger.info("Starting LocalNewsScraper update %s" % self.url)

        try:
            schema = Schema.objects.get(slug=self.schema_slug)
        except Schema.DoesNotExist:
            logger.error("Schema (%s): DoesNotExist" % self.schema_slug)
            return 1

        response, content = self.http.request(self.url)
        if response.fromcache:
            logger.info("Feed is unchanged since last update (cached)")
            return

        f = feedparser.parse(content)
        addcount = updatecount = 0
        for entry in f.entries:
            title = convert_entities(entry.title)
            description = convert_entities(entry.description)

            if entry.id.startswith('http'):
                item_url = entry.id
            else:
                item_url = entry.link
            try:
                item = NewsItem.objects.get(schema__id=schema.id,
                                            title=title,
                                            description=description)
                #url=item_url)
                status = 'updated'
            except NewsItem.DoesNotExist:
                item = NewsItem()
                status = 'added'
            except NewsItem.MultipleObjectsReturned:
                # Seen some where we get the same story with multiple URLs. Why?
                logger.warn(
                    "Multiple entries matched title %r and description %r. Expected unique!"
                    % (title, description))
                continue
            try:
                item.title = title
                item.schema = schema
                item.description = description
                item.url = item_url
                # Support both georss and xcal for getting the location name.
                # TODO: should also support ev:location per http://web.resource.org/rss/1.0/modules/event/
                item.location_name = entry.get(
                    'xCal_x-calconnect-street') or entry.get(
                        'x-calconnect-street') or entry.get(
                            'georss_featurename') or entry.get('featurename')
                item.item_date = datetime.datetime(*entry.updated_parsed[:6])
                item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
                _short_title = item.title[:30] + '...'

                # feedparser bug: depending on which parser it magically uses,
                # we either get the xml namespace in the key name, or we don't.
                point = entry.get('georss_point') or entry.get('point')
                x, y = None, None
                if point:
                    # GeoRSS puts latitude (Y) first.
                    y, x = point.split(' ')
                else:
                    if item.location_name:
                        text = item.location_name
                    else:
                        # Geocode whatever we can find.
                        text = item.title + ' ' + item.description
                    logger.debug("...Falling back on geocoding from %r..." %
                                 text[:50])
                    addrs = parse_addresses(text)
                    for addr, unused in addrs:
                        try:
                            result = SmartGeocoder().geocode(addr)
                            point = result['point']
                            logger.debug("internally geocoded %r" % addr)
                            x, y = point.x, point.y
                            if not item.location_name:
                                item.location_name = result['address']
                            break
                        except GeocodingException:
                            logger.debug("Geocoding exception on %r:" % text)
                            log_exception(level=logging.DEBUG)
                            continue
                        except:
                            logger.error(
                                'uncaught geocoder exception on %r\n' % addr)
                            log_exception()
                    if None in (x, y):
                        logger.debug(
                            "Skip, couldn't geocode any addresses in item '%s...'"
                            % _short_title)
                        continue
                item.location = Point((float(x), float(y)))
                if not intersects_metro_bbox(item.location):
                    reversed_loc = Point((float(y), float(x)))
                    if intersects_metro_bbox(reversed_loc):
                        logger.info(
                            "Got points in apparently reverse order, flipping them"
                        )
                        item.location = reversed_loc
                    else:
                        logger.info("Skipping %r as %s,%s is out of bounds" %
                                    (_short_title, y, x))
                        continue
                if not item.location_name:
                    # Fall back to reverse-geocoding.
                    from ebpub.geocoder import reverse
                    try:
                        block, distance = reverse.reverse_geocode(
                            item.location)
                        logger.debug(" Reverse-geocoded point to %r" %
                                     block.pretty_name)
                        item.location_name = block.pretty_name
                    except reverse.ReverseGeocodeError:
                        logger.info(
                            " Skip, failed to reverse geocode %s for %r" %
                            (item.location.wkt, _short_title))
                        continue
                item.save()
                if status == 'added':
                    addcount += 1
                else:
                    updatecount += 1
                logger.info("%s: %s" % (status, _short_title))
            except:
                logger.error("Warning: couldn't save %r. Traceback:" %
                             _short_title)
                log_exception()
        logger.info("Finished LocalNewsScraper update: %d added, %d updated" %
                    (addcount, updatecount))

示例#29

0

显示文件

文件： ksc_scrape.py 项目： jtalbott22/OpenCampusKent

	def update(self):

		#
		#
		# Download Calendar RSS feed and update database
		#
		#

		logger.info("Starting KSUStudentProgrammingScraper")

		feed = feedparser.parse(self.url)
		seencount = addcount = updatecount = 0
		for entry in feed.entries:

			seencount += 1
			title = convert_entities(entry.title)
			title = foo(title, '', ' (')
			try:
				item = NewsItem.objects.get(title=title,
											schema__id=self.schema.id)
				status = "updated"
			except NewsItem.DoesNotExist:
				item = NewsItem()
				status = "added"
			except NewsItem.MultipleObjectsReturned:
				logger.warn("Multiple entries matched title %r, event titles are not unique?" % title)
				continue
			try:

				#
				#
				# The actual rss feed elements are grabbed here
				#
				#

				itm_description = entry.description

				soup = BeautifulSoup(foo(itm_description,"</table><br />","<br /><br />"))
				locations = soup.findAll(text=True)
				location = locations[0].strip()
				place_grabber = places.place_grabber()
				grab_results = place_grabber(location)
				try:
					item.location = Place.objects.get(pretty_name=grab_results[0][2]).location
					item.location_name = Place.objects.get(pretty_name=grab_results[0][2]).pretty_name
				except:
					item.location = PlaceSynonym.objects.get(pretty_name=grab_results[0][2]).place.location
					item.location_name = PlaceSynonym.objects.get(pretty_name=grab_results[0][2]).place.pretty_name

				try:
					item.attributes['room'] = locations[1].strip().replace("Room: ","")
				except Exception as e:
					logger.info("Tried saving item.room, error: %s" % e)

				item.schema = self.schema
				item.title = title

				soup = BeautifulSoup(foo(itm_description,"<br /><br />","</td></tr>"))
				item.description = soup.findAll(text=True)
				item.description = item.description[0].strip()

				item.url = entry.link

				start_t = foo(itm_description,"Start Time:</b>&nbsp;</td><td>","</td>")
				start_t = dateutil.parser.parse(start_t)

				end_t = foo(itm_description,"End Time:</b>&nbsp;</td><td>","</td>")
				end_t = dateutil.parser.parse(end_t)

				end_dt = foo(itm_description,"End Date:</b>&nbsp;</td><td>","</td>")
				end_dt = dateutil.parser.parse(end_dt)

				item.item_date = dateutil.parser.parse(entry.category)
				item.pub_date = datetime.datetime(*entry.updated_parsed[:6])

				item.attributes['start-time'] = start_t.time()
				item.attributes['end-time'] = end_t.time()

				item.save()

				if status == 'added':
					addcount += 1
				else:
					updatecount += 1
				logger.info("%s: %s" % (status, item.title))
			except Exception as e:
				logger.exception("unexpected error: %s" % e)
		logger.info("KSUStudentProgrammingScraper finished: %d added, %d updated of %s total" % (addcount, updatecount, seencount))

示例#30

0

显示文件

    def update(self):
        """ Download Calendar RSS feed and update database """
        logger.info("Starting EventsCalendarScraper")

        feed = feedparser.parse(self.url)
        seencount = addcount = updatecount = 0
        for entry in feed.entries:

            def ns_get(element):
                # work around feedparser unpredictability.
                namespace, element = element.split(':')
                result = entry.get('%s_%s' % (namespace, element))
                if result is None:
                    result = entry.get(element)
                return result

            seencount += 1
            title = convert_entities(entry.title)
            try:
                item = NewsItem.objects.get(title=title,
                                            schema__id=self.schema.id)
                status = "updated"
            except NewsItem.DoesNotExist:
                item = NewsItem()
                status = "added"
            except NewsItem.MultipleObjectsReturned:
                logger.warn(
                    "Multiple entries matched title %r, event titles are not unique?"
                    % title)
                continue
            try:
                item.location_name = '%s %s' % (
                    ns_get('xcal:x-calconnect-venue-name'),
                    ns_get('xcal:x-calconnect-street'))
                item.location_name = item.location_name.strip()
                item.schema = self.schema
                item.title = title
                item.description = convert_entities(entry.description)
                item.url = entry.link
                start_dt = ns_get('xcal:dtstart')
                start_dt = dateutil.parser.parse(start_dt)
                # Upstream bug: They provide a UTC offset of +0000 which
                # means times in UTC, but they're actually times in
                # US/Eastern, so do *not* fix the zone.
                #start_dt = start_dt.astimezone(local_tz)
                item.item_date = start_dt.date()
                item.pub_date = datetime.datetime(*entry.updated_parsed[:6])
                item.location = Point(
                    (float(ns_get('geo:long')), float(ns_get('geo:lat'))))
                if (item.location.x, item.location.y) == (0.0, 0.0):
                    logger.warn("Skipping %r, bad location 0,0" % item.title)
                    continue

                if not item.location_name:
                    # Fall back to reverse-geocoding.
                    from ebpub.geocoder import reverse
                    try:
                        block, distance = reverse.reverse_geocode(
                            item.location)
                        logger.info(" Reverse-geocoded point to %r" %
                                    block.pretty_name)
                        item.location_name = block.pretty_name
                    except reverse.ReverseGeocodeError:
                        logger.debug(" Failed to reverse geocode %s for %r" %
                                     (item.location.wkt, item.title))
                        item.location_name = u''

                item.save()
                item.attributes['start_time'] = start_dt.time()
                end_dt = ns_get('xcal:dtend') or u''
                if end_dt.strip():
                    end_dt = dateutil.parser.parse(end_dt.strip())
                    #end_dt = end_dt.astimezone(local_tz)
                    item.attributes['end_time'] = end_dt.time()
                if status == 'added':
                    addcount += 1
                else:
                    updatecount += 1
                logger.info("%s: %s" % (status, item.title))
            except Exception as e:
                logger.exception("unexpected error: %s" % e)
        logger.info(
            "EventsCalendarScraper finished: %d added, %d updated of %s total"
            % (addcount, updatecount, seencount))

示例#31

0

显示文件

def main(argv=None):
    logger.info("Starting add_news")
    if argv:
        url = argv[0]
    else:
        url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai'
    schema_slug = 'local-news'

    try:
        schema = Schema.objects.get(slug=schema_slug)
    except Schema.DoesNotExist:
        logger.error("Schema (%s): DoesNotExist" % schema_slug)
        sys.exit(1)

    f = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in f.entries:
        title = convert_entities(entry.title)
        description = convert_entities(entry.description)

        if entry.id.startswith('http'):
            item_url = entry.id
        else:
            item_url = entry.link
        try:
            item = NewsItem.objects.get(schema__id=schema.id,
                                        title=title,
                                        description=description)
            #url=item_url)
            status = 'updated'
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = 'added'
        except NewsItem.MultipleObjectsReturned:
            # Seen some where we get the same story with multiple URLs. Why?
            logger.warn(
                "Multiple entries matched title %r and description %r. Expected unique!"
                % (title, description))
            continue
        try:
            item.title = title
            item.schema = schema
            item.description = description
            item.url = item_url
            item.location_name = entry.get('x-calconnect-street') or entry.get(
                'georss_featurename')
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])

            # feedparser bug: depending on which parser it magically uses,
            # we either get the xml namespace in the key name, or we don't.
            point = entry.get('georss_point') or entry.get('point')
            if point:
                x, y = point.split(' ')
            else:
                # Fall back on geocoding.
                text = item.title + ' ' + item.description
                try:
                    x, y = quick_dirty_fallback_geocode(text, parse=True)
                except GeocodingException:
                    logger.debug("Geocoding exception on %r:" % text)
                    log_exception(level=logging.DEBUG)
                    continue
                if None in (x, y):
                    logger.info("couldn't geocode '%s...'" % item.title[:30])
                    continue
            item.location = Point((float(y), float(x)))
            if item.location.x == 0.0 and item.location.y == 0.0:
                # There's a lot of these. Maybe attempt to
                # parse and geocode if we haven't already?
                logger.info("Skipping %r as it has bad location 0,0" %
                            item.title)
                continue
            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    logger.debug(" Reverse-geocoded point to %r" %
                                 block.pretty_name)
                    item.location_name = block.pretty_name
                    item.block = block
                except reverse.ReverseGeocodeError:
                    logger.debug(" Failed to reverse geocode %s for %r" %
                                 (item.location.wkt, item.title))
                    item.location_name = u''
            item.save()
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.error("Warning: couldn't save %r. Traceback:" % item.title)
            log_exception()
    logger.info("Finished add_news: %d added, %d updated" %
                (addcount, updatecount))

示例#32

0

显示文件

文件： add_news.py 项目： horshacktest/openblock

def update(argv=None):
    logger.info("Starting add_news")
    if argv:
        url = argv[0]
    else:
        url = 'http://search.boston.com/search/api?q=*&sort=-articleprintpublicationdate&subject=massachusetts&scope=bonzai'
    schema_slug = 'local-news'

    try:
        schema = Schema.objects.get(slug=schema_slug)
    except Schema.DoesNotExist:
        logger.error( "Schema (%s): DoesNotExist" % schema_slug)
        sys.exit(1)

    f = feedparser.parse(url)
    addcount = updatecount = 0
    for entry in f.entries:
        title = convert_entities(entry.title)
        description = convert_entities(entry.description)

        if entry.id.startswith('http'):
            item_url = entry.id
        else:
            item_url = entry.link
        try:
            item = NewsItem.objects.get(schema__id=schema.id,
                                        title=title,
                                        description=description)
            #url=item_url)
            status = 'updated'
        except NewsItem.DoesNotExist:
            item = NewsItem()
            status = 'added'
        except NewsItem.MultipleObjectsReturned:
            # Seen some where we get the same story with multiple URLs. Why?
            logger.warn("Multiple entries matched title %r and description %r. Expected unique!" % (title, description))
            continue
        try:
            item.title = title
            item.schema = schema
            item.description = description
            item.url = item_url
            item.location_name = entry.get('x-calconnect-street') or entry.get('georss_featurename')
            item.item_date = datetime.datetime(*entry.updated_parsed[:6])
            item.pub_date = datetime.datetime(*entry.updated_parsed[:6])

            # feedparser bug: depending on which parser it magically uses,
            # we either get the xml namespace in the key name, or we don't.
            point = entry.get('georss_point') or entry.get('point')
            x, y = None, None
            if point:
                x, y = point.split(' ')
            if True:
                # Fall back on geocoding.
                text = item.title + ' ' + item.description
                addrs = parse_addresses(text)
                for addr, unused in addrs:
                    try:
                        result = SmartGeocoder().geocode(addr)
                        point = result['point']
                        logger.debug("internally geocoded %r" % addr)
                        x, y = point.x, point.y
                        break
                    except (GeocodingException, ParsingError):
                        logger.debug("Geocoding exception on %r:" % text,
                                     exc_info=True)
                        continue
                    except:
                        logger.exception('uncaught geocoder exception on %r\n' % addr)
                if None in (x, y):
                    logger.info("couldn't geocode '%s...'" % item.title[:30])
                    continue
            item.location = Point((float(y), float(x)))
            if item.location.x == 0.0 and item.location.y == 0.0:
                # There's a lot of these. Maybe attempt to
                # parse and geocode if we haven't already?
                logger.info("Skipping %r as it has bad location 0,0" % item.title)
                continue
            if not item.location_name:
                # Fall back to reverse-geocoding.
                from ebpub.geocoder import reverse
                try:
                    block, distance = reverse.reverse_geocode(item.location)
                    logger.debug(" Reverse-geocoded point to %r" % block.pretty_name)
                    item.location_name = block.pretty_name
                except reverse.ReverseGeocodeError:
                    logger.debug(" Failed to reverse geocode %s for %r" % (item.location.wkt, item.title))
                    item.location_name = u''
            item.save()
            if status == 'added':
                addcount += 1
            else:
                updatecount += 1
            logger.info("%s: %s" % (status, item.title))
        except:
            logger.exception("Warning: couldn't save %r. Traceback:" % item.title)

    logger.info("Finished add_news: %d added, %d updated" % (addcount, updatecount))

示例#33

0

显示文件

def article_text_sections(tree):
    """
    Given an HTML tree of a news article (or blog entry permalink), deduces
    which part of it is text and returns a list of lists of strings, with each
    string representing a paragraph and each list of strings representing a
    "section" of the page.
    """

    # The basic algorithm here is to combine all text within the same block
    # (e.g., a <div>).

    MIN_NUM_PARAGRAPHS = 3
    MIN_NUM_PUNCTUATED = 3

    # In order for a paragraph to be counted toward MIN_NUM_PUNCTUATED, it must
    # have this number of characters.
    MIN_CHARS_IN_PARAGRAPH = 30

    # If this many paragraphs with MIN_CHARS_IN_PARAGRAPH are included in the
    # section, then the section will be included, regardless of failing
    # MIN_PERCENTAGE_PUNCTUATED.
    NUM_PARAGRAPHS_SAFE_GUESS = 6

    # In order for a section to be included in the result, at least this
    # percentage of paragraphs in the section must be punctuated.
    MIN_PERCENTAGE_PUNCTUATED = decimal.Decimal('.5')

    block_tags = set([
        'blockquote', 'dd', 'div', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
        'h7', 'h8', 'li', 'p', 'td', 'th', 'tr'
    ])
    drop_tags_only = set([
        'a', 'abbr', 'acronym', 'b', 'center', 'dir', 'dl', 'em', 'font',
        'form', 'hr', 'i', 'label', 'menu', 'ol', 'pre', 'small', 'span',
        'strong', 'sub', 'sup', 'table', 'tbody', 'tfoot', 'thead', 'topic',
        'u', 'ul', 'wbr'
    ])
    drop_tags_and_contents = set([
        'applet', 'area', 'button', 'embed', 'img', 'iframe', 'head', 'input',
        'link', 'map', 'meta', 'noscript', 'object', 'option', 'script',
        'select', 'spacer', 'style', 'textarea', 'title'
    ])
    layout_tags = set([
        'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8', 'td', 'th', 'tr'
    ])
    is_open_tag = re.compile('^<[^/][^>]+>$').search
    is_close_tag = re.compile('^</[^>]+>$').search
    ignored_paragraphs = set([
        'del.icio.us', 'digg', 'email', 'e-mail editor', 'e-mail story',
        'no comments', 'print', 'print article', 'printer-friendly',
        'printer version', 'reprints'
    ])

    elements_to_drop = []
    for element in tree.getiterator():
        if not isinstance(element.tag, basestring):  # If it's a comment...
            element.drop_tag()
            continue
        if element.text and '\n' in element.text:
            element.text = element.text.replace('\n', ' ')
        if element.tail and '\n' in element.tail:
            element.tail = element.tail.replace('\n', ' ')
        if element.tag in block_tags:
            element.text = '\n' + (element.text or '')
            element.tail = '\n' + (element.tail or '')
        elif element.tag == 'br':
            element.tail = '\n' + (element.tail or '')
            element.drop_tag()
        elif element.tag in drop_tags_only:
            element.drop_tag()
        elif element.tag in drop_tags_and_contents:
            elements_to_drop.append(element)
        elif element.tag not in ('html', 'body'):  # Unknown tag!
            element.drop_tag()
    for e in elements_to_drop:
        e.drop_tree()

    for element in tree.getiterator():
        if element.tag in block_tags:
            if element.tag in layout_tags:
                element.text = '\n<%s>\n%s\n' % (element.tag,
                                                 (element.text or ''))
                element.tail = '\n</%s>\n%s\n' % (element.tag,
                                                  (element.tail or ''))
            element.drop_tag()

    try:
        tree.body
    except IndexError:
        # In some cases, the article is missing a <body> tag, and tree.body
        # will result in an IndexError. Just skip these.
        return []

    new_html = etree.tostring(tree.body, method='html')
    new_html = convert_entities(new_html)
    lines = re.split(r'\s*\n+\s*', new_html.strip())
    result = []
    sections = []
    for line in lines:
        if is_open_tag(line):
            result.append([])
        elif is_close_tag(line):
            last_bit = result.pop()
            if len(last_bit) >= MIN_NUM_PARAGRAPHS:
                sections.append(last_bit)
        else:  # It's text, not a tag.
            try:
                result[-1].append(line)
            except IndexError:  # No tags seen yet.
                result.append([line])

    # Cut out the sections that don't contain enough punctuated sentences.
    final_sections = []

    for section in sections:
        count = 0
        to_delete = []
        for i, paragraph in enumerate(section):

            # Quotes seem to be causing the is_punctuated to fail
            paragraph = paragraph.replace('"', '').strip()
            paragraph = paragraph.replace("'", '').strip()

            if paragraph.lower() in ignored_paragraphs:
                to_delete.append(i)
            elif is_punctuated(
                    paragraph) and len(paragraph) >= MIN_CHARS_IN_PARAGRAPH:
                count += 1

        percent_punctuated = decimal.Decimal(count) / decimal.Decimal(
            len(section))

        if count >= NUM_PARAGRAPHS_SAFE_GUESS or (
                count >= MIN_NUM_PUNCTUATED
                and percent_punctuated >= MIN_PERCENTAGE_PUNCTUATED):

            for i in reversed(
                    to_delete
            ):  # Delete in reverse so that index order is preserved.
                del section[i]
            final_sections.append(section)
    return final_sections