Python text_from_html示例，ebdata.textmining.treeutils.text_from_html Python示例

示例#1

0

显示文件

文件： meetup_retrieval.py 项目： DotNetWebs/openblock

    def clean_list_record(self, record):
        # clean up a record dict
        venue = record.get('venue', {})
        if not venue:
            raise SkipRecord("No venue")
        location_name_parts = [venue.get(key, '').strip() for key in
                               ('address_1', 'address_2', 'city', 'state', 'zip')]
        location_name = ', '.join([p for p in location_name_parts if p])
        event_time = datetime.datetime.fromtimestamp(record['time'] / 1000.0, local_tz)

        cleaned = {'title': text_from_html(record['name']),
                   'description': text_from_html(record.get('description', '')),
                   'location_name': location_name,
                   'location': Point(venue['lon'],
                                     venue['lat']),
                   'url': record['event_url'],
                   'item_date': event_time.date(),
                   }
        attributes = {'venue_phone': venue.get('phone', ''),
                      'venue_name': text_from_html(venue.get('name', '')),
                      'start_time': event_time.time(),
                      'group_name': record['group']['name'],
                      }
        cleaned['_attributes'] = attributes
        return cleaned

示例#2

0

显示文件

    def clean_list_record(self, record):
        # clean up a record dict
        venue = record.get('venue', {})
        if not venue:
            raise SkipRecord("No venue")
        location_name_parts = [
            venue.get(key, '').strip()
            for key in ('address_1', 'address_2', 'city', 'state', 'zip')
        ]
        location_name = ', '.join([p for p in location_name_parts if p])
        event_time = datetime.datetime.fromtimestamp(record['time'] / 1000.0,
                                                     local_tz)

        cleaned = {
            'title': text_from_html(record['name']),
            'description': text_from_html(record.get('description', '')),
            'location_name': location_name,
            'location': Point(venue['lon'], venue['lat']),
            'url': record['event_url'],
            'item_date': event_time.date(),
        }
        attributes = {
            'venue_phone': venue.get('phone', ''),
            'venue_name': text_from_html(venue.get('name', '')),
            'start_time': event_time.time(),
            'group_name': record['group']['name'],
        }
        cleaned['_attributes'] = attributes
        return cleaned

示例#3

0

显示文件

文件： whiteville_news.py 项目： OpenData-NC/columbus-county-nc

    def clean_list_record(self, record):
        date = datetime.date(*record['updated_parsed'][:3])
        description = record['summary']

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        # First we'll need some suitable text; throw away HTML tags.
        full_description = record['content'][0]['value']
        full_description = text_from_html(full_description)
        # This method on the RssListDetailScraper does the rest.
        location, location_name = self.get_point_and_location_name(
            record, address_text=full_description)

        if not (location or location_name):
            locs = self.grabber(full_description)
            if locs:
                location_name = locs[0][2]
                location = Location.objects.get(name=location_name).location

        if not (location or location_name):
            raise SkipRecord("No location or location_name")

        cleaned = {
            'item_date': date,
            'location': location,
            'location_name': location_name,
            'title': record['title'],
            'description': description,
            'url': record['link']
        }
        return cleaned

示例#4

0

显示文件

文件： kent_360_blog.py 项目： jtalbott22/OpenCampusKent

    def save(self, old_record, list_record, detail_record):
        # This gets called once all parsing and cleanup is done.
        # It looks a lot like our 'expedient hack' code above.

        # We can ignore detail_record since has_detail is False.

        date = datetime.date(*list_record['updated_parsed'][:3])
        description = text_from_html(list_record['summary'])

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        # First we'll need some suitable text; throw away HTML tags.
       # full_description = list_record['content'][0]['value']
       # full_description = text_from_html(full_description)
        grabber = places.location_grabber()

        addrs = grabber(description)
       # printing articles title for debugging
       # print list_record['title']

        if not addrs:
            addrs = grabber(list_record['title'])
            if not addrs:
                self.logger.info("no addresses found")
                return

        location = None
        location_name = u''
        block = None
        # Ready to geocode. If we had one location_name to try,
        # this could be done automatically in create_or_update(), but
        # we have multiple possible location_names.
        for l, r, name in addrs:
            #addr = addr.strip()
            try:
                locationSyn = LocationSynonym.objects.get(pretty_name = name)
                location = Location.objects.get(name = locationSyn.location).location
            except GeocodingException:
                log_exception(level=logging.DEBUG)
                continue
            location_name = name
           # block = location['block']
           # location = location['point']
            break
        if location is None:
            self.logger.info("no addresses geocoded in %r" % list_record['title'])
            return

        kwargs = dict(item_date=date,
                      location=location,
                      location_name=location_name,
                      description=description,
                      title=list_record['title'],
                      url=list_record['link'],
                      )
        attributes = None
        self.create_or_update(old_record, attributes, **kwargs)

示例#5

0

显示文件

文件： retrieval.py 项目： IdahoInstitute/openblock

    def clean_list_record(self, record):
        if record['title'].startswith(u'Boston 24'):
            # We don't include the summary posts.
            # TODO: the 'Boston 24' tag indicates posts with aggregate
            # daily stats.  Make a separate schema for the aggregates,
            # with attributes like those used in
            # everyblock/everyblock/cities/nyc/crime_aggregate/retrieval.py.
            # Or maybe not: these are citywide, not by precinct.
            # So what would be the Location?  Whole city??
            self.logger.info("boston daily crime stats, we don't know how to "
                             "handle these yet")
            raise SkipRecord

        date = datetime.date(*record['updated_parsed'][:3])
        description = record['summary']

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        full_description = record['content'][0]['value']
        full_description = text_from_html(full_description)
        location, location_name = self.get_point_and_location_name(
            record, address_text=full_description)

        if not (location or location_name):
            raise SkipRecord("No location or location_name")

        # Get the precinct from the tags.
        precincts = ['A1', 'A15', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4',
                     'E13', 'E18', 'E5']
        tags = [t['term'] for t in record['tags']]
        precinct = None
        for tag in tags:
            if tag in precincts:
                # TODO: we need a LocationType for precincts, and shapes; and
                # then we could set newsitem.location_object to the Location
                # for this precinct. For now we just save it as an attribute.
                precinct = tag
                break

        attributes = {}
        if precinct:
            precinct = self.get_or_create_lookup('precinct', precinct, precinct)
            attributes['precinct'] = precinct.id
        else:
            raise SkipRecord("no precinct found in tags %r" % tags)

        cleaned = dict(item_date=date,
                       location=location,
                       location_name=location_name,
                       title=record['title'],
                       description=description,
                       url=record['link'],
                       attributes=attributes,
                       )

        return cleaned

示例#6

0

显示文件

文件： midmo_coupons.py 项目： columbia-daily-tribune/obcolumbia

    def clean_detail_record(self, detail_record):
        detail_record = detail_record.copy()
        detail_record['description'] = text_from_html(detail_record['description'])
        detail_record['url'] = detail_record.pop('link')
        detail_record['item_date'] = datetime.date(*detail_record.pop('updated_parsed')[:3])
        for key in ('updated', 'title_detail', 'summary_detail', 'guidislink',
                    'summary', 'links', 'id'):
            del detail_record[key]

        return detail_record

示例#7

0

显示文件

文件： midmo_reviews.py 项目： columbia-daily-tribune/obcolumbia

 def parse_list(self, page):
     """
     yields a dictionary of data for each record on the page.
     See list_detail.py.
     """
     tree = self.parse_html(page)
     for review in tree.findall("//*[@typeof='v:Review']"):
         url = review.find(".//a[@class='permalink']").attrib['href']
         if url.startswith('/'):
             url = self.base_url + url
         rating = review.find(".//*[@property='v:rating']").text
         rating = rating.split(':')[-1].strip()
         reviewer = review.find(".//*[@property='v:reviewer']").text
         reviewed = review.find(".//*[@property='v:itemreviewed']").text
         description = review.find(".//*[@property='v:description']/p")
         description = getattr(description, 'text', '')
         if description:
             description = text_from_html(description)
         else:
             self.logger.info("Blank description, skipping...")
             continue
         date = review.find(".//*[@property='v:dtreviewed']").text
         date = parse_date(date, '%B %d, %Y')
         if date < self.start_date:
             raise StopScraping("Reached cutoff date %s" % self.start_date)
         data = {
             'url': url,
             'title': reviewed,
             'description': description,
             'item_date': date,
             '_attributes': {
                 'rating': rating,
                 'reviewer': reviewer,
                 'business_name': reviewed,
                 },
             }
         yield data

示例#8

0

显示文件

文件： retrieval.py 项目： UniversityDailyKansan/openblock

    def save(self, old_record, list_record, detail_record):
        # TODO: move some of this to clean_list_record?
        date = datetime.date(*list_record['updated_parsed'][:3])

        # Get the precinct from the tags.
        precincts = ['A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4',
                     'E13', 'E18', 'E5']
        precinct = None
        tags = [t['term'] for t in list_record['tags']]
        if not tags:
            return

        for precinct in tags:
            if precinct in precincts:
                # TODO: we need a LocationType for precincts, and shapes; and
                # then we could set newsitem.location_object to the Location
                # for this precinct.
                break

        if not precinct:
            self.logger.debug("no precinct found in tags %r" % tags)

        description = list_record['summary']

        full_description = list_record['content'][0]['value']
        full_description = text_from_html(full_description)

        addrs = parse_addresses(full_description)
        if not addrs:
            self.logger.info("no addresses found in %r %r" % (list_record['title'], 
                                                           list_record['link']))
            return

        location = None
        location_name = u''
        block = None

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        for addr, unused in addrs:
            addr = addr.strip()
            try:
                location = SmartGeocoder().geocode(addr)
            except (GeocodingException, ParsingError):
                log_exception(level=logging.DEBUG)
                continue
            location_name = location['address']
            location = location['point']
            break
        else:
            self.logger.info("no addresses geocoded in %r" % list_record['title'])
            return

        kwargs = dict(item_date=date,
                      location=location,
                      location_name=location_name,
                      title=list_record['title'],
                      description=description,
                      url=list_record['link'],
                      )
        attributes = None
        self.create_or_update(old_record, attributes, **kwargs)

示例#9

0

显示文件

文件： bpdnews_retrieval.py 项目： frankk00/openblock

    def save(self, old_record, list_record, detail_record):
        # TODO: move some of this to clean_list_record?
        date = datetime.date(*list_record["updated_parsed"][:3])

        # Get the precinct from the tags.
        precincts = ["A1", "A7", "B2", "B3", "C11", "C6", "D14", "D4", "E13", "E18", "E5"]
        precinct = None
        tags = [t["term"] for t in list_record["tags"]]
        if not tags:
            return

        for precinct in tags:
            if precinct in precincts:
                # TODO: we need a LocationType for precincts, and shapes; and
                # then we could set newsitem.location_object to the Location
                # for this precinct.
                break

        if not precinct:
            self.logger.debug("no precinct found in tags %r" % tags)

        description = list_record["summary"]

        full_description = list_record["content"][0]["value"]
        full_description = text_from_html(full_description)

        addrs = parse_addresses(full_description)
        if not addrs:
            self.logger.info("no addresses found in %r %r" % (list_record["title"], list_record["link"]))
            return

        location = None
        location_name = u""
        block = None

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        for addr, unused in addrs:
            addr = addr.strip()
            try:
                location = SmartGeocoder().geocode(addr)
            except GeocodingException:
                log_exception(level=logging.DEBUG)
                continue
            location_name = location["address"]
            block = location["block"]
            location = location["point"]
            break
        else:
            self.logger.info("no addresses geocoded in %r" % list_record["title"])
            return

        kwargs = dict(
            item_date=date,
            location=location,
            location_name=location_name,
            title=list_record["title"],
            description=description,
            url=list_record["link"],
        )
        attributes = None
        self.create_or_update(old_record, attributes, **kwargs)

示例#10

0

显示文件

    def save(self, old_record, list_record, detail_record):
        # TODO: move some of this to clean_list_record?
        date = datetime.date(*list_record['updated_parsed'][:3])

        # Get the precinct from the tags.
        precincts = [
            'A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18',
            'E5'
        ]
        precinct = None
        tags = [t['term'] for t in list_record['tags']]
        if not tags:
            return

        for precinct in tags:
            if precinct in precincts:
                # TODO: we need a LocationType for precincts, and shapes; and
                # then we could set newsitem.location_object to the Location
                # for this precinct.
                break

        if not precinct:
            self.logger.debug("no precinct found in tags %r" % tags)

        description = list_record['summary']

        full_description = list_record['content'][0]['value']
        full_description = text_from_html(full_description)

        addrs = parse_addresses(full_description)
        if not addrs:
            self.logger.info("no addresses found in %r %r" %
                             (list_record['title'], list_record['link']))
            return

        location = None
        location_name = u''
        block = None

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        for addr, unused in addrs:
            addr = addr.strip()
            try:
                location = SmartGeocoder().geocode(addr)
            except GeocodingException:
                log_exception(level=logging.DEBUG)
                continue
            location_name = location['address']
            block = location['block']
            location = location['point']
            break
        else:
            self.logger.info("no addresses geocoded in %r" %
                             list_record['title'])
            return

        kwargs = dict(
            item_date=date,
            location=location,
            location_name=location_name,
            title=list_record['title'],
            description=description,
            url=list_record['link'],
        )
        attributes = None
        self.create_or_update(old_record, attributes, **kwargs)

示例#11

0

显示文件

文件： news_rss.py 项目： jtalbott22/OpenCampusKent

    def save(self, old_record, list_record, detail_record):
        # This gets called once all parsing and cleanup is done.
        # It looks a lot like our 'expedient hack' code above.

        # We can ignore detail_record since has_detail is False.

        date = datetime.date(*list_record['updated_parsed'][:3])
        description = text_from_html(list_record['summary'])

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        # First we'll need some suitable text; throw away HTML tags.
       # full_description = list_record['content'][0]['value']
       # full_description = text_from_html(full_description)
        grabber = places.location_grabber()
 
        addrs = grabber(description)
       # printing articles title for debugging
       # print list_record['title']

        #if not addrs:
	#    addrs = grabber(list_record['title'])
 	#    if not addrs:
        #  	self.logger.info("no addresses found")
        #    	return

        location = None
        location_name = u''
        block = None

	
	grabber = places.place_grabber()
 
        addrs = grabber(description)

        #if not match is found article is assigned location of Kent State
        if not addrs:
            location_name  = "Kent State"
            locationSyn = LocationSynonym.objects.get(pretty_name = location_name)
            location = Location.objects.get(name = locationSyn.location).location
            self.logger.info("no matches for place found. Using Kent State default")
	else:	
            location = None
            location_name = u''
            block = None
	    
	    
	    #here we're checking the return results form the place grabber
	    #for mathces in the database. first Places are checked then PlaceSynonyms.
	    for l, r, name in addrs:
            #addr = addr.strip()
                try:
		    print name
                    place = Place.objects.get(pretty_name = name)
                    location = place.location
                except Place.DoesNotExist:
		    try:
                        place = PlaceSynonym.objects.get(pretty_name = name)
		        location = place.place.location
		    
		    except PlaceSynonym.DoesNotExist:
			self.logger.info("no addresses geocoded in %r" % list_record['title'])
			continue
                location_name = name
               # block = location['block']
               # location = location['point']
                break
            if location is None:
                self.logger.info("no addresses geocoded in %r" % list_record['title'])
                return



        kwargs = dict(item_date=date,
                      location=location,
                      location_name=location_name,
                      description=description,
                      title=list_record['title'],
                      url=list_record['link'],
                      )
        attributes = None
        self.create_or_update(old_record, attributes, **kwargs)

示例#12

0

显示文件

    def clean_list_record(self, record):
        if record['title'].startswith(u'Boston 24'):
            # We don't include the summary posts.
            # TODO: the 'Boston 24' tag indicates posts with aggregate
            # daily stats.  Make a separate schema for the aggregates,
            # with attributes like those used in
            # everyblock/everyblock/cities/nyc/crime_aggregate/retrieval.py.
            # Or maybe not: these are citywide, not by precinct.
            # So what would be the Location?  Whole city??
            self.logger.info("boston daily crime stats, we don't know how to "
                             "handle these yet")
            raise SkipRecord

        date = datetime.date(*record['updated_parsed'][:3])
        description = record['summary']

        # This feed doesn't provide geographic data; we'll try to
        # extract addresses from the text, and stop on the first
        # one that successfully geocodes.
        full_description = record['content'][0]['value']
        full_description = text_from_html(full_description)
        location, location_name = self.get_point_and_location_name(
            record, address_text=full_description)

        if not (location or location_name):
            raise SkipRecord("No location or location_name")

        # Get the precinct from the tags.
        precincts = [
            'A1', 'A15', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13',
            'E18', 'E5'
        ]
        tags = [t['term'] for t in record['tags']]
        precinct = None
        for tag in tags:
            if tag in precincts:
                # TODO: we need a LocationType for precincts, and shapes; and
                # then we could set newsitem.location_object to the Location
                # for this precinct. For now we just save it as an attribute.
                precinct = tag
                break

        attributes = {}
        if precinct:
            precinct = self.get_or_create_lookup('precinct', precinct,
                                                 precinct)
            attributes['precinct'] = precinct.id
        else:
            raise SkipRecord("no precinct found in tags %r" % tags)

        cleaned = dict(
            item_date=date,
            location=location,
            location_name=location_name,
            title=record['title'],
            description=description,
            url=record['link'],
            attributes=attributes,
        )

        return cleaned