def save(self, old_record, list_record, detail_record): kwargs = self.unique_fields(list_record) location = Point((float(list_record['geo_long']), float(list_record['geo_lat']))) if (location.x, location.y) == (0,0, 0.0): print "skipping %r as it has bad location 0,0" % list_record['title'] return # remove address and rating from summary. summary_detail = list_record['summary_detail']['value'] content = list_record['summary'] content = address_re.sub('', content) rating = rating_re.search(content) attributes = None if rating: rating = int(rating.group(1)) attributes = {'rating': rating} content = rating_re.sub('', content) content = preprocess_to_string(content, drop_tags=('p', 'br', 'b',)) kwargs.update(dict(description=content, location=location, )) if old_record: self.update_existing(old_record, kwargs, attributes) else: self.create_newsitem(attributes=attributes, **kwargs)
def unique_fields(self, list_record): # not necessarily primary key, but for this script's purposes # these are the fields that in combination uniquely idenfity # an article. date = datetime.date(*list_record['updated_parsed'][:3]) precincts = ['A1', 'A7', 'B2', 'B3', 'C11', 'C6', 'D14', 'D4', 'E13', 'E18', 'E5'] precinct = None tags = [t['term'] for t in list_record['tags']] if not tags: return for precinct in tags: if precinct in precincts: # TODO: we need a LocationType for precincts, and shapes; and # then we can set newsitem.location_object to the Location # for this precinct. break if not precinct: self.logger.debug("no precinct found in tags %r" % tags) if 'Boston 24' in tags: # TODO: the 'Boston 24' tag indicates posts with aggregate # daily stats. Make a separate schema for aggregates, # with attributes like those used in # everyblock/everyblock/cities/nyc/crime_aggregate/retrieval.py. # These are citywide though, not by precinct. # So what would be the Location? Whole city?? self.logger.info("boston daily crime stats, we don't know how to " "handle these yet") description = list_record['content'][0]['value'] # TODO: we should have a stock 'clean up html' function. description = preprocess_to_string( description, drop_tags=('a', 'area', 'b', 'center', 'font', 'form', 'img', 'input', 'p', 'strong', 'map', 'small', 'span', 'sub', 'sup', 'topic', 'u'), drop_trees=('applet', 'button', 'embed', 'iframe', 'object', 'select', 'textarea'), drop_attrs=('background', 'border', 'cellpadding', 'cellspacing', 'class', 'clear', 'id', 'rel', 'style', 'target')) from ebdata.retrieval.utils import convert_entities description = convert_entities(description) #description = description.replace(' ', ' ').replace(' ', ' ') addrs = parse_addresses(description) if not addrs: self.logger.info("no addresses found in %r" % list_record['title']) location = None location_name = u'' for addr, unused in addrs: addr = addr.strip() try: from geocoder_hack import quick_dirty_fallback_geocode x, y = quick_dirty_fallback_geocode(addr) if (x, y) != (None, None): location = Point((float(x), float(y))) location_name = addr.title() except: print "ugh, %r" % addr # XXX log something return dict(item_date=date, location=location, location_name=location_name, title=list_record['title'], description=description, )