예제 #1
0
    def clean_list_record(self, list_record):
        """
        Given a dict, prepare it for saving as a newsitem.
        Result will be a dictionary of anything from list_record
        that looks like a known field of the NewsItem model.
        Anything that looks like a known SchemaField of the item's Schema
        will be set as an 'attributes' sub-dictionary.

        Unrecognized keys will be ignored (and logged).

        Locations are found heuristically:
         - If there's a 'location' key, try to split the value into (lat, lon) points
         - If there's keys like 'latitude'/'lat' and 'longitude'/'lon'/'long'/'lng', use those
         - If there's a 'location_name', geocode if needed
         - If there's no 'location_name', reverse-geocode if needed

        """
        from ebpub.db.models import NewsItem
        fieldnames = [f.name for f in NewsItem._meta.fields]
        core_fields = {}
        from ebdata.retrieval.utils import get_point
        if 'location' in list_record:
            # If there's a comma- or space-separated location in the
            # orginal, this gives us a way to use it by mapping it to
            # "location"
            try:
                lat, lon = re.split(r'[\s,]+', list_record.pop('location'))
                list_record.setdefault('lat', lat)
                list_record.setdefault('lon', lon)
            except ValueError:
                pass
        # Now try all the field names recognized by get_point(), eg
        # lat, latitude, lon, long, lng, georss_point, etc.
        point = get_point(list_record)
        for fieldname in fieldnames:
            if fieldname in list_record:
                # TODO: coerce types? Or maybe Django's implicit conversion is OK.
                core_fields[fieldname] = list_record.pop(fieldname)

        # Try to ensure we have both point and location_name;
        # fall back to address extraction from *all* fields.
        address_text = core_fields.get('location_name') or '\n'.join(
            [unicode(s) for s in list_record.values()])
        point, location_name = self.geocode_if_needed(point, address_text)
        core_fields['location'] = point
        core_fields['location_name'] = location_name

        # Attributes.
        attributes = {}
        schemafields = self.schema.schemafield_set.all()
        for sf in schemafields:
            if sf.name in list_record:
                # TODO: coerce types? Or maybe Django's implicit conversion is OK.
                attributes[sf.name] = list_record.pop(sf.name)
        core_fields['attributes'] = attributes
        if len(list_record):
            self.logger.debug("Unused stuff from list_record: %s" %
                              list_record)
        return core_fields
예제 #2
0
    def clean_list_record(self, list_record):
        """
        Given a dict, prepare it for saving as a newsitem.
        Result will be a dictionary of anything from list_record
        that looks like a known field of the NewsItem model.
        Anything that looks like a known SchemaField of the item's Schema
        will be set as an 'attributes' sub-dictionary.

        Unrecognized keys will be ignored (and logged).

        Locations are found heuristically:
         - If there's a 'location' key, try to split the value into (lat, lon) points
         - If there's keys like 'latitude'/'lat' and 'longitude'/'lon'/'long'/'lng', use those
         - If there's a 'location_name', geocode if needed
         - If there's no 'location_name', reverse-geocode if needed

        """
        from ebpub.db.models import NewsItem
        fieldnames = [f.name for f in NewsItem._meta.fields]
        core_fields = {}
        from ebdata.retrieval.utils import get_point
        if 'location' in list_record:
            # If there's a comma- or space-separated location in the
            # orginal, this gives us a way to use it by mapping it to
            # "location"
            try:
                lat, lon = re.split(r'[\s,]+', list_record.pop('location'))
                list_record.setdefault('lat', lat)
                list_record.setdefault('lon', lon)
            except ValueError:
                pass
        # Now try all the field names recognized by get_point(), eg
        # lat, latitude, lon, long, lng, georss_point, etc.
        point = get_point(list_record)
        for fieldname in fieldnames:
            if fieldname in list_record:
                # TODO: coerce types? Or maybe Django's implicit conversion is OK.
                core_fields[fieldname] = list_record.pop(fieldname)

        # Try to ensure we have both point and location_name;
        # fall back to address extraction from *all* fields.
        address_text = core_fields.get('location_name') or '\n'.join([unicode(s) for s in list_record.values()])
        point, location_name = self.geocode_if_needed(point, address_text)
        core_fields['location'] = point
        core_fields['location_name'] = location_name

        # Attributes.
        attributes = {}
        schemafields = self.schema.schemafield_set.all()
        for sf in schemafields:
            if sf.name in list_record:
                # TODO: coerce types? Or maybe Django's implicit conversion is OK.
                attributes[sf.name] = list_record.pop(sf.name)
        core_fields['attributes'] = attributes
        if len(list_record):
            self.logger.debug("Unused stuff from list_record: %s" % list_record)
        return core_fields
예제 #3
0
    def get_location(self, record):
        """Try to get a point from the record, trying both georss,
        geo, and some non-standard conventions.

        Returns a Point or None.

        This is not called automatically; if you want to use it, your
        scraper should do ``newsitem.location = self.get_location(record)``
        sometime prior to ``self.save()``.
        """
        from ebdata.retrieval.utils import get_point
        return get_point(record)
예제 #4
0
    def clean_list_record(self, list_record):
        """
        Given a dict, prepare it for saving as a newsitem.
        Result will be a dictionary of anything from list_record
        that looks like a known field of the NewsItem model.

        Anything that looks like a known SchemaField of the item's Schema
        will be set as an item in an 'attributes' sub-dictionary.

        Unrecognized keys will be ignored (and logged).

        Locations are found heuristically:
         - If there's a 'location' key, try to split the value into (lat, lon) points
         - If there's keys like 'latitude'/'lat' and 'longitude'/'lon'/'long'/'lng', use those
         - If there's a 'location_name', geocode if needed
         - If there's no 'location_name', reverse-geocode if possible

        """
        from ebpub.db.models import NewsItem
        fieldnames = [f.name for f in NewsItem._meta.fields]
        core_fields = {}
        from ebdata.retrieval.utils import get_point
        if 'location' in list_record:
            # If there's a comma- or space-separated location in the
            # orginal, this gives us a way to use it by mapping it to
            # "location"
            try:
                lat, lon = re.split(r'[\s,]+', str(list_record.pop('location')))
                list_record.setdefault('lat', lat)
                list_record.setdefault('lon', lon)
            except ValueError:
                pass
        # Now try all the field names recognized by get_point(), eg
        # lat, latitude, lon, long, lng, georss_point, etc.
        point = get_point(list_record)
        for fieldname in fieldnames:
            if fieldname in list_record:
                # TODO: coerce types? Or maybe Django's implicit conversion is OK.
                core_fields[fieldname] = list_record.pop(fieldname)

        # Try to ensure we have both point and location_name;
        # fall back to address extraction from *all* fields.
        address_text = core_fields.get('location_name')
        if self.get_location_name_from_all_fields and not address_text:
            address_text = '\n'.join([unicode(s) for s in list_record.values()])
        point, location_name = self.geocode_if_needed(point, address_text)
        core_fields['location'] = point
        core_fields['location_name'] = location_name

        # Attributes.
        attributes = list_record.get('attributes', {})
        schemafields = self.schema.schemafield_set.all()
        for sf in schemafields:
            if sf.name in list_record:
                # TODO: coerce types? Or maybe Django's implicit conversion is OK.
                value = list_record.pop(sf.name)
                if sf.is_many_to_many_lookup():
                    # Passed value needs to be a list of strings.
                    if isinstance(value, basestring):
                        value = [value]
                    lookups = [
                        Lookup.objects.get_or_create_lookup(
                            sf, name=v, code=v, make_text_slug=False
                        )
                        for v in value]
                    value = ','.join([str(lookup.id) for lookup in lookups])

                elif sf.is_lookup:
                    # Need an int id.
                    value = unicode(value)
                    value = Lookup.objects.get_or_create_lookup(
                        sf, name=value, code=value, make_text_slug=False)
                    value = value.id
                else:
                    # TODO: handle other types?
                    value = unicode(value)
                attributes[sf.name] = value
        core_fields['attributes'] = attributes
        if len(list_record):
            self.logger.debug("Unused stuff from list_record: %s" % list_record)
        return core_fields