def import_into_xapian(): client = Client(settings.XAPIAN_BASE_URL, settings.XAPIAN_SPECIES_DB) client.newdb([{ 'field_name': 'common_name', 'store': True, 'freetext': {'language': 'en'} # language used for stemming }, { 'field_name': 'scientific_name', 'store': True, 'freetext': {'language': 'en'} # Remove when stemming bug is fixed }, { 'field_name': 'freebase_id', 'store': True, 'freetext': {'language': 'en'} }], overwrite=True) # replaces existing index if there is one # We have a database! # Now we create documents queue = [] count = 0 for row in import_from_file(): if not row['scientific_name']: continue count += 1 doc = Document() # doc.id = 'X' will over-ride auto ID /AND/ cause replace if exists doc.extend([ ('common_name', row['name']), ('scientific_name', row['scientific_name']), ('freebase_id', row['id']), ]) # client.add(doc) - would work here queue.append(doc) if len(queue) >= 1000: client.bulkadd(queue) queue = [] print "Imported %d" % count # Catch the remainder if queue: client.bulkadd(queue)
def search_split(request, what, near): if near.lower() == 'me': (current_location, (lat, lon)) = location_from_request(request) if current_location: near = current_location # Look up location using Google geocoder (more likely to return results) name, (lat, lon) = google_geocode(near) if not name: # Treat as a lat/long pair instead, see if that works c = Client(settings.XAPIAN_BASE_URL, settings.XAPIAN_PERSONAL_PREFIX) result = c.parse_latlong(near) if result['ok']: lat, lon = result['latitude'], result['longitude'] else: return search_single( request, '%s near %s' % (what, near), bypass=True ) results, results_info, results_corrected_q = search_places( what or SEARCH_ALL, details=True, latlon=(lat, lon), num=PER_PAGE ) species_results, species_results_info, species_results_corrected_q = \ search_known_species( what, details=True, default_op=Query.OP_OR, ) users_results, users_results_info, users_results_corrected_q = \ search_users( what, details=True, default_op=Query.OP_OR, ) results = [ place for place in results if not place.is_unlisted ] for result in results: result.species_list = result.get_species() for species in result.species_list: if species in species_results: species.matches_search = True # If we got back a distance, bung that on there too try: result.distance = ([ d for d in results_info['items'] if d['id'] == 'places.Place:%s' % result.id ][0]['geo_distance']['latlon'] / 1609.344) except (KeyError, IndexError): pass return render(request, 'search/search_split.html', { 'what': what, 'near': near, 'location_name': name, 'results': results, 'results_info': pformat(results_info), 'results_corrected_q': results_corrected_q, 'species_results': (species_results or [])[:5], 'species_results_info': pformat(species_results_info), 'species_results_corrected_q': species_results_corrected_q, 'species_results_more': len(species_results or [])>5, 'users_results': users_results, 'users_results_info': pformat(users_results_info), 'users_results_corrected_q': users_results_corrected_q, })
def import_into_xapian(): client = Client( settings.XAPIAN_BASE_URL, settings.XAPIAN_LOCATION_DB ) try: client.deldb() except: # BAD: Naked except pass client.newdb([{ 'field_name': 'place_name', 'store': True, 'freetext': {'language': 'en'} # language used for stemming }, { 'field_name': 'county', # Maps to admin_name2 'store': True, 'freetext': {} }, { 'field_name': 'country_code', 'store': True, 'freetext': {} # TODO: Use exact match here, not yet implemented }, { 'field_name': 'postal_code', 'store': True, 'freetext': {} # TODO: Can we do prefix search only? }, { 'field_name': 'description', 'store': True, # stored but not indexed }, { 'field_name': 'latlon', 'store': True, 'type': 'geo', 'geo': {}, # no options yet }]) # We have a database! # We throw away anything that results in a description that we have # already used for something else. There are only 213 (out of 27,000) # where a duplicate description has more than one lat/lon pair - so # we've chosen to just discard those. seen_descriptions = set() # Now we create documents queue = [] count = 0 for row in import_from_file(): # Some (3) of them don't have lat or lon - ignore those if not (row['latitude'] and row['longitude']): continue description = make_description(row) if description in seen_descriptions: continue seen_descriptions.add(description) count += 1 doc = Document() # doc.id = 'X' will over-ride auto ID /AND/ cause replace if exists doc.extend([ ('place_name', row['place_name']), ('county', row['admin_name2']), ('postal_code', row['postal_code']), ('country_code', row['country_code']), ('description', description), ('latlon', '%s %s' % ( row['latitude'], row['longitude'], )), # TODO: Ignoring accuracy field for the moment ]) # client.add(doc) - would work here queue.append(doc) if len(queue) >= 1000: client.bulkadd(queue) queue = [] print "Imported %d" % count # Catch the remainder if queue: client.bulkadd(queue)