def _calc_category_group_summary(criteria): """ Calculate and return a summary of postings by category group. 'criteria' is a dictionary containing the supplied filter criteria. Upon completion, we return a (success, results) tuple, where 'success' is True if and only if the summary was successfully calculated. If 'success' is True, 'results' will be a list of (type, code, number) tuples, where 'type' is the type of summary item, 'code' is the 3taps code for that summary item, and 'number' is the number of matching postings. If 'success' is False, 'results' will be a string describing why the summary could not be calculated. """ query = None # initially. try: success,results = searchHelpers.build_search_query(criteria) if not success: return (False, results) else: query = results query = query.values("category_group__code") results = [] for row in query.annotate(count=Count("id")): results.append(("category_group", row['category_group__code'], row['count'])) except DatabaseError,e: transaction.rollback() # Let the database keep working. if "statement timeout" in str(e): # The query timed out. Tell the user the bad news. if query != None: sql = str(query.query) eventRecorder.record("SUMMARIZER_API", "QUERY_TIMED_OUT", text=sql) logger.debug("DATABASE TIMEOUT, query=" + sql) transaction.commit() return (False, "Database timeout") else: raise
def search(request): """ Respond to the "/api/latest/search" URL. We search for postings based on the given search criteria. """ start_time = time.time() if request.method != "GET": return HttpResponseNotAllowed(["GET"]) # Check the caller's authentication. # ...eventually. # Extract the search criteria. criteria = {} for param in ["category_group", "category", "country", "state", "metro", "region", "county", "city", "locality", "zipcode", "source", "heading", "body", "text", "timestamp", "id", "price", "currency", "annotations", "external_id", "status", "has_image", "include_deleted", "only_deleted"]: if param in request.GET: criteria[param] = request.GET[param] # If the caller didn't supply a timestamp, add a default timestamp to the # search query. if "timestamp" not in criteria: criteria['timestamp'] = str(int((time.time() - 24*3600))) + '..' + \ str(int(time.time())) # Extract our other parameters. if "rpp" in request.GET: try: rpp = int(request.GET['rpp']) except ValueError: return HttpResponse(json.dumps( {'success' : False, 'error' : "Invalid 'rpp' value"}), mimetype="application/json") if rpp < 1 or rpp > 100: return HttpResponse(json.dumps( {'success' : False, 'error' : "'rpp' value out of range"}), mimetype="application/json") else: rpp = 10 if "retvals" in request.GET: retvals = set() for field in request.GET['retvals'].split(","): if field in ["id", "account_id", "source", "category", "category_group", "location", "external_id", "external_url", "heading", "body", "html", "timestamp", "expires", "language", "price", "currency", "images", "annotations", "status", "immortal"]: retvals.add(field) else: return HttpResponse(json.dumps( {'success' : False, 'error' : "invalid 'retvals' " + "value: " + repr(field)}), mimetype="application/json") else: retvals = set(["id", "source", "category", "location", "external_id", "external_url", "heading", "timestamp"]) if "anchor" in request.GET: anchor = request.GET['anchor'] else: anchor = None if "page" in request.GET: try: page = int(request.GET['page']) except ValueError: return HttpResponse(json.dumps( {'success' : False, 'error' : "Invalid 'page' value"}), mimetype="application/json") else: page = 0 # Construct a search query based on the supplied parameters. success,result = searchHelpers.build_search_query(criteria) if not success: return HttpResponse(json.dumps({'success' : False, 'error' : result}), mimetype="application/json") else: query = result if anchor != None: query = query.filter(id__lte=anchor) num_matches = query.count() query = query.order_by("-timestamp") query = query[page*rpp:page*rpp+rpp] sql = str(query.query) # Testing: If the caller provided a "return_sql" parameter, return the raw # SQL statement rather than running it. if (request.GET.get("return_sql") == "1"): return HttpResponse(sql) # Before running the query, set a timeout so we don't hang if the query # takes too long. cursor = connection.cursor() cursor.execute("SET STATEMENT_TIMEOUT=%s" % settings.QUERY_TIMEOUT) # Process the search query, and assemble our search results. found_postings = [] new_anchor = None try: for posting in query: if anchor == None and new_anchor == None: # Remember the ID of the first (ie, most recent) found posting. # This will be our anchor for subsequent requests. new_anchor = str(posting.id) found_posting = {} if "id" in retvals: found_posting['id'] = posting.id if "account_id" in retvals: found_posting['account_id'] = posting.account_id if "source" in retvals: found_posting['source'] = posting.source.code if "category" in retvals: found_posting['category'] = posting.category.code if "category_group" in retvals: found_posting['category_group'] = posting.category_group.code if "location" in retvals: loc = {} if posting.location_latitude != None: loc['latitude'] = posting.location_latitude if posting.location_longitude != None: loc['longitude'] = posting.location_longitude if posting.location_accuracy != None: loc['accuracy'] = posting.location_accuracy if posting.location_country != None: loc['country'] = posting.location_country.code if posting.location_state != None: loc['state'] = posting.location_state.code if posting.location_metro != None: loc['metro'] = posting.location_metro.code if posting.location_region != None: loc['region'] = posting.location_region.code if posting.location_county != None: loc['county'] = posting.location_county.code if posting.location_city != None: loc['city'] = posting.location_city.code if posting.location_locality != None: loc['locality'] = posting.location_locality.code if posting.location_zipcode != None: loc['zipcode'] = posting.location_zipcode.code found_posting['location'] = loc if "external_id" in retvals: found_posting['external_id'] = posting.external_id if "external_url" in retvals: found_posting['external_url'] = posting.external_url if "heading" in retvals: found_posting['heading'] = posting.heading if "body" in retvals: found_posting['body'] = posting.body if "html" in retvals: found_posting['html'] = posting.html if "timestamp" in retvals: found_posting['timestamp'] = datetime_to_seconds( posting.timestamp) if "expires" in retvals: found_posting['expires'] = datetime_to_seconds(posting.expires) if "language" in retvals: found_posting['language'] = posting.language if "price" in retvals: found_posting['price'] = posting.price if "currency" in retvals: found_posting['currency'] = posting.currency if "images" in retvals: images = [] for image in posting.imagereference_set.all(): dst_image = {} if image.full_url != None: dst_image['full_url'] = image.full_url if image.full_width != None: dst_image['full_width'] = image.full_width if image.full_height != None: dst_image['full_height'] = image.full_height if image.thumbnail_url != None: dst_image['thumbnail_url'] = image.thumbnail_url if image.thumbnail_width != None: dst_image['thumbnail_width'] = image.thumbnail_width if image.thumbnail_height != None: dst_image['thumbnail_height'] = image.thumbnail_height images.append(dst_image) found_posting['images'] = images if "annotations" in retvals: annotations = {} for posting_annotation in posting.postingannotation_set.all(): s = posting_annotation.annotation.annotation key,value = s.split(":", 1) annotations[key] = value found_posting['annotations'] = annotations if "status" in retvals: status = {} status['offered'] = posting.status_offered status['lost'] = posting.status_lost status['stolen'] = posting.status_stolen status['found'] = posting.status_found status['deleted'] = posting.status_deleted found_posting['status'] = status if "immortal" in retvals: found_posting['immortal'] = posting.immortal found_postings.append(found_posting) except DatabaseError, e: if "statement timeout" in str(e): # The query timed out. Tell the user the bad news. sql = str(query.query) logger.debug("DATABASE TIMEOUT, query=" + sql) eventRecorder.record("SEARCH_API", "QUERY_TIMED_OUT", text=sql) return HttpResponse(json.dumps({'success' : False, 'error' : "Database timeout"}), mimetype="application/json") else: logger.exception(e) sql = str(query.query) return HttpResponse(json.dumps({'success' : False, 'error' : "Database error"}), mimetype="application/json")
'postings' : found_postings} if anchor == None and new_anchor != None: response['anchor'] = new_anchor # If the caller gave us an anchor, see if any new postings have come in # since the original query was made. if anchor != None: success,query = searchHelpers.build_search_query(criteria) if success: response['new_postings'] = query.filter(id__gt=anchor).count() # Record an event telling us how long the search request took. end_time = time.time() time_taken = int((end_time - start_time) * 1000) # Milliseconds. eventRecorder.record("SEARCH_API", "SEARCH_REQUESTS", 1, time_taken, text=sql) # transaction.commit() # Add the search request time to the response. response['time_taken'] = time_taken # Finally, return the response back to the caller. return HttpResponse(json.dumps(response, sort_keys=True, indent=" "), mimetype="application/json")
def post(request): """ Respond to the "/api/latest/posting" URL. We accept a number of postings and add them to the database. """ if request.method != "POST": return HttpResponseNotAllowed(["POST"]) if request.META['CONTENT_TYPE'] != "application/json": return HttpResponseBadRequest("Request not in JSON format") try: params = json.loads(request.body) except: return HttpResponseBadRequest("Invalid JSON data") # Check the caller's authentication. # ...eventually. # Grab the raw posting data supplied by the caller. if "posting" in params: raw_postings = [params['posting']] elif "postings" in params: raw_postings = params['postings'] else: return HttpResponseBadRequest("Missing 'posting' or 'postings' " + "parameter") if len(raw_postings) > 1000: return HttpResponseBadRequest("Too many postings") # Check the raw postings, making sure the supplied data is valid. We # generate two lists: a list of checked postings to process, and a list of # responses to send back to the caller. results = postingParser.check_raw_postings(raw_postings) error_responses = [] postings = [] for success,result in results: if success: error_responses.append(None) postings.append(result) else: error_responses.append(result) # Calculate the amount of time the client should wait before sending in # more postings. wait_for = calc_wait_for_time(len(postings)) # Queue the postings for later processing. if len(postings) > 0: tasks.process_postings.delay(postings) eventRecorder.record("POSTING_API", "POSTINGS_QUEUED", len(postings), wait_for) # Finally, return the response back to the caller. response = {'error_responses' : error_responses, 'wait_for' : wait_for} return HttpResponse(json.dumps(response), mimetype="application/json")
def process_postings(parsed_postings): """ Process a batch of parsed postings. 'parsed_postings' should be a list of postings that have been successfully checked by the posting parser. Each entry in this list will be a dictionary with the following entries: 'posting' The posting itself, as a dictionary. The fields in this dictionary match the attributes with the same name in the Posting object. 'annotations' A list of annotation values to associate with this posting. Each string will be of the form "key:value" 'images' A list of images to associate with this posting. Each image will be a dictionary with 'full_url' and 'thumbnail_url' entries, as appropriate. We process the postings, adding them to the system as appropriate. Note that this involves the following steps: 1. Calling the Geolocator API if the postings need to be geolocated. 2. Storing the postings into the database. 3. Sending the postings out via the Notification API. """ eventRecorder.record("POSTING_API", "POSTINGS_DEQUEUED", len(parsed_postings)) start_time = time.time() # If necessary, geolocate the postings. for src in parsed_postings: posting = src['posting'] if "location" in posting: raw_loc = posting['location'] has_lat_long = False # initially. if "latitude" in raw_loc and "longitude" in raw_loc: has_lat_long = True has_loc_codes = False # initially. for field in ["country", "state", "metro", "region", "county", "city", "locality", "zipcode"]: if field in raw_loc: has_loc_codes = True break if has_lat_long and not has_loc_codes: # This posting has a lat/long value but no location codes -> # reverse geocode the posting to see which locations it belongs # to. locs = reverseGeocoder.calc_locations(raw_loc['latitude'], raw_loc['longitude'], raw_loc.get("bounds"), raw_loc.get("accuracy")) for level,loc in locs.items(): raw_loc[level] = loc # If we were supplied a bounds array, convert it to a string for # storage. if "bounds" in raw_loc: raw_loc['bounds'] = repr(bounds['raw_loc']) # Get the Annotation objects used by these postings. Since these # objects hold unique annotation values, they can be shared across # postings -- we use the existing Annotation object if it exists, and # create new ones where necessary. annotations = {} # Maps annotation string to Annotation object. for src in parsed_postings: for annotation_value in src['annotations']: if annotation_value not in annotations: # The following attempts to work around a database deadlock # issue. We attempt to get_or_create the given Annotation # record, and if this results in a database deadlock, we wait a # moment before trying again. while True: try: annotation,created = Annotation.objects.get_or_create( annotation=annotation_value) except DatabaseError,e: if "deadlock" in str(e): logger.debug("DEADLOCK DETECTED!!! TRYING AGAIN") time.sleep(0.1) continue else: raise else: break annotations[annotation_value] = annotation
image_ref.posting = posting image_ref.full_url = full_url image_ref.thumbnail_url = thumbnail_url image_ref.save() # If we've created enough postings, commit the transaction. num_postings_in_transaction = num_postings_in_transaction + 1 if num_postings_in_transaction >= MAX_NUM_POSTINGS_IN_TRANSACTION: transaction.commit() num_postings_in_transaction = 0 # Now that the postings are in the database, send them out via the # notification system. # ...more to come! # That's all, folks! end_time = time.time() time_taken = int(1000 * (end_time - start_time)) eventRecorder.record("POSTING_API", "POSTINGS_PROCESSED", len(parsed_postings), time_taken) except: transaction.rollback() raise else: transaction.commit()
def summarize(request): """ Respond to the "/api/latest/summarizer" URL. We calculate and return a summary of postings based on our supplied parameters. """ start_time = time.time() if request.method != "GET": return HttpResponseNotAllowed(["GET"]) # Check the caller's authentication. # ...eventually. # Extract the search criteria. criteria = {} for param in ["category_group", "category", "country", "state", "metro", "region", "county", "city", "locality", "zipcode", "source", "heading", "body", "text", "timestamp", "id", "price", "currency", "annotations", "status", "has_image", "include_deleted"]: if param in request.GET: criteria[param] = request.GET[param] # Process our other parameters. if "dimension" in request.GET: dimension = request.GET['dimension'] if dimension not in ["category", "location", "source"]: return HttpResponseBadRequest("Unknown dimension: " + dimension) else: return HttpResponseBadRequest("Missing required 'dimension' parameter") # Before running the query, set a timeout so we don't hang if the query # takes too long. cursor = connection.cursor() cursor.execute("SET STATEMENT_TIMEOUT=%s" % settings.QUERY_TIMEOUT) # Calculate the appropriate type of summary. if dimension == "category" and "category_group" not in criteria: success,results = _calc_category_group_summary(criteria) elif dimension == "category" and 'category_group' in criteria: success,results = _calc_category_summary(criteria) elif dimension == "location": success,results = _calc_location_summary(criteria) elif dimension == "source": success,results = _calc_source_summary(criteria) else: return HttpResponse(json.dumps({'success' : False, 'error' : "Unable to determine " + "summary type"}), mimetype="application/json") # Record an event telling us how long the summary request took. end_time = time.time() time_taken = int((end_time - start_time) * 1000) # Milliseconds. eventRecorder.record("SUMMARIZER_API", "SUMMARY_REQUESTS", 1, time_taken) transaction.commit() # Finally, return the results back to the caller. if success: return HttpResponse(json.dumps({'success' : True, 'summary' : results}, sort_keys=True, indent=" "), mimetype="application/json") else: return HttpResponse(json.dumps({'success' : False, 'error' : results}), mimetype="application/json")
def _calc_location_summary(criteria): """ Calculate and return a summary of postings by location. 'criteria' is a dictionary containing the supplied filter criteria. If 'success' is True, 'results' will be a list of (type, code, number) tuples, where 'type' is the type of summary item, 'code' is the 3taps code for that summary item, and 'number' is the number of matching postings. If 'success' is False, 'results' will be a string describing why the summary could not be calculated. """ # See what level of location we want to summarize on. This is based on the # lowest-level of the supplied criteria. if "locality" in criteria: level = "zipcode" elif "city" in criteria: level = "locality" elif "county" in criteria: level = "city" elif "region" in criteria: level = "county" elif "metro" in criteria: level = "region" elif "state" in criteria: level = "metro" elif "country" in criteria: level = "state" else: level = "country" # Repeatedly process locations at this level, and drill down if we need to # to get lower-level locations. filter_results = [] null_fields = [] while True: # Start by making a database query for the matching postings at this # level. query = None # initially. try: success,results = searchHelpers.build_search_query(criteria) if not success: return (False, results) else: query = results kwargs = {} for field in null_fields: kwargs[field] = None query = query.filter(**kwargs) grouping_field = "location_" + level + "__code" query = query.values(grouping_field) # Process the search results at this level. Note that we might # encounter a row where the location code is NULL -- if this # happens, we'll have to drill down to a lower level. drill_down = False # initially. for row in query.annotate(count=Count("id")): if row[grouping_field] == None: # We have some postings which don't match this level -> we # have to drill down further. drill_down = True else: # Add this row to the summary results. filter_results.append((level, row[grouping_field], row['count'])) except DatabaseError,e: transaction.rollback() # Let the database keep working. if "statement timeout" in str(e): # The query timed out. Tell the user the bad news. if query != None: sql = str(query.query) eventRecorder.record("SUMMARIZER_API", "QUERY_TIMED_OUT", text=sql) logger.debug("DATABASE TIMEOUT, query=" + sql) transaction.commit() return (False, "Database timeout") else: raise # If we have to drill down, do so now. if drill_down: # Add this level's location to the list of "null fields". These # are fields which must have the value NULL in subsequent searches # as we continue to drill down. null_fields.append("location_" + level) # Continue searching at the next lower level. if level == "country": level = "state" elif level == "state": level = "metro" elif level == "metro": level = "region" elif level == "region": level = "county" elif level == "county": level = "city" elif level == "city": level = "locality" elif level == "locality": level = "zipcode" elif level == "zipcode": # We've reached the bottom -> give up. break continue else: break