Exemplo n.º 1
0
def _calc_category_group_summary(criteria):
    """ Calculate and return a summary of postings by category group.

        'criteria' is a dictionary containing the supplied filter criteria.

        Upon completion, we return a (success, results) tuple, where 'success'
        is True if and only if the summary was successfully calculated.

        If 'success' is True, 'results' will be a list of (type, code, number)
        tuples, where 'type' is the type of summary item, 'code' is the 3taps
        code for that summary item, and 'number' is the number of matching
        postings.

        If 'success' is False, 'results' will be a string describing why the
        summary could not be calculated.
    """
    query = None # initially.

    try:
        success,results = searchHelpers.build_search_query(criteria)
        if not success:
            return (False, results)
        else:
            query = results

        query = query.values("category_group__code")

        results = []
        for row in query.annotate(count=Count("id")):
            results.append(("category_group",
                            row['category_group__code'],
                            row['count']))
    except DatabaseError,e:
        transaction.rollback() # Let the database keep working.
        if "statement timeout" in str(e):
            # The query timed out.  Tell the user the bad news.
            if query != None:
                sql = str(query.query)
                eventRecorder.record("SUMMARIZER_API", "QUERY_TIMED_OUT",
                                     text=sql)
                logger.debug("DATABASE TIMEOUT, query=" + sql)
                transaction.commit()
            return (False, "Database timeout")
        else:
            raise
Exemplo n.º 2
0
def search(request):
    """ Respond to the "/api/latest/search" URL.

        We search for postings based on the given search criteria.
    """
    start_time = time.time()

    if request.method != "GET":
        return HttpResponseNotAllowed(["GET"])

    # Check the caller's authentication.

    # ...eventually.

    # Extract the search criteria.

    criteria = {}

    for param in ["category_group", "category", "country", "state", "metro",
                  "region", "county", "city", "locality", "zipcode", "source",
                  "heading", "body", "text", "timestamp", "id", "price",
                  "currency", "annotations", "external_id", "status",
                  "has_image", "include_deleted", "only_deleted"]:
        if param in request.GET:
            criteria[param] = request.GET[param]

    # If the caller didn't supply a timestamp, add a default timestamp to the
    # search query.

    if "timestamp" not in criteria:
        criteria['timestamp'] = str(int((time.time() - 24*3600))) + '..' + \
                                str(int(time.time()))

    # Extract our other parameters.

    if "rpp" in request.GET:
        try:
            rpp = int(request.GET['rpp'])
        except ValueError:
            return HttpResponse(json.dumps(
                                    {'success' : False,
                                     'error'   : "Invalid 'rpp' value"}),
                                mimetype="application/json")
        if rpp < 1 or rpp > 100:
            return HttpResponse(json.dumps(
                                    {'success' : False,
                                     'error'   : "'rpp' value out of range"}),
                                mimetype="application/json")
    else:
        rpp = 10

    if "retvals" in request.GET:
        retvals = set()
        for field in request.GET['retvals'].split(","):
            if field in ["id", "account_id", "source", "category",
                         "category_group", "location", "external_id",
                         "external_url", "heading", "body", "html",
                         "timestamp", "expires", "language", "price",
                         "currency", "images", "annotations", "status",
                         "immortal"]:
                retvals.add(field)
            else:
                return HttpResponse(json.dumps(
                                        {'success' : False,
                                         'error'   : "invalid 'retvals' " +
                                                     "value: " + repr(field)}),
                                    mimetype="application/json")
    else:
        retvals = set(["id", "source", "category", "location", "external_id",
                       "external_url", "heading", "timestamp"])

    if "anchor" in request.GET:
        anchor = request.GET['anchor']
    else:
        anchor = None

    if "page" in request.GET:
        try:
            page = int(request.GET['page'])
        except ValueError:
            return HttpResponse(json.dumps(
                                    {'success' : False,
                                     'error'   : "Invalid 'page' value"}),
                                mimetype="application/json")
    else:
        page = 0

    # Construct a search query based on the supplied parameters.

    success,result = searchHelpers.build_search_query(criteria)

    if not success:
        return HttpResponse(json.dumps({'success' : False,
                                        'error'   : result}),
                            mimetype="application/json")
    else:
        query = result

    if anchor != None:
        query = query.filter(id__lte=anchor)

    num_matches = query.count()

    query = query.order_by("-timestamp")
    query = query[page*rpp:page*rpp+rpp]
    sql = str(query.query)

    # Testing: If the caller provided a "return_sql" parameter, return the raw
    # SQL statement rather than running it.

    if (request.GET.get("return_sql") == "1"):
        return HttpResponse(sql)

    # Before running the query, set a timeout so we don't hang if the query
    # takes too long.

    cursor = connection.cursor()
    cursor.execute("SET STATEMENT_TIMEOUT=%s" % settings.QUERY_TIMEOUT)

    # Process the search query, and assemble our search results.

    found_postings = []
    new_anchor = None

    try:
        for posting in query:
            if anchor == None and new_anchor == None:
                # Remember the ID of the first (ie, most recent) found posting.
                # This will be our anchor for subsequent requests.
                new_anchor = str(posting.id)

            found_posting = {}
            if "id" in retvals:
                found_posting['id'] = posting.id
            if "account_id" in retvals:
                found_posting['account_id'] = posting.account_id
            if "source" in retvals:
                found_posting['source'] = posting.source.code
            if "category" in retvals:
                found_posting['category'] = posting.category.code
            if "category_group" in retvals:
                found_posting['category_group'] = posting.category_group.code
            if "location" in retvals:
                loc = {}
                if posting.location_latitude != None:
                    loc['latitude'] = posting.location_latitude
                if posting.location_longitude != None:
                    loc['longitude'] = posting.location_longitude
                if posting.location_accuracy != None:
                    loc['accuracy'] = posting.location_accuracy
                if posting.location_country != None:
                    loc['country'] = posting.location_country.code
                if posting.location_state != None:
                    loc['state'] = posting.location_state.code
                if posting.location_metro != None:
                    loc['metro'] = posting.location_metro.code
                if posting.location_region != None:
                    loc['region'] = posting.location_region.code
                if posting.location_county != None:
                    loc['county'] = posting.location_county.code
                if posting.location_city != None:
                    loc['city'] = posting.location_city.code
                if posting.location_locality != None:
                    loc['locality'] = posting.location_locality.code
                if posting.location_zipcode != None:
                    loc['zipcode'] = posting.location_zipcode.code
                found_posting['location'] = loc
            if "external_id" in retvals:
                found_posting['external_id'] = posting.external_id
            if "external_url" in retvals:
                found_posting['external_url'] = posting.external_url
            if "heading" in retvals:
                found_posting['heading'] = posting.heading
            if "body" in retvals:
                found_posting['body'] = posting.body
            if "html" in retvals:
                found_posting['html'] = posting.html
            if "timestamp" in retvals:
                found_posting['timestamp'] = datetime_to_seconds(
                                                        posting.timestamp)
            if "expires" in retvals:
                found_posting['expires'] = datetime_to_seconds(posting.expires)
            if "language" in retvals:
                found_posting['language'] = posting.language
            if "price" in retvals:
                found_posting['price'] = posting.price
            if "currency" in retvals:
                found_posting['currency'] = posting.currency
            if "images" in retvals:
                images = []
                for image in posting.imagereference_set.all():
                    dst_image = {}
                    if image.full_url != None:
                        dst_image['full_url'] = image.full_url
                    if image.full_width != None:
                        dst_image['full_width'] = image.full_width
                    if image.full_height != None:
                        dst_image['full_height'] = image.full_height
                    if image.thumbnail_url != None:
                        dst_image['thumbnail_url'] = image.thumbnail_url
                    if image.thumbnail_width != None:
                        dst_image['thumbnail_width'] = image.thumbnail_width
                    if image.thumbnail_height != None:
                        dst_image['thumbnail_height'] = image.thumbnail_height
                    images.append(dst_image)
                found_posting['images'] = images
            if "annotations" in retvals:
                annotations = {}
                for posting_annotation in posting.postingannotation_set.all():
                    s = posting_annotation.annotation.annotation
                    key,value = s.split(":", 1)
                    annotations[key] = value
                found_posting['annotations'] = annotations
            if "status" in retvals:
                status = {}
                status['offered'] = posting.status_offered
                status['lost']    = posting.status_lost
                status['stolen']  = posting.status_stolen
                status['found']   = posting.status_found
                status['deleted'] = posting.status_deleted
                found_posting['status'] = status
            if "immortal" in retvals:
                found_posting['immortal'] = posting.immortal

            found_postings.append(found_posting)
    except DatabaseError, e:
        if "statement timeout" in str(e):
            # The query timed out.  Tell the user the bad news.
            sql = str(query.query)
            logger.debug("DATABASE TIMEOUT, query=" + sql)
            eventRecorder.record("SEARCH_API", "QUERY_TIMED_OUT", text=sql)
            return HttpResponse(json.dumps({'success' : False,
                                            'error'   : "Database timeout"}),
                                mimetype="application/json")
        else:
            logger.exception(e)
            sql = str(query.query)
            return HttpResponse(json.dumps({'success' : False,
                                            'error'   : "Database error"}),
                                mimetype="application/json")
Exemplo n.º 3
0
                'postings'    : found_postings}

    if anchor == None and new_anchor != None:
        response['anchor'] = new_anchor

    # If the caller gave us an anchor, see if any new postings have come in
    # since the original query was made.

    if anchor != None:
        success,query = searchHelpers.build_search_query(criteria)
        if success:
            response['new_postings'] = query.filter(id__gt=anchor).count()

    # Record an event telling us how long the search request took.

    end_time   = time.time()
    time_taken = int((end_time - start_time) * 1000) # Milliseconds.
    eventRecorder.record("SEARCH_API", "SEARCH_REQUESTS", 1, time_taken,
                         text=sql)
    # transaction.commit()

    # Add the search request time to the response.

    response['time_taken'] = time_taken

    # Finally, return the response back to the caller.

    return HttpResponse(json.dumps(response, sort_keys=True, indent="    "),
                        mimetype="application/json")

Exemplo n.º 4
0
def post(request):
    """ Respond to the "/api/latest/posting" URL.

        We accept a number of postings and add them to the database.
    """
    if request.method != "POST":
        return HttpResponseNotAllowed(["POST"])

    if request.META['CONTENT_TYPE'] != "application/json":
        return HttpResponseBadRequest("Request not in JSON format")

    try:
        params = json.loads(request.body)
    except:
        return HttpResponseBadRequest("Invalid JSON data")

    # Check the caller's authentication.

    # ...eventually.

    # Grab the raw posting data supplied by the caller.

    if "posting" in params:
        raw_postings = [params['posting']]
    elif "postings" in params:
        raw_postings = params['postings']
    else:
        return HttpResponseBadRequest("Missing 'posting' or 'postings' " +
                                      "parameter")

    if len(raw_postings) > 1000:
        return HttpResponseBadRequest("Too many postings")

    # Check the raw postings, making sure the supplied data is valid.  We
    # generate two lists: a list of checked postings to process, and a list of
    # responses to send back to the caller.

    results = postingParser.check_raw_postings(raw_postings)

    error_responses = []
    postings        = []

    for success,result in results:
        if success:
            error_responses.append(None)
            postings.append(result)
        else:
            error_responses.append(result)

    # Calculate the amount of time the client should wait before sending in
    # more postings.

    wait_for = calc_wait_for_time(len(postings))

    # Queue the postings for later processing.

    if len(postings) > 0:
        tasks.process_postings.delay(postings)
        eventRecorder.record("POSTING_API", "POSTINGS_QUEUED",
                             len(postings), wait_for)

    # Finally, return the response back to the caller.

    response = {'error_responses' : error_responses,
                'wait_for'        : wait_for}

    return HttpResponse(json.dumps(response), mimetype="application/json")
Exemplo n.º 5
0
def process_postings(parsed_postings):
    """ Process a batch of parsed postings.

        'parsed_postings' should be a list of postings that have been
        successfully checked by the posting parser.  Each entry in this list
        will be a dictionary with the following entries:

            'posting'

                The posting itself, as a dictionary.  The fields in this
                dictionary match the attributes with the same name in the
                Posting object.

            'annotations'

                A list of annotation values to associate with this posting.
                Each string will be of the form "key:value"

            'images'

                A list of images to associate with this posting.  Each image
                will be a dictionary with 'full_url' and 'thumbnail_url'
                entries, as appropriate.

        We process the postings, adding them to the system as appropriate.
        Note that this involves the following steps:

            1. Calling the Geolocator API if the postings need to be
               geolocated.

            2. Storing the postings into the database.

            3. Sending the postings out via the Notification API.
    """
    eventRecorder.record("POSTING_API", "POSTINGS_DEQUEUED",
                         len(parsed_postings))

    start_time = time.time()

    # If necessary, geolocate the postings.

    for src in parsed_postings:
        posting = src['posting']
        if "location" in posting:
            raw_loc = posting['location']

            has_lat_long = False # initially.
            if "latitude" in raw_loc and "longitude" in raw_loc:
                has_lat_long = True

            has_loc_codes = False # initially.
            for field in ["country", "state", "metro", "region", "county",
                          "city", "locality", "zipcode"]:
                if field in raw_loc:
                    has_loc_codes = True
                    break

            if has_lat_long and not has_loc_codes:
                # This posting has a lat/long value but no location codes ->
                # reverse geocode the posting to see which locations it belongs
                # to.

                locs = reverseGeocoder.calc_locations(raw_loc['latitude'],
                                                      raw_loc['longitude'],
                                                      raw_loc.get("bounds"),
                                                      raw_loc.get("accuracy"))

                for level,loc in locs.items():
                    raw_loc[level] = loc

            # If we were supplied a bounds array, convert it to a string for
            # storage.

            if "bounds" in raw_loc:
                raw_loc['bounds'] = repr(bounds['raw_loc'])

    # Get the Annotation objects used by these postings.  Since these
    # objects hold unique annotation values, they can be shared across
    # postings -- we use the existing Annotation object if it exists, and
    # create new ones where necessary.

    annotations = {} # Maps annotation string to Annotation object.

    for src in parsed_postings:
        for annotation_value in src['annotations']:
            if annotation_value not in annotations:
                # The following attempts to work around a database deadlock
                # issue.  We attempt to get_or_create the given Annotation
                # record, and if this results in a database deadlock, we wait a
                # moment before trying again.

                while True:
                    try:
                        annotation,created = Annotation.objects.get_or_create(
                                                annotation=annotation_value)
                    except DatabaseError,e:
                        if "deadlock" in str(e):
                            logger.debug("DEADLOCK DETECTED!!!  TRYING AGAIN")
                            time.sleep(0.1)
                            continue
                        else:
                            raise
                    else:
                        break

                annotations[annotation_value] = annotation
Exemplo n.º 6
0
                    image_ref.posting       = posting
                    image_ref.full_url      = full_url
                    image_ref.thumbnail_url = thumbnail_url
                    image_ref.save()

            # If we've created enough postings, commit the transaction.

            num_postings_in_transaction = num_postings_in_transaction + 1
            if num_postings_in_transaction >= MAX_NUM_POSTINGS_IN_TRANSACTION:
                transaction.commit()
                num_postings_in_transaction = 0

        # Now that the postings are in the database, send them out via the
        # notification system.

        # ...more to come!

        # That's all, folks!

        end_time   = time.time()
        time_taken = int(1000 * (end_time - start_time))

        eventRecorder.record("POSTING_API", "POSTINGS_PROCESSED",
                             len(parsed_postings), time_taken)
    except:
        transaction.rollback()
        raise
    else:
        transaction.commit()

Exemplo n.º 7
0
def summarize(request):
    """ Respond to the "/api/latest/summarizer" URL.

        We calculate and return a summary of postings based on our supplied
        parameters.
    """
    start_time = time.time()

    if request.method != "GET":
        return HttpResponseNotAllowed(["GET"])

    # Check the caller's authentication.

    # ...eventually.

    # Extract the search criteria.

    criteria = {}
    for param in ["category_group", "category", "country", "state", "metro",
                  "region", "county", "city", "locality", "zipcode", "source",
                  "heading", "body", "text", "timestamp", "id", "price",
                  "currency", "annotations", "status", "has_image",
                  "include_deleted"]:
        if param in request.GET:
            criteria[param] = request.GET[param]

    # Process our other parameters.

    if "dimension" in request.GET:
        dimension = request.GET['dimension']
        if dimension not in ["category", "location", "source"]:
            return HttpResponseBadRequest("Unknown dimension: " + dimension)
    else:
        return HttpResponseBadRequest("Missing required 'dimension' parameter")

    # Before running the query, set a timeout so we don't hang if the query
    # takes too long.

    cursor = connection.cursor()
    cursor.execute("SET STATEMENT_TIMEOUT=%s" % settings.QUERY_TIMEOUT)

    # Calculate the appropriate type of summary.

    if dimension == "category" and "category_group" not in criteria:
        success,results = _calc_category_group_summary(criteria)
    elif dimension == "category" and 'category_group' in criteria:
        success,results = _calc_category_summary(criteria)
    elif dimension == "location":
        success,results = _calc_location_summary(criteria)
    elif dimension == "source":
        success,results = _calc_source_summary(criteria)
    else:
        return HttpResponse(json.dumps({'success' : False,
                                        'error'   : "Unable to determine " +
                                                    "summary type"}),
                            mimetype="application/json")

    # Record an event telling us how long the summary request took.

    end_time   = time.time()
    time_taken = int((end_time - start_time) * 1000) # Milliseconds.
    eventRecorder.record("SUMMARIZER_API", "SUMMARY_REQUESTS", 1, time_taken)
    transaction.commit()

    # Finally, return the results back to the caller.

    if success:
        return HttpResponse(json.dumps({'success' : True,
                                        'summary' : results},
                                       sort_keys=True, indent="    "),
                            mimetype="application/json")
    else:
        return HttpResponse(json.dumps({'success' : False,
                                        'error'   : results}),
                            mimetype="application/json")
Exemplo n.º 8
0
def _calc_location_summary(criteria):
    """ Calculate and return a summary of postings by location.

        'criteria' is a dictionary containing the supplied filter criteria.

        If 'success' is True, 'results' will be a list of (type, code, number)
        tuples, where 'type' is the type of summary item, 'code' is the 3taps
        code for that summary item, and 'number' is the number of matching
        postings.

        If 'success' is False, 'results' will be a string describing why the
        summary could not be calculated.
    """
    # See what level of location we want to summarize on.  This is based on the
    # lowest-level of the supplied criteria.

    if "locality" in criteria:
        level = "zipcode"
    elif "city" in criteria:
        level = "locality"
    elif "county" in criteria:
        level = "city"
    elif "region" in criteria:
        level = "county"
    elif "metro" in criteria:
        level = "region"
    elif "state" in criteria:
        level = "metro"
    elif "country" in criteria:
        level = "state"
    else:
        level = "country"

    # Repeatedly process locations at this level, and drill down if we need to
    # to get lower-level locations.

    filter_results = []
    null_fields    = []

    while True:

        # Start by making a database query for the matching postings at this
        # level.

        query = None # initially.

        try:
            success,results = searchHelpers.build_search_query(criteria)

            if not success:
                return (False, results)
            else:
                query = results

            kwargs = {}
            for field in null_fields:
                kwargs[field] = None
            query = query.filter(**kwargs)

            grouping_field = "location_" + level + "__code"

            query = query.values(grouping_field)

            # Process the search results at this level.  Note that we might
            # encounter a row where the location code is NULL -- if this
            # happens, we'll have to drill down to a lower level.

            drill_down = False # initially.
            for row in query.annotate(count=Count("id")):
                if row[grouping_field] == None:
                    # We have some postings which don't match this level -> we
                    # have to drill down further.
                    drill_down = True
                else:
                    # Add this row to the summary results.
                    filter_results.append((level,
                                           row[grouping_field],
                                           row['count']))
        except DatabaseError,e:
            transaction.rollback() # Let the database keep working.
            if "statement timeout" in str(e):
                # The query timed out.  Tell the user the bad news.
                if query != None:
                    sql = str(query.query)
                    eventRecorder.record("SUMMARIZER_API", "QUERY_TIMED_OUT",
                                         text=sql)
                    logger.debug("DATABASE TIMEOUT, query=" + sql)
                    transaction.commit()
                return (False, "Database timeout")
            else:
                raise

        # If we have to drill down, do so now.

        if drill_down:

            # Add this level's location to the list of "null fields".  These
            # are fields which must have the value NULL in subsequent searches
            # as we continue to drill down.

            null_fields.append("location_" + level)

            # Continue searching at the next lower level.

            if level == "country":
                level = "state"
            elif level == "state":
                level = "metro"
            elif level == "metro":
                level = "region"
            elif level == "region":
                level = "county"
            elif level == "county":
                level = "city"
            elif level == "city":
                level = "locality"
            elif level == "locality":
                level = "zipcode"
            elif level == "zipcode":
                # We've reached the bottom -> give up.
                break

            continue
        else:
            break