示例#1
0
def v0_highlighter_post(request, response, tfidf, cid):
    '''Obtain highlights for a document POSTed as the body, which is the
    pre-design-thinking structure of the highlights API.  See v1 below.

    NB: This end point will soon be deleted.

    The route for this endpoint is:
    ``POST /dossier/v0/highlighter/<cid>``.

    ``content_id`` is the id to associate with the given feature
    collection. The feature collection should be in the request
    body serialized as JSON.

    '''
    logger.info('got %r', cid)
    tfidf = tfidf or None
    content_type = request.headers.get('content-type', '')
    if not content_type.startswith('text/html'):
        logger.critical('content-type=%r', content_type)
        response.status = 415
        return {
            'error': {
                'code': 0,
                'message':
                'content_type=%r and should be text/html' % content_type
            }
        }

    url = urllib.unquote(cid.split('|', 1)[1])
    body = request.body.read()
    if len(body) == 0:
        response.status = 420
        return {'error': {'code': 1, 'message': 'empty body'}}
    logger.info('parsing %d bytes for url: %r', len(body), url)
    fc = etl.create_fc_from_html(url, body, tfidf=tfidf)
    if fc is None:
        logger.critical('failed to get FC using %d bytes from %r', len(body),
                        url)
        response.status = 506
        return {
            'error': {
                'code': 2,
                'message': 'FC not generated for that content'
            }
        }
    highlights = dict()
    for feature_name, pretty_name in feature_pretty_names:
        # Each type of string is
        if feature_name not in fc: continue
        total = sum(fc[feature_name].values())
        highlights[pretty_name] = [
            (phrase, count / total, [], []) for phrase, count in sorted(
                fc[feature_name].items(), key=itemgetter(1), reverse=True)
        ]
        logger.info('%r and %d keys', feature_name,
                    len(highlights[pretty_name]))
    return {'highlights': highlights}
示例#2
0
def v1_fc_put(request, response, store, kvlclient, tfidf, cid):
    '''Store a single feature collection.

    The route for this endpoint is:
    ``PUT /dossier/v1/feature-collections/<content_id>``.

    ``content_id`` is the id to associate with the given feature
    collection. The feature collection should be in the request
    body serialized as JSON.

    Alternatively, if the request's ``Content-type`` is
    ``text/html``, then a feature collection is generated from the
    HTML. The generated feature collection is then returned as a
    JSON payload.

    This endpoint returns status ``201`` upon successful
    storage otherwise. An existing feature collection with id
    ``content_id`` is overwritten.
    '''
    tfidf = tfidf or None
    if request.headers.get('content-type', '').startswith('text/html'):
        url = urllib.unquote(cid.split('|', 1)[1])
        fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf)
        logger.info('created FC for %r', cid)
        store.put([(cid, fc)])
        return fc_to_json(fc)
    else:
        fc = FeatureCollection.from_dict(json.load(request.body))
        keywords = set()
        for subid in fc:
            if subid.startswith('subtopic'):
                ty = subtopic_type(subid)
                if ty in ('text', 'manual'):
                    # get the user selected string
                    data = typed_subtopic_data(fc, subid)
                    map(keywords.add, cleanse(data).split())
                    keywords.add(cleanse(data))

        folders = Folders(kvlclient)
        for fid, sid in folders.parent_subfolders(cid):
            if not isinstance(fid, unicode):
                fid = fid.decode('utf8')
            if not isinstance(sid, unicode):
                sid = sid.decode('utf8')
            keywords.add(cleanse(fid))
            keywords.add(cleanse(sid))

        fc[u'keywords'] = StringCounter(keywords)
        store.put([(cid, fc)])
        response.status = 201
示例#3
0
def v1_fc_put(request, response, store, kvlclient, tfidf, cid):
    '''Store a single feature collection.

    The route for this endpoint is:
    ``PUT /dossier/v1/feature-collections/<content_id>``.

    ``content_id`` is the id to associate with the given feature
    collection. The feature collection should be in the request
    body serialized as JSON.

    Alternatively, if the request's ``Content-type`` is
    ``text/html``, then a feature collection is generated from the
    HTML. The generated feature collection is then returned as a
    JSON payload.

    This endpoint returns status ``201`` upon successful
    storage otherwise. An existing feature collection with id
    ``content_id`` is overwritten.
    '''
    tfidf = tfidf or None
    if request.headers.get('content-type', '').startswith('text/html'):
        url = urllib.unquote(cid.split('|', 1)[1])
        fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf)
        logger.info('created FC for %r', cid)
        store.put([(cid, fc)])
        return fc_to_json(fc)
    else:
        fc = FeatureCollection.from_dict(json.load(request.body))
        keywords = set()
        for subid in fc:
            if subid.startswith('subtopic'):
                ty = subtopic_type(subid)
                if ty in ('text', 'manual'):
                    # get the user selected string
                    data = typed_subtopic_data(fc, subid)
                    map(keywords.add, cleanse(data).split())
                    keywords.add(cleanse(data))

        folders = Folders(kvlclient)
        for fid, sid in folders.parent_subfolders(cid):
            if not isinstance(fid, unicode):
                fid = fid.decode('utf8')
            if not isinstance(sid, unicode):
                sid = sid.decode('utf8')
            keywords.add(cleanse(fid))
            keywords.add(cleanse(sid))

        fc[u'keywords'] = StringCounter(keywords)
        store.put([(cid, fc)])
        response.status = 201
示例#4
0
def create_highlights(data, tfidf):
    '''compute highlights for `data`, store it in the store using
    `kvlclient`, and return a `highlights` response payload.

    '''
    try:
        fc = etl.create_fc_from_html(
            data['content-location'], data['body'], tfidf=tfidf, encoding=None)
    except Exception, exc:
        logger.critical('failed to build FC', exc_info=True)
        return {
            'state': ERROR,
            'error': {'code': 7,
                      'message': 'internal error: %s' % traceback.format_exc(exc),
                      }
        }
示例#5
0
def v0_highlighter_post(request, response, tfidf, cid):
    '''Obtain highlights for a document POSTed as the body, which is the
    pre-design-thinking structure of the highlights API.  See v1 below.

    NB: This end point will soon be deleted.

    The route for this endpoint is:
    ``POST /dossier/v0/highlighter/<cid>``.

    ``content_id`` is the id to associate with the given feature
    collection. The feature collection should be in the request
    body serialized as JSON.

    '''
    logger.info('got %r', cid)
    tfidf = tfidf or None
    content_type = request.headers.get('content-type', '')
    if not content_type.startswith('text/html'):
        logger.critical('content-type=%r', content_type)
        response.status = 415
        return {'error': {'code': 0, 'message': 'content_type=%r and should be text/html' % content_type}}

    url = urllib.unquote(cid.split('|', 1)[1])
    body = request.body.read()
    if len(body) == 0:
        response.status = 420
        return {'error': {'code': 1, 'message': 'empty body'}}
    logger.info('parsing %d bytes for url: %r', len(body), url)
    fc = etl.create_fc_from_html(url, body, tfidf=tfidf)
    if fc is None:
        logger.critical('failed to get FC using %d bytes from %r', len(body), url)
        response.status = 506
        return {'error': {'code': 2, 'message': 'FC not generated for that content'}}
    highlights = dict()
    for feature_name, pretty_name in feature_pretty_names:
        # Each type of string is
        if feature_name not in fc: continue
        total = sum(fc[feature_name].values())
        highlights[pretty_name] = [
            (phrase, count / total, [], [])
            for phrase, count in sorted(fc[feature_name].items(), key=itemgetter(1), reverse=True)]
        logger.info('%r and %d keys', feature_name, len(highlights[pretty_name]))
    return {'highlights': highlights}
示例#6
0
def create_highlights(data, tfidf):
    '''compute highlights for `data`, store it in the store using
    `kvlclient`, and return a `highlights` response payload.

    '''
    try:
        fc = etl.create_fc_from_html(data['content-location'],
                                     data['body'],
                                     tfidf=tfidf,
                                     encoding=None)
    except Exception, exc:
        logger.critical('failed to build FC', exc_info=True)
        return {
            'state': ERROR,
            'error': {
                'code': 7,
                'message': 'internal error: %s' % traceback.format_exc(exc),
            }
        }
示例#7
0
    def callback(si, link):
        if si is None: return
        cid_url = hashlib.md5(str(link)).hexdigest()
        cid = etl.interface.mk_content_id(cid_url)
        content_ids.append(cid)

        # hack alert!
        # We currently use FCs to store subtopic text data, which
        # means we cannot overwrite existing FCs with reckless
        # abandon. So we adopt a heuristic: check if an FC already
        # exists, and if it does, check if it is being used to store
        # user data. If so, don't overwrite it and move on.
        fc = config.store.get(cid)
        if fc is not None and any(k.startswith('subtopic|')
                                  for k in fc.iterkeys()):
            logger.info('skipping ingest for %r (abs url: %r) because '
                        'an FC with user data already exists.',
                        cid, link)
            return

        other_features = {
            u'keywords': StringCounter(keywords), #list(queries)),
        }

        try:
            fc = etl.create_fc_from_html(
                link, si.body.raw,
                encoding=si.body.encoding or 'utf-8', tfidf=tfidf,
                other_features=other_features,
            )
            if not fc:
                logger.info('failed to get an FC, moving on')
                return
            logger.info('created FC for %r (abs url: %r)',
                        cid, link)
            config.store.put([(cid, fc)])
        except Exception:
            logger.info('trapped ingest failure on %r (abs url: %r)',
                        cid, link, exc_info=True)