Пример #1
0
def v1_fc_put(request, response, store, kvlclient, tfidf, cid):
    '''Store a single feature collection.

    The route for this endpoint is:
    ``PUT /dossier/v1/feature-collections/<content_id>``.

    ``content_id`` is the id to associate with the given feature
    collection. The feature collection should be in the request
    body serialized as JSON.

    Alternatively, if the request's ``Content-type`` is
    ``text/html``, then a feature collection is generated from the
    HTML. The generated feature collection is then returned as a
    JSON payload.

    This endpoint returns status ``201`` upon successful
    storage otherwise. An existing feature collection with id
    ``content_id`` is overwritten.
    '''
    tfidf = tfidf or None
    if request.headers.get('content-type', '').startswith('text/html'):
        url = urllib.unquote(cid.split('|', 1)[1])
        fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf)
        logger.info('created FC for %r', cid)
        store.put([(cid, fc)])
        return fc_to_json(fc)
    else:
        fc = FeatureCollection.from_dict(json.load(request.body))
        keywords = set()
        for subid in fc:
            if subid.startswith('subtopic'):
                ty = subtopic_type(subid)
                if ty in ('text', 'manual'):
                    # get the user selected string
                    data = typed_subtopic_data(fc, subid)
                    map(keywords.add, cleanse(data).split())
                    keywords.add(cleanse(data))

        folders = Folders(kvlclient)
        for fid, sid in folders.parent_subfolders(cid):
            if not isinstance(fid, unicode):
                fid = fid.decode('utf8')
            if not isinstance(sid, unicode):
                sid = sid.decode('utf8')
            keywords.add(cleanse(fid))
            keywords.add(cleanse(sid))

        fc[u'keywords'] = StringCounter(keywords)
        store.put([(cid, fc)])
        response.status = 201
Пример #2
0
def v1_fc_put(request, response, store, kvlclient, tfidf, cid):
    '''Store a single feature collection.

    The route for this endpoint is:
    ``PUT /dossier/v1/feature-collections/<content_id>``.

    ``content_id`` is the id to associate with the given feature
    collection. The feature collection should be in the request
    body serialized as JSON.

    Alternatively, if the request's ``Content-type`` is
    ``text/html``, then a feature collection is generated from the
    HTML. The generated feature collection is then returned as a
    JSON payload.

    This endpoint returns status ``201`` upon successful
    storage otherwise. An existing feature collection with id
    ``content_id`` is overwritten.
    '''
    tfidf = tfidf or None
    if request.headers.get('content-type', '').startswith('text/html'):
        url = urllib.unquote(cid.split('|', 1)[1])
        fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf)
        logger.info('created FC for %r', cid)
        store.put([(cid, fc)])
        return fc_to_json(fc)
    else:
        fc = FeatureCollection.from_dict(json.load(request.body))
        keywords = set()
        for subid in fc:
            if subid.startswith('subtopic'):
                ty = subtopic_type(subid)
                if ty in ('text', 'manual'):
                    # get the user selected string
                    data = typed_subtopic_data(fc, subid)
                    map(keywords.add, cleanse(data).split())
                    keywords.add(cleanse(data))

        folders = Folders(kvlclient)
        for fid, sid in folders.parent_subfolders(cid):
            if not isinstance(fid, unicode):
                fid = fid.decode('utf8')
            if not isinstance(sid, unicode):
                sid = sid.decode('utf8')
            keywords.add(cleanse(fid))
            keywords.add(cleanse(sid))

        fc[u'keywords'] = StringCounter(keywords)
        store.put([(cid, fc)])
        response.status = 201
Пример #3
0
 def process(self, fc, context=None):
     text_source = self.config.get('text_source')
     if text_source and text_source in fc:
         text = fc[text_source]
     else:
         return fc
     names = defaultdict(StringCounter)
     for sent in nltk.sent_tokenize(text):
         for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
             if hasattr(chunk, 'label'):
                 label = chunk.label()
                 name = ' '.join(c[0] for c in chunk.leaves())
                 if not isinstance(name, unicode):
                     name = unicode(name, 'utf-8')
                 name = cleanse(name)
                 #print chunk.node, name
                 names[label][name] += 1
     for entity_type, name_counts in names.items():
         fc[entity_type] = name_counts
     return fc
Пример #4
0
 def process(self, fc, context=None):
     text_source = self.config.get('text_source')
     if text_source and text_source in fc:
         text = fc[text_source]
     else:
         return fc
     names = defaultdict(StringCounter)
     for sent in nltk.sent_tokenize(text):
         for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
             if hasattr(chunk, 'label'):
                 label = chunk.label()
                 name = ' '.join(c[0] for c in chunk.leaves())
                 if not isinstance(name, unicode):
                     name = unicode(name, 'utf-8')
                 name = cleanse(name)
                 #print chunk.node, name
                 names[label][name] += 1
     for entity_type, name_counts in names.items():
         fc[entity_type] = name_counts
     return fc
Пример #5
0
def traverse_extract_fetch(config, wukey, stop_after_extraction=False):
    '''Given a config and a
    `wukey=cbor.dumps((folder_name,subfolder_name))`, traverse the
    folders to generate queries, issue them to Google, fetch the
    results, and ingest them.

    '''

    config.kvlclient.setup_namespace({'openquery': (str,)})
    try:
        data = list(config.kvlclient.get('openquery', (wukey,)))
        if data:
            if data[0][1]:
                logger.info('found existing query results: %r', data)
                return
            else:
                config.kvlclient.delete('openquery', (wukey,))
    except:
        logger.error('failed to get data from existing table', exc_info=True)

    fid, sid = cbor.loads(wukey)
    tfidf = config.tfidf
    folders = Folders(config.kvlclient)
    fetcher = Fetcher()

    ## To disable the keyword extractor model, you can uncomment out
    ## the next three lines (`get_subfolder_queries`) and comment out
    ## the following two lines (`extract_keyword_queries`).
    #keyword_feature_keys = []
    #queries = get_subfolder_queries(
    #    config.store, config.label_store, folders, fid, sid)

    queries, keyword_feature_keys, has_observations = extract_keyword_queries(
        config.store, config.label_store, folders, fid, sid)

    logger.info('Model found %d queries: %r', len(queries), queries)

    if stop_after_extraction:
        return

    keywords = set()
    for key in keyword_feature_keys:
        ckey = cleanse(key.decode('utf8'))
        keywords.add(ckey)
        for part in ckey.split():
            keywords.add(part)

    #link2queries = defaultdict(set)
    links = set()
    logger.info('searching google for: %r', queries)
    for q in queries:
        for result in config.google.web_search_with_paging(q, limit=10):
            links.add(result['link'])
            #map(link2queries[result['link']].add, cleanse(q.decode('utf8')).split())
            logger.info('discovered %r', result['link'])

    result = None

    #logger.info('got %d URLs from %d queries', len(link2queries), len(queries))
    logger.info('got %d URLs from %d queries', len(links), len(queries))

    # content_ids gets modified within the 'callback' closure
    content_ids = []
    #for link, queries in link2queries.items():

    def callback(si, link):
        if si is None: return
        cid_url = hashlib.md5(str(link)).hexdigest()
        cid = etl.interface.mk_content_id(cid_url)
        content_ids.append(cid)

        # hack alert!
        # We currently use FCs to store subtopic text data, which
        # means we cannot overwrite existing FCs with reckless
        # abandon. So we adopt a heuristic: check if an FC already
        # exists, and if it does, check if it is being used to store
        # user data. If so, don't overwrite it and move on.
        fc = config.store.get(cid)
        if fc is not None and any(k.startswith('subtopic|')
                                  for k in fc.iterkeys()):
            logger.info('skipping ingest for %r (abs url: %r) because '
                        'an FC with user data already exists.',
                        cid, link)
            return

        other_features = {
            u'keywords': StringCounter(keywords), #list(queries)),
        }

        try:
            fc = etl.create_fc_from_html(
                link, si.body.raw,
                encoding=si.body.encoding or 'utf-8', tfidf=tfidf,
                other_features=other_features,
            )
            if not fc:
                logger.info('failed to get an FC, moving on')
                return
            logger.info('created FC for %r (abs url: %r)',
                        cid, link)
            config.store.put([(cid, fc)])
        except Exception:
            logger.info('trapped ingest failure on %r (abs url: %r)',
                        cid, link, exc_info=True)

    logger.info('FETCHING using ASYNC')
    fetcher.get_async(islice(links, None), callback)

    data = json.dumps({'content_ids': content_ids})
    logger.info('saving %d content_ids in %d bytes on wukey %r',
                len(content_ids), len(data), wukey)
    config.kvlclient.put('openquery', ((wukey,), data))
    logger.info('done saving for %r', wukey)
Пример #6
0
            logger.error(
                'got other than list of length at least two from service: %r --> %r',
                url, results)
            continue
        query_ack = results[0]
        query_suggestions = results[1]
        if not isinstance(query_suggestions, list):
            logger.error('got other than list of query suggestions: %r --> %r',
                         url, results)
            continue
        suggestions += query_suggestions
        logger.info('%d suggestions from %r', len(query_suggestions), url)

    logger.info('found %d suggestions for %r', len(suggestions), query)

    cleansed_query = cleanse(query)
    if cleansed_query not in suggestions:
        suggestions.insert(0, query)
    return [query, suggestions]  #list(set(suggestions))]


feature_pretty_names = [
    ('ORGANIZATION', 'Organizations'),
    ('PERSON', 'Persons'),
    ('FACILITY', 'Facilities'),
    ('GPE', 'Geo-political Entities'),
    ('LOCATION', 'Locations'),
    ('skype', 'Skype Handles'),
    ('phone', 'Phone Numbers'),
    ('email', 'Email Addresses'),
    ('bowNP_unnorm', 'Noun Phrases'),