Exemplo n.º 1
0
def make_feature(fc):
    '''Builds a new `StringCounter` from the many `StringCounters` in the
    input `fc`.  This StringCounter will define one of the targets for
    the `MultinomialNB` classifier.

    This crucial function decides the relative importance of features
    extracted by the ETL pipeline.  This is essentially a form of
    domain fitting that allows us to tune the extraction to the fields
    that are important to a domain.  However, if the NER for a domain
    is inadequate, then the primary purpose of these relative
    weightings is to remove bogus NER extractions.

    '''
    feat = StringCounter()
    rejects = set()
    keepers = set()
    #keepers_keys = ['GPE', 'PERSON', 'ORGANIZATION', 'usernames']
    keepers_keys = ['phone', 'email'] #['usernames', 'phone', 'email', 'ORGANIZATION', 'PERSON']
    rejects_keys = ['keywords', 'usernames', 'ORGANIZATION', 'PERSON']
    # The features used to pull the keys for the classifier
    for f, strength in [('keywords', 10**4), ('GPE', 1), ('bow', 1), ('bowNP_sip', 10**8),
                        ('phone', 10**12), ('email', 10**12),
                        ('bowNP', 10**3), ('PERSON', 10**8), ('ORGANIZATION', 10**6), ('usernames', 10**12)]:
        if strength == 1:
            feat += fc[f]
        else:
            feat += StringCounter({key: strength * count
                                   for key, count in fc[f].items()})
        if f in rejects_keys:
            map(rejects.add, fc[f])
        if f in keepers_keys:
            map(keepers.add, fc[f])
        if u'' in feat: feat.pop(u'')
    return feat, rejects, keepers
Exemplo n.º 2
0
def test_usernames(url_or_path, username, count):
    urls = StringCounter()
    urls[url_or_path] += count

    if username is not None:
        results = usernames(urls)
        assert results == StringCounter({username: count})
Exemplo n.º 3
0
def test_image_urls():
    html = '''
<img src="http://ExAmPle.com/My Image.jpg">
<img src="http://example.com/My%20Image.jpg">
'''
    assert StringCounter(features.image_urls(html)) == StringCounter({
        'http://example.com/My Image.jpg':
        2,
    })
Exemplo n.º 4
0
def test_extract_emails():
    txt = '''
email: [email protected]
email: [email protected]
'''
    assert StringCounter(features.emails(txt)) == StringCounter({
        '*****@*****.**':
        2,
    })
Exemplo n.º 5
0
def test_a_urls():
    html = '''
<a href="http://ExAmPle.com/My Page.html">
<a href="http://example.com/My%20Page.html">
'''
    assert StringCounter(features.a_urls(html)) == StringCounter({
        'http://example.com/My Page.html':
        2,
    })
Exemplo n.º 6
0
def test_host_names():
    urls = StringCounter()
    urls['http://www.example.com/folder1'] = 3
    urls['http://www.example.com/folder2'] = 2
    urls['http://www.different.com/folder2'] = 7

    assert features.host_names(urls) == StringCounter({
        'www.example.com': 5,
        'www.different.com': 7,
    })
Exemplo n.º 7
0
def test_path_dirs():
    urls = StringCounter()
    urls['http://www.example.com/folder1/folder3/index.html?source=dummy'] = 3
    urls['http://www.example.com/folder2/folder1'] = 2
    urls['http://www.different.com/folder2'] = 7

    assert features.path_dirs(urls) == StringCounter({
        'folder1': 5,
        'folder2': 9,
        'folder3': 3,
        'index.html': 3,
    })
def test_string_counter():
    hc = StringCounter('this sentence has been parsed')
    h1 = (id(hc), hc.generation)

    cache = dict(h1='hi')
    assert len(cache) == 1

    hc.update('more text')
    h2 = (id(hc), hc.generation)

    cache[h2] = 'hi again?'
    assert len(cache) == 2
    assert h1 != h2
Exemplo n.º 9
0
def test_string_counter():
    hc = StringCounter("this sentence has been parsed")
    h1 = (id(hc), hc.generation)

    cache = dict(h1="hi")
    assert len(cache) == 1

    hc.update("more text")
    h2 = (id(hc), hc.generation)

    cache[h2] = "hi again?"
    assert len(cache) == 2
    assert h1 != h2
Exemplo n.º 10
0
def test_extract_phones():
    txt = '''
Phone: 111-222-3333
Phone: 1112223333
Phone: 1-111-222-3333
Phone: 11112223333
Phone: 222-3333
Phone: 2223333
'''
    assert StringCounter(features.phones(txt)) == StringCounter({
        '1112223333': 2,
        '11112223333': 2,
        '2223333': 2,
    })
Exemplo n.º 11
0
def test_add_facets():
    cid1 = 'cid1'
    fc1 = FeatureCollection()
    fc1['bowNP_sip'] = StringCounter([u'elephant', u'car'])
    cid2 = 'cid2'
    fc2 = FeatureCollection()
    fc2['bowNP_sip'] = StringCounter([u'car', u'green'])
    fake_results = {'results': [(cid1, fc1), (cid2, fc2)]}

    new_results = mod_pairwise.add_facets(fake_results)

    assert 'facets' in new_results

    assert new_results['facets'] == {
        'elephant': [cid1],
        'car': [cid1, cid2],
        'green': [cid2],
    }
Exemplo n.º 12
0
def add_sip_to_fc(fc, tfidf, limit=40):
    '''add "bowNP_sip" to `fc` using `tfidf` data
    '''
    if 'bowNP' not in fc:
        return
    if tfidf is None:
        return
    sips = features.sip_noun_phrases(tfidf, fc['bowNP'].keys(), limit=limit)
    fc[u'bowNP_sip'] = StringCounter(sips)
Exemplo n.º 13
0
def path_dirs(urls):
    '''
    Takes a StringCounter of normalized URL and parses them into
    a list of path directories. The file name is
    included in the path directory list.
    '''
    path_dirs = StringCounter()
    for url in urls:
        for path_dir in filter(None, urlparse(url).path.split('/')):
            path_dirs[path_dir] += urls[url]
    return path_dirs
Exemplo n.º 14
0
def create_fc_from_html(url,
                        html,
                        encoding='utf-8',
                        tfidf=None,
                        other_features=None):
    if encoding is not None:
        html = unicode(html, encoding)
    soup = BeautifulSoup(html, "lxml")
    title = soup_get(soup, 'title', lambda v: v.get_text())
    body = soup_get(soup, 'body', lambda v: v.prettify())
    if other_features is None:
        other_features = {}
    other_features.update({
        u'title': StringCounter([title]),
        u'titleBow': StringCounter(title.split()),
    })
    fc = html_to_fc(body, url=url, other_features=other_features)
    if fc is None:
        return None
    if tfidf is not None:
        add_sip_to_fc(fc, tfidf)
    return fc
Exemplo n.º 15
0
def v1_fc_put(request, response, store, kvlclient, tfidf, cid):
    '''Store a single feature collection.

    The route for this endpoint is:
    ``PUT /dossier/v1/feature-collections/<content_id>``.

    ``content_id`` is the id to associate with the given feature
    collection. The feature collection should be in the request
    body serialized as JSON.

    Alternatively, if the request's ``Content-type`` is
    ``text/html``, then a feature collection is generated from the
    HTML. The generated feature collection is then returned as a
    JSON payload.

    This endpoint returns status ``201`` upon successful
    storage otherwise. An existing feature collection with id
    ``content_id`` is overwritten.
    '''
    tfidf = tfidf or None
    if request.headers.get('content-type', '').startswith('text/html'):
        url = urllib.unquote(cid.split('|', 1)[1])
        fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf)
        logger.info('created FC for %r', cid)
        store.put([(cid, fc)])
        return fc_to_json(fc)
    else:
        fc = FeatureCollection.from_dict(json.load(request.body))
        keywords = set()
        for subid in fc:
            if subid.startswith('subtopic'):
                ty = subtopic_type(subid)
                if ty in ('text', 'manual'):
                    # get the user selected string
                    data = typed_subtopic_data(fc, subid)
                    map(keywords.add, cleanse(data).split())
                    keywords.add(cleanse(data))

        folders = Folders(kvlclient)
        for fid, sid in folders.parent_subfolders(cid):
            if not isinstance(fid, unicode):
                fid = fid.decode('utf8')
            if not isinstance(sid, unicode):
                sid = sid.decode('utf8')
            keywords.add(cleanse(fid))
            keywords.add(cleanse(sid))

        fc[u'keywords'] = StringCounter(keywords)
        store.put([(cid, fc)])
        response.status = 201
Exemplo n.º 16
0
def forum_post_features(row):
    fc = FeatureCollection()
    for k in row['author']:
        fc['post_author_' + k] = row['author'][k]

    if 'image_urls' in row:
        fc['image_url'] = StringCounter()
        for image_url in row['image_urls']:
            fc['image_url'][image_url] += 1

    others = ['parent_id', 'thread_id', 'thread_link', 'thread_name', 'title']
    for k in others:
        if k in row:
            fc['post_' + k] = uni(row[k])
    return fc
Exemplo n.º 17
0
def host_names(urls):
    '''
    Takes a StringCounter of normalized URL and parses their hostnames

    N.B. this assumes that absolute URLs will begin with

    http://

    in order to accurately resolve the host name.
    Relative URLs will not have host names.
    '''
    host_names = StringCounter()
    for url in urls:
        host_names[urlparse(url).netloc] += urls[url]
    return host_names
Exemplo n.º 18
0
def usernames(urls):
    '''Take an iterable of `urls` of normalized URL or file paths and
    attempt to extract usernames.  Returns a list.

    '''
    usernames = StringCounter()
    for url, count in urls.items():
        uparse = urlparse(url)
        path = uparse.path
        hostname = uparse.hostname
        m = username_re.match(path)
        if m:
            usernames[m.group('username')] += count
        elif hostname in ['twitter.com', 'www.facebook.com']:
            usernames[path.lstrip('/')] += count
    return usernames
Exemplo n.º 19
0
    def callback(si, link):
        if si is None: return
        cid_url = hashlib.md5(str(link)).hexdigest()
        cid = etl.interface.mk_content_id(cid_url)
        content_ids.append(cid)

        # hack alert!
        # We currently use FCs to store subtopic text data, which
        # means we cannot overwrite existing FCs with reckless
        # abandon. So we adopt a heuristic: check if an FC already
        # exists, and if it does, check if it is being used to store
        # user data. If so, don't overwrite it and move on.
        fc = config.store.get(cid)
        if fc is not None and any(k.startswith('subtopic|')
                                  for k in fc.iterkeys()):
            logger.info('skipping ingest for %r (abs url: %r) because '
                        'an FC with user data already exists.',
                        cid, link)
            return

        other_features = {
            u'keywords': StringCounter(keywords), #list(queries)),
        }

        try:
            fc = etl.create_fc_from_html(
                link, si.body.raw,
                encoding=si.body.encoding or 'utf-8', tfidf=tfidf,
                other_features=other_features,
            )
            if not fc:
                logger.info('failed to get an FC, moving on')
                return
            logger.info('created FC for %r (abs url: %r)',
                        cid, link)
            config.store.put([(cid, fc)])
        except Exception:
            logger.info('trapped ingest failure on %r (abs url: %r)',
                        cid, link, exc_info=True)
Exemplo n.º 20
0
def get_feat(fc, name):
    if len(fc[name]) == 0:
        return StringCounter({'': 0})
    else:
        return fc[name]
Exemplo n.º 21
0
# !!! IMPORTANT !!!
# Define features that you want to index. This will let you quickly scan
# for feature collections in the database with matching values.
#
# You don't have to index everything, but it's probably a good idea to index
# the most prominent features. e.g., phone or email or website.
#
# These should correspond to the names of the corresponding features.
feature_indexes = [u'phone', u'email', u'website', u'rate']

# Create a "store," which knows how to store and index feature collections.
store = Store(conn, feature_indexes=feature_indexes)

# Create a fresh feature collection and add a 'rate' feature.
fc = FeatureCollection()
fc['rate'] = StringCounter({
    u'5per30': 5,
    u'5per60': 1,
    u'10per20': 2,
})

# Content ids are the unique identifier for each feature collection.
# It's probably sufficient to use whatever you have for "ad id."
content_id = 'some_unique_value'
store.put([(content_id, fc)])
print store.get(content_id)

# Use the index scan!
print list(store.index_scan_prefix(u'rate', '10'))
Exemplo n.º 22
0
def extract(positive_fcs, negative_fcs, features=None):
    '''Takes a labeled set of feature collections (positive and negative)
       and the features wanted. And trains a Naive Bayes classifier on
       the underlying keys of the set of selected features features.
       If no features are selected, all are used.

       Returns two list of (keywords, strength) tuples ordered by strength. The
       first are feature keys that were predictive of the positive
       label and the second are the feature keys are were predictive
       of the negative label.

    ``*_fcs`` is the list of feature collections, positive label and
            negative label respectively.

    ``features`` designates which specific feature gets vectorized the
               other features are ignored.

    '''

    # Vector of labels
    labels = np.array([1] * len(positive_fcs) + [0] * len(negative_fcs))

    # Used to convert the feature collection keys into a sklearn
    # compatible format
    v = DictVectorizer(sparse=False)

    D = list()
    for fc in (positive_fcs + negative_fcs):
        feat = StringCounter()

        if not fc:
            logger.warn('how did we get an empty fc? %r', fc)

        else:
            # The features used to pull the keys for the classifier
            for f in features:
                feat += fc[f]

        D.append(feat)

    # Convert the list of Counters into an sklearn compatible format
    X = v.fit_transform(D)

    # Fit the sklearn Bernoulli Naive Bayes classifer
    clf = BernoulliNB()
    clf.fit(X, labels)

    # Extract the learned features that are predictive of the positive
    # and negative class
    positive_keywords = v.inverse_transform(clf.feature_log_prob_[1])[0]
    negative_keywords = v.inverse_transform(clf.feature_log_prob_[0])[0]

    pos_words = Counter(positive_keywords)
    neg_words = Counter(negative_keywords)

    ## make a list ordered by their weight
    pos_ordered = sorted(pos_words.items(),
                         key=operator.itemgetter(1),
                         reverse=True)
    neg_ordered = sorted(neg_words.items(),
                         key=operator.itemgetter(1),
                         reverse=True)

    return pos_ordered, neg_ordered
Exemplo n.º 23
0
 def add_feature(name, xs):
     if name not in fc:
         fc[name] = StringCounter()
     fc[name] += StringCounter(xs)
Exemplo n.º 24
0
def make_fc(text):
    nhash = nilsimsa_hash(text)
    fc = FeatureCollection()
    fc['#nilsimsa_all'] = StringCounter([nhash])
    return fc
Exemplo n.º 25
0
def html_to_fc(html=None,
               clean_html=None,
               clean_visible=None,
               encoding=None,
               url=None,
               timestamp=None,
               other_features=None):
    '''`html` is expected to be a raw string received over the wire from a
    remote webserver, and `encoding`, if provided, is used to decode
    it.  Typically, encoding comes from the Content-Type header field.
    The :func:`~streamcorpus_pipeline._clean_html.make_clean_html`
    function handles character encodings.

    '''
    def add_feature(name, xs):
        if name not in fc:
            fc[name] = StringCounter()
        fc[name] += StringCounter(xs)

    timestamp = timestamp or int(time.time() * 1000)
    other_features = other_features or {}

    if clean_html is None:
        if html is not None:
            try:
                clean_html_utf8 = make_clean_html(html, encoding=encoding)
            except:
                logger.warn('dropping doc because:', exc_info=True)
                return
            clean_html = clean_html_utf8.decode('utf-8')
        else:
            clean_html_utf8 = u''
            clean_html = u''
    else:
        clean_html_utf8 = u''

    if clean_visible is None or len(clean_visible) == 0:
        clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8')
    elif isinstance(clean_visible, str):
        clean_visible = clean_visible.decode('utf-8')

    fc = FeatureCollection()
    fc[u'meta_raw'] = html and uni(html, encoding) or u''
    fc[u'meta_clean_html'] = clean_html
    fc[u'meta_clean_visible'] = clean_visible
    fc[u'meta_timestamp'] = unicode(timestamp)

    url = url or u''

    fc[u'meta_url'] = uni(url)

    add_feature(u'icq', features.ICQs(clean_visible))
    add_feature(u'skype', features.skypes(clean_visible))
    add_feature(u'phone', features.phones(clean_visible))
    add_feature(u'email', features.emails(clean_visible))
    bowNP, normalizations = features.noun_phrases(cleanse(clean_visible),
                                                  included_unnormalized=True)
    add_feature(u'bowNP', bowNP)
    bowNP_unnorm = chain(*normalizations.values())
    add_feature(u'bowNP_unnorm', bowNP_unnorm)

    add_feature(u'image_url', features.image_urls(clean_html))
    add_feature(u'a_url', features.a_urls(clean_html))

    ## get parsed versions, extract usernames
    fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url'])
    fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url'])
    fc[u'usernames'] = features.usernames(fc[u'image_url'])

    fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url'])
    fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url'])

    fc[u'usernames'] += features.usernames(fc[u'a_url'])

    #fc[u'usernames'] += features.usernames2(
    #    fc[u'meta_clean_visible'])

    # beginning of treating this as a pipeline...
    xform = features.entity_names()
    fc = xform.process(fc)

    for feat_name, feat_val in other_features.iteritems():
        fc[feat_name] += StringCounter(feat_val)

    return fc