예제 #1
0
def test_usernames(url_or_path, username, count):
    urls = StringCounter()
    urls[url_or_path] += count

    if username is not None:
        results = usernames(urls)
        assert results == StringCounter({username: count})
예제 #2
0
def test_a_urls():
    html = '''
<a href="http://ExAmPle.com/My Page.html">
<a href="http://example.com/My%20Page.html">
'''
    assert StringCounter(features.a_urls(html)) == StringCounter({
        'http://example.com/My Page.html':
        2,
    })
예제 #3
0
def test_image_urls():
    html = '''
<img src="http://ExAmPle.com/My Image.jpg">
<img src="http://example.com/My%20Image.jpg">
'''
    assert StringCounter(features.image_urls(html)) == StringCounter({
        'http://example.com/My Image.jpg':
        2,
    })
예제 #4
0
def test_extract_emails():
    txt = '''
email: [email protected]
email: [email protected]
'''
    assert StringCounter(features.emails(txt)) == StringCounter({
        '*****@*****.**':
        2,
    })
예제 #5
0
def test_host_names():
    urls = StringCounter()
    urls['http://www.example.com/folder1'] = 3
    urls['http://www.example.com/folder2'] = 2
    urls['http://www.different.com/folder2'] = 7

    assert features.host_names(urls) == StringCounter({
        'www.example.com': 5,
        'www.different.com': 7,
    })
예제 #6
0
def test_path_dirs():
    urls = StringCounter()
    urls['http://www.example.com/folder1/folder3/index.html?source=dummy'] = 3
    urls['http://www.example.com/folder2/folder1'] = 2
    urls['http://www.different.com/folder2'] = 7

    assert features.path_dirs(urls) == StringCounter({
        'folder1': 5,
        'folder2': 9,
        'folder3': 3,
        'index.html': 3,
    })
예제 #7
0
def test_string_counter():
    hc = StringCounter('this sentence has been parsed')
    h1 = (id(hc), hc.generation)

    cache = dict(h1='hi')
    assert len(cache) == 1

    hc.update('more text')
    h2 = (id(hc), hc.generation)
    cache[h2] = 'hi again?'

    assert len(cache) == 2
    assert h1 != h2
예제 #8
0
def test_extract_phones():
    txt = '''
Phone: 111-222-3333
Phone: 1112223333
Phone: 1-111-222-3333
Phone: 11112223333
Phone: 222-3333
Phone: 2223333
'''
    assert StringCounter(features.phones(txt)) == StringCounter({
        '1112223333': 2,
        '11112223333': 2,
        '2223333': 2,
    })
예제 #9
0
def test_json_serializer():
    with registry:
        registry.add('StringCounter', JsonSerializer)

        fc = FeatureCollection()
        fc['thing2'] = StringCounter(dict(hello='people'))
        fc['thing2']['another'] = 5
        fc['thing3'] = StringCounter(dict(hello='people2'))
        fc_str = fc.dumps()

        fc2 = FeatureCollection.loads(fc_str)

        assert fc2['thing2']['another'] == 5
        assert fc2['thing2']['hello'] == 'people'
        assert fc2['thing3']['hello'] == 'people2'
def test_readonly(counter_type):
    fc = FeatureCollection({
        'hello': counter_type(Counter('hello')),
        'goodbye': counter_type(Counter('goodbye'))
    })
    fc2 = FeatureCollection({
        'hello': counter_type(Counter('hello')),
        'goodbye': counter_type(Counter('goodbye'))
    })

    fc.read_only = True
    with pytest.raises(ReadOnlyException):
        fc += fc2

    with pytest.raises(ReadOnlyException):
        fc -= fc2

    with pytest.raises(ReadOnlyException):
        fc *= 2

    with pytest.raises(ReadOnlyException):
        fc['woof'] = StringCounter()

    if hasattr(counter_type, 'read_only'):
        with pytest.raises(ReadOnlyException):
            fc['hello']['l'] = 3
        with pytest.raises(ReadOnlyException):
            fc['hello']['l'] += 3

    fc.read_only = False
    fc += fc2
    assert Counter(map(abs, fc['hello'].values())) == Counter({2: 3, 4: 1})
    fc -= fc2
    fc -= fc2
    assert Counter(map(abs, fc['hello'].values())) == Counter()
예제 #11
0
def test_string_counter_serialize():
    fc = FeatureCollection()
    fc['thing1'] = StringCounter()
    fc['thing1']['foo'] += 1
    fc_str = fc.dumps()

    fc2 = FeatureCollection.loads(fc_str)
    assert fc2['thing1']['foo'] == 1
예제 #12
0
 def build_feature(self, si):
     usernames = StringCounter()
     for sentence in si.body.sentences['nltk_tokenizer']:
         for token in sentence.tokens:
             if len(token.token) <= 3: continue
             if self.classify(token.token.decode('utf8')):
                 usernames[token.token] += 1
     return usernames
예제 #13
0
def add_sip_to_fc(fc, tfidf, limit=40):
    '''add "bowNP_sip" to `fc` using `tfidf` data
    '''
    if 'bowNP' not in fc:
        return
    if tfidf is None:
        return
    sips = features.sip_noun_phrases(tfidf, fc['bowNP'].keys(), limit=limit)
    fc[u'bowNP_sip'] = StringCounter(sips)
예제 #14
0
def path_dirs(urls):
    '''
    Takes a StringCounter of normalized URL and parses them into
    a list of path directories. The file name is
    included in the path directory list.
    '''
    path_dirs = StringCounter()
    for url in urls:
        for path_dir in filter(None, urlparse(url).path.split('/')):
            path_dirs[path_dir] += urls[url]
    return path_dirs
def test_read_only_features():
    fc = FeatureCollection({'feat': StringCounter({'foo': 1})})
    fc['feat']['foo'] += 1
    fc.read_only = True

    with pytest.raises(ReadOnlyException):
        fc['feat']['foo'] += 1
    with pytest.raises(ReadOnlyException):
        fc['feat'].pop('foo')
    with pytest.raises(ReadOnlyException):
        del fc['feat']['foo']
예제 #16
0
def create_fc_from_html(url,
                        html,
                        encoding='utf-8',
                        tfidf=None,
                        other_features=None):
    if encoding is not None:
        html = unicode(html, encoding)
    soup = BeautifulSoup(html, "lxml")
    title = soup_get(soup, 'title', lambda v: v.get_text())
    body = soup_get(soup, 'body', lambda v: v.prettify())
    if other_features is None:
        other_features = {}
    other_features.update({
        u'title': StringCounter([title]),
        u'titleBow': StringCounter(title.split()),
    })
    fc = html_to_fc(body, url=url, other_features=other_features)
    if fc is None:
        return None
    if tfidf is not None:
        add_sip_to_fc(fc, tfidf)
    return fc
예제 #17
0
class JsonSerializer(StringCounter):
    def __init__(self):
        raise NotImplementedError

    @staticmethod
    def loads(bytes):
        return StringCounter(json.loads(bytes))

    @staticmethod
    def dumps(sc):
        return json.dumps(sc)

    constructor = staticmethod(lambda: StringCounter())
예제 #18
0
def v1_fc_put(request, response, store, kvlclient, tfidf, cid):
    '''Store a single feature collection.

    The route for this endpoint is:
    ``PUT /dossier/v1/feature-collections/<content_id>``.

    ``content_id`` is the id to associate with the given feature
    collection. The feature collection should be in the request
    body serialized as JSON.

    Alternatively, if the request's ``Content-type`` is
    ``text/html``, then a feature collection is generated from the
    HTML. The generated feature collection is then returned as a
    JSON payload.

    This endpoint returns status ``201`` upon successful
    storage otherwise. An existing feature collection with id
    ``content_id`` is overwritten.
    '''
    tfidf = tfidf or None
    if request.headers.get('content-type', '').startswith('text/html'):
        url = urllib.unquote(cid.split('|', 1)[1])
        fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf)
        logger.info('created FC for %r', cid)
        store.put([(cid, fc)])
        return fc_to_json(fc)
    else:
        fc = FeatureCollection.from_dict(json.load(request.body))
        keywords = set()
        for subid in fc:
            if subid.startswith('subtopic'):
                ty = subtopic_type(subid)
                if ty in ('text', 'manual'):
                    # get the user selected string
                    data = typed_subtopic_data(fc, subid)
                    map(keywords.add, cleanse(data).split())
                    keywords.add(cleanse(data))

        folders = Folders(kvlclient)
        for fid, sid in folders.parent_subfolders(cid):
            if not isinstance(fid, unicode):
                fid = fid.decode('utf8')
            if not isinstance(sid, unicode):
                sid = sid.decode('utf8')
            keywords.add(cleanse(fid))
            keywords.add(cleanse(sid))

        fc[u'keywords'] = StringCounter(keywords)
        store.put([(cid, fc)])
        response.status = 201
예제 #19
0
def host_names(urls):
    '''
    Takes a StringCounter of normalized URL and parses their hostnames

    N.B. this assumes that absolute URLs will begin with

    http://

    in order to accurately resolve the host name.
    Relative URLs will not have host names.
    '''
    host_names = StringCounter()
    for url in urls:
        host_names[urlparse(url).netloc] += urls[url]
    return host_names
예제 #20
0
def forum_post_features(row):
    fc = FeatureCollection()
    for k in row['author']:
        fc['post_author_' + k] = row['author'][k]

    if 'image_urls' in row:
        fc['image_url'] = StringCounter()
        for image_url in row['image_urls']:
            fc['image_url'][image_url] += 1

    others = ['parent_id', 'thread_id', 'thread_link', 'thread_name', 'title']
    for k in others:
        if k in row:
            fc['post_' + k] = uni(row[k])
    return fc
예제 #21
0
def test_string_counter2():
    sc = StringCounter()
    sc['hi'] += 1
    assert isinstance(sc, StringCounter)
    sc += Counter('hi')
    assert isinstance(sc, StringCounter)
예제 #22
0
 def loads(bytes):
     return StringCounter(json.loads(bytes))
예제 #23
0
def make_fc(text):
    nhash = nilsimsa_hash(text)
    fc = FeatureCollection()
    fc['#nilsimsa_all'] = StringCounter([nhash])
    return fc
예제 #24
0
def extract(positive_fcs, negative_fcs, features=None):
    '''Takes a labeled set of feature collections (positive and negative)
       and the features wanted. And trains a Naive Bayes classifier on
       the underlying keys of the set of selected features features.
       If no features are selected, all are used.

       Returns two list of (keywords, strength) tuples ordered by strength. The
       first are feature keys that were predictive of the positive
       label and the second are the feature keys are were predictive
       of the negative label.

    ``*_fcs`` is the list of feature collections, positive label and
            negative label respectively.

    ``features`` designates which specific feature gets vectorized the
               other features are ignored.

    '''

    # Vector of labels
    labels = np.array([1] * len(positive_fcs) + [0] * len(negative_fcs))

    # Used to convert the feature collection keys into a sklearn
    # compatible format
    v = DictVectorizer(sparse=False)

    D = list()
    for fc in (positive_fcs + negative_fcs):
        feat = StringCounter()

        if not fc:
            logger.warn('how did we get an empty fc? %r', fc)

        else:
            # The features used to pull the keys for the classifier
            for f in features:
                feat += fc[f]

        D.append(feat)

    # Convert the list of Counters into an sklearn compatible format
    X = v.fit_transform(D)

    # Fit the sklearn Bernoulli Naive Bayes classifer
    clf = BernoulliNB()
    clf.fit(X, labels)

    # Extract the learned features that are predictive of the positive
    # and negative class
    positive_keywords = v.inverse_transform(clf.feature_log_prob_[1])[0]
    negative_keywords = v.inverse_transform(clf.feature_log_prob_[0])[0]

    pos_words = Counter(positive_keywords)
    neg_words = Counter(negative_keywords)

    ## make a list ordered by their weight
    pos_ordered = sorted(pos_words.items(),
                         key=operator.itemgetter(1),
                         reverse=True)
    neg_ordered = sorted(neg_words.items(),
                         key=operator.itemgetter(1),
                         reverse=True)

    return pos_ordered, neg_ordered
예제 #25
0
def html_to_fc(html=None,
               clean_html=None,
               clean_visible=None,
               encoding=None,
               url=None,
               timestamp=None,
               other_features=None):
    '''`html` is expected to be a raw string received over the wire from a
    remote webserver, and `encoding`, if provided, is used to decode
    it.  Typically, encoding comes from the Content-Type header field.
    The :func:`~streamcorpus_pipeline._clean_html.make_clean_html`
    function handles character encodings.

    '''
    def add_feature(name, xs):
        if name not in fc:
            fc[name] = StringCounter()
        fc[name] += StringCounter(xs)

    timestamp = timestamp or int(time.time() * 1000)
    other_features = other_features or {}

    if clean_html is None:
        if html is not None:
            try:
                clean_html_utf8 = make_clean_html(html, encoding=encoding)
            except:
                logger.warn('dropping doc because:', exc_info=True)
                return
            clean_html = clean_html_utf8.decode('utf-8')
        else:
            clean_html_utf8 = u''
            clean_html = u''
    else:
        clean_html_utf8 = u''

    if clean_visible is None or len(clean_visible) == 0:
        clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8')
    elif isinstance(clean_visible, str):
        clean_visible = clean_visible.decode('utf-8')

    fc = FeatureCollection()
    fc[u'meta_raw'] = html and uni(html, encoding) or u''
    fc[u'meta_clean_html'] = clean_html
    fc[u'meta_clean_visible'] = clean_visible
    fc[u'meta_timestamp'] = unicode(timestamp)

    url = url or u''

    fc[u'meta_url'] = uni(url)

    add_feature(u'skype', features.skypes(clean_visible))
    add_feature(u'phone', features.phones(clean_visible))
    add_feature(u'email', features.emails(clean_visible))
    bowNP, normalizations = features.noun_phrases(cleanse(clean_visible),
                                                  included_unnormalized=True)
    add_feature(u'bowNP', bowNP)
    bowNP_unnorm = chain(*normalizations.values())
    add_feature(u'bowNP_unnorm', bowNP_unnorm)

    add_feature(u'image_url', features.image_urls(clean_html))
    add_feature(u'a_url', features.a_urls(clean_html))

    ## get parsed versions, extract usernames
    fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url'])
    fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url'])
    fc[u'usernames'] = features.usernames(fc[u'image_url'])

    fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url'])
    fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url'])

    fc[u'usernames'] += features.usernames(fc[u'a_url'])

    #fc[u'usernames'] += features.usernames2(
    #    fc[u'meta_clean_visible'])

    # beginning of treating this as a pipeline...
    xform = features.entity_names()
    fc = xform.process(fc)

    for feat_name, feat_val in other_features.iteritems():
        fc[feat_name] += StringCounter(feat_val)

    return fc
예제 #26
0
 def add_feature(name, xs):
     if name not in fc:
         fc[name] = StringCounter()
     fc[name] += StringCounter(xs)