예제 #1
0
def test_non_counter_features_bad_serialize():
    with pytest.raises(SerializationError):
        FeatureCollection({'NAME': 'foobaz'})
    fc = FeatureCollection()
    fc['NAME'] = 'foobaz'
    with pytest.raises(SerializationError):
        fc.dumps()
def test_readonly(counter_type):
    fc = FeatureCollection({
        'hello': counter_type(Counter('hello')),
        'goodbye': counter_type(Counter('goodbye'))
    })
    fc2 = FeatureCollection({
        'hello': counter_type(Counter('hello')),
        'goodbye': counter_type(Counter('goodbye'))
    })

    fc.read_only = True
    with pytest.raises(ReadOnlyException):
        fc += fc2

    with pytest.raises(ReadOnlyException):
        fc -= fc2

    with pytest.raises(ReadOnlyException):
        fc *= 2

    with pytest.raises(ReadOnlyException):
        fc['woof'] = StringCounter()

    if hasattr(counter_type, 'read_only'):
        with pytest.raises(ReadOnlyException):
            fc['hello']['l'] = 3
        with pytest.raises(ReadOnlyException):
            fc['hello']['l'] += 3

    fc.read_only = False
    fc += fc2
    assert Counter(map(abs, fc['hello'].values())) == Counter({2: 3, 4: 1})
    fc -= fc2
    fc -= fc2
    assert Counter(map(abs, fc['hello'].values())) == Counter()
예제 #3
0
def test_ft_roundtrip():
    fc = FeatureCollection()
    fc['@NAME']['foo'].append([
        ('nltk', 5, 2),
    ])
    fc2 = FeatureCollection.loads(fc.dumps())
    assert fc['@NAME'] == fc2['@NAME']
def test_read_only_preserved_after_serialized():
    fc = FeatureCollection({'NAME': {'foo': 1, 'baz': 2}})
    fc.read_only = True
    fcnew = FeatureCollection.loads(fc.dumps())
    assert fcnew.read_only
    with pytest.raises(ReadOnlyException):
        fcnew['NAME']['foo'] += 1
예제 #5
0
def test_string_counter_serialize():
    fc = FeatureCollection()
    fc['thing1'] = StringCounter()
    fc['thing1']['foo'] += 1
    fc_str = fc.dumps()

    fc2 = FeatureCollection.loads(fc_str)
    assert fc2['thing1']['foo'] == 1
예제 #6
0
def test_serialize_deserialize(counter_type):
    ## build entity, serialize, deserialize, and verify its multisets
    ent1 = FeatureCollection()
    ent1['bow'] += counter_type(Counter(['big', 'dog']))
    ent1['bow'] += counter_type(Counter('tall building'))
    ent1['bon'] += counter_type(Counter(['Super Cat', 'Small Cat',
                                         'Tiger Fish']))

    blob = ent1.dumps()
    ent2 = FeatureCollection.loads(blob)
    assert_same_fc(ent1, ent2)
예제 #7
0
def test_type(counter_type):
    m1 = FeatureCollection()
    m1['bow'] += counter_type(Counter(['big', 'dog']))

    assert type(m1) == FeatureCollection

    m2 = FeatureCollection()
    m2['bow'] += counter_type(Counter(['cat']))
    m1 += m2

    assert type(m1) == FeatureCollection
def test_read_only_features():
    fc = FeatureCollection({'feat': StringCounter({'foo': 1})})
    fc['feat']['foo'] += 1
    fc.read_only = True

    with pytest.raises(ReadOnlyException):
        fc['feat']['foo'] += 1
    with pytest.raises(ReadOnlyException):
        fc['feat'].pop('foo')
    with pytest.raises(ReadOnlyException):
        del fc['feat']['foo']
예제 #9
0
def test_no_bytes_allowed():
    fc = FeatureCollection({'foo': u'bar'})
    fc.dumps()  # OK!

    with pytest.raises(SerializationError):
        fc = FeatureCollection({'foo': 'bar'})

    fc = FeatureCollection()
    fc['foo'] = 'bar'
    with pytest.raises(SerializationError):
        fc.dumps()
예제 #10
0
def test_type(counter_type):
    ent1 = FeatureCollection()
    ent1['bow'] += counter_type(Counter(['big', 'dog']))
    if counter_type.__name__ == 'StringCounter':
        ent1['bow']['a'] += 1
    assert isinstance(ent1, FeatureCollection)

    ent3 = FeatureCollection()
    ent3['bow'] += counter_type(Counter(['cat']))

    ent1 += ent3
    assert isinstance(ent1, FeatureCollection)
예제 #11
0
def test_binop_no_share():
    fc1 = FeatureCollection({'NAME': {'foo': 1, 'bar': 1}})
    fc2 = FeatureCollection({'NAME': {'foo': 2, 'bar': 2}})

    fc3 = fc1 + fc2

    assert fc1['NAME']['foo'] == 1
    assert fc2['NAME']['foo'] == 2

    fc1 += fc2
    assert fc1 == fc3
    assert fc1['NAME']['foo'] == 3
    assert fc2['NAME']['foo'] == 2
예제 #12
0
def test_binop_different_no_share():
    fc1 = FeatureCollection({'FOO': {'foo': 1}})
    fc2 = FeatureCollection({'BAR': {'bar': 1}})

    result = fc1 + fc2
    expected = FeatureCollection({'FOO': {'foo': 1 }, 'BAR': {'bar': 1}})
    assert result == expected

    result['BAR']['bar'] = 2
    assert fc2['BAR']['bar'] == 1

    result['FOO']['foo'] = 2
    assert fc1['FOO']['foo'] == 1
예제 #13
0
def test_fc_eq(counter_type):
    fc1 = FeatureCollection({
            'hello': counter_type(Counter('hello')),
            'goodbye': counter_type(Counter('goodbye'))})
    fc2 = FeatureCollection({
            'hello': counter_type(Counter('hello')),
            'goodbye': counter_type(Counter('goodbye'))})
    fc3 = FeatureCollection({
            'hello': counter_type(Counter('hello')),
            'goodbye': counter_type(Counter('goodbye2'))})

    assert fc1 == fc2
    assert fc1 != fc3
예제 #14
0
def test_json_serializer():
    with registry:
        registry.add('StringCounter', JsonSerializer)

        fc = FeatureCollection()
        fc['thing2'] = StringCounter(dict(hello='people'))
        fc['thing2']['another'] = 5
        fc['thing3'] = StringCounter(dict(hello='people2'))
        fc_str = fc.dumps()

        fc2 = FeatureCollection.loads(fc_str)

        assert fc2['thing2']['another'] == 5
        assert fc2['thing2']['hello'] == 'people'
        assert fc2['thing3']['hello'] == 'people2'
예제 #15
0
def test_thing_serializer():
    with registry:
        registry.add('StringCounter', ThingSerializer)

        fc = FeatureCollection()
        fc['thing1'] = Thing(json.dumps(dict(hello='people')))
        fc['thing1']['another'] = 'more'
        fc['thing1'].do_more_things()
        fc_str = fc.dumps()

        fc2 = FeatureCollection.loads(fc_str)

        assert fc2['thing1']['another'] == 'more'
        assert fc2['thing1']['hello'] == 'people'
        assert fc2['thing1']['doing'] == 'something'
예제 #16
0
def perftest_throughput_feature_collection():
    with registry:
        registry.add('StringCounter', ThingSerializer)
        fc = FeatureCollection()
        fc['thing1'] = Thing(json.dumps(dict(one_mb=' ' * 2**20)))
        fc_str = fc.dumps()

        start_time = time.time()
        num = 1000
        for i in range(num):
            fc2 = FeatureCollection.loads(fc_str)
            fc2.dumps()
        elapsed = time.time() - start_time
        rate = float(num) / elapsed
        print('%d MB in %.1f sec --> %.1f MB per sec' % (num, elapsed, rate))
예제 #17
0
def test_eq(counter_type):
    mc1 = FeatureCollection({
        'hello': counter_type(Counter('hello')),
        'goodbye': counter_type(Counter('goodbye'))
    })
    mc2 = FeatureCollection({
        'hello': counter_type(Counter('hello')),
        'goodbye': counter_type(Counter('goodbye'))
    })
    mc3 = FeatureCollection({
        'hello': counter_type(Counter('hello')),
        'goodbye': counter_type(Counter('goodbye2'))
    })

    assert mc1 == mc2
    assert mc1 != mc3
예제 #18
0
def test_default(counter_type):
    'does a FC make a new counter that adds properly'
    mc = FeatureCollection()
    assert isinstance(mc['foo'], counter_type)

    mc['foo'] += counter_type(Counter('dog'))
    assert isinstance(mc['foo'], counter_type), \
        'failed and made %s' % type(mc['foo'])

    mc['foo'] -= counter_type(Counter('dog'))
    assert isinstance(mc['foo'], counter_type), \
        'failed and made %s' % type(mc['foo'])

    if hasattr(mc['foo'], 'substract'):
        mc['foo'].subtract(counter_type(Counter('dog')))
        assert isinstance(mc['foo'], counter_type), \
            'failed and made %s' % type(mc['foo'])
        mc['foo'] += counter_type(Counter('dog'))

    mc['foo'] += counter_type(Counter('dog'))
    assert isinstance(mc['foo'], counter_type), \
        'failed and made %s' % type(mc['foo'])

    mc['foo'] += counter_type(Counter('dog'))
    mc['foo'] += counter_type(Counter('dog cat'))
    assert Counter(map(abs, mc['foo'].values())) == Counter({1: 4, 3: 3})
예제 #19
0
def test_multiset_change(counter_type):
    ent1 = FeatureCollection()
    ent1['bow'] += counter_type(Counter(['big', 'dog']))
    ent1.pop('bow')
    assert dict(ent1.items()) == dict()

    ## can pop empty -- fails
    #ent1.pop('foo')

    ## set equal to
    test_data = ['big2', 'dog2']
    ent1['bow'] = counter_type(Counter(test_data))
    assert list(map(abs,ent1['bow'].values())) == [1,1]

    ent1['bow'] += counter_type(Counter(test_data))
    assert list(map(abs,ent1['bow'].values())) == [2,2]
예제 #20
0
def test_ignored():
    fc = FeatureCollection()
    fc['foo'] = 'bar'
    with pytest.raises(SerializationError):
        fc.dumps()

    fc = FeatureCollection()
    fc['_foo'] = 'bar'
    fc.dumps()  # _foo is ignored!
예제 #21
0
def test_fc_chunk():
    fc1 = FeatureCollection({'NAME': {'foo': 2, 'baz': 1}})
    fc2 = FeatureCollection({'NAME': {'foo': 4, 'baz': 2}})

    fh = StringIO()
    chunk = FeatureCollectionChunk(file_obj=fh, mode='wb')
    chunk.add(fc1)
    chunk.add(fc2)
    chunk.flush()

    blob = fh.getvalue()
    assert blob
    fh = StringIO(blob)
    chunk = FeatureCollectionChunk(file_obj=fh, mode='rb')
    rfc1, rfc2 = list(chunk)
    assert fc1 == rfc1
    assert fc2 == rfc2
예제 #22
0
def test_meta_adding(counter_type):
    mc = FeatureCollection({
        'hello': counter_type(Counter('hello')),
        'goodbye': counter_type(Counter('goodbye'))
    })
    mc2 = mc + mc

    assert Counter(map(abs, mc2['hello'].values())) == Counter({2: 3, 4: 1})
예제 #23
0
def test_build_from_dict(counter_type):
    mc = FeatureCollection({
        'hello': counter_type(Counter('hello')),
        'goodbye': counter_type(Counter('goodbye'))
    })

    assert Counter(map(abs, mc['hello'].values())) == Counter({1: 3, 2: 1})
    assert isinstance(mc['hello'], counter_type)
예제 #24
0
def test_fc_meta_adding_complex(counter_type):
    fc = FeatureCollection({
            'hello': counter_type(Counter('hello')),
            'goodbye': counter_type(Counter('goodbye'))})
    fc2 = FeatureCollection({
            'hello': counter_type(Counter('hello')),
            'goodbye': counter_type(Counter('goodbye'))})
    fc3 = fc + fc2

    assert Counter(map(abs,fc3['hello'].values())) == Counter({2: 3, 4: 1})
    fc += fc2
    assert Counter(map(abs,fc['hello'].values())) == Counter({2: 3, 4: 1})

    fc3 -= fc2
    assert Counter(map(abs,fc3['hello'].values())) == Counter({1: 3, 2: 1})

    fc3 -= fc2
    assert Counter(map(abs,fc3['hello'].values())) == Counter()
예제 #25
0
def test_meta_adding_complex(counter_type):
    mc = FeatureCollection({
        'hello': counter_type(Counter('hello')),
        'goodbye': counter_type(Counter('goodbye'))
    })
    mc2 = FeatureCollection({
        'hello': counter_type(Counter('hello')),
        'goodbye': counter_type(Counter('goodbye'))
    })
    mc3 = mc + mc2

    assert Counter(map(abs, mc3['hello'].values())) == Counter({2: 3, 4: 1})
    mc += mc2
    assert Counter(map(abs, mc['hello'].values())) == Counter({2: 3, 4: 1})

    ## isub tests
    mc3 -= mc2
    assert Counter(map(abs, mc3['hello'].values())) == Counter({1: 3, 2: 1})

    mc3 -= mc2
    assert Counter(map(abs, mc3['hello'].values())) == Counter()
def test_read_only_binop():
    fc1 = FeatureCollection({'NAME': {'foo': 1, 'bar': 1}})
    fc2 = FeatureCollection({'NAME': {'foo': 2, 'bar': 2}})

    fc1.read_only = True
    fc2.read_only = True

    result = fc1 + fc2
    expected = FeatureCollection({'NAME': {'foo': 3, 'bar': 3}})
    assert result == expected
    assert not result.read_only
예제 #27
0
def v1_fc_put(request, response, store, kvlclient, tfidf, cid):
    '''Store a single feature collection.

    The route for this endpoint is:
    ``PUT /dossier/v1/feature-collections/<content_id>``.

    ``content_id`` is the id to associate with the given feature
    collection. The feature collection should be in the request
    body serialized as JSON.

    Alternatively, if the request's ``Content-type`` is
    ``text/html``, then a feature collection is generated from the
    HTML. The generated feature collection is then returned as a
    JSON payload.

    This endpoint returns status ``201`` upon successful
    storage otherwise. An existing feature collection with id
    ``content_id`` is overwritten.
    '''
    tfidf = tfidf or None
    if request.headers.get('content-type', '').startswith('text/html'):
        url = urllib.unquote(cid.split('|', 1)[1])
        fc = etl.create_fc_from_html(url, request.body.read(), tfidf=tfidf)
        logger.info('created FC for %r', cid)
        store.put([(cid, fc)])
        return fc_to_json(fc)
    else:
        fc = FeatureCollection.from_dict(json.load(request.body))
        keywords = set()
        for subid in fc:
            if subid.startswith('subtopic'):
                ty = subtopic_type(subid)
                if ty in ('text', 'manual'):
                    # get the user selected string
                    data = typed_subtopic_data(fc, subid)
                    map(keywords.add, cleanse(data).split())
                    keywords.add(cleanse(data))

        folders = Folders(kvlclient)
        for fid, sid in folders.parent_subfolders(cid):
            if not isinstance(fid, unicode):
                fid = fid.decode('utf8')
            if not isinstance(sid, unicode):
                sid = sid.decode('utf8')
            keywords.add(cleanse(fid))
            keywords.add(cleanse(sid))

        fc[u'keywords'] = StringCounter(keywords)
        store.put([(cid, fc)])
        response.status = 201
    def add_folder(self, folder_id, ann_id=None):
        '''Add a folder.

        If ``ann_id`` is set, then the folder is owned by the given user.
        Otherwise, the folder is owned and viewable by all anonymous
        users.

        :param str folder_id: Folder id
        :param str ann_id: Username
        '''
        self.assert_valid_folder_id(folder_id)
        ann_id = self._annotator(ann_id)
        cid = self.wrap_folder_content_id(ann_id, folder_id)
        self.store.put([(cid, FeatureCollection())])
        logger.info('Added folder %r with content id %r', folder_id, cid)
예제 #29
0
def forum_post_features(row):
    fc = FeatureCollection()
    for k in row['author']:
        fc['post_author_' + k] = row['author'][k]

    if 'image_urls' in row:
        fc['image_url'] = StringCounter()
        for image_url in row['image_urls']:
            fc['image_url'][image_url] += 1

    others = ['parent_id', 'thread_id', 'thread_link', 'thread_name', 'title']
    for k in others:
        if k in row:
            fc['post_' + k] = uni(row[k])
    return fc
예제 #30
0
def v1_fc_put(request, response, visid_to_dbid, store, cid):
    '''Store a single feature collection.

    The route for this endpoint is:
    ``PUT /dossier/v1/feature-collections/<content_id>``.

    ``content_id`` is the id to associate with the given feature
    collection. The feature collection should be in the request
    body serialized as JSON.

    This endpoint returns status ``201`` upon successful storage.
    An existing feature collection with id ``content_id`` is
    overwritten.
    '''
    fc = FeatureCollection.from_dict(json.load(request.body))
    store.put([(visid_to_dbid(cid), fc)])
    response.status = 201