예제 #1
0
    def test_metadata(self):
        db = TrailDB('testtrail.tdb')
        self.assertEqual(1, db.min_timestamp())
        self.assertEqual(3, db.max_timestamp())
        self.assertEqual((1, 3), db.time_range())

        self.assertEqual((1, 3), db.time_range(parsetime = False))
예제 #2
0
 def test_simple_disjunction(self):
     tdb = TrailDB('testtrail')
     # test shorthand notation (not a list of lists)
     events = list(tdb.trail(0, event_filter=[('field1', 'a'), ('field2', '4')]))
     self.assertEqual(len(events), 2)
     self.assertEqual((events[0].field1, events[0].field2), ('a', '1'))
     self.assertEqual((events[1].field1, events[1].field2), ('d', '4'))
예제 #3
0
 def test_negation(self):
     tdb = TrailDB('testtrail')
     events = list(tdb.trail(0, event_filter=[('field3', 'x', True)]))
     self.assertEqual(len(events), 1)
     self.assertEqual(
         (events[0].field1, events[0].field2, events[0].field3),
         ('c', '3', 'y'))
예제 #4
0
    def test_apply_blacklist(self):
        uuids = [
            "02345678123456781234567812345678",
            "12345678123456781234567812345678",
            "22345678123456781234567812345678",
            "32345678123456781234567812345678",
            "42345678123456781234567812345678"
        ]
        cons = TrailDBConstructor('blacklist_testtrail', ['field1', 'field2'])
        for uuid in uuids:
            cons.add(uuid, 1, ['a', '1'])
            cons.add(uuid, 2, ['b', '2'])
            cons.add(uuid, 3, ['c', '3'])
        cons.finalize()

        tdb = TrailDB('blacklist_testtrail')
        blacklist = [uuids[1], uuids[2]]
        tdb.apply_blacklist(blacklist)
        found_trails = list(tdb.trails(parsetime=False))

        for trail_uuid, trail_events in found_trails:
            if trail_uuid in blacklist:
                expected_length = 0
            else:
                expected_length = 3

            trail_events = list(trail_events)
            self.assertEqual(len(trail_events), expected_length)
예제 #5
0
 def test_simple_disjunction(self):
     tdb = TrailDB('testtrail')
     # test shorthand notation (not a list of lists)
     events = list(tdb.trail(0, event_filter=[('field1', 'a'), ('field2', '4')]))
     self.assertEqual(len(events), 2)
     self.assertEqual((events[0].field1, events[0].field2), ('a', '1'))
     self.assertEqual((events[1].field1, events[1].field2), ('d', '4'))
예제 #6
0
 def test_conjunction(self):
     tdb = TrailDB('testtrail')
     events = list(
         tdb.trail(0,
                   event_filter=[[('field1', 'e'), ('field1', 'c')],
                                 [('field3', 'y', True)]]))
     self.assertEqual(len(events), 1)
     self.assertEqual((events[0].field1, events[0].field2), ('e', '5'))
예제 #7
0
 def test_time_range(self):
     tdb = TrailDB('testtrail')
     events = list(tdb.trail(0,
                             event_filter=[[(2, 4)]],
                             parsetime=False))
     self.assertEqual(len(events), 2)
     self.assertEqual(events[0].time, 2L)
     self.assertEqual(events[1].time, 3L)
예제 #8
0
    def test_silly_open(self):
        self.assertTrue(os.path.exists('testtrail.tdb'))
        self.assertFalse(os.path.exists('testtrail'))

        db1 = TrailDB('testtrail.tdb')
        db2 = TrailDB('testtrail')

        with self.assertRaises(TrailDBError):
            TrailDB('foo.tdb')
예제 #9
0
 def test_filter_object(self):
     tdb = TrailDB('testtrail')
     obj = tdb.create_filter([[('field1', 'e'), ('field1', 'c')],
                              [('field3', 'y', True)]])
     events = list(tdb.trail(0, event_filter=obj))
     self.assertEqual(len(events), 1)
     self.assertEqual((events[0].field1, events[0].field2), ('e', '5'))
     events = list(tdb.trail(0, event_filter=obj))
     self.assertEqual(len(events), 1)
     self.assertEqual((events[0].field1, events[0].field2), ('e', '5'))
예제 #10
0
 def test_filter_object(self):
     tdb = TrailDB('testtrail')
     obj = tdb.create_filter([[('field1', 'e'), ('field1', 'c')],
                              [('field3', 'y', True)]])
     events = list(tdb.trail(0, event_filter=obj))
     self.assertEqual(len(events), 1)
     self.assertEqual((events[0].field1, events[0].field2), ('e', '5'))
     events = list(tdb.trail(0, event_filter=obj))
     self.assertEqual(len(events), 1)
     self.assertEqual((events[0].field1, events[0].field2), ('e', '5'))
예제 #11
0
    def test_crumbs(self):
        db = TrailDB('testtrail.tdb')

        n = 0
        for uuid, trail in db.trails():
            n += 1
            self.assertEqual(self.uuid, uuid)
            self.assertIsInstance(trail, TrailDBCursor)
            self.assertEqual(3, len(list(trail)))

        self.assertEqual(1, n)
예제 #12
0
    def test_crumbs(self):
        db = TrailDB('testtrail.tdb')

        n = 0
        for uuid, trail in db.trails():
            n += 1
            self.assertEqual(self.uuid, uuid)
            self.assertIsInstance(trail, TrailDBCursor)
            self.assertEqual(3, len(list(trail)))

        self.assertEqual(1, n)
예제 #13
0
    def test_trails(self):
        db = TrailDB('testtrail')
        self.assertEqual(1, db.num_trails)

        trail = db.trail(0)
        self.assertIsInstance(trail, TrailDBCursor)

        events = list(trail) # Force evaluation of generator
        self.assertEqual(3, len(events))
        for event in events:
            self.assertTrue(hasattr(event, 'time'))
            self.assertTrue(hasattr(event, 'field1'))
            self.assertTrue(hasattr(event, 'field2'))
예제 #14
0
    def test_trails(self):
        db = TrailDB('testtrail')
        self.assertEqual(1, db.num_trails)

        trail = db.trail(0)
        self.assertIsInstance(trail, TrailDBCursor)

        events = list(trail)  # Force evaluation of generator
        self.assertEqual(3, len(events))
        for event in events:
            self.assertTrue(hasattr(event, 'time'))
            self.assertTrue(hasattr(event, 'field1'))
            self.assertTrue(hasattr(event, 'field2'))
예제 #15
0
def loading():
    traildb = TrailDB("/mnt/data/wikipedia-history-small.tdb")
    user_edits = 0
    ip_edits = 0

    for uuid, trail in traildb.trails():
        for event in trail:
            if event.user != "":
                user_edits += 1
            elif event.ip != "":
                ip_edits += 1

    print("User edits: {}".format(user_edits))
    print("IP edits: {}".format(ip_edits))
예제 #16
0
def get_dataframe():
    tdb = TrailDB('pydata-tutorial.tdb')
    base = tdb.min_timestamp()
    types = []
    xs = []
    ys = []
    # try this:
    # for y, (first_ts, events) in enumerate(sorted(get_events(tdb), reverse=True)):
    for y, (first_ts, events) in enumerate(get_events(tdb)):
        for event in events:
            xs.append(old_div(int(event.time - base), (24 * 3600)))
            ys.append(y)
            types.append('user' if event.user else 'anon')
    data = pd.DataFrame({'x': xs, 'y': ys})
    data['type'] = pd.Series(types, dtype='category')
    return data
def get_dataframe():
    tdb = TrailDB('pydata-tutorial.tdb')
    base = tdb.min_timestamp()
    types = []
    xs = []
    ys = []
    #try this:
    #for y, (first_ts, events) in enumerate(sorted(get_events(tdb), reverse=True)):
    for y, (first_ts, events) in enumerate(get_events(tdb)):
        for event in events:
            xs.append(int(event.time - base) / (24 * 3600))
            ys.append(y)
            types.append('user' if event.user else 'anon')
    data = pd.DataFrame({'x': xs, 'y': ys})
    data['type'] = pd.Series(types, dtype='category')
    return data
예제 #18
0
    def test_metadata(self):
        db = TrailDB('testtrail.tdb')
        self.assertEqual(1, db.min_timestamp())
        self.assertEqual(3, db.max_timestamp())
        self.assertEqual((1, 3), db.time_range())

        self.assertEqual((1, 3), db.time_range(parsetime=False))
예제 #19
0
    def test_trails_selected_uuids(self):
        uuids = [
            "02345678123456781234567812345678",
            "12345678123456781234567812345678",
            "22345678123456781234567812345678",
            "32345678123456781234567812345678",
            "42345678123456781234567812345678"
        ]
        cons = TrailDBConstructor('whitelist_testtrail', ['field1', 'field2'])
        for uuid in uuids:
            cons.add(uuid, 1, ['a', '1'])
            cons.add(uuid, 2, ['b', '2'])
            cons.add(uuid, 3, ['c', '3'])
        cons.finalize()

        tdb = TrailDB('whitelist_testtrail')
        whitelist = [uuids[0], uuids[3], uuids[4]]

        expected_length = 3
        for trail_uuid, trail_events in tdb.trails(selected_uuids=whitelist):
            trail_events = list(trail_events)
            self.assertEqual(len(trail_events), expected_length)
예제 #20
0
    def test_lexicons(self):
        db = TrailDB('testtrail')

        # First field
        self.assertEqual(4, db.lexicon_size(1))
        self.assertEqual(['a', 'b', 'c'], list(db.lexicon(1)))

        # Second field
        self.assertEqual(['1', '2', '3'], list(db.lexicon(2)))

        with self.assertRaises(TrailDBError):
            db.lexicon(3)  # Out of bounds
예제 #21
0
def traildb_to_coo(db, fieldname):
    if not TrailDB:
        raise ImportError("Could not find traildb")
    db_handle = TrailDB(db)
    num_events = db_handle.num_events
    del db_handle
    r_idx = np.zeros(num_events, dtype=np.uint64)
    c_idx = np.zeros(num_events, dtype=np.uint64)
    uuids = np.zeros((num_events, 16), dtype=np.uint8)
    timestamps = np.zeros(num_events, dtype=np.uint64)

    cols = traildb_coo_repr_func(db.encode(), fieldname.encode(), r_idx, c_idx,
                                 uuids, timestamps)
    return uuids, timestamps, cols,\
        sparse.coo_matrix((np.ones(num_events), (r_idx, c_idx)))
예제 #22
0
    def test_lexicons(self):
        db = TrailDB('testtrail')

        # First field
        self.assertEqual(4, db.lexicon_size(1))
        self.assertEqual(['a', 'b', 'c'], list(db.lexicon(1)))

        # Second field
        self.assertEqual(['1', '2', '3'], list(db.lexicon(2)))

        with self.assertRaises(TrailDBError):
            db.lexicon(3) # Out of bounds
예제 #23
0
 def test_fields(self):
     db = TrailDB('testtrail')
     self.assertEqual(['time', 'field1', 'field2'], db.fields)
예제 #24
0
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from past.utils import old_div
from random import random
import sys

from traildb import TrailDB, TrailDBConstructor


def extract(tdb, cons, sample_size):
    for uuid, trail in tdb.trails():
        if random() < sample_size:
            for event in trail:
                cons.add(uuid, event.time, list(event)[1:])
    return cons.finalize()


if __name__ == '__main__':
    if len(sys.argv) < 3:
        print(
            'Usage: extract_sample source_tdb destination_tdb sample_percentage'
        )
        sys.exit(1)
    tdb = TrailDB(sys.argv[1])
    cons = TrailDBConstructor(sys.argv[2], tdb.fields[1:])
    num = extract(tdb, cons, old_div(float(sys.argv[3]), 100.)).num_trails
    print('Extracted %d trails to %s' % (num, sys.argv[2]))
예제 #25
0
from traildb import TrailDBConstructor, TrailDB
from uuid import uuid4
from datetime import datetime

cons = TrailDBConstructor('tiny', ['username', 'action'])

for i in range(3):
    uuid = uuid4().hex
    username = '******' % i
    for day, action in enumerate(['open', 'save', 'close']):
        cons.add(uuid, datetime(2016, i + 1, day + 1), (username, action))

cons.finalize()

for uuid, trail in TrailDB('tiny').trails():
    print uuid, list(trail)
예제 #26
0
import sys
from traildb import TrailDB

SESSION_LIMIT = 30 * 60


def sessions(tdb):
    for i, (uuid, trail) in enumerate(tdb.trails(only_timestamp=True)):
        prev_time = trail.next()
        num_events = 1
        num_sessions = 1
        for timestamp in trail:
            if timestamp - prev_time > SESSION_LIMIT:
                num_sessions += 1
            prev_time = timestamp
            num_events += 1
        print 'Trail[%d] Number of Sessions: %d Number of Events: %d' %\
              (i, num_sessions, num_events)


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print 'Usage: tutorial_wikipedia_sessions <wikipedia-history.tdb>'
    else:
        sessions(TrailDB(sys.argv[1]))
예제 #27
0
 def test_negation(self):
     tdb = TrailDB('testtrail')
     events = list(tdb.trail(0, event_filter=[('field3', 'x', True)]))
     self.assertEqual(len(events), 1)
     self.assertEqual((events[0].field1, events[0].field2, events[0].field3), ('c', '3', 'y'))
예제 #28
0
def item_top():
    tdb = TrailDB('pydata-tutorial')
    stats = Counter(event.title for uuid, trail in tdb.trails(rawitems=True)
                                for event in trail)
    return [(tdb.get_item_value(item), f) for item, f in stats.most_common(5)]
def string_top():
    tdb = TrailDB('pydata-tutorial')
    return Counter(event.title for uuid, trail in tdb.trails()
                               for event in trail).most_common(5)
예제 #30
0
 def test_uuids(self):
     db = TrailDB('testtrail')
     self.assertEqual(0, db.get_trail_id(self.uuid))
     self.assertEqual(self.uuid, db.get_uuid(0))
     self.assertTrue(self.uuid in db)
def item_top():
    tdb = TrailDB('pydata-tutorial')
    stats = Counter(event.title for uuid, trail in tdb.trails(rawitems=True)
                                for event in trail)
    return [(tdb.get_item_value(item), f) for item, f in stats.most_common(5)]
예제 #32
0
 def test_uuids(self):
     db = TrailDB('testtrail')
     self.assertEqual(0, db.get_trail_id(self.uuid))
     self.assertEqual(self.uuid, db.get_uuid(0))
     self.assertTrue(self.uuid in db)
예제 #33
0
def string_top():
    tdb = TrailDB('pydata-tutorial')
    return Counter(event.title for uuid, trail in tdb.trails()
                               for event in trail).most_common(5)
예제 #34
0
 def test_conjunction(self):
     tdb = TrailDB('testtrail')
     events = list(tdb.trail(0, event_filter=[[('field1', 'e'), ('field1', 'c')],
                                              [('field3', 'y', True)]]))
     self.assertEqual(len(events), 1)
     self.assertEqual((events[0].field1, events[0].field2), ('e', '5'))