Пример #1
0
 def do_test(self, m_class, db, args, i):
    self.i = i
    # create tokenizer
    tzer = u.class_by_name(args.tokenizer)(args.ngram)
    # load training & testing tweets from database
    exu = None if args.dup_users else set()
    (tr_tweets, tr_users) = self.fetch(db, args.srid, 'training', tzer,
                                       args.fields, args.unify_fields, exu)
    exu = None if args.dup_users else tr_users
    (te_tweets, _) = self.fetch(db, args.srid, 'testing', tzer,
                                args.fields, args.unify_fields, exu)
    if (not args.skip_small_tests or
         self.enough_data_p(len(tr_tweets), len(te_tweets))):
       self.attempted = True
    else:
       l.info('insufficient data, skipping test %s ' % (self))
       self.attempted = False
       self.results = []
       return
    # tokenize training tweets
    tr_tokens = self.group_tokens(tr_tweets,
                                  args.trim_head, args.min_instances)
    self.train_tweet_ct = len(tr_tweets)
    self.train_token_ct = len(tr_tokens)
    # downsample test tweets
    if (len(te_tweets) > args.test_tweet_limit):
       te_tweets = u.rand.sample(te_tweets, args.test_tweet_limit)
       l.info('sampled %d test tweets per --test-tweet-limit'
              % (args.test_tweet_limit))
    self.test_tweet_ct = len(te_tweets)
    # build model
    self.model = m_class(tr_tokens, args.srid, tr_tweets)
    l.debug('starting model build')
    t_start = time.time()
    self.model.build()
    l.info('built model in %s' % (u.fmt_seconds(time.time() - t_start)))
    t_start = time.time()
    # test 'em
    self.results = multicore.do(test_tweet,
                                (self.model, args.fields), te_tweets)
    l.info('tested tweets in %s' % (u.fmt_seconds(time.time() - t_start)))
Пример #2
0
 def do_test(self, m_class, db, args, i):
     self.i = i
     # create tokenizer
     tzer = u.class_by_name(args.tokenizer)(args.ngram)
     # load training & testing tweets from database
     exu = None if args.dup_users else set()
     (tr_tweets, tr_users) = self.fetch(db, args.srid, 'training', tzer,
                                        args.fields, args.unify_fields, exu)
     exu = None if args.dup_users else tr_users
     (te_tweets, _) = self.fetch(db, args.srid, 'testing', tzer,
                                 args.fields, args.unify_fields, exu)
     if (not args.skip_small_tests
             or self.enough_data_p(len(tr_tweets), len(te_tweets))):
         self.attempted = True
     else:
         l.info('insufficient data, skipping test %s ' % (self))
         self.attempted = False
         self.results = []
         return
     # tokenize training tweets
     tr_tokens = self.group_tokens(tr_tweets, args.trim_head,
                                   args.min_instances)
     self.train_tweet_ct = len(tr_tweets)
     self.train_token_ct = len(tr_tokens)
     # downsample test tweets
     if (len(te_tweets) > args.test_tweet_limit):
         te_tweets = u.rand.sample(te_tweets, args.test_tweet_limit)
         l.info('sampled %d test tweets per --test-tweet-limit' %
                (args.test_tweet_limit))
     self.test_tweet_ct = len(te_tweets)
     # build model
     self.model = m_class(tr_tokens, args.srid, tr_tweets)
     l.debug('starting model build')
     t_start = time.time()
     self.model.build()
     l.info('built model in %s' % (u.fmt_seconds(time.time() - t_start)))
     t_start = time.time()
     # test 'em
     self.results = multicore.do(test_tweet, (self.model, args.fields),
                                 te_tweets)
     l.info('tested tweets in %s' % (u.fmt_seconds(time.time() - t_start)))
Пример #3
0
 def main(self):
    u.memory_use_log()
    t_start = time.time()
    db = db_glue.DB(self.args.database_file)
    l.info('opened database %s' % (self.args.database_file))
    assert (db.metadata_get('schema_version') == '5')
    # normalize start and end times
    if (self.args.start is None):
       sql = 'SELECT min(created_at) AS "st [timestamp]" FROM tweet'
       self.args.start = db.sql(sql)[0]['st']
    if (self.args.end is None):
       sql = 'SELECT max(created_at) AS "et [timestamp]" FROM tweet'
       # add one second because end time is exclusive
       self.args.end = db.sql(sql)[0]['et'] + timedelta(seconds=1)
    self.args.start = time_.as_utc(self.args.start)
    self.args.end = time_.as_utc(self.args.end)
    # print test sequence parameters
    self.log_parameters()
    # set up model parameters
    model_class = u.class_by_name(self.args.model)
    model_class.parms_init(self.args.model_parms, log_parms=True)
    # build schedule
    self.schedule_build(self.args.limit)
    l.info('scheduled %s tests (%s left over)'
           % (len(self.schedule), self.args.end - self.schedule[-1].end))
    if (not os.path.exists(self.args.output_dir)):
       os.mkdir(self.args.output_dir)
    l.info('results in %s' % (self.args.output_dir))
    # testing loop
    for (i, t) in enumerate(self.schedule):
       if (i+1 < self.args.start_test):
          l.info('using saved test %d per --start-test' % (i+1))
          l.warning('token and tweet counts will be incorrect')
          # FIXME: hack.....
          try:
             t.model = u.Deleted_To_Save_Memory()
             t.results = u.Deleted_To_Save_Memory()
             t.i = i
             t.train_tweet_ct = -1e6
             t.train_token_ct = -1e6
             t.test_tweet_ct = -1e6
             t.unshrink_from_disk(self.args.output_dir, results=True)
             t.attempted = True
          except IOError, x:
             if (x.errno != 2):
                raise
             t.attempted = False
       else:
          l.info('starting test %d of %d: %s' % (i+1, len(self.schedule), t))
          t.do_test(model_class, db, self.args, i)
       t.summarize()
       if (t.attempted):
          if (self.args.profile_memory):
             # We dump a memory profile here because it's the high water
             # mark; we're about to reduce usage significantly.
             import meliae.scanner as ms
             filename = 'memory.%d.json' % (i)
             l.info('dumping memory profile %s' % (filename))
             ms.dump_all_objects('%s/%s' % (self.args.output_dir, filename))
          t.shrink_to_disk(self.args.output_dir)
       l.debug('result: %s' % (t.summary))
       u.memory_use_log()
Пример #4
0
 def main(self):
     u.memory_use_log()
     t_start = time.time()
     # Replaced with self.cur in __init__
     # db = db_glue.DB(self.args.database_file)
     # assert (db.metadata_get('schema_version') == '5')
     # normalize start and end times
     if (self.args.start is None):
         sql = 'SELECT min(created_at) AS st FROM {0};'.format(self.table)
         self.cur.execute(sql)
         self.args.start = self.cur.fetchone()[0]
     if (self.args.end is None):
         sql = 'SELECT max(created_at) AS et FROM {0};'.format(self.table)
         self.cur.execute(sql)
         # add one second because end time is exclusive
         self.args.end = self.cur.fetchone()[0] + timedelta(seconds=1)
     self.args.start = time_.as_utc(self.args.start)
     self.args.end = time_.as_utc(self.args.end)
     # print test sequence parameters
     self.log_parameters()
     # set up model parameters
     model_class = u.class_by_name(self.args.model)
     model_class.parms_init(self.args.model_parms, log_parms=True)
     # build schedule
     self.schedule_build(self.args.limit)
     l.info('scheduled %s tests (%s left over)'
            % (len(self.schedule), self.args.end - self.schedule[-1].end))
     if (not os.path.exists(self.args.output_dir)):
         os.mkdir(self.args.output_dir)
     l.info('results in %s' % (self.args.output_dir))
     # testing loop
     for (i, t) in enumerate(self.schedule):
         if (i+1 < self.args.start_test):
             l.info('using saved test %d per --start-test' % (i+1))
             l.warning('token and tweet counts will be incorrect')
             # FIXME: hack.....
             try:
                 t.model = u.Deleted_To_Save_Memory()
                 t.results = u.Deleted_To_Save_Memory()
                 t.i = i
                 t.train_tweet_ct = -1e6
                 t.train_token_ct = -1e6
                 t.test_tweet_ct = -1e6
                 t.unshrink_from_disk(self.args.output_dir, results=True)
                 t.attempted = True
             except (IOError, x):
                 if (x.errno != 2):
                     raise
                 t.attempted = False
         else:
             l.info('starting test %d of %d: %s' % (i+1, len(self.schedule), t))
             t.do_test(model_class, self.cur, self.args, i)
         t.summarize()
         if (t.attempted):
             if (self.args.profile_memory):
                 # We dump a memory profile here because it's the high water
                 # mark; we're about to reduce usage significantly.
                 import meliae.scanner as ms
                 filename = 'memory.%d.json' % (i)
                 l.info('dumping memory profile %s' % (filename))
                 ms.dump_all_objects('%s/%s' % (self.args.output_dir, filename))
             t.shrink_to_disk(self.args.output_dir)
         l.debug('result: %s' % (t.summary))
         u.memory_use_log()
     # done!
     l.debug('computing summary')
     self.summarize()
     l.debug('summary: %s' % (self.summary))
     l.debug('saving TSV results')
     test_indices = u.sl_union_fromtext(len(self.schedule), ':')
     self.tsv_save_tests('%s/%s' % (self.args.output_dir, 'tests.tsv'),
                         test_indices)
     l.debug('saving pickled summary')
     self.memory_use = u.memory_use()
     self.memory_use_peak = "Not implemented"
     self.time_use = time.time() - t_start
     u.pickle_dump('%s/%s' % (self.args.output_dir, 'summary'), self)
     u.memory_use_log()
     l.info('done in %s' % (u.fmt_seconds(self.time_use)))
Пример #5
0
 def main(self):
     u.memory_use_log()
     t_start = time.time()
     db = db_glue.DB(self.args.database_file)
     l.info('opened database %s' % (self.args.database_file))
     assert (db.metadata_get('schema_version') == '5')
     # normalize start and end times
     if (self.args.start is None):
         sql = 'SELECT min(created_at) AS "st [timestamp]" FROM tweet'
         self.args.start = db.sql(sql)[0]['st']
     if (self.args.end is None):
         sql = 'SELECT max(created_at) AS "et [timestamp]" FROM tweet'
         # add one second because end time is exclusive
         self.args.end = db.sql(sql)[0]['et'] + timedelta(seconds=1)
     self.args.start = time_.as_utc(self.args.start)
     self.args.end = time_.as_utc(self.args.end)
     # print test sequence parameters
     self.log_parameters()
     # set up model parameters
     model_class = u.class_by_name(self.args.model)
     model_class.parms_init(self.args.model_parms, log_parms=True)
     # build schedule
     self.schedule_build(self.args.limit)
     l.info('scheduled %s tests (%s left over)' %
            (len(self.schedule), self.args.end - self.schedule[-1].end))
     if (not os.path.exists(self.args.output_dir)):
         os.mkdir(self.args.output_dir)
     l.info('results in %s' % (self.args.output_dir))
     # testing loop
     for (i, t) in enumerate(self.schedule):
         if (i + 1 < self.args.start_test):
             l.info('using saved test %d per --start-test' % (i + 1))
             l.warning('token and tweet counts will be incorrect')
             # FIXME: hack.....
             try:
                 t.model = u.Deleted_To_Save_Memory()
                 t.results = u.Deleted_To_Save_Memory()
                 t.i = i
                 t.train_tweet_ct = -1e6
                 t.train_token_ct = -1e6
                 t.test_tweet_ct = -1e6
                 t.unshrink_from_disk(self.args.output_dir, results=True)
                 t.attempted = True
             except IOError, x:
                 if (x.errno != 2):
                     raise
                 t.attempted = False
         else:
             l.info('starting test %d of %d: %s' %
                    (i + 1, len(self.schedule), t))
             t.do_test(model_class, db, self.args, i)
         t.summarize()
         if (t.attempted):
             if (self.args.profile_memory):
                 # We dump a memory profile here because it's the high water
                 # mark; we're about to reduce usage significantly.
                 import meliae.scanner as ms
                 filename = 'memory.%d.json' % (i)
                 l.info('dumping memory profile %s' % (filename))
                 ms.dump_all_objects('%s/%s' %
                                     (self.args.output_dir, filename))
             t.shrink_to_disk(self.args.output_dir)
         l.debug('result: %s' % (t.summary))
         u.memory_use_log()