def get_classifications(self): logger.info('Loading dual cursors of dump and caesar classifications') cursor1 = DB().classifications.getClassifications() cursor2 = DB().caesar.getClassifications() return DualCursor(cursor1, cursor2)
def run(self, amount=None): def _amt(stats): return stats['first_classifications'] if amount is None: amount = _amt(DB().classifications.get_stats()) amount += _amt(DB().caesar._gen_stats(upload=False)) super().run(amount=amount)
def call(self, args): """ Define what to do if this interface's command was passed """ if args.upload_dump: fname = args.upload_dump[0] DB().classifications.upload_project_dump(fname) if args.upload_golds: fname = args.upload_golds[0] DB().golds.upload_golds_csv(fname) if args.gen_stats: DB().classifications._gen_stats()
def get_cursor(): """ Generate a cursor with classifications Returns ------- swap.db.Cursor Classifications """ cursor = DB().classifications.aggregate([{ '$match': { 'gold_label': { '$ne': -1 } } }, { '$group': { '_id': '$subject_id', 'gold': { '$first': "$gold_label" }, 'total': { '$sum': 1 }, 'votes': { '$sum': "$annotation" } } }]) return cursor
def test_batch_size(self, mock): DB().classifications.getClassifications(batch_size=50) args, kwargs = mock.call_args print(args, kwargs) assert 'batchSize' in args[1] assert args[1]['batchSize'] == 50
def _init_subjects(self, golds): subjects = {} for id_, gold in golds.items(): stats = SubjectStats.from_static(id_, DB()) subjects[id_] = self.Subject(id_, gold, stats) return subjects
def upload_data(data): db = DB() requests = [] def write(): nonlocal requests print('writing') if len(requests) > 0: db.subjects.bulk_write(requests) requests = [] i = 0 print(i) for subject, metadata in data.items(): r = db.subjects.update_metadata(subject, metadata, False) requests.append(r) if i % 10000: sys.stdout.flush() sys.stdout.write('%d\r' % i) if len(requests) > 1e5: write() requests = [] i += 1 write()
def test_get_cursor_type(self): DB._instances = {} db = DB()._db query = [{'$limit': 5}] c = Cursor(query, db.classifications) assert isinstance(c.getCursor(), pymongo.command_cursor.CommandCursor)
def test_length(self): DB._reset() db = DB()._db query = [{'$limit': 5}] c = Cursor(query, db.classifications) print(c.next()) assert len(c) == 5
def get_classifications(): """ Get the cursor containing classifications from db Returns ------- swap.db.Cursor Cursor with classifications """ return DB().classifications.getClassifications()
def subjects(self, subject_ids): """ Get the gold labels for a set of subjects Parameters ---------- subject_ids : list List of subject ids (int) """ logger.debug('getting %d subjects', len(subject_ids)) return DB().golds.get_golds(subject_ids)
def random(self, size, gold=None): """ Get a random sample of gold labels Parameters ---------- size : int Sample size """ logger.debug('Size %d gold filter %s', size, gold) return DB().golds.get_random_golds(size, gold)
def test_get_classifications_1(self, mock): query = [ {'$sort': OrderedDict([ ('seen_before', 1), ('classification_id', 1)])}, {'$match': {'seen_before': False}}, {'$project': { 'user_id': 1, 'subject_id': 1, 'annotation': 1, 'session_id': 1}}] DB().classifications.getClassifications() mock.assert_called_with(query, {'batchSize': 100000})
def consensus(self, size): """ Get the gold labels for the most consensus subjects Parameters ---------- size : int Number of subjects """ logger.debug('Size %d', size) subjects = db_cv().get_consensus(size) return DB().golds.get_golds(subjects)
def run(self, amount=None): """ Process all classifications in DB with SWAP .. note:: Iterates through the classification collection of the database and proccesss each classification one at a time in the order returned by the db. Parameters like max_batch_size are hard-coded. Prints status. """ if amount is None: amount = DB().classifications.get_stats() amount = amount['first_classifications'] self.init_swap() # get classifications cursor = self.get_classifications() # loop over classification cursor to process # classifications one at a time logger.info('Start: SWAP Processing %d classifications', amount) count = 0 with progressbar.ProgressBar(max_value=amount) as bar: bar.update(count) # Loop over all classifications of the query # Note that the exact size of the query might be lower than # n_classifications if not all classifications are being queried for cl in cursor: # process classification in swap cl = Classification.generate(cl) self._delegate(cl) bar.update(count) count += 1 if config.control.debug and count > config.control.amount: break if config.back_update: logger.info('back_update active: processing changes') self.swap.process_changes() logger.info('done')
def classify(self, raw_cl): # Add classification from caesar data = self.parse_raw(raw_cl) cl = self.gen_cl(data) logger.debug('Checking if already received classification') if not self.cl_exists(data): logger.debug('Uploading classification to caesar db: %s', str(data)) DB().caesar.insert(data) logger.debug('Adding classification from network: %s', str(cl)) self.swap.classify(cl) subject = self.swap.subjects.get(cl.subject) return subject
def getClassifications(self): """ Returns Iterator over all Classifications """ # fields to project fields = ['user_name', 'subject_id', 'annotation', 'gold_label'] # if meta data is requested if self.meta_data is not None: meta_data_field = 'metadata' + "." + self.meta_data fields.append('metadata') fields[fields.index('metadata')] = meta_data_field # Define a query q = Query() q.project(fields) # range query on metadata if self.meta_lower is not None and self.meta_upper is not None: q.match_range(meta_data_field, self.meta_lower, self.meta_upper) # perform query on classification data classifications = DB().classifications.aggregate(q.build()) return classifications
def __init__(self): self.getters = [] self._golds = None self.db = DB().golds
def all(self): """ Get all gold labels """ return DB().golds.get_golds()
def db_cv(): return DB().controversial
def cl_exists(cl): def id_(cl): return cl['classification_id'] return DB().caesar.exists(id_(cl)) or \ DB().classifications.exists(id_(cl))
def db_cl(): return DB().classifications
def get_swap_scores(): return DB().subjects.get_scores()
def call(self, args): swap = None scores = None if args.load: obj = self.load(args.load[0]) if isinstance(obj, SWAP): swap = obj scores = swap.score_export() elif isinstance(obj, ScoreExport): scores = obj if args.scores_from_csv: fname = args.scores_from_csv[0] scores = ScoreExport.from_csv(fname) if args.run: swap = self.run_swap(args) scores = swap.score_export() if swap is not None: if args.save: manifest = self.manifest(swap, args) self.save(swap, self.f(args.save[0]), manifest) if args.log: fname = self.f(args.log[0]) write_log(swap, fname) if args.stats: s = swap.stats_str() print(s) logger.debug(s) if args.test: from swap.utils.golds import GoldGetter gg = GoldGetter() logger.debug('applying new gold labels') swap.set_gold_labels(gg.golds) swap.process_changes() logger.debug('done') if args.test_reorder: self.reorder_classifications(swap) if args.export_user_scores: fname = self.f(args.export_user_scores[0]) self.export_user_scores(swap, fname) if scores is not None: if args.save_scores: DB().subjects.save_scores(scores) if args.scores_to_csv: self.scores_to_csv(scores, args.scores_to_csv[0]) self.plot(args, swap, scores) if args.shell: import code code.interact(local=locals()) return swap