def _stats_pcts(self): mh = MediaHaven() nl_count = len(Datasources['namenlijst']['func']()) mh_count = len(mh.search('+(workflow:GMS) +(archiveStatus:on_tape)')) data = OrderedDict({ '': 'COUNT(*)', 'names from IFFM namenlijst': ('COUNT(DISTINCT nmlid)', nl_count), 'newspaper pages': ('COUNT(DISTINCT pid)', mh_count), }) for k, v in data.items(): total = None if type(v) is tuple: total = v[1] v = v[0] args = (v, self.table, self.model.SKIP) res = self.db.execute('SELECT %s FROM %s WHERE status != %d' % args) matches = int(res.scalar()) res = self.db.execute( 'SELECT %s FROM %s WHERE status != %d and score > 0' % args) matches_with_score = int(res.scalar()) counts = [ matches, matches_with_score, matches_with_score / matches, ] if total is not None: counts.append(total) counts.append(matches / total) counts.append(matches_with_score / total) data[k] = counts return data
if args.table: table_name = args.table try: table = meta.tables[table_name] except KeyError: raise FileNotFoundError('Couldnt find table "%s"' % table_name) start = 0 if vars(args)['continue']: start = db.execute(func.max(table.c.doc_index)).scalar() + 1 elif args.continue_from: start = int(args.continue_from) else: start = 0 data = mh.search('+(workflow:GMS) +(archiveStatus:on_tape)', start) # data.set_length(500) # debugging # truncate table first if not args.debug and args.clear: logger.warning("Clearing table %s" % table_name) db.execute(table.delete()) ner = NERFactory().get() for idx, item in tqdm(enumerate(data), total=len(data) - start): text = item['description'] entities = ner.tag(text) # if args.debug: # print(list(entities)) date = item['mdProperties']['carrier_date']