示例#1
0
    def _stats_pcts(self):
        mh = MediaHaven()

        nl_count = len(Datasources['namenlijst']['func']())
        mh_count = len(mh.search('+(workflow:GMS) +(archiveStatus:on_tape)'))

        data = OrderedDict({
            '':
            'COUNT(*)',
            'names from IFFM namenlijst': ('COUNT(DISTINCT nmlid)', nl_count),
            'newspaper pages': ('COUNT(DISTINCT pid)', mh_count),
        })

        for k, v in data.items():
            total = None
            if type(v) is tuple:
                total = v[1]
                v = v[0]

            args = (v, self.table, self.model.SKIP)
            res = self.db.execute('SELECT %s FROM %s WHERE status != %d' %
                                  args)
            matches = int(res.scalar())

            res = self.db.execute(
                'SELECT %s FROM %s WHERE status != %d and score > 0' % args)
            matches_with_score = int(res.scalar())
            counts = [
                matches,
                matches_with_score,
                matches_with_score / matches,
            ]

            if total is not None:
                counts.append(total)
                counts.append(matches / total)
                counts.append(matches_with_score / total)
            data[k] = counts

        return data
示例#2
0
if args.table:
    table_name = args.table
try:
    table = meta.tables[table_name]
except KeyError:
    raise FileNotFoundError('Couldnt find table "%s"' % table_name)

start = 0
if vars(args)['continue']:
    start = db.execute(func.max(table.c.doc_index)).scalar() + 1
elif args.continue_from:
    start = int(args.continue_from)
else:
    start = 0

data = mh.search('+(workflow:GMS) +(archiveStatus:on_tape)', start)

# data.set_length(500) # debugging

# truncate table first
if not args.debug and args.clear:
    logger.warning("Clearing table %s" % table_name)
    db.execute(table.delete())

ner = NERFactory().get()
for idx, item in tqdm(enumerate(data), total=len(data) - start):
    text = item['description']
    entities = ner.tag(text)
    # if args.debug:
    #     print(list(entities))
    date = item['mdProperties']['carrier_date']