def test_group_first(self): from pyglottolog.util import group_first for key, items in group_first([(1, 2), (1, 3)]): self.assertEqual(key, 1) self.assertEqual(len(list(items)), 2) break
def show_merges(self): with self.connect() as conn: cursor = conn.execute('SELECT hash, refid, filename, bibkey ' 'FROM entry AS e WHERE EXISTS (SELECT 1 FROM entry ' 'WHERE hash = e.hash AND refid != e.refid) ' 'ORDER BY hash, refid DESC, filename, bibkey') for hash, group in group_first(cursor): self.print_group(conn, group) new = self._merged_entry(self._entrygrp(conn, hash), raw=True) cand = [(ri, self._merged_entry(self._entrygrp(conn, ri), raw=True)) for ri in unique(ri for hs, ri, fn, bk in group)] old = min(cand, key=lambda p: distance(new, p[1]))[0] print('-> %s\n' % old)
def show_splits(self): with self.connect() as conn: cursor = conn.execute('SELECT refid, hash, filename, bibkey ' 'FROM entry AS e WHERE EXISTS (SELECT 1 FROM entry ' 'WHERE refid = e.refid AND hash != e.hash) ' 'ORDER BY refid, hash, filename, bibkey') for refid, group in group_first(cursor): self.print_group(conn, group) old = self._merged_entry(self._entrygrp(conn, refid), raw=True) cand = [(hs, self._merged_entry(self._entrygrp(conn, hs), raw=True)) for hs in unique(hs for ri, hs, fn, bk in group)] new = min(cand, key=lambda p: distance(old, p[1]))[0] print('-> %s\n' % new)
def show_merges(self): with self.connect() as conn: cursor = conn.execute( 'SELECT hash, refid, filename, bibkey ' 'FROM entry AS e WHERE EXISTS (SELECT 1 FROM entry ' 'WHERE hash = e.hash AND refid != e.refid) ' 'ORDER BY hash, refid DESC, filename, bibkey') for hash, group in group_first(cursor): self.print_group(conn, group) new = self._merged_entry(self._entrygrp(conn, hash), raw=True) cand = [(ri, self._merged_entry(self._entrygrp(conn, ri), raw=True)) for ri in unique(ri for hs, ri, fn, bk in group)] old = min(cand, key=lambda p: distance(new, p[1]))[0] print('-> %s\n' % old)
def show_splits(self): with self.connect() as conn: cursor = conn.execute( 'SELECT refid, hash, filename, bibkey ' 'FROM entry AS e WHERE EXISTS (SELECT 1 FROM entry ' 'WHERE refid = e.refid AND hash != e.hash) ' 'ORDER BY refid, hash, filename, bibkey') for refid, group in group_first(cursor): self.print_group(conn, group) old = self._merged_entry(self._entrygrp(conn, refid), raw=True) cand = [(hs, self._merged_entry(self._entrygrp(conn, hs), raw=True)) for hs in unique(hs for ri, hs, fn, bk in group)] new = min(cand, key=lambda p: distance(old, p[1]))[0] print('-> %s\n' % new)
def assign_ids(conn, verbose=False): merged_entry, entrygrp = Database._merged_entry, Database._entrygrp allhash, = conn.execute('SELECT NOT EXISTS (SELECT 1 FROM entry ' 'WHERE hash IS NULL)').fetchone() assert allhash print('%d entries' % conn.execute('UPDATE entry SET id = NULL, srefid = refid').rowcount) # resolve splits: srefid = refid only for entries from the most similar hash group nsplit = 0 cursor = conn.execute('SELECT refid, hash, filename, bibkey FROM entry AS e ' 'WHERE EXISTS (SELECT 1 FROM entry WHERE refid = e.refid AND hash != e.hash) ' 'ORDER BY refid, hash, filename, bibkey') for refid, group in group_first(cursor): old = merged_entry(entrygrp(conn, refid), raw=True) nsplit += len(group) cand = [(hs, merged_entry(entrygrp(conn, hs), raw=True)) for hs in unique(hs for ri, hs, fn, bk in group)] new = min(cand, key=lambda p: distance(old, p[1]))[0] separated = conn.execute('UPDATE entry SET srefid = NULL WHERE refid = ? AND hash != ?', (refid, new)).rowcount if verbose: for row in group: print(row) for ri, hs, fn, bk in group: print('\t%r, %r, %r, %r' % hashfields(conn, fn, bk)) print('-> %s' % new) print('%d: %d separated from %s\n' % (refid, separated, new)) print('%d splitted' % nsplit) nosplits, = conn.execute('SELECT NOT EXISTS (SELECT 1 FROM entry AS e ' 'WHERE EXISTS (SELECT 1 FROM entry WHERE srefid = e.srefid AND hash != e.hash))').fetchone() assert nosplits # resolve merges: id = srefid of the most similar srefid group nmerge = 0 cursor = conn.execute('SELECT hash, srefid, filename, bibkey FROM entry AS e ' 'WHERE EXISTS (SELECT 1 FROM entry WHERE hash = e.hash AND srefid != e.srefid) ' 'ORDER BY hash, srefid DESC, filename, bibkey') for hash, group in group_first(cursor): new = merged_entry(entrygrp(conn, hash), raw=True) nmerge += len(group) cand = [(ri, merged_entry(entrygrp(conn, ri), raw=True)) for ri in unique(ri for hs, ri, fn, bk in group)] old = min(cand, key=lambda p: distance(new, p[1]))[0] merged = conn.execute('UPDATE entry SET id = ? WHERE hash = ? AND srefid != ?', (old, hash, old)).rowcount if verbose: for row in group: print(row) for hs, ri, fn, bk in group: print('\t%r, %r, %r, %r' % hashfields(conn, fn, bk)) print('-> %s' % old) print('%s: %d merged into %d\n' % (hash, merged, old)) print('%d merged' % nmerge) # unchanged entries print('%d unchanged' % conn.execute('UPDATE entry SET id = srefid ' 'WHERE id IS NULL AND srefid IS NOT NULL').rowcount) nomerges, = conn.execute('SELECT NOT EXISTS (SELECT 1 FROM entry AS e ' 'WHERE EXISTS (SELECT 1 FROM entry WHERE hash = e.hash AND id != e.id))').fetchone() assert nomerges # identified print('%d identified (new/separated)' % conn.execute('UPDATE entry ' 'SET id = (SELECT id FROM entry AS e WHERE e.hash = entry.hash AND e.id IS NOT NULL) ' 'WHERE refid IS NULL AND id IS NULL AND EXISTS ' '(SELECT 1 FROM entry AS e WHERE e.hash = entry.hash AND e.id IS NOT NULL)').rowcount) # assign new ids to hash groups of separated/new entries nextid, = conn.execute('SELECT coalesce(max(refid), 0) + 1 FROM entry').fetchone() cursor = conn.execute('SELECT hash FROM entry WHERE id IS NULL GROUP BY hash ORDER BY hash') print('%d new ids (new/separated)' % conn.executemany('UPDATE entry SET id = ? WHERE hash = ?', ((id, hash) for id, (hash,) in enumerate(cursor, nextid))).rowcount) assert allid(conn) assert onetoone(conn) # supersede relation superseded, = conn.execute('SELECT count(*) FROM entry WHERE id != srefid').fetchone() print('%d supersede pairs' % superseded)
def _show(self, sql): with self.connect() as conn: cursor = conn.execute(sql) for hash, group in group_first(cursor): self.print_group(conn, group) print()
def assign_ids(conn, verbose=False): merged_entry, entrygrp = Database._merged_entry, Database._entrygrp allhash, = conn.execute('SELECT NOT EXISTS (SELECT 1 FROM entry ' 'WHERE hash IS NULL)').fetchone() assert allhash print('%d entries' % conn.execute('UPDATE entry SET id = NULL, srefid = refid').rowcount) # resolve splits: srefid = refid only for entries from the most similar hash group nsplit = 0 cursor = conn.execute( 'SELECT refid, hash, filename, bibkey FROM entry AS e ' 'WHERE EXISTS (SELECT 1 FROM entry WHERE refid = e.refid AND hash != e.hash) ' 'ORDER BY refid, hash, filename, bibkey') for refid, group in group_first(cursor): old = merged_entry(entrygrp(conn, refid), raw=True) nsplit += len(group) cand = [(hs, merged_entry(entrygrp(conn, hs), raw=True)) for hs in unique(hs for ri, hs, fn, bk in group)] new = min(cand, key=lambda (hs, fields): distance(old, fields))[0] separated = conn.execute( 'UPDATE entry SET srefid = NULL WHERE refid = ? AND hash != ?', (refid, new)).rowcount if verbose: for row in group: print(row) for ri, hs, fn, bk in group: print('\t%r, %r, %r, %r' % hashfields(conn, fn, bk)) print('-> %s' % new) print('%d: %d separated from %s\n' % (refid, separated, new)) print('%d splitted' % nsplit) nosplits, = conn.execute( 'SELECT NOT EXISTS (SELECT 1 FROM entry AS e ' 'WHERE EXISTS (SELECT 1 FROM entry WHERE srefid = e.srefid AND hash != e.hash))' ).fetchone() assert nosplits # resolve merges: id = srefid of the most similar srefid group nmerge = 0 cursor = conn.execute( 'SELECT hash, srefid, filename, bibkey FROM entry AS e ' 'WHERE EXISTS (SELECT 1 FROM entry WHERE hash = e.hash AND srefid != e.srefid) ' 'ORDER BY hash, srefid DESC, filename, bibkey') for hash, group in group_first(cursor): new = merged_entry(entrygrp(conn, hash), raw=True) nmerge += len(group) cand = [(ri, merged_entry(entrygrp(conn, ri), raw=True)) for ri in unique(ri for hs, ri, fn, bk in group)] old = min(cand, key=lambda (ri, fields): distance(new, fields))[0] merged = conn.execute( 'UPDATE entry SET id = ? WHERE hash = ? AND srefid != ?', (old, hash, old)).rowcount if verbose: for row in group: print(row) for hs, ri, fn, bk in group: print('\t%r, %r, %r, %r' % hashfields(conn, fn, bk)) print('-> %s' % old) print('%s: %d merged into %d\n' % (hash, merged, old)) print('%d merged' % nmerge) # unchanged entries print('%d unchanged' % conn.execute('UPDATE entry SET id = srefid ' 'WHERE id IS NULL AND srefid IS NOT NULL').rowcount) nomerges, = conn.execute( 'SELECT NOT EXISTS (SELECT 1 FROM entry AS e ' 'WHERE EXISTS (SELECT 1 FROM entry WHERE hash = e.hash AND id != e.id))' ).fetchone() assert nomerges # identified print('%d identified (new/separated)' % conn.execute( 'UPDATE entry ' 'SET id = (SELECT id FROM entry AS e WHERE e.hash = entry.hash AND e.id IS NOT NULL) ' 'WHERE refid IS NULL AND id IS NULL AND EXISTS ' '(SELECT 1 FROM entry AS e WHERE e.hash = entry.hash AND e.id IS NOT NULL)' ).rowcount) # assign new ids to hash groups of separated/new entries nextid, = conn.execute( 'SELECT coalesce(max(refid), 0) + 1 FROM entry').fetchone() cursor = conn.execute( 'SELECT hash FROM entry WHERE id IS NULL GROUP BY hash ORDER BY hash') print('%d new ids (new/separated)' % conn.executemany('UPDATE entry SET id = ? WHERE hash = ?', ( (id, hash) for id, (hash, ) in enumerate(cursor, nextid))).rowcount) assert allid(conn) assert onetoone(conn) # supersede relation superseded, = conn.execute( 'SELECT count(*) FROM entry WHERE id != srefid').fetchone() print('%d supersede pairs' % superseded)
def test_group_first(): key, items = next(util.group_first([(1, 2), (1, 3)])) assert key, list(items) == (1, [(1, 2), (1, 3)])