class ReconciliationJob(WorkerJob): def __init__(self, job_id, id, type, association_file=None): self._job_id = job_id self._id = id self._type = type self._association_file = association_file self._job = JobLL(job_id) self._result = None super().__init__(self.start_reconciliation) def start_reconciliation(self): filename = f'Reconciled_{hasher(self._job_id)}_{self._id}_{hasher(self._association_file)}' serialised = f'Cluster_{hasher(self._job_id)}_{self._id}' self._result = Cls.extend_cluster( serialisation_dir=CLUSTER_SERIALISATION_DIR, serialized_cluster_name=serialised, csv_association_file=join(CSV_ASSOCIATIONS_DIR, self._association_file), save_in=CLUSTER_SERIALISATION_DIR, reconciled_name=filename, condition_30=True, activated=True) def watch_process(self): pass def watch_kill(self): clustering_job = self._job.clustering(self._id, self._type) if clustering_job['kill']: self.kill(reset=False) def on_kill(self, reset): job_data = { 'status': 'waiting' } if reset else { 'status': 'failed', 'status_message': 'Killed manually' } self._job.update_clustering(self._id, self._type, job_data) def on_exception(self): err_message = str(self._exception) self._job.update_clustering(self._id, self._type, { 'status': 'failed', 'status_message': err_message }) def on_finish(self): with db_conn() as conn, conn.cursor() as cur: cur.execute( ''' UPDATE clusterings SET extended_count = %s, cycles_count = %s, status = %s, finished_at = now() WHERE job_id = %s AND spec_id = %s AND spec_type = %s ''', (self._result['extended_clusters_count'], self._result['cycles_count'], 'done', self._job_id, self._id, self._type))
class ClusteringJob(WorkerJob): def __init__(self, job_id, id, type): self._job_id = job_id self._id = id self._type = type self._job = JobLL(job_id) self._worker = None super().__init__(self.start_clustering) def start_clustering(self): links = self._job.get_links(self._id, self._type) self._worker = SimpleLinkClustering(links) data = StringIO() for cluster in self._worker.get_clusters(): for node in cluster['nodes']: data.write(f"{cluster['id']}\t{node}\n") data.seek(0) if not self._killed: schema = 'linksets' if self._type == 'linkset' else 'lenses' linkset_table_name = self._job.table_name(self._id) clusters_table_name = linkset_table_name + '_clusters' cluster_hashes_table_name = linkset_table_name + '_cluster_hashes' linkset_index_name = linkset_table_name + '_cluster_id_idx' with self._db_conn.cursor() as cur: cur.execute( sql.SQL('SET search_path TO {}').format( sql.Identifier(schema))) cur.execute( sql.SQL('DROP INDEX IF EXISTS {}').format( sql.Identifier(linkset_index_name))) cur.execute( sql.SQL(''' CREATE TEMPORARY TABLE IF NOT EXISTS {} ( id integer NOT NULL, node text NOT NULL ) ON COMMIT DROP ''').format(sql.Identifier(clusters_table_name))) cur.copy_from(data, clusters_table_name) cur.execute( sql.SQL(''' CREATE TEMPORARY TABLE IF NOT EXISTS {} ON COMMIT DROP AS SELECT id, substring(md5(array_to_string(ARRAY( SELECT DISTINCT unnest(array_agg(node)) AS x ORDER BY x ), '')) FOR 15) AS hash_id FROM {} GROUP BY id ''').format(sql.Identifier(cluster_hashes_table_name), sql.Identifier(clusters_table_name))) cur.execute( sql.SQL(''' UPDATE {} AS linkset SET cluster_id = clusters.id FROM {} AS clusters WHERE linkset.source_uri = clusters.node ''').format(sql.Identifier(linkset_table_name), sql.Identifier(clusters_table_name))) cur.execute( sql.SQL(''' UPDATE {} AS linkset SET cluster_hash_id = cluster_hashes.hash_id FROM {} AS cluster_hashes WHERE linkset.cluster_id = cluster_hashes.id ''').format(sql.Identifier(linkset_table_name), sql.Identifier(cluster_hashes_table_name))) cur.execute( sql.SQL( 'CREATE INDEX ON {} USING btree (cluster_id); ANALYZE {};' ).format(sql.Identifier(linkset_table_name), sql.Identifier(linkset_table_name))) def watch_process(self): if not self._worker: return self._job.update_clustering( self._id, self._type, { 'status_message': 'Processing found clusters' if self._worker.links_processed else 'Processing links', 'links_count': self._worker.links_processed, 'clusters_count': len(self._worker.clusters) }) def watch_kill(self): clustering_job = self._job.clustering(self._id, self._type) if clustering_job['kill']: self.kill(reset=False) def on_kill(self, reset): if self._worker: self._worker.stop_clustering() job_data = { 'status': 'waiting' } if reset else { 'status': 'failed', 'status_message': 'Killed manually' } self._job.update_clustering(self._id, self._type, job_data) def on_exception(self): err_message = str(self._exception) self._job.update_clustering(self._id, self._type, { 'status': 'failed', 'status_message': err_message }) def on_finish(self): if len(self._worker.clusters) == 0: return with db_conn() as conn, conn.cursor( cursor_factory=extras.RealDictCursor) as cur: cur.execute( sql.SQL(''' SELECT (SELECT count(DISTINCT uri) AS size FROM {schema}.{table_name}, LATERAL (VALUES (source_uri), (target_uri)) AS nodes(uri)) AS resources_size, (SELECT size FROM ( SELECT count(DISTINCT uri) AS size FROM {schema}.{table_name}, LATERAL (VALUES (source_uri), (target_uri)) AS nodes(uri) GROUP BY cluster_id ) AS x ORDER BY size ASC LIMIT 1) AS smallest_size, (SELECT size FROM ( SELECT count(DISTINCT uri) AS size FROM {schema}.{table_name}, LATERAL (VALUES (source_uri), (target_uri)) AS nodes(uri) GROUP BY cluster_id ) AS x ORDER BY size DESC LIMIT 1) AS largest_size, (SELECT count FROM ( SELECT count(cluster_id) AS count FROM {schema}.{table_name} GROUP BY cluster_id ) AS x ORDER BY count ASC LIMIT 1) AS smallest_count, (SELECT count FROM ( SELECT count(cluster_id) AS count FROM {schema}.{table_name} GROUP BY cluster_id ) AS x ORDER BY count DESC LIMIT 1) AS largest_count ''').format( schema=sql.Identifier('linksets' if self._type == 'linkset' else 'lenses'), table_name=sql.Identifier(self._job.table_name(self._id)), )) result = cur.fetchone() cur.execute( ''' UPDATE clusterings SET links_count = %s, clusters_count = %s, resources_size = %s, smallest_size = %s, largest_size = %s, smallest_count = %s, largest_count = %s, status = %s, status_message = NULL, finished_at = now() WHERE job_id = %s AND spec_id = %s AND spec_type = %s ''', (self._worker.links_processed, len( self._worker.clusters), result['resources_size'], result['smallest_size'], result['largest_size'], result['smallest_count'], result['largest_count'], 'done', self._job_id, self._id, self._type))