def db_calculate_filedistribution(resource_id): log.msg("[%s] Calculating file distributions" % resource_id) file_distribution = {} query = (select([func.count()]).select_from(Files).where(Files.c.resource_id == resource_id)) total_file_count = yield tx_pool.runQuery(query) total_file_count = int(total_file_count[0].count_1) for k, v in FileCategories().data.iteritems(): query = (select([func.count()]).select_from(Files).where(Files.c.file_format == v).where(Files.c.resource_id == resource_id)) count = yield tx_pool.runQuery(query) if count: count = int(count[0].count_1) pct = 100 * float(count)/float(total_file_count) file_distribution[k] = "%.1f" % pct else: file_distribution[k] = 0 query = (ResourceMeta.update().where(ResourceMeta.c.id == resource_id).values(file_distribution=json.dumps(file_distribution))) yield tx_pool.runOperation(query) log.msg("[%s] Calculating file distributions DONE" % resource_id)
def db_finalize(self): query = (Resources.update().where(Resources.c.id == self.data['resource_id']).values(date_crawl_end=datetime.now())) yield tx_pool.runOperation(query) query = (Files.delete().where(Files.c.resource_id == self.data['resource_id'])) yield tx_pool.runOperation(query) query = (Files.update().where(Files.c.resource_id == '-%s' % str(self.data['resource_id'])).values(resource_id=self.data['resource_id'])) yield tx_pool.runOperation(query) query = (ResourceMeta.update().where(ResourceMeta.c.id == self.data['resource_meta_id']).values(file_count=self.resource.db_files_inserts)) yield tx_pool.runOperation(query) if 'recursive_foldersizes' in self.data['options']: yield self.db_calculate_foldersizes() self.db_calculate_filedistribution(resource_id=self.data['resource_id']) self.db_busy_crawling_toggle()
def db_busy_crawling_toggle(self): resource_meta = yield tx_pool.runQuery(select([ResourceMeta]).where(ResourceMeta.c.id == self.data['resource_meta_id'])) resource_meta = resource_meta[0] query = (ResourceMeta.update().where(ResourceMeta.c.id == self.data['resource_meta_id']).values(busy = 0 if resource_meta.busy else 1)) yield tx_pool.runOperation(query)