def refresh_unpublished_videos(): session = db_session() a_month_ago = dt.datetime.utcnow() - dt.timedelta(days=30) unpublished_videos = \ session.query(Video).filter( Video.channel_id.isnot(None), Video.published_at.is_(None), Video.uploaded_at > a_month_ago, ) trusted_block_num = confirmed_block_num(5) def is_published(video_id): addr = get_publisher_address(video_id, trusted_block_num) return addr, video_id publish_results = lkeep( thread_multi( fn=is_published, fn_args=[None], dep_args=[video.id for video in unpublished_videos], max_workers=10, re_raise_errors=False, )) for publisher_addr, video_id in publish_results: if publisher_addr != null_address: video = session.query(Video).filter_by(id=video_id).one() video.published_at = dt.datetime.utcnow() video.eth_address = publisher_addr session.add(video) session.commit()
def identify_placement(tiles: Dict[int, HashTile]) -> Placement: # figure out the placement and orientation of each tile in the final image # fix one corner tile in its original orientation start = corners(tiles)[0] # the position and orientation of each tile in the picture img_positions = {start: (0, 0)} img_orients = {start: 0} # and initialize the fringe with its non-none neighbouring tiles unplaced = set(lkeep(tiles[start].neighbours)) # assembly all the tiles while len(unplaced) > 0: # pop any tile from the fringe (and also grab its neighbouring tiles) candidate = unplaced.pop() neighbours = set(lkeep(tiles[candidate].neighbours)) # identify any of the already fixed tiles fixed = (neighbours & img_positions.keys()).pop() # figure out the placement of the candidate tile fixed_tile = orient(tiles[fixed].tile, k=img_orients[fixed]) placement = identify_tile_placement(fixed_tile, tiles[candidate].tile) # place the candidate relative to the fixed tile x, y = img_positions[fixed] img_positions[candidate] = (x + placement.dx, y + placement.dy) img_orients[candidate] = placement.orient # add the candidate's neighbours for the next iterations new_neighbours = neighbours - img_positions.keys() unplaced.update(new_neighbours) return Placement(positions=img_positions, orients=img_orients)
def delete_video_files(video_id: str, file_mapper_obj: dict): # delete upload files s3 = boto3.client( 's3', region_name=S3_UPLOADS_REGION, aws_access_key_id=AWS_MANAGER_PUBLIC_KEY, aws_secret_access_key=AWS_MANAGER_PRIVATE_KEY, ) keys_to_delete = lkeep([ file_mapper_obj['s3_upload_video_key'], file_mapper_obj['s3_upload_thumbnail_key'], ]) s3.delete_objects( Bucket=file_mapper_obj['s3_upload_bucket'], Delete={ 'Objects': [{ 'Key': x } for x in keys_to_delete], }, ) # delete video files s3t = S3Transfer( region_name=S3_VIDEOS_REGION, bucket_name=S3_VIDEOS_BUCKET, ) keys_to_delete = [ *s3t.ls(f'snapshots/{video_id}'), *s3t.ls(f'thumbnails/{video_id}'), *s3t.ls(f'v1/{video_id}'), ] if keys_to_delete: s3t.client.delete_objects( Bucket=S3_VIDEOS_BUCKET, Delete={ 'Objects': [{ 'Key': x['Key'] } for x in keys_to_delete], }, )
def scrape_comments(mongo, batch_size=250, max_workers=50): """ Parse operations and post-process for comment/post extraction. """ indexer = Indexer(mongo) start_block = indexer.get_checkpoint('comments') query = { "type": "comment", "block_num": { "$gt": start_block, "$lte": start_block + batch_size, } } projection = { '_id': 0, 'block_num': 1, 'author': 1, 'permlink': 1, } results = list(mongo.Operations.find(query, projection=projection)) identifiers = set(f"{x['author']}/{x['permlink']}" for x in results) # handle an edge case when we are too close to the head, # and the batch contains no work to do if not results and is_recent(start_block, days=1): return # get Post.export() results in parallel raw_comments = thread_multi(fn=get_comment, fn_args=[None], dep_args=list(identifiers), max_workers=max_workers, yield_results=True) raw_comments = lkeep(raw_comments) # split into root posts and comments posts = lfilter(lambda x: x['depth'] == 0, raw_comments) comments = lfilter(lambda x: x['depth'] > 0, raw_comments) # Mongo upsert many log_output = '' if posts: r = mongo.Posts.bulk_write( [ UpdateOne({'identifier': x['identifier']}, {'$set': { **x, 'updatedAt': dt.datetime.utcnow() }}, upsert=True) for x in posts ], ordered=False, ) log_output += \ f'(Posts: {r.upserted_count} upserted, {r.modified_count} modified) ' if comments: r = mongo.Comments.bulk_write( [ UpdateOne({'identifier': x['identifier']}, {'$set': { **x, 'updatedAt': dt.datetime.utcnow() }}, upsert=True) for x in comments ], ordered=False, ) log_output += \ f'(Comments: {r.upserted_count} upserted, {r.modified_count} modified) ' # We are only querying {type: 'comment'} blocks and sometimes # the gaps are larger than the batch_size. index = silent(max)(lpluck('block_num', results)) or (start_block + batch_size) indexer.set_checkpoint('comments', index) log.info(f'Checkpoint: {index} {log_output}')
def fill_probes(platform_id): platform = Platform.objects.get(pk=platform_id) gpl_name = platform.gpl_name cprint('%s %s %s' % (platform.pk, platform.gpl_name, platform.specie), attrs=['bold']) assert platform.specie platform.verdict = '' platform.probes_total = None platform.probes_matched = None platform.stats = {} platform.last_filled = timezone.now() annot_file = '/pub/geo/DATA/annotation/platforms/%s.annot.gz' % gpl_name family_file = '/pub/geo/DATA/SOFT/by_platform/%s/%s_family.soft.gz' % ( gpl_name, gpl_name) files = [annot_file, family_file] tables = list(map(peek_platform, files)) # Skip empty files = list(compress(files, tables)) tables = lkeep(tables) # TODO: check other supplementary files formats supplementary_dir = '/pub/geo/DATA/supplementary/platforms/%s/' % gpl_name _, supplementary_files = listdir(supplementary_dir) supplementary_files = [ f for f in supplementary_files if f.endswith('.txt.gz') and not re_test('\.cdf\.', f, re.I) ] files.extend(supplementary_files) tables.extend( decompress(download('%s%s' % (supplementary_dir, f))) for f in supplementary_files) platform.stats['files'] = lkeep(files) if not any(tables): cprint('No data for %s' % gpl_name, 'red') platform.verdict = 'no data' platform.save() return # Read tables in df = pd.concat( read_table(table, file) for table, file in zip(tables, files)) del tables # free memory platform.probes_total = len(set(df.index)) cprint('Found %d probes to match' % platform.probes_total, 'yellow') # import ipdb; ipdb.set_trace() # noqa # Try to resolve probes starting from best scopes mygene_probes = [] platform.stats['matches'] = [] platform.verdict = 'no clue' for scopes, cols in SCOPE_COLUMNS: cols = list(set(cols) & set(df.columns)) if not cols: continue cprint('> Looking into %s' % ', '.join(sorted(cols)), 'cyan') platform.verdict = 'nothing matched' probes = pd.concat(df[col].dropna() for col in cols) new_matches = mygene_fetch(platform, probes, scopes) mygene_probes.extend(new_matches) # Drop matched probes if new_matches: platform.stats['matches'].append({ 'scopes': scopes, 'cols': cols, 'found': len(new_matches), }) df = df.drop(lpluck('probe', new_matches)) if df.empty: break # Update stats and history platform.probes_matched = len(mygene_probes) platform.history.append({ 'time': timezone.now().strftime('%Y-%m-%d %T'), 'probes_total': platform.probes_total, 'probes_matched': platform.probes_matched, }) # Insert found genes if mygene_probes: with transaction.atomic(): platform.verdict = 'ok' platform.save() platform.probes.all().delete() PlatformProbe.objects.bulk_create([ PlatformProbe(platform=platform, **probe_info) for probe_info in mygene_probes ]) cprint('Inserted %d probes for %s' % (len(mygene_probes), gpl_name), 'green') else: cprint('Nothing matched for %s' % gpl_name, 'red') platform.save()
def search(request): # Save last specie in session specie = request.GET.get('specie') if specie != request.session.get('specie'): request.session['specie'] = specie q = request.GET.get('q') if not q: return {'series': None} exclude_tags = lkeep(silent(int), request.GET.getlist('exclude_tags')) series_tags, tag_series, tag_ids = series_tags_data() # Parse query q_string, q_tags = _parse_query(q) q_tags, wrong_tags = lsplit(lambda t: t.lower() in tag_ids, q_tags) if wrong_tags: message = 'Unknown tag%s %s.' % ('s' if len(wrong_tags) > 1 else '', ', '.join(wrong_tags)) messages.warning(request, message) if not q_string and not q_tags: return {'series': None} # Build qs qs = search_series_qs(q_string) if specie: qs = qs.filter(specie=specie) if q_tags: q_tag_ids = lkeep(tag_ids.get(t.lower()) for t in q_tags) include_series = reduce(set.intersection, (tag_series[t] for t in q_tag_ids)) if include_series: qs = qs.filter(id__in=include_series) else: message = 'No series annotated with %s.' \ % (q_tags[0] if len(q_tags) == 1 else 'all these tags simultaneously') messages.warning(request, message) return {'series': []} series_ids = qs.values_list('id', flat=True).order_by() tags = ldistinct(mapcat(series_tags, series_ids), key=itemgetter('id')) if exclude_tags: exclude_series = join(tag_series[t] for t in exclude_tags) qs = qs.exclude(id__in=exclude_series) series = paginate(request, qs, 10) # Get annotations statuses annos_qs = SeriesAnnotation.objects.filter(series__in=series) \ .values_list('series_id', 'tag_id', 'best_cohens_kappa') tags_validated = {(s, t): k == 1 for s, t, k in annos_qs} return dict( { 'series': series, 'page': series, 'tags_validated': tags_validated, 'tags': tags, 'series_tags': series_tags, }, **_search_stats(qs))