def find_near_assets(self, asset_id: str, data: dict) -> dict: """Find near assets base on centroid of the shape.""" query = { 'center': { '$near': { '$geometry': data['center'], '$maxDistance': 500 } } } qs = self.collection.find(query, { 'asset_id': 1, 'year': 1, 'iiif_identifier': 1 }) # Merge with similar and set distance as 0 similar = data.get('similar', []) similar = similar + [ py_.pick(i, 'asset_id', 'year', 'iiif_identifier') for i in qs if i.get('year', None) is not None ] out = {'similar': similar} return out
def from_dict(d): call_uuid = (d.get("call_uuid") if d.get("call_uuid") is not None else d.get("call_id")) conversation_uuid = (d.get("conversation_uuid") if d.get("conversation_uuid") is not None else d.get("conversation_id")) if call_uuid is None or conversation_uuid is None: raise ValueError( f"No reference for call or conversation. {d.keys()}") if d.get("alternatives"): d["alternatives"] = json.dumps(d["alternatives"], ensure_ascii=False) return ConversationTask( **{ **py_.pick( d, [ "alternatives", "audio_url", "state", "reftime", "prediction", ], ), "data_id": str(conversation_uuid), "raw": d, "call_uuid": str(call_uuid), "conversation_uuid": str(conversation_uuid), })
def parse_klokan_hidden(self, asset_id: str) -> dict: """Parse data extracted from Georeference website.""" mongo = self.get_collection( collection=KlokanHiddenDataLoader.collection) doc = mongo.find_one({'id': asset_id}) data = {} if doc: # data = py_.pick(doc, *['bbox', 'control_points', 'cutline']) data = py_.pick(doc, *['cutline']) # # Convert data to GeoJson # bbox = data.get('bbox', []) # if len(bbox) > 0: # bbox = Polygon(partition(2, bbox)) # data.update({'bbox': bbox}) # else: # data['bbox'] = None # # Convert control points into geolocations # lpoints = data.get('control_points', []) # npoints = [] # if len(lpoints) > 0: # for point in lpoints: # metadata = py_.pick(point, *['map', 'scan_zoom', 'map_zoom', 'address', 'pixel_x', 'pixel_y']) # lon = point['longitude'] # lat = point['latitude'] # point = Point([lon, lat]) # npoints.append({'point': point, 'metadata': metadata}) # data['control_points'] = npoints # data['control_points_count'] = len(npoints) # load wordfile wld = self.get_world_file(asset_id) # cutline to polygon cutline = data.get('cutline', []) if wld and cutline: polygon = self.pixels_to_geo_polygon(cutline, wld) data.update({'cutline': polygon}) data.update({'cutline_centroid': self.centroid(polygon)}) # bbox using the following format # [[left, bottom], [left, top], [right, top], [right, bottom], [left, bottom]] # [[west, south], [west, north], [east, north], [east, south], [west, south]] w = py_.get(doc, 'pyramid.width') h = py_.get(doc, 'pyramid.height') bbox_coord = [[0, h], [0, 0], [w, 0], [w, h]] if wld: polygon = self.pixels_to_geo_polygon(bbox_coord, wld, validated=False) data.update({'bbox_coord': polygon}) return data
def update_targets(self, items): target = self.targets[0] xas_averaged = target.collection valids, invalids = py_.partition( mark_lu(py_.flatten(items), target.lu_field, self.dt_fetch), 'valid') # Remove documents flagging now-valid data as invalid. xas_averaged.delete_many( mark_invalid({"mp_id": { "$in": py_.pluck(valids, 'mp_id') }})) for doc in valids: xas_averaged.update_one(py_.pick(doc, 'mp_id', 'element'), {'$set': doc}, upsert=True) for doc in invalids: xas_averaged.update_one(mark_invalid(py_.pick(doc, 'mp_id')), {'$set': doc}, upsert=True)
def update_targets(self, items): xas_averaged = self.targets[0] xas_averaged.ensure_index([("valid", 1), ("mp_id", 1)]) xas_averaged.ensure_index([("mp_id", 1), ("element", 1)]) xas_averaged.ensure_index([("chemsys", 1), ("element", 1)]) valids, invalids = py_.partition( mark_lu(py_.flatten(items), xas_averaged.lu_field, self.dt_fetch), 'valid') # Remove documents flagging now-valid data as invalid. xas_averaged.collection.delete_many( mark_invalid({"mp_id": { "$in": py_.pluck(valids, 'mp_id') }})) bulk = xas_averaged.collection.initialize_ordered_bulk_op() for doc in valids: (bulk.find(py_.pick(doc, 'mp_id', 'element')).upsert().replace_one(doc)) for doc in invalids: (bulk.find(mark_invalid(py_.pick( doc, 'mp_id'))).upsert().replace_one(doc)) bulk.execute()
def parse_slnsw_title_links(self, asset_id: str, data: dict) -> dict: query = {'asset_id': asset_id} mongo = self.get_collection( collection=SLNSWLinkTitlesLoader.collection) doc = mongo.find_one(query) out = {} if doc: out = py_.pick(doc, 'collection_title', 'url_id') return out
def load_objects(self, *args, **kwargs): qs = self.queryset(SLNSWLinkTitlesLoader.collection, {}) for doc in qs: # Select only the basic data data = py_.pick(doc, 'asset_id', 'url_id') # Build url url = self.SLNSW_COLLECTION.format(**data) # Get data and merge data.update(self.load_data(url)) yield data
def parse_slnsw_subdivision(self, asset_id: str) -> dict: """Parse subdivision data extracted from data collected from the search engine. Data here are mainly loaded fom the SLNSW website. """ data = {} mongo = self.get_collection( collection=SearchEngineSubdivisionLoader.collection) doc = mongo.find_one({'dig_id': asset_id}) if doc: data = py_.pick(doc, 'year', 'location_name', 'boundaries', 'call_no', 'date') # parse year try: data['year'] = int(data.get('year')) except: data['year'] = None # parse date date_str = data.get('date') date = None if self.RE_DATE_HAS_YEAR.match(date_str): # clean date string date_str = date_str.replace('[', '') date_str = date_str.replace(']', '') date_str = date_str.strip() if self.RE_DATE_JUST_YEAR.match(date_str): date_str = '{}-1-1'.format(date_str) date = dateparser.parse(date_str, settings={'STRICT_PARSING': True}) data['date'] = date # Use date year if year is not in found and the row has a date if date and not data['year']: data['year'] = date.year data['date_str'] = date_str # Drop empty or none data data = {k: v for k, v in data.items() if v} # Add suffix for now data = {'{}_subdivision'.format(k): v for k, v in data.items()} return data
def load_objects(self, *args, **kwargs): qs = self.queryset(ComasterImagesLoader.collection) for i, doc in enumerate(qs): try: logger.debug('{} - {asset_id}'.format(i, **doc)) # Create cutline crop doc['has_cutline_crop'] = self.crop_image(**doc) sizes = [128, 256, 512, 768, 1024] params = py_.pick(doc, ['filename', 'asset_id']) self.derivatives(suffix='uncrop', process=self.process_thumbnail, sizes=sizes, **params) params.update( {'filename': '{asset_id}_crop.png'.format(**doc)}) self.derivatives(suffix='crop', process=self.process_thumbnail, sizes=sizes, **params) # # Create derivative of the crop # self.derivative_cutline_crop(size=800, **doc) # self.derivative_cutline_crop(size=1600, **doc) # # Edge derivative # self.derivate_edge_detection(**doc) # self.derivate_edge_detection(**doc, size=800) # self.derivate_edge_detection(**doc, size=1600) # # Create uncrop derivatives # self.derivative_uncrop(size=800, **doc) # self.derivative_uncrop(size=1600, **doc) doc.update(self.get_image_info(**doc)) yield doc except Exception as e: logger.exception(f'Fail croping images: {e}')
def load_objects(self): """Generator that yield klokan map dictionaries.""" # Clean collection self.collection.remove({}) # Re-create data qs = self.queryset(DXMapsData.collection, query={ 'valid': True, 'active': True }) for doc in qs: logger.debug('DXMap creating GEOJson for {asset_id}'.format(**doc)) geometry = py_.get(doc, 'cutline.coordinates', None) # If cutline exists is a valid map if geometry: poly = Polygon(geometry) # Build feature properties properties = py_.pick(doc, 'year', 'collection_title', 'asset_id', 'url_id', 'colorfulness', 'iiif_identifier', 'colored', 'cutline_centroid', 'similar', 'bbox_coord', 'location_name', 'width', 'height') properties = py_.rename_keys( properties, { 'cutline_centroid': 'centroid', 'bbox_coord': 'image_bounds', 'collection_title': 'title', 'url_id': 'collection_id' }) # build feature feature = Feature(geometry=poly, properties=properties) yield feature self.export_to_json() return []
def repos_to_csv(repos_by_lang, page_num): repo_issue_content_list = [] for index, repo in enumerate(repos_by_lang): # get repo with basic numerical numerical data repos_by_lang[index] = py_.pick(repo, 'full_name', 'forks_count', 'open_issues_count', 'watchers_count') # separate full name to list ['owner', 'repository name'] repo_name = repo['full_name'] repo_owner_name_list = repo_name.split('/') issue_list = GetIssueContent( repo_owner_name_list[0], repo_owner_name_list[1]).get_issue_content()[0:2] clean_issue_list = '[[[[[Next]]]]]'.join(map(str, issue_list)) repo_issue_content_list.append(clean_issue_list) # add star count and merge to existing dictionary star_count = { "star_count": GetStarCountsByRepo(repo['full_name']).get() } repos_by_lang[index] = py_.merge(repos_by_lang[index], star_count) pd_format_dic = { 'full_name': py_.pluck(repos_by_lang, 'full_name'), 'forks_count': py_.pluck(repos_by_lang, 'forks_count'), 'open_issues_count': py_.pluck(repos_by_lang, 'open_issues_count'), 'watchers_count': py_.pluck(repos_by_lang, 'watchers_count'), 'comment_count': py_.pluck(repos_by_lang, 'comment_count'), 'star_count': py_.pluck(repos_by_lang, 'star_count'), 'issue_content': repo_issue_content_list } # print(pd_format_dic) df = pd.DataFrame.from_dict(pd_format_dic) file_name = Config().get_search_setting()['lang'].split(':')[1] df.to_csv(f'../data/{file_name}_github_{page_num}.csv') print(f'Saving {file_name}_github_{page_num} to csv finished!!')
def parse_klokan(self, obj): # Get data we are interested data = py_.pick( obj, *['title', 'thumbnail', 'center', 'north_east', 'south_west']) # process data # 1. Convert strings to GeoCoordinates data.update({ 'center': self.str_longlat_to_geo_point(data['center']), 'north_east': self.str_longlat_to_geo_point(data['north_east']), 'south_west': self.str_longlat_to_geo_point(data['south_west']), }) # hack for metabase fields = ['center', 'north_east', 'south_west'] for f in fields: point = data[f] if point: coordinates = point.coordinates data['{}_longitude'.format(f)] = coordinates[0] data['{}_latitude'.format(f)] = coordinates[1] # 2. Extract city suburb name from title title = py_.get(data, 'title', '') data['location_name'] = py_.get(title.split(','), 0) # 3. Extract year from title as fall back option match = self.RE_TITLE_YEAR.match(title) if match: year = match.group('year') data['year_title'] = int(year) return data
def getLatestComment(convo): r = requests.get(convo['_links']['related']['comments'], headers=HEADERS) comments = r.json()['_results'] if not py_.is_empty(comments): # get latest return py_.pick(comments[0], 'body', 'posted_at')
def from_dict(d): it = py_.pick(d, ["conversation_id", "audio_url"]) return AudioSegmentTask(**it)