Пример #1
0
    def find_near_assets(self, asset_id: str, data: dict) -> dict:
        """Find near assets base on centroid of the shape."""

        query = {
            'center': {
                '$near': {
                    '$geometry': data['center'],
                    '$maxDistance': 500
                }
            }
        }
        qs = self.collection.find(query, {
            'asset_id': 1,
            'year': 1,
            'iiif_identifier': 1
        })

        # Merge with similar and set distance as 0
        similar = data.get('similar', [])
        similar = similar + [
            py_.pick(i, 'asset_id', 'year', 'iiif_identifier')
            for i in qs if i.get('year', None) is not None
        ]
        out = {'similar': similar}

        return out
Пример #2
0
 def from_dict(d):
     call_uuid = (d.get("call_uuid")
                  if d.get("call_uuid") is not None else d.get("call_id"))
     conversation_uuid = (d.get("conversation_uuid")
                          if d.get("conversation_uuid") is not None else
                          d.get("conversation_id"))
     if call_uuid is None or conversation_uuid is None:
         raise ValueError(
             f"No reference for call or conversation. {d.keys()}")
     if d.get("alternatives"):
         d["alternatives"] = json.dumps(d["alternatives"],
                                        ensure_ascii=False)
     return ConversationTask(
         **{
             **py_.pick(
                 d,
                 [
                     "alternatives",
                     "audio_url",
                     "state",
                     "reftime",
                     "prediction",
                 ],
             ),
             "data_id":
             str(conversation_uuid),
             "raw":
             d,
             "call_uuid":
             str(call_uuid),
             "conversation_uuid":
             str(conversation_uuid),
         })
Пример #3
0
    def parse_klokan_hidden(self, asset_id: str) -> dict:
        """Parse data extracted from Georeference website."""
        mongo = self.get_collection(
            collection=KlokanHiddenDataLoader.collection)
        doc = mongo.find_one({'id': asset_id})

        data = {}
        if doc:
            # data = py_.pick(doc, *['bbox', 'control_points', 'cutline'])
            data = py_.pick(doc, *['cutline'])

        # # Convert data to GeoJson
        # bbox = data.get('bbox', [])
        # if len(bbox) > 0:
        #     bbox = Polygon(partition(2, bbox))
        #     data.update({'bbox': bbox})
        # else:
        #     data['bbox'] = None

        # # Convert control points into geolocations
        # lpoints = data.get('control_points', [])
        # npoints = []

        # if len(lpoints) > 0:
        #     for point in lpoints:
        #         metadata = py_.pick(point, *['map', 'scan_zoom', 'map_zoom', 'address', 'pixel_x', 'pixel_y'])
        #         lon = point['longitude']
        #         lat = point['latitude']
        #         point = Point([lon, lat])
        #         npoints.append({'point': point, 'metadata': metadata})

        # data['control_points'] = npoints
        # data['control_points_count'] = len(npoints)

        # load wordfile
        wld = self.get_world_file(asset_id)

        # cutline to polygon
        cutline = data.get('cutline', [])
        if wld and cutline:
            polygon = self.pixels_to_geo_polygon(cutline, wld)
            data.update({'cutline': polygon})
            data.update({'cutline_centroid': self.centroid(polygon)})

        # bbox using the following format
        # [[left, bottom], [left, top], [right, top], [right, bottom], [left, bottom]]
        # [[west, south], [west, north], [east, north], [east, south], [west, south]]
        w = py_.get(doc, 'pyramid.width')
        h = py_.get(doc, 'pyramid.height')
        bbox_coord = [[0, h], [0, 0], [w, 0], [w, h]]

        if wld:
            polygon = self.pixels_to_geo_polygon(bbox_coord,
                                                 wld,
                                                 validated=False)
            data.update({'bbox_coord': polygon})

        return data
Пример #4
0
    def update_targets(self, items):
        target = self.targets[0]
        xas_averaged = target.collection
        valids, invalids = py_.partition(
            mark_lu(py_.flatten(items), target.lu_field, self.dt_fetch),
            'valid')
        # Remove documents flagging now-valid data as invalid.
        xas_averaged.delete_many(
            mark_invalid({"mp_id": {
                "$in": py_.pluck(valids, 'mp_id')
            }}))

        for doc in valids:
            xas_averaged.update_one(py_.pick(doc, 'mp_id', 'element'),
                                    {'$set': doc},
                                    upsert=True)
        for doc in invalids:
            xas_averaged.update_one(mark_invalid(py_.pick(doc, 'mp_id')),
                                    {'$set': doc},
                                    upsert=True)
Пример #5
0
 def update_targets(self, items):
     xas_averaged = self.targets[0]
     xas_averaged.ensure_index([("valid", 1), ("mp_id", 1)])
     xas_averaged.ensure_index([("mp_id", 1), ("element", 1)])
     xas_averaged.ensure_index([("chemsys", 1), ("element", 1)])
     valids, invalids = py_.partition(
         mark_lu(py_.flatten(items), xas_averaged.lu_field, self.dt_fetch),
         'valid')
     # Remove documents flagging now-valid data as invalid.
     xas_averaged.collection.delete_many(
         mark_invalid({"mp_id": {
             "$in": py_.pluck(valids, 'mp_id')
         }}))
     bulk = xas_averaged.collection.initialize_ordered_bulk_op()
     for doc in valids:
         (bulk.find(py_.pick(doc, 'mp_id',
                             'element')).upsert().replace_one(doc))
     for doc in invalids:
         (bulk.find(mark_invalid(py_.pick(
             doc, 'mp_id'))).upsert().replace_one(doc))
     bulk.execute()
Пример #6
0
    def parse_slnsw_title_links(self, asset_id: str, data: dict) -> dict:

        query = {'asset_id': asset_id}
        mongo = self.get_collection(
            collection=SLNSWLinkTitlesLoader.collection)
        doc = mongo.find_one(query)
        out = {}

        if doc:
            out = py_.pick(doc, 'collection_title', 'url_id')

        return out
Пример #7
0
    def load_objects(self, *args, **kwargs):
        qs = self.queryset(SLNSWLinkTitlesLoader.collection, {})
        for doc in qs:
            # Select only the basic data
            data = py_.pick(doc, 'asset_id', 'url_id')

            # Build url
            url = self.SLNSW_COLLECTION.format(**data)

            # Get data and merge
            data.update(self.load_data(url))

            yield data
Пример #8
0
    def parse_slnsw_subdivision(self, asset_id: str) -> dict:
        """Parse subdivision data extracted from data collected from the search engine.
        Data here are mainly loaded fom the SLNSW website.
        """
        data = {}
        mongo = self.get_collection(
            collection=SearchEngineSubdivisionLoader.collection)
        doc = mongo.find_one({'dig_id': asset_id})

        if doc:
            data = py_.pick(doc, 'year', 'location_name', 'boundaries',
                            'call_no', 'date')

            # parse year
            try:
                data['year'] = int(data.get('year'))
            except:
                data['year'] = None

            # parse date
            date_str = data.get('date')
            date = None
            if self.RE_DATE_HAS_YEAR.match(date_str):
                # clean date string
                date_str = date_str.replace('[', '')
                date_str = date_str.replace(']', '')
                date_str = date_str.strip()

                if self.RE_DATE_JUST_YEAR.match(date_str):
                    date_str = '{}-1-1'.format(date_str)

                date = dateparser.parse(date_str,
                                        settings={'STRICT_PARSING': True})

            data['date'] = date

            # Use date year if year is not in found and the row has a date
            if date and not data['year']:
                data['year'] = date.year

            data['date_str'] = date_str

            # Drop empty or none data
            data = {k: v for k, v in data.items() if v}

            # Add suffix for now
            data = {'{}_subdivision'.format(k): v for k, v in data.items()}

        return data
Пример #9
0
    def load_objects(self, *args, **kwargs):
        qs = self.queryset(ComasterImagesLoader.collection)
        for i, doc in enumerate(qs):
            try:
                logger.debug('{} - {asset_id}'.format(i, **doc))
                # Create cutline crop
                doc['has_cutline_crop'] = self.crop_image(**doc)

                sizes = [128, 256, 512, 768, 1024]

                params = py_.pick(doc, ['filename', 'asset_id'])
                self.derivatives(suffix='uncrop',
                                 process=self.process_thumbnail,
                                 sizes=sizes,
                                 **params)

                params.update(
                    {'filename': '{asset_id}_crop.png'.format(**doc)})
                self.derivatives(suffix='crop',
                                 process=self.process_thumbnail,
                                 sizes=sizes,
                                 **params)

                # # Create derivative of the crop
                # self.derivative_cutline_crop(size=800, **doc)
                # self.derivative_cutline_crop(size=1600, **doc)

                # # Edge derivative
                # self.derivate_edge_detection(**doc)
                # self.derivate_edge_detection(**doc, size=800)
                # self.derivate_edge_detection(**doc, size=1600)

                # # Create uncrop derivatives
                # self.derivative_uncrop(size=800, **doc)
                # self.derivative_uncrop(size=1600, **doc)

                doc.update(self.get_image_info(**doc))

                yield doc
            except Exception as e:
                logger.exception(f'Fail croping images: {e}')
Пример #10
0
    def load_objects(self):
        """Generator that yield klokan map dictionaries."""
        # Clean collection
        self.collection.remove({})

        # Re-create data
        qs = self.queryset(DXMapsData.collection,
                           query={
                               'valid': True,
                               'active': True
                           })
        for doc in qs:
            logger.debug('DXMap creating GEOJson for {asset_id}'.format(**doc))

            geometry = py_.get(doc, 'cutline.coordinates', None)
            # If cutline exists is a valid map
            if geometry:
                poly = Polygon(geometry)

                # Build feature properties
                properties = py_.pick(doc, 'year', 'collection_title',
                                      'asset_id', 'url_id', 'colorfulness',
                                      'iiif_identifier', 'colored',
                                      'cutline_centroid', 'similar',
                                      'bbox_coord', 'location_name', 'width',
                                      'height')
                properties = py_.rename_keys(
                    properties, {
                        'cutline_centroid': 'centroid',
                        'bbox_coord': 'image_bounds',
                        'collection_title': 'title',
                        'url_id': 'collection_id'
                    })

                # build feature
                feature = Feature(geometry=poly, properties=properties)
                yield feature

        self.export_to_json()
        return []
def repos_to_csv(repos_by_lang, page_num):
    repo_issue_content_list = []
    for index, repo in enumerate(repos_by_lang):
        # get repo with basic numerical numerical data
        repos_by_lang[index] = py_.pick(repo, 'full_name', 'forks_count',
                                        'open_issues_count', 'watchers_count')

        # separate full name to list ['owner', 'repository name']
        repo_name = repo['full_name']
        repo_owner_name_list = repo_name.split('/')

        issue_list = GetIssueContent(
            repo_owner_name_list[0],
            repo_owner_name_list[1]).get_issue_content()[0:2]
        clean_issue_list = '[[[[[Next]]]]]'.join(map(str, issue_list))
        repo_issue_content_list.append(clean_issue_list)

        # add star count and merge to existing dictionary
        star_count = {
            "star_count": GetStarCountsByRepo(repo['full_name']).get()
        }
        repos_by_lang[index] = py_.merge(repos_by_lang[index], star_count)

    pd_format_dic = {
        'full_name': py_.pluck(repos_by_lang, 'full_name'),
        'forks_count': py_.pluck(repos_by_lang, 'forks_count'),
        'open_issues_count': py_.pluck(repos_by_lang, 'open_issues_count'),
        'watchers_count': py_.pluck(repos_by_lang, 'watchers_count'),
        'comment_count': py_.pluck(repos_by_lang, 'comment_count'),
        'star_count': py_.pluck(repos_by_lang, 'star_count'),
        'issue_content': repo_issue_content_list
    }

    # print(pd_format_dic)

    df = pd.DataFrame.from_dict(pd_format_dic)
    file_name = Config().get_search_setting()['lang'].split(':')[1]
    df.to_csv(f'../data/{file_name}_github_{page_num}.csv')
    print(f'Saving {file_name}_github_{page_num} to csv finished!!')
Пример #12
0
    def parse_klokan(self, obj):
        # Get data we are interested
        data = py_.pick(
            obj, *['title', 'thumbnail', 'center', 'north_east', 'south_west'])

        # process data
        # 1. Convert strings to GeoCoordinates
        data.update({
            'center':
            self.str_longlat_to_geo_point(data['center']),
            'north_east':
            self.str_longlat_to_geo_point(data['north_east']),
            'south_west':
            self.str_longlat_to_geo_point(data['south_west']),
        })

        # hack for metabase
        fields = ['center', 'north_east', 'south_west']
        for f in fields:
            point = data[f]
            if point:
                coordinates = point.coordinates
                data['{}_longitude'.format(f)] = coordinates[0]
                data['{}_latitude'.format(f)] = coordinates[1]

        # 2. Extract city suburb name from title
        title = py_.get(data, 'title', '')
        data['location_name'] = py_.get(title.split(','), 0)

        # 3. Extract year from title as fall back option
        match = self.RE_TITLE_YEAR.match(title)
        if match:
            year = match.group('year')
            data['year_title'] = int(year)

        return data
Пример #13
0
def getLatestComment(convo):
    r = requests.get(convo['_links']['related']['comments'], headers=HEADERS)
    comments = r.json()['_results']
    if not py_.is_empty(comments):
        # get latest
        return py_.pick(comments[0], 'body', 'posted_at')
Пример #14
0
 def from_dict(d):
     it = py_.pick(d, ["conversation_id", "audio_url"])
     return AudioSegmentTask(**it)