class IndeedType(Document): #完成搜索建议 suggest = Completion(analyzer=ik_analyzer) #理论上可以这样 #要保存在elasticsearch中的数据类型 job_title = Text(analyzer="ik_max_word") job_location = Text(analyzer="ik_max_word") job_summary = Text(analyzer="ik_max_word") job_salary = Text(analyzer="ik_max_word") company_name = Text(analyzer="ik_max_word") job_href = Text(analyzer="ik_max_word") job_star = Text(analyzer="ik_max_word") job_review = Text(analyzer="ik_max_word") class Meta: index = "Indeed" doc_type = "data_science" class Index: name = "indeed" doc_type = "data_science" # Display cluster health print(connections.get_connection().cluster.health())
class ChengdeType(DocType): # 伯乐在线文章类型 '''搜索时需要进行分词''' suggest = Completion(analyzer=ik_analyzer) title = Text(analyzer="ik_max_word") '''这个词不需要分词''' url = Keyword() url_object_id = Keyword() rent_type = Keyword price = Integer() hourse_type = Keyword() area = Text(analyzer="ik_max_word") community = Text(analyzer="ik_max_word") detail = Text(analyzer="ik_max_word") telephone = Keyword() """ liuli1的 content是keyword """ '''确定目标保存的 index和type''' class Meta: index = "zufang" doc_type = "chengde58"
class ArticleType(DocType): # 伯乐在线文章类型 # 搜索建议自动补全 ,设置Completion类型 , 目前要用自定义的CustomAnalyzer避免报错 suggest = Completion(analyzer=ik_analyzer) '''搜索时需要进行分词''' title = Text(analyzer="ik_max_word") create_date = Date() '''这个词不需要分词''' url = Keyword() url_object_id = Keyword() comment_nums = Integer() average_score = Keyword() tags = Text(analyzer="ik_max_word") content = Text(analyzer="ik_max_word") """ liuli1的 content是keyword """ '''确定目标保存的 index和type''' class Meta: index = "liuli1" doc_type = "article"
class LaGou(DocType): # 伯乐在线文章类型 suggest = Completion(analyzer=ik_analyzer) url = Keyword() url_object_id = Keyword() title = Text(analyzer="ik_max_word") salary = Text(analyzer="ik_max_word") job_city = Text(analyzer="ik_max_word") work_years = Text(analyzer="ik_max_word") degree_need = Text(analyzer="ik_max_word") job_type = Text(analyzer="ik_max_word") publish_time = Date() tags = Text(analyzer="ik_max_word") job_advantage = Text(analyzer="ik_max_word") job_desc = Text(analyzer="ik_max_word") job_addr = Text(analyzer="ik_max_word") company_url = Keyword() company_name = Text(analyzer="ik_max_word") crawl_time = Date() class Meta: index = "lagou" doc_type = "lagoujob"
class LagouType(DocType): #拉勾类型 suggest = Completion(analyer='ik_max_word') title = Text(analyer='ik_max_word') url = Keyword() url_object_id = Keyword() salary = Keyword() job_city = Keyword() work_years = Keyword() degree_need = Text(analyer='ik_max_word') job_type = Text(analyer='ik_max_word') publish_time = Keyword() job_advantage = Text(analyer='ik_max_word') job_desc = Text(analyer='ik_max_word') job_addr = Keyword() company_name = Text(analyer='ik_max_word') company_url = Keyword() tags = Text(analyer='ik_max_word') crawl_time = Date() class Meta: index = 'lagou' doc_type = 'possion'
class LagouJobIndex(Document): suggest = Completion(analyzer=my_analyzer) title = Text(analyzer="ik_max_word") url = Keyword() url_object_id = Keyword() salary_min = Integer() salary_max = Integer() job_city = Keyword() work_years_min = Integer() work_years_max = Integer() degree_need = Text(analyzer="ik_max_word") job_type = Keyword() publish_time = Date() job_advantage = Text(analyzer="ik_max_word") job_desc = Text(analyzer="ik_smart") job_addr = Text(analyzer="ik_max_word") company_name = Keyword() company_url = Keyword() tags = Text(analyzer="ik_max_word") crawl_time = Date() class Index: name = 'lagou_job'
class Lagou(Document): suggest = Completion(analyzer='ik_max_word') title = Text(analyzer='ik_max_word') url = Keyword() url_object_id = Keyword() salary = Keyword() job_city = Text(analyzer='ik_max_word') work_years = Keyword() degree_need = Keyword() job_type = Text(analyzer='ik_max_word') publish_time = Keyword() job_advantage = Text(analyzer='ik_max_word') job_desc = Text(analyzer='ik_max_word') job_address = Text(analyzer='ik_max_word') company_url = Keyword() company_name = Text(analyzer='ik_max_word') crawl_time = Date() crawl_update_time = Date() tags = Text(analyzer='ik_max_word') class Index: name = 'lagou' settings = {"number_of_shards": 2, "number_of_replicas": 0}
class ZhiHuQuestionType(Document): """ 知乎问题 """ suggest = Completion(analyzer=my_analyzer) # 知乎的问题 item zhihu_id = Keyword() topics = Text(analyzer="ik_max_word") url = Keyword() title = Text(analyzer="ik_max_word") content = Text(analyzer="ik_max_word") answer_num = Integer() comments_num = Integer() watch_user_num = Integer() click_num = Integer() crawl_time = Date() # 定义了es中对应的index class Index: name = 'zhihu' doc_type = "question" class Meta: doc_type = "question"
class Lagou(DocType): suggest = Completion(analyzer=ik_analyzer) url = Keyword() url_obj_id = Keyword() title = Text(analyzer='ik_max_word') min_salary = Integer() max_salary = Integer() min_work_year = Integer() max_work_year = Integer() job_city = Keyword() degree_need = Keyword() job_type = Keyword() publish_time = Date() tags = Text(analyzer='ik_max_word') job_advantage = Text(analyzer='ik_max_word') job_desc = Text(analyzer='ik_max_word') job_addr = Text(analyzer='ik_max_word') company_name = Text(analyzer='ik_max_word') class Meta: index = 'lagou' doc_type = 'job'
class LagouItem(DocType): suggest = Completion(analyzer=ik_analyzer) url = Keyword() url_object_id = Keyword() title = Text(analyzer='ik_max_word') salary = Keyword() job_city = Keyword() work_years = Keyword() degree_need = Keyword() job_type = Keyword() publish_time = Date() job_advantags = Text(analyzer='ik_max_word') job_dec = Text(analyzer='ik_max_word') job_addr = Keyword() company_name = Keyword() company_url = Keyword() tags = Text(analyzer='ik_max_word') crawl_time = Date() crawl_update_time = Date() class Meta: index = "lagou" doc_type = "job"
class ArticType(DocType): #继承自定义的类 suggest = Completion(analyzer=ik_analyzer) #搜索建议自动补全的功能 title = Text(analyzer="ik_max_word") create_date = Date() link_url = Keyword() #不分析 url_object_id = Keyword() front_image_url = Keyword() #设置字段名称的字段类型,keyword为普通字符串类型,不分词 front_image_path = Keyword() # 点赞数 praise_num = Integer() # 评论数 comment_num = Integer() # 收藏数 fav_num = Integer() # 标签 tags = Text(analyzer="ik_max_word") #Text为字符串类型并且可以分词建立倒排索引 # 内容 content = Text(analyzer="ik_max_word") class Meta: index = "jobbole" #设置索引名称 doc_type = 'article' #设置表名称
class ArticleType(Document): # 伯乐在线文章类型 suggest = Completion(analyzer=ik_analyzer) title = Text(analyzer="ik_max_word") create_date = Date() url = Keyword() url_object_id = Keyword() front_image_url = Keyword() front_image_path = Keyword() praise_nums = Integer() comment_nums = Integer() fav_nums = Integer() tags = Text(analyzer="ik_max_word") content = Text(analyzer="ik_max_word") class Meta: # name = "jobbole" index = "jobbole" doc_type = "article" class Index: name = 'jobbole' doc_type = 'article'
class LagouType(DocType): # 拉勾网职位类型 suggest = Completion(analyzer=ik_analyzer) title = Text(analyzer='ik_max_word') create_date = Date() url = Keyword() url_object_id = Keyword() salary = Text(analyzer='ik_max_word') job_city = Text(analyzer='ik_max_word') work_years = Text(analyzer='ik_max_word') degree_need = Text(analyzer='ik_max_word') job_type = Text(analyzer='ik_max_word') tags = Text(analyzer='ik_max_word') publish_time = Text(analyzer='ik_max_word') job_advantage = Text(analyzer='ik_max_word') job_desc = Text(analyzer='ik_max_word') job_addr = Text(analyzer='ik_max_word') company_name = Text(analyzer='ik_max_word') company_url = Keyword() class Meta: index = "lagou2_linux" doc_type = 'lagou'
class Lagou(DocType): # 拉勾网职位信息 title_suggest = Completion(analyzer=ik_analyzer, search_analyzer=ik_analyzer) title = Text(analyzer='ik_max_word', search_analyzer="ik_max_word", fields={'title': Keyword()}) id = Text() url = Text() salary = Text() job_city = Text() work_years = Text() degree_need = Text() job_type = Text() publish_time = Text() job_advantage = Text() job_desc = Text() job_addr = Text() company_name = Text() company_url = Text() tags = Text(analyzer='ik_max_word', fields={'tags': Keyword()}) crawl_time = Date() class Meta: index = 'jobbole' doc_type = 'lagou_job'
class LagouJob(Document): # 建议 suggestion = Completion(analyzer=analyzer('ik_smart')) job_id = Keyword() # 职位标题 title = Text(analyzer="ik_max_word") # url url = Keyword() # 工资 salary = FloatRange() # # 工资下限 # salary_min = Float() # 工作经验 work_years = FloatRange() # # 最低年 # work_year_min = Integer() # 学历要求 degree_need = Float() # 工作性质 实习兼职全职 job_type = Keyword() # 发布时间 publish_time = Date() # 职位诱惑 job_advantage = Text(analyzer="ik_max_word") # 职位描述 job_desc = Text(analyzer="ik_max_word") # 工作城市 job_city = Keyword() # 工作地址 job_addr = Text(analyzer="ik_max_word") # 公司url company_url = Keyword() # 公司名字 company_name = Keyword() class Index: name = 'a51job'
class VideoType(Document): class Index: name = 'video' # settings = { # "number_of_shards": 2, # "number_of_replicas": 1 # } # 文章和视频通用 url = Keyword() url_object_id = Keyword() title = Text(analyzer="ik_max_word") source = Text(analyzer="ik_max_word") date = Date() # 视频特有 img_url = Keyword() suggest = Completion(analyzer=ik_analyzer) # 搜索建议 def __init__(self, item): super(VideoType, self).__init__() self.assign(item) def assign(self, item): keys = ["url", "title", "source", "date", "url_object_id", "img_url"] for key in keys: try: item[key] except: item[key] = "" self.url = item["url"] self.title = item["title"] self.source = item["source"] self.date = item["date"] self.meta.id = item["url_object_id"] self.img_url = item["img_url"] self.suggest = gen_suggests(VideoType.Index.name, ((self.title, 10), (self.source, 2)))
class GameType(DocType): # suggest字段类型定义为completion # string类型:text, keyword两种 # text类型:会进行分词,抽取词干,建立倒排索引 # keyword类型:就是一个普通字符串,只能完全匹配才能搜索到 # 数字类型:long, integer, short, byte, double, float # 日期类型:date # bool(布尔) # 类型:boolean # binary(二进制) # 类型:binary # 复杂类型:object, nested # geo(地区) # 类型:geo - point, geo - shape # 专业类型:ip, competion url = Keyword() suggest = Completion(analyzer=ik_analyzer) # analyzer = "ik_max_word" 是以一种分词方式 gameName = Text(analyzer="ik_max_word") # 动作啥的 gameType = Keyword() # 开发商 developer = Text(analyzer="ik_max_word") # 发行商 publisher = Text(analyzer="ik_max_word") # 发售日期 publishDate = Keyword() gameLanguage = Keyword() # 标签 魔幻 穿越啥的 gameTitle = Text(analyzer="ik_max_word") gamePlatform = Keyword() # 游戏内容 对应故事背景 gameContext = Text(analyzer="ik_max_word") class Meta: index = "test" doc_type = "game"
class Person(DocType, RangeRelevantEntitiesMixin): """Person document.""" full_name_suggest = Completion(preserve_separators=False) translated_first_name = TranslatedField("first_name", "first_name_en") translated_last_name = TranslatedField("last_name", "last_name_en") translated_patronymic = TranslatedField("patronymic", "patronymic_en") translated_last_workplace = TranslatedField("last_workplace", "last_workplace_en") translated_last_job_title = TranslatedField("last_job_title", "last_job_title_en") @classmethod @cached(timeout=25 * 60 * 60) def get_all_persons(cls): return [ blacklist( p.to_dict(), [ "full_name_suggest_en", "dob_details", "dob", "full_name_suggest", "last_job_id", "risk_category", "photo_path", "terminated", "last_modified", "inn", "inn_source", "passport", "passport_source", ], ) for p in cls.search().scan() ]
class VideoType(DocType): """ 学习视频类型 """ # 搜索建议的mapping设置 suggest = Completion(analyzer=ik_analyzer) url_object_id = Keyword() url = Keyword() class_name = Text(analyzer='ik_max_word') price = Float() abstract = Text(analyzer='ik_max_word') data_source = Keyword() second_classify = Text(analyzer='ik_max_word') first_classify = Text(analyzer='ik_max_word') # recommend_score = Float() # 用来初始化index以及type的名称 class Meta: # 类比到mysql就是数据库名 index = 'learning_video' # 类比到mysql就是表名 doc_type = 'video'
class Film(Document): id = Keyword(required=True) imdb_id = Keyword() original_title = Text() characters = Nested(properties={'char_id': Keyword(), 'char_name': Text()}) directors = Nested(properties={'director_id': Keyword(), 'director_name': Text()}) original_language = Keyword() adult = Boolean() belongs_to_collection = Text() genres = Nested(properties={'genre': Keyword()}) popularity = Float() release_date = Date() budget = Float() revenue = Float() runtime = Float() spoken_languages = Nested(properties={'spoken_language': Keyword()}) poster_path = Keyword() vote_average = Float() vote_count = Integer() keywords = Nested(properties={'keyword': Keyword()}) suggestion = Completion() class Index: name = ELASTIC_INDEX
class ZhiHuAnswerType(Document): """ 知乎回答 """ suggest = Completion(analyzer=my_analyzer) # 知乎的问题 item zhihu_id = Keyword() url = Keyword() question_id = Keyword() author_id = Keyword() content = Text(analyzer="ik_max_word") praise_num = Integer() comments_num = Integer() create_time = Date() update_time = Date() crawl_time = Date() author_name = Keyword() # 定义了es中对应的index class Index: name = 'zhihu' doc_type = "answer" class Meta: doc_type = "answer"
class ArticleType(DocType): """ 技术文章类型,此处会影响搜索打分结果 """ # 搜索建议的mapping设置 suggest = Completion(analyzer=ik_analyzer) url_object_id = Keyword() url = Keyword() title = Text(analyzer='ik_max_word') article_type = Text(analyzer='ik_max_word') data_source = Keyword() # 热度预留字段,由阅读数,评论数,点赞数,收藏数组成 hot_score = Integer() publish_time = Date() abstract = Text(analyzer='ik_max_word') tags = Text(analyzer='ik_max_word') # 用来初始化index以及type的名称 class Meta: # 类比到mysql就是数据库名 index = 'technology_article' # 类比到mysql就是表名 doc_type = 'article'
class DouBanIndex(Document): """ :param: 电影id,电影名,别名,简介,链接,导演,演员,类型,上映时间 """ suggest = Completion(analyzer=my_analyzer) movie_id = Keyword() name = Text(analyzer="ik_max_word") alias = Text(analyzer="ik_smart") introduce = Text(analyzer="ik_smart") url = Keyword() directors = Keyword() rate = Float() casts = Keyword() type = Keyword() date = Keyword() class Index: name = "movie" # 设置分片数量,副本数量, settings = { "number_of_shards": 2, "number_of_replicas": 0 }
class LagouJobType(DocType): # 拉勾网的职位 suggest = Completion(analyzer=ik_analyzer) title = Text(analyzer="ik_max_word") create_date = Date() url = Keyword() url_object_id = Keyword() salary = Keyword() job_city = Keyword() work_years = Keyword() degree_need = Keyword() job_type = Keyword() publish_time = Keyword() job_advantage = Keyword() job_desc = Keyword() job_addr = Keyword() company_name = Keyword() company_url = Keyword() tags = Text(analyzer="ik_max_word") content = Text(analyzer="ik_max_word") class Meta: index = "lagou" doc_type = "lagou"
class ItemDoc(DocType): """ Common superclass for Document, Audio, Video, Wiki, Maps, and Newspaper. Don't get index in the index server.""" id = Text() title = Text(fields={'keyword': Keyword()}) title_search = Text() title_suggest = Completion() abstract = Text() type = Text(fields={'keyword': Keyword()}) education_levels = Text(multi=True, fields={'keyword': Keyword()}) communities = Text(multi=True, fields={'keyword': Keyword()}) collections = Text(multi=True, fields={'keyword': Keyword()}) languages = Text(multi=True, fields={'keyword': Keyword()}) description = Text() license_type = Text(fields={'keyword': Keyword()}) year_of_available = Date() publication_year = Date() created_date = Date() updated_date = Date() author_list = Keyword(fields={'keyword': Keyword()}, multi=True) url = Text() view_count = Integer() published = Text(fields={'keyword': Keyword()})
class LagouType(DocType): suggest = Completion(analyzer=ik_analyzer) title = Text(analyzer="ik_max_word") url = Keyword() url_object_id = Keyword() salary_min = Integer() salary_max = Integer() job_city = Keyword() work_years_min = Integer() work_years_max = Integer() degree_need = Text(analyzer="ik_max_word") job_type = Keyword() publish_time = Date() job_advantage = Text(analyzer="ik_max_word") job_desc = Text(analyzer="ik_max_word") job_addr = Text(analyzer="ik_max_word") company_name = Keyword() company_url = Keyword() tags = Text(analyzer="ik_max_word") crawl_time = Date() class Meta: index = "lagou" doc_type = "job"
class NACPDeclaration(DocType, AbstractDeclaration): """NACP Declaration document. Assumes there's a dynamic mapping with all fields not indexed by default.""" persons = Text(analyzer="ukrainian", copy_to="all") countries = Text(analyzer="ukrainian", copy_to="all") companies = Text(analyzer="ukrainian", copy_to="all") names_autocomplete = Text( analyzer="namesAutocompleteAnalyzer", search_analyzer="namesAutocompleteSearchAnalyzer", fields={"raw": Text(index=True)}, term_vector="with_positions_offsets", ) all = Text(analyzer="ukrainian") general = Object( properties={ "full_name_suggest": Completion(preserve_separators=False), "full_name": Text(index=True, analyzer="ukrainian"), "full_name_for_sorting": Keyword( index=True, ignore_above=100 ), # only for sorting purposes "name": Text(index=True, analyzer="ukrainian"), "patronymic": Text(index=True, analyzer="ukrainian"), "last_name": Text(index=True, analyzer="ukrainian"), "post": Object( properties={ "actual_region": Text( index=True, analyzer="ukrainian", fields={"raw": Keyword(index=True)}, ), "region": Text( index=True, analyzer="ukrainian", fields={"raw": Keyword(index=True)}, ), "office": Text( index=True, analyzer="ukrainian", fields={"raw": Keyword(index=True)}, ), "post_type": Text( index=True, analyzer="ukrainian", fields={"raw": Keyword(index=True)}, ), "post": Text( index=True, analyzer="ukrainian", fields={"raw": Keyword(index=True)}, ), } ), } ) declaration = Object(properties={"date": NoneAwareDate()}) estate = Object( properties={ "region": Text( index=True, analyzer="ukrainian", fields={"raw": Keyword(index=True)} ) } ) intro = Object( properties={ "declaration_year": Keyword(index=True), "declaration_year_to": NoneAwareDate(), "declaration_year_from": NoneAwareDate(), "doc_type": Keyword(index=True), "date": NoneAwareDate(index=True), } ) ft_src = Text(index=True, analyzer="ukrainian", copy_to="all") nacp_orig = Object(include_in_all=False, enabled=False) # concatinated from set of fields for regular search (not deepsearch mode) index_card = Text(index=True, analyzer="ukrainian") INDEX_CARD_FIELDS = [ "general.last_name", "general.name", "general.patronymic", "general.full_name", "general.post.post", "general.post.office", "general.post.region", "general.post.actual_region", "intro.declaration_year", "intro.doc_type", "declaration.source", "declaration.url", ] def raw_html(self): fname = os.path.join( settings.NACP_DECLARATIONS_PATH, self.meta.id[5:7], os.path.basename(self.declaration.basename) + ".html", ) try: with open(fname, "r") as fp: d = fp.read() except FileNotFoundError: return "<h2>Вибачте, декларація тимчасово відсутня, але ми вже працюємо над вирішенням проблеми</h2>" m = re.search(r"<\/style>(.*)</body>", d) declaration_html = m.group(1) # OH LORD, THAT'S NOT WHAT I'VE BEEN TAUGHT IN UNIVERSITY doc = declaration_html.replace( "</div></div></div><header><h2>", "</div></div><header><h2>" ) # MY ASS IS ON FIRE doc = re.sub(r"</table>\s*<header>", "</table></div><header>", doc) companies = self._all_companies() codes = [c.lstrip("0") for c in companies if c.isdigit() and 4 < len(c) < 9] for c in codes: if c: full_code = c.rjust(8, "0") doc = re.sub( r"\b0*{}\b".format(c), ' <a href="https://ring.org.ua/edr/uk/company/{}" target="_blank">{}</a>'.format( full_code, full_code ), doc, ) return doc def prepare_translations(self, language): if language == "en": self.translator = HTMLTranslator( self.raw_html(), "h2 span, legend i, label strong, label, th, header, span.block b, b, p, span, td", ) def raw_en_html(self): assert hasattr(self, "translator"), "You should call prepare_translations first" return self.translator.get_translated_html() af_paths = [ jmespath.compile("step_7.*.emitent_ua_company_code"), jmespath.compile("step_7.*.rights[].*.ua_company_code[]"), jmespath.compile("step_8.*.corporate_rights_company_code"), jmespath.compile("step_8.*.rights[].*.ua_company_code[]"), jmespath.compile("step_9.*.beneficial_owner_company_code"), ] def _is_change_form(self): return self.intro.doc_type and self.intro.doc_type == "Форма змін" def _affiliated_companies(self, src=None): # For now if self._is_change_form(): return [] results = [] if src is None: src = self.nacp_orig.to_dict() for path in self.af_paths: results += path.search(src) or [] return set(filter(None, results)) rl_paths = { "step_11": jmespath.compile("step_11.*"), "step_12": jmespath.compile("step_12.*"), } def _related_companies(self, src=None): # For now if self._is_change_form(): return [] results = [] if src is None: src = self.nacp_orig.to_dict() for section in self.rl_paths["step_11"].search(src) or []: try: section = section or {} obj_type = section.get("objectType", "").lower() other_obj_type = section.get("otherObjectType", "").lower() if obj_type in INCOME_TYPES or other_obj_type in INCOME_TYPES: results += [section.get("source_ua_company_code", "")] except AttributeError: pass for section in self.rl_paths["step_12"].search(src) or []: try: section = section or {} obj_type = section.get("objectType", "").lower() if obj_type in MONETARY_ASSETS_TYPES: results += [section.get("organization_ua_company_code", "")] except AttributeError: pass return set(filter(None, results)) ac_paths = [ jmespath.compile("step_2.*.source_ua_company_code[]"), jmespath.compile("step_3.*.beneficial_owner_company_code[]"), jmespath.compile("step_3.*.rights[].*.ua_company_code[]"), jmespath.compile("step_4.*.addition_company_code[]"), jmespath.compile("step_4.*.rights[].*.ua_company_code[]"), jmespath.compile("step_4.undefined.rights[].*.ua_company_code[]"), jmespath.compile("step_5.*.emitent_ua_company_code[]"), jmespath.compile("step_5.*.rights[].*.ua_company_code[]"), jmespath.compile("step_6.*.corporate_rights_company_code[]"), jmespath.compile("step_6.*.rights[].*.ua_company_code[]"), jmespath.compile("step_10.*.rights[].*.ua_company_code[]"), jmespath.compile("step_11.*.rights[].*.ua_company_code[]"), jmespath.compile("step_11.*.rights[].*.ua_company_name[]"), jmespath.compile("step_12.*.rights[].*.ua_company_code[]"), jmespath.compile("step_13.*.emitent_ua_company_code[]"), jmespath.compile("step_13.*.emitent_ua_company_name[]"), jmespath.compile("step_13.*.guarantor[].*.guarantor_ua_company_code[]"), jmespath.compile( "step_13.*.guarantor_realty[].*.realty_rights_ua_company_code[]" ), jmespath.compile( "step_13.*.guarantor_realty[].*.realty_rights_ua_company_code[]" ), jmespath.compile("step_15.*.emitent_ua_company_code[]"), jmespath.compile("step_16.org.*.reestrCode[]"), jmespath.compile("step_16.part_org.*.reestrCode[]"), jmespath.compile("step_7.*.emitent_ua_company_code"), jmespath.compile("step_7.*.rights[].*.ua_company_code[]"), jmespath.compile("step_8.*.corporate_rights_company_code"), jmespath.compile("step_8.*.rights[].*.ua_company_code[]"), jmespath.compile("step_9.*.beneficial_owner_company_code"), jmespath.compile("step_11.*.source_ua_company_code"), jmespath.compile("step_12.*.organization_ua_company_code"), ] def _all_companies(self, src=None): # For now if self._is_change_form(): return [] results = [] if src is None: src = self.nacp_orig.to_dict() for path in self.ac_paths: results += path.search(src) or [] return set(filter(None, results)) def related_companies(self, affiliated_only=True): """ Prepares data to use with procurement dataset """ src = self.nacp_orig.to_dict() res = self._affiliated_companies(src) if not affiliated_only: res += self._related_companies(src) res = filter(None, map(lambda x: x.strip().lstrip("0"), set(res))) return list(set(res) - BANK_EDRPOUS) def get_procurement_earnings_by_year(self, affiliated_only=True): # Safety valve against transactions with malformed dates next_year_dt = date(date.today().year + 1, 1, 1) return ( Transactions.objects.select_related("seller") .filter( seller__code__in=self.related_companies(affiliated_only), date__lt=next_year_dt, ) .annotate(year=ExtractYear("date")) .values("year") .annotate(count=Count("pk"), sum_uah=Sum("volume_uah")) ) def get_procurement_earnings_by_company(self, affiliated_only=True): # Safety valve against transactions with malformed dates next_year_dt = date(date.today().year + 1, 1, 1) return ( Transactions.objects.select_related("seller") .filter( seller__code__in=self.related_companies(affiliated_only), date__lt=next_year_dt, ) .values("seller__code", "seller__pk", "seller__name") .annotate(count=Count("pk"), sum_uah=Sum("volume_uah")) ) def infocard(self): return { "first_name": self.general.name, "patronymic": self.general.patronymic, "last_name": self.general.last_name, "office": self.general.post.office, "position": self.general.post.post, "source": self.declaration.source, "id": self.meta.id, "url": settings.SITE_URL + reverse("details", kwargs={"declaration_id": self.meta.id}), "document_type": self.intro.doc_type, "is_corrected": self.intro.corrected, "created_date": self.intro.date, "declaration_year": getattr(self.intro, "declaration_year"), } def raw_source(self): return { "url": "https://public-api.nazk.gov.ua/v1/declaration/%s" % self.meta.id.replace("nacp_", "") } def related_entities(self): src = self.nacp_orig.to_dict() owned_companies = self._affiliated_companies(src) related_companies = self._related_companies(src) all_companies = self._all_companies(src) return { "people": {"family": list(self.get_family_members())}, "documents": { "corrected": list(getattr(self, "corrected_declarations", []) or []), "originals": list(getattr(self, "original_declarations", []) or []), }, "companies": { "owned": list(owned_companies), "related": list(related_companies), "all": list(all_companies), }, } def unified_source(self): return self.nacp_orig.to_dict() def aggregated_data(self): if hasattr(self, "aggregated"): return self.aggregated.to_dict() else: return {} class Meta: doc_type = "nacp_declaration_doctype"
class Declaration(DocType, AbstractDeclaration): """Declaration document. Assumes there's a dynamic mapping with all fields not indexed by default.""" persons = Text(analyzer="ukrainian", copy_to="all") countries = Text(analyzer="ukrainian", copy_to="all") companies = Text(analyzer="ukrainian", copy_to="all") names_autocomplete = Text( analyzer="namesAutocompleteAnalyzer", search_analyzer="namesAutocompleteSearchAnalyzer", fields={"raw": Text(index=True)}, term_vector="with_positions_offsets", ) all = Text(analyzer="ukrainian") general = Object( properties={ "full_name_suggest": Completion(preserve_separators=False), "full_name": Text(index=True, analyzer="ukrainian"), "full_name_for_sorting": Keyword( index=True, ignore_above=100 ), # only for sorting purposes "name": Text(index=True, analyzer="ukrainian"), "patronymic": Text(index=True, analyzer="ukrainian"), "last_name": Text(index=True, analyzer="ukrainian"), "family_raw": Text(index=True, analyzer="ukrainian"), "family": Nested( properties={ "name": Text(index=True, analyzer="ukrainian"), "relations": Keyword(index=False), "inn": Keyword(index=False), } ), "post_raw": Text(index=True, analyzer="ukrainian"), "post": Object( properties={ "region": Text( index=True, analyzer="ukrainian", fields={"raw": Keyword(index=True)}, ), "office": Text( index=True, analyzer="ukrainian", fields={"raw": Keyword(index=True)}, ), "post": Text( index=True, analyzer="ukrainian", fields={"raw": Keyword(index=True)}, ), } ), "addresses": Nested( properties={ "place": Text(index=False), "place_hidden": Boolean(index=False), "place_district": Text(index=False), "place_district_hidden": Boolean(index=False), "place_city": Text(index=False), "place_city_hidden": Boolean(index=False), "place_city_type": Keyword(index=False), "place_city_type_hidden": Boolean(index=False), "place_address": Text(index=False), "place_address_hidden": Boolean(index=False), "place_address_type": Keyword(index=False), } ), } ) declaration = Object( properties={ "date": NoneAwareDate(), "notfull": Boolean(index=False), "notfull_lostpages": Keyword(index=False), "additional_info": Boolean(index=False), "additional_info_text": Text(index=False), "needs_scancopy_check": Boolean(index=False), } ) intro = Object( properties={ "declaration_year": Keyword(index=True), "doc_type": Keyword(index=True), "date": NoneAwareDate(index=True), } ) ft_src = Text(index=True, analyzer="ukrainian", copy_to="all") # concatinated from set of fields for regular search (not deepsearch mode) index_card = Text(index=True, analyzer="ukrainian") INDEX_CARD_FIELDS = [ "general.last_name", "general.name", "general.patronymic", "general.full_name", "general.post.post", "general.post.office", "general.post.region", "general.post.actual_region", "intro.declaration_year", "intro.doc_type", "declaration.source", "declaration.url", ] INCOME_SINGLE_PROPERTIES = { "value": Keyword(index=False), "value_unclear": Boolean(index=False), "comment": Text(index=False), "family": Keyword(index=False), "family_unclear": Boolean(index=False), "family_comment": Text(index=False), } INCOME_LIST_PROPERTIES = { "country": Keyword(index=False), "country_comment": Text(index=False), "cur": Keyword(index=False), "cur_units": Keyword(index=False), "uah_equal": Keyword(index=False), } income = Object( properties={ "5": Object(properties=INCOME_SINGLE_PROPERTIES), "6": Object(properties=INCOME_SINGLE_PROPERTIES), "7": Object(properties=INCOME_SINGLE_PROPERTIES), "8": Object(properties=INCOME_SINGLE_PROPERTIES), "9": Object(properties=INCOME_SINGLE_PROPERTIES), "10": Object(properties=INCOME_SINGLE_PROPERTIES), "11": Object(properties=INCOME_SINGLE_PROPERTIES), "12": Object(properties=INCOME_SINGLE_PROPERTIES), "13": Object(properties=INCOME_SINGLE_PROPERTIES), "14": Object(properties=INCOME_SINGLE_PROPERTIES), "15": Object(properties=INCOME_SINGLE_PROPERTIES), "16": Object(properties=INCOME_SINGLE_PROPERTIES), "17": Object(properties=INCOME_SINGLE_PROPERTIES), "18": Object(properties=INCOME_SINGLE_PROPERTIES), "19": Object(properties=INCOME_SINGLE_PROPERTIES), "20": Object(properties=INCOME_SINGLE_PROPERTIES), "21": Nested(properties=INCOME_LIST_PROPERTIES), "22": Nested(properties=INCOME_LIST_PROPERTIES), } ) ESTATE_PROPERTIES = { "region": Text(index=False), "address": Text(index=False), "space": Keyword(index=False), "space_units": Keyword(index=False), "space_comment": Text(index=False), "costs": Keyword(index=False), "costs_comment": Text(index=False), "costs_rent": Keyword(index=False), "costs_rent_comment": Text(index=False), "costs_property": Keyword(index=False), "costs_property_comment": Text(index=False), } estate = Object( properties={ "23": Nested(properties=ESTATE_PROPERTIES), "24": Nested(properties=ESTATE_PROPERTIES), "25": Nested(properties=ESTATE_PROPERTIES), "26": Nested(properties=ESTATE_PROPERTIES), "27": Nested(properties=ESTATE_PROPERTIES), "28": Nested(properties=ESTATE_PROPERTIES), "29": Nested(properties=ESTATE_PROPERTIES), "30": Nested(properties=ESTATE_PROPERTIES), "31": Nested(properties=ESTATE_PROPERTIES), "32": Nested(properties=ESTATE_PROPERTIES), "33": Nested(properties=ESTATE_PROPERTIES), "34": Nested(properties=ESTATE_PROPERTIES), } ) VEHICLE_PROPERTIES = { "brand": Text(index=False), "brand_info": Text(index=False), "year": Keyword(index=False), "sum": Keyword(index=False), "sum_comment": Text(index=False), "sum_rent": Keyword(index=False), "sum_rent_comment": Text(index=False), "brand_hidden": Boolean(index=False), "brand_info_hidden": Boolean(index=False), "brand_info_unclear": Boolean(index=False), } vehicle = Object( properties={ "35": Nested(properties=VEHICLE_PROPERTIES), "36": Nested(properties=VEHICLE_PROPERTIES), "37": Nested(properties=VEHICLE_PROPERTIES), "38": Nested(properties=VEHICLE_PROPERTIES), "39": Nested(properties=VEHICLE_PROPERTIES), "40": Nested(properties=VEHICLE_PROPERTIES), "41": Nested(properties=VEHICLE_PROPERTIES), "42": Nested(properties=VEHICLE_PROPERTIES), "43": Nested(properties=VEHICLE_PROPERTIES), "44": Nested(properties=VEHICLE_PROPERTIES), } ) BANKS_PROPERTIES = { "sum": Keyword(index=False), "sum_hidden": Boolean(index=False), "sum_units": Keyword(index=False), "sum_comment": Text(index=False), "sum_foreign": Keyword(index=False), "sum_foreign_units": Keyword(index=False), "sum_foreign_comment": Text(index=False), } banks = Object( properties={ "45": Nested(properties=BANKS_PROPERTIES), "46": Nested(properties=BANKS_PROPERTIES), "47": Nested(properties=BANKS_PROPERTIES), "48": Nested(properties=BANKS_PROPERTIES), "49": Nested(properties=BANKS_PROPERTIES), "50": Nested(properties=BANKS_PROPERTIES), "51": Nested(properties=BANKS_PROPERTIES), "52": Nested(properties=BANKS_PROPERTIES), "53": Nested(properties=BANKS_PROPERTIES), } ) LIABILITIES_PROPERTIES = { "sum": Keyword(index=False), "sum_comment": Text(index=False), "sum_units": Keyword(index=False), "sum_foreign": Keyword(index=False), "sum_foreign_comment": Text(index=False), } liabilities = Object( properties={ "54": Nested(properties=LIABILITIES_PROPERTIES), "55": Nested(properties=LIABILITIES_PROPERTIES), "56": Nested(properties=LIABILITIES_PROPERTIES), "57": Nested(properties=LIABILITIES_PROPERTIES), "58": Nested(properties=LIABILITIES_PROPERTIES), "59": Nested(properties=LIABILITIES_PROPERTIES), "60": Nested(properties=LIABILITIES_PROPERTIES), "61": Nested(properties=LIABILITIES_PROPERTIES), "62": Nested(properties=LIABILITIES_PROPERTIES), "63": Nested(properties=LIABILITIES_PROPERTIES), "64": Nested(properties=LIABILITIES_PROPERTIES), } ) def raw_source(self): src = self.to_dict() return blacklist(src, ["ft_src", "index_card"]) def infocard(self): return { "first_name": self.general.name, "patronymic": self.general.patronymic, "last_name": self.general.last_name, "office": self.general.post.office, "position": self.general.post.post, "source": getattr(self.declaration, "source", getattr(self, "source", "")), "id": self.meta.id, "url": settings.SITE_URL + reverse("details", kwargs={"declaration_id": self.meta.id}), "document_type": self.intro.doc_type, "is_corrected": False, "declaration_year": getattr(self.intro, "declaration_year"), "created_date": getattr( self.intro, "date", getattr(self.declaration, "date", "") ), } def related_entities(self): return { "people": {"family": list(self.get_family_members())}, "documents": {"corrected": [], "originals": []}, "companies": {"owned": [], "related": [], "all": []}, } def unified_source(self): try: doc = self.to_dict() doc["id"] = self.meta.id converter = PaperToNACPConverter(doc) return converter.convert() except ConverterError: return None def _is_change_form(self): return False def aggregated_data(self): return self.aggregated # Temporary solution to provide enough aggregated data # to make it possible to compare old and new declarations # TODO: REPLACE ME @property def aggregated(self): if hasattr(self, "_aggregated"): return self._aggregated def to_float(doc, key): try: return float(str(getattr(doc, key, "0") or "0").replace(",", ".")) except ValueError: return 0. def get_exchange_rate(year, curr): rates = { "2011": {"USD": 7.98, "EUR": 10.29, "RUB": 0.250}, # As on 2011/12/30 "2012": {"USD": 7.99, "EUR": 10.53, "RUB": 0.263}, # As on 2012/12/29 "2013": {"USD": 7.99, "EUR": 11.04, "RUB": 0.244}, # As on 2013/12/30 "2014": {"USD": 15.76, "EUR": 19.23, "RUB": 0.303}, # As on 2014/12/29 "2015": {"USD": 24.00, "EUR": 26.22, "RUB": 0.329}, # As on 2015/12/31 "2016": { # As on 2016/12/31 "USD": 27.1908, "EUR": 28.4226, "RUB": 0.4511, }, "2017": { # As on 2017/12/31 "USD": 28.0672, "EUR": 33.4954, "RUB": 0.4870, }, } if year not in rates: return if curr not in rates[year]: return return rates[year][curr] def to_space(space): areas_koef = {"га": 10000, "cоток": 100, "соток": 100, "м²": 1} units = getattr(space, "space_units", "") return to_float(space, "space") * areas_koef.get(units, 1) resp = { "incomes.presents.all": 0, "incomes.family": 0, "incomes.declarant": 0, "assets.cash.total": 0, "assets.family": 0, "assets.declarant": 0, "incomes.total": 0, "assets.total": 0, "expenses.total": 0, "liabilities.total": 0, "estate.family_land": 0, "estate.declarant_land": 0, "estate.family_other": 0, "estate.declarant_other": 0, "vehicles.all_names": "", } if hasattr(self, "income"): resp["incomes.declarant"] = to_float(self.income["5"], "value") resp["incomes.family"] = to_float(self.income["5"], "family") resp["incomes.presents.all"] = to_float( self.income["11"], "value" ) + to_float(self.income["11"], "family") resp["incomes.total"] = resp["incomes.declarant"] + resp["incomes.family"] if hasattr(self, "liabilities"): for field in [ "54", "55", "56", "57", "58", "59", "60", "61", "62", "63", "64", ]: if hasattr(self.liabilities, field): resp["liabilities.total"] += to_float( getattr(self.liabilities, field), "sum" ) if hasattr(self, "banks"): for d_key, k in (("45", "declarant"), ("51", "family")): for a in getattr(self.banks, d_key, []): try: currency = getattr(a, "sum_units", "UAH") or "UAH" amount = to_float(a, "sum") if currency == "грн": currency = "UAH" if currency != "UAH": rate = get_exchange_rate( str(self.intro.declaration_year), currency ) if rate is None: continue amount *= rate resp["assets.{}".format(k)] += amount except ValueError: continue resp["assets.total"] = resp["assets.family"] + resp["assets.declarant"] vehicles = [] if hasattr(self, "vehicle"): for field in [ "34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44", ]: car_infos = getattr(self.vehicle, field, []) for car_info in car_infos: vehicles.append( "{} {}".format( car_info["brand"], car_info["brand_info"] ).replace(";", "") ) resp["vehicles.all_names"] += ";".join(vehicles) if hasattr(self, "estate"): for d_key, k in ( ("24", "declarant_other"), ("30", "family_other"), ("25", "declarant_other"), ("31", "family_other"), ("26", "declarant_other"), ("32", "family_other"), ("27", "declarant_other"), ("33", "family_other"), ("28", "declarant_other"), ("34", "family_other"), ): estate_infos = getattr(self.estate, d_key, []) for space in estate_infos: resp["estate.{}".format(k)] += to_space(space) for d_key, k in (("23", "declarant_land"), ("29", "family_land")): estate_infos = getattr(self.estate, d_key, []) for space in estate_infos: resp["estate.{}".format(k)] += to_space(space) self._aggregated = resp return resp class Meta: pass
class DatasetSearch(ModelSearchAdapter): model = Dataset fuzzy = True exclude_fields = ['spatial.geom', 'spatial.zones.geom'] class Meta: doc_type = 'Dataset' title = String(analyzer=i18n_analyzer, fields={'raw': String(index='not_analyzed')}) description = String(analyzer=i18n_analyzer) license = String(index='not_analyzed') frequency = String(index='not_analyzed') organization = String(index='not_analyzed') owner = String(index='not_analyzed') tags = String(index='not_analyzed', fields={'i18n': String(index='not_analyzed')}) badges = String(index='not_analyzed') tag_suggest = Completion(analyzer=simple, search_analyzer=simple, payloads=False) resources = Object( properties={ 'title': String(), 'description': String(), 'format': String(index='not_analyzed') }) format_suggest = Completion(analyzer=simple, search_analyzer=simple, payloads=False) dataset_suggest = Completion(analyzer=simple, search_analyzer=simple, payloads=True) created = Date(format='date_hour_minute_second') last_modified = Date(format='date_hour_minute_second') metrics = metrics_mapping_for(Dataset) featured = Boolean() temporal_coverage = Nested(multi=False, properties={ 'start': Long(), 'end': Long() }) temporal_weight = Long(), geozones = Object( properties={ 'id': String(index='not_analyzed'), 'name': String(index='not_analyzed'), 'keys': String(index='not_analyzed') }) granularity = String(index='not_analyzed') spatial_weight = Long() from_certified = Boolean() fields = ( 'geozones.keys^9', 'geozones.name^9', 'acronym^7', 'title^6', 'tags.i18n^3', 'description', ) sorts = { 'title': 'title.raw', 'created': 'created', 'last_modified': 'last_modified', 'reuses': 'metrics.reuses', 'followers': 'metrics.followers', 'views': 'metrics.views', } facets = { 'tag': TermsFacet(field='tags'), 'badge': TermsFacet(field='badges', labelizer=dataset_badge_labelizer), 'organization': ModelTermsFacet(field='organization', model=Organization), 'owner': ModelTermsFacet(field='owner', model=User), 'license': ModelTermsFacet(field='license', model=License), 'geozone': ModelTermsFacet(field='geozones.id', model=GeoZone, labelizer=zone_labelizer), 'granularity': TermsFacet(field='granularity', labelizer=granularity_labelizer), 'format': TermsFacet(field='resources.format'), 'reuses': RangeFacet(field='metrics.reuses', ranges=[('none', (None, 1)), ('few', (1, 5)), ('quite', (5, 10)), ('many', (10, None))], labels={ 'none': _('Never reused'), 'few': _('Little reused'), 'quite': _('Quite reused'), 'many': _('Heavily reused'), }), 'temporal_coverage': TemporalCoverageFacet(field='temporal_coverage'), 'featured': BoolFacet(field='featured'), } boosters = [ BoolBooster('featured', 1.5), BoolBooster('from_certified', 1.2), ValueFactor('spatial_weight', missing=1), ValueFactor('temporal_weight', missing=1), GaussDecay('metrics.reuses', max_reuses, decay=0.1), GaussDecay('metrics.followers', max_followers, max_followers, decay=0.1), ] @classmethod def is_indexable(cls, dataset): return (dataset.deleted is None and len(dataset.resources) > 0 and not dataset.private) @classmethod def get_suggest_weight(cls, temporal_weight, spatial_weight, featured): '''Compute the suggest part of the indexation payload''' featured_weight = 1 if not featured else FEATURED_WEIGHT return temporal_weight * spatial_weight * featured_weight @classmethod def serialize(cls, dataset): organization = None owner = None image_url = None spatial_weight = DEFAULT_SPATIAL_WEIGHT temporal_weight = DEFAULT_TEMPORAL_WEIGHT if dataset.organization: organization = Organization.objects( id=dataset.organization.id).first() image_url = organization.logo(40, external=True) elif dataset.owner: owner = User.objects(id=dataset.owner.id).first() image_url = owner.avatar(40, external=True) certified = organization and organization.certified document = { 'title': dataset.title, 'description': dataset.description, 'license': getattr(dataset.license, 'id', None), 'tags': dataset.tags, 'badges': [badge.kind for badge in dataset.badges], 'tag_suggest': dataset.tags, 'resources': [{ 'title': r.title, 'description': r.description, 'format': r.format, } for r in dataset.resources], 'format_suggest': [r.format.lower() for r in dataset.resources if r.format], 'frequency': dataset.frequency, 'organization': str(organization.id) if organization else None, 'owner': str(owner.id) if owner else None, 'dataset_suggest': { 'input': cls.completer_tokenize(dataset.title) + [dataset.id], 'output': dataset.title, 'payload': { 'id': str(dataset.id), 'slug': dataset.slug, 'acronym': dataset.acronym, 'image_url': image_url, }, }, 'created': dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'), 'last_modified': dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'), 'metrics': dataset.metrics, 'featured': dataset.featured, 'from_certified': certified, } if (dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end): start = dataset.temporal_coverage.start.toordinal() end = dataset.temporal_coverage.end.toordinal() temporal_weight = min((end - start) / 365, MAX_TEMPORAL_WEIGHT) document.update({ 'temporal_coverage': { 'start': start, 'end': end }, 'temporal_weight': temporal_weight, }) if dataset.spatial is not None: # Index precise zone labels and parents zone identifiers # to allow fast filtering. zone_ids = [z.id for z in dataset.spatial.zones] zones = GeoZone.objects(id__in=zone_ids).exclude('geom') parents = set() geozones = [] coverage_level = ADMIN_LEVEL_MAX for zone in zones: geozones.append({ 'id': zone.id, 'name': zone.name, 'keys': zone.keys_values }) parents |= set(zone.parents) coverage_level = min(coverage_level, admin_levels[zone.level]) geozones.extend([{'id': p} for p in parents]) spatial_weight = ADMIN_LEVEL_MAX / coverage_level document.update({ 'geozones': geozones, 'granularity': dataset.spatial.granularity, 'spatial_weight': spatial_weight, }) document['dataset_suggest']['weight'] = cls.get_suggest_weight( temporal_weight, spatial_weight, dataset.featured) if dataset.acronym: document['dataset_suggest']['input'].append(dataset.acronym) return document
class ReuseSearch(ModelSearchAdapter): model = Reuse fuzzy = True class Meta: doc_type = 'Reuse' title = String(analyzer=i18n_analyzer, fields={'raw': String(index='not_analyzed')}) description = String(analyzer=i18n_analyzer) url = String(index='not_analyzed') organization = String(index='not_analyzed') owner = String(index='not_analyzed') type = String(index='not_analyzed') tags = String(index='not_analyzed', fields={'i18n': String(index='not_analyzed')}) badges = String(index='not_analyzed') topic = String(index='not_analyzed') tag_suggest = Completion(analyzer=simple, search_analyzer=simple, payloads=False) datasets = Object(properties={ 'id': String(index='not_analyzed'), 'title': String(), }) created = Date(format='date_hour_minute_second') last_modified = Date(format='date_hour_minute_second') metrics = Reuse.__search_metrics__ featured = Boolean() reuse_suggest = Completion(analyzer=simple, search_analyzer=simple, payloads=True) extras = Object() facets = { 'tag': TermsFacet(field='tags'), 'organization': ModelTermsFacet(field='organization', model=Organization), 'owner': ModelTermsFacet(field='owner', model=User), 'dataset': ModelTermsFacet(field='dataset.id', model=Dataset), 'type': TermsFacet(field='type', labelizer=reuse_type_labelizer), 'datasets': RangeFacet(field='metrics.datasets', ranges=[('none', (None, 1)), ('few', (1, 5)), ('many', (5, None))], labels={ 'none': _('No datasets'), 'few': _('Few datasets'), 'many': _('Many datasets'), }), 'followers': RangeFacet(field='metrics.followers', ranges=[('none', (None, 1)), ('few', (1, 5)), ('many', (5, None))], labels={ 'none': _('No followers'), 'few': _('Few followers'), 'many': _('Many followers'), }), 'badge': TermsFacet(field='badges', labelizer=reuse_badge_labelizer), 'featured': BoolFacet(field='featured'), 'topic': TermsFacet(field='topic', labelizer=reuse_topic_labelizer), } sorts = { 'title': 'title.raw', 'created': 'created', 'last_modified': 'last_modified', 'datasets': 'metrics.datasets', 'followers': 'metrics.followers', 'views': 'metrics.views', } boosters = [ BoolBooster('featured', lazy('featured_boost')), GaussDecay('metrics.datasets', max_datasets, decay=lazy('datasets_decay')), GaussDecay('metrics.followers', max_followers, decay=lazy('followers_decay')), ] @classmethod def is_indexable(cls, reuse): return (reuse.deleted is None and len(reuse.datasets) > 0 and not reuse.private) @classmethod def serialize(cls, reuse): """By default use the ``to_dict`` method and exclude ``_id``, ``_cls`` and ``owner`` fields. """ datasets = Dataset.objects(id__in=[r.id for r in reuse.datasets]) datasets = list(datasets.only('id', 'title').no_dereference()) organization = None owner = None if reuse.organization: organization = Organization.objects( id=reuse.organization.id).first() elif reuse.owner: owner = User.objects(id=reuse.owner.id).first() return { 'title': reuse.title, 'description': reuse.description, 'url': reuse.url, 'organization': str(organization.id) if organization else None, 'owner': str(owner.id) if owner else None, 'type': reuse.type, 'topic': reuse.topic, 'tags': reuse.tags, 'tag_suggest': reuse.tags, 'badges': [badge.kind for badge in reuse.badges], 'created': to_iso_datetime(reuse.created_at), 'last_modified': to_iso_datetime(reuse.last_modified), 'dataset': [{ 'id': str(d.id), 'title': d.title } for d in datasets], 'metrics': reuse.metrics, 'featured': reuse.featured, 'extras': reuse.extras, 'reuse_suggest': { 'input': cls.completer_tokenize(reuse.title) + [reuse.id], 'output': str(reuse.id), 'payload': { 'title': reuse.title, 'slug': reuse.slug, 'image_url': reuse.image(500, external=True), }, }, }