Exemplo n.º 1
0
class IndeedType(Document):
    #完成搜索建议
    suggest = Completion(analyzer=ik_analyzer)  #理论上可以这样
    #要保存在elasticsearch中的数据类型
    job_title = Text(analyzer="ik_max_word")
    job_location = Text(analyzer="ik_max_word")
    job_summary = Text(analyzer="ik_max_word")
    job_salary = Text(analyzer="ik_max_word")
    company_name = Text(analyzer="ik_max_word")
    job_href = Text(analyzer="ik_max_word")
    job_star = Text(analyzer="ik_max_word")
    job_review = Text(analyzer="ik_max_word")

    class Meta:
        index = "Indeed"
        doc_type = "data_science"

    class Index:
        name = "indeed"
        doc_type = "data_science"

    # Display cluster health
    print(connections.get_connection().cluster.health())
Exemplo n.º 2
0
class ChengdeType(DocType):
    # 伯乐在线文章类型
    '''搜索时需要进行分词'''
    suggest = Completion(analyzer=ik_analyzer)
    title = Text(analyzer="ik_max_word")
    '''这个词不需要分词'''
    url = Keyword()
    url_object_id = Keyword()
    rent_type = Keyword
    price = Integer()
    hourse_type = Keyword()
    area = Text(analyzer="ik_max_word")
    community = Text(analyzer="ik_max_word")
    detail = Text(analyzer="ik_max_word")
    telephone = Keyword()
    """
    liuli1的
    content是keyword
    """
    '''确定目标保存的 index和type'''
    class Meta:
        index = "zufang"
        doc_type = "chengde58"
Exemplo n.º 3
0
class ArticleType(DocType):
    # 伯乐在线文章类型
    # 搜索建议自动补全 ,设置Completion类型 , 目前要用自定义的CustomAnalyzer避免报错
    suggest = Completion(analyzer=ik_analyzer)
    '''搜索时需要进行分词'''
    title = Text(analyzer="ik_max_word")
    create_date = Date()
    '''这个词不需要分词'''
    url = Keyword()
    url_object_id = Keyword()
    comment_nums = Integer()
    average_score = Keyword()
    tags = Text(analyzer="ik_max_word")
    content = Text(analyzer="ik_max_word")
    """
    liuli1的
    content是keyword
    """

    '''确定目标保存的 index和type'''
    class Meta:
        index = "liuli1"
        doc_type = "article"
Exemplo n.º 4
0
class LaGou(DocType):
    # 伯乐在线文章类型
    suggest = Completion(analyzer=ik_analyzer)
    url = Keyword()
    url_object_id = Keyword()
    title = Text(analyzer="ik_max_word")
    salary = Text(analyzer="ik_max_word")
    job_city = Text(analyzer="ik_max_word")
    work_years = Text(analyzer="ik_max_word")
    degree_need = Text(analyzer="ik_max_word")
    job_type = Text(analyzer="ik_max_word")
    publish_time = Date()
    tags = Text(analyzer="ik_max_word")
    job_advantage = Text(analyzer="ik_max_word")
    job_desc = Text(analyzer="ik_max_word")
    job_addr = Text(analyzer="ik_max_word")
    company_url = Keyword()
    company_name = Text(analyzer="ik_max_word")
    crawl_time = Date()

    class Meta:
        index = "lagou"
        doc_type = "lagoujob"
Exemplo n.º 5
0
class LagouType(DocType):
    #拉勾类型
    suggest = Completion(analyer='ik_max_word')
    title = Text(analyer='ik_max_word')
    url = Keyword()
    url_object_id = Keyword()
    salary = Keyword()
    job_city = Keyword()
    work_years = Keyword()
    degree_need = Text(analyer='ik_max_word')
    job_type = Text(analyer='ik_max_word')
    publish_time = Keyword()
    job_advantage = Text(analyer='ik_max_word')
    job_desc = Text(analyer='ik_max_word')
    job_addr = Keyword()
    company_name = Text(analyer='ik_max_word')
    company_url = Keyword()
    tags = Text(analyer='ik_max_word')
    crawl_time = Date()

    class Meta:
        index = 'lagou'
        doc_type = 'possion'
Exemplo n.º 6
0
class LagouJobIndex(Document):
    suggest = Completion(analyzer=my_analyzer)
    title = Text(analyzer="ik_max_word")
    url = Keyword()
    url_object_id = Keyword()
    salary_min = Integer()
    salary_max = Integer()
    job_city = Keyword()
    work_years_min = Integer()
    work_years_max = Integer()
    degree_need = Text(analyzer="ik_max_word")
    job_type = Keyword()
    publish_time = Date()
    job_advantage = Text(analyzer="ik_max_word")
    job_desc = Text(analyzer="ik_smart")
    job_addr = Text(analyzer="ik_max_word")
    company_name = Keyword()
    company_url = Keyword()
    tags = Text(analyzer="ik_max_word")
    crawl_time = Date()

    class Index:
        name = 'lagou_job'
Exemplo n.º 7
0
class Lagou(Document):
    suggest = Completion(analyzer='ik_max_word')
    title = Text(analyzer='ik_max_word')
    url = Keyword()
    url_object_id = Keyword()
    salary = Keyword()
    job_city = Text(analyzer='ik_max_word')
    work_years = Keyword()
    degree_need = Keyword()
    job_type = Text(analyzer='ik_max_word')
    publish_time = Keyword()
    job_advantage = Text(analyzer='ik_max_word')
    job_desc = Text(analyzer='ik_max_word')
    job_address = Text(analyzer='ik_max_word')
    company_url = Keyword()
    company_name = Text(analyzer='ik_max_word')
    crawl_time = Date()
    crawl_update_time = Date()
    tags = Text(analyzer='ik_max_word')

    class Index:
        name = 'lagou'
        settings = {"number_of_shards": 2, "number_of_replicas": 0}
Exemplo n.º 8
0
class ZhiHuQuestionType(Document):
    """ 知乎问题 """

    suggest = Completion(analyzer=my_analyzer)
    # 知乎的问题 item
    zhihu_id = Keyword()
    topics = Text(analyzer="ik_max_word")
    url = Keyword()
    title = Text(analyzer="ik_max_word")
    content = Text(analyzer="ik_max_word")
    answer_num = Integer()
    comments_num = Integer()
    watch_user_num = Integer()
    click_num = Integer()
    crawl_time = Date()

    # 定义了es中对应的index
    class Index:
        name = 'zhihu'
        doc_type = "question"

    class Meta:
        doc_type = "question"
Exemplo n.º 9
0
class Lagou(DocType):

    suggest = Completion(analyzer=ik_analyzer)
    url = Keyword()
    url_obj_id = Keyword()
    title = Text(analyzer='ik_max_word')
    min_salary = Integer()
    max_salary = Integer()
    min_work_year = Integer()
    max_work_year = Integer()
    job_city = Keyword()
    degree_need = Keyword()
    job_type = Keyword()
    publish_time = Date()
    tags = Text(analyzer='ik_max_word')
    job_advantage = Text(analyzer='ik_max_word')
    job_desc = Text(analyzer='ik_max_word')
    job_addr = Text(analyzer='ik_max_word')
    company_name = Text(analyzer='ik_max_word')

    class Meta:
        index = 'lagou'
        doc_type = 'job'
Exemplo n.º 10
0
Arquivo: models.py Projeto: MrHjt/down
class LagouItem(DocType):
    suggest = Completion(analyzer=ik_analyzer)
    url = Keyword()
    url_object_id = Keyword()
    title = Text(analyzer='ik_max_word')
    salary = Keyword()
    job_city = Keyword()
    work_years = Keyword()
    degree_need = Keyword()
    job_type = Keyword()
    publish_time = Date()
    job_advantags = Text(analyzer='ik_max_word')
    job_dec = Text(analyzer='ik_max_word')
    job_addr = Keyword()
    company_name = Keyword()
    company_url = Keyword()
    tags = Text(analyzer='ik_max_word')
    crawl_time = Date()
    crawl_update_time = Date()

    class Meta:
        index = "lagou"
        doc_type = "job"
Exemplo n.º 11
0
class ArticType(DocType):  #继承自定义的类
    suggest = Completion(analyzer=ik_analyzer)  #搜索建议自动补全的功能

    title = Text(analyzer="ik_max_word")
    create_date = Date()
    link_url = Keyword()  #不分析
    url_object_id = Keyword()
    front_image_url = Keyword()  #设置字段名称的字段类型,keyword为普通字符串类型,不分词
    front_image_path = Keyword()
    # 点赞数
    praise_num = Integer()
    # 评论数
    comment_num = Integer()
    # 收藏数
    fav_num = Integer()
    # 标签
    tags = Text(analyzer="ik_max_word")  #Text为字符串类型并且可以分词建立倒排索引
    # 内容
    content = Text(analyzer="ik_max_word")

    class Meta:
        index = "jobbole"  #设置索引名称
        doc_type = 'article'  #设置表名称
Exemplo n.º 12
0
class ArticleType(Document):
    # 伯乐在线文章类型
    suggest = Completion(analyzer=ik_analyzer)
    title = Text(analyzer="ik_max_word")
    create_date = Date()
    url = Keyword()
    url_object_id = Keyword()
    front_image_url = Keyword()
    front_image_path = Keyword()
    praise_nums = Integer()
    comment_nums = Integer()
    fav_nums = Integer()
    tags = Text(analyzer="ik_max_word")
    content = Text(analyzer="ik_max_word")

    class Meta:
        #     name = "jobbole"
        index = "jobbole"
        doc_type = "article"

    class Index:
        name = 'jobbole'
        doc_type = 'article'
Exemplo n.º 13
0
class LagouType(DocType):
    # 拉勾网职位类型
    suggest = Completion(analyzer=ik_analyzer)
    title = Text(analyzer='ik_max_word')
    create_date = Date()
    url = Keyword()
    url_object_id = Keyword()
    salary = Text(analyzer='ik_max_word')
    job_city = Text(analyzer='ik_max_word')
    work_years = Text(analyzer='ik_max_word')
    degree_need = Text(analyzer='ik_max_word')
    job_type = Text(analyzer='ik_max_word')
    tags = Text(analyzer='ik_max_word')
    publish_time = Text(analyzer='ik_max_word')
    job_advantage = Text(analyzer='ik_max_word')
    job_desc = Text(analyzer='ik_max_word')
    job_addr = Text(analyzer='ik_max_word')
    company_name = Text(analyzer='ik_max_word')
    company_url = Keyword()

    class Meta:
        index = "lagou2_linux"
        doc_type = 'lagou'
Exemplo n.º 14
0
class Lagou(DocType):
    # 拉勾网职位信息
    title_suggest = Completion(analyzer=ik_analyzer, search_analyzer=ik_analyzer)
    title = Text(analyzer='ik_max_word', search_analyzer="ik_max_word", fields={'title': Keyword()})
    id = Text()
    url = Text()
    salary = Text()
    job_city = Text()
    work_years = Text()
    degree_need = Text()
    job_type = Text()
    publish_time = Text()
    job_advantage = Text()
    job_desc = Text()
    job_addr = Text()
    company_name = Text()
    company_url = Text()
    tags = Text(analyzer='ik_max_word', fields={'tags': Keyword()})
    crawl_time = Date()

    class Meta:
        index = 'jobbole'
        doc_type = 'lagou_job'
Exemplo n.º 15
0
class LagouJob(Document):
    # 建议
    suggestion = Completion(analyzer=analyzer('ik_smart'))
    job_id = Keyword()
    # 职位标题
    title = Text(analyzer="ik_max_word")
    # url
    url = Keyword()
    # 工资
    salary = FloatRange()
    # # 工资下限
    # salary_min = Float()
    # 工作经验
    work_years = FloatRange()
    # # 最低年
    # work_year_min = Integer()
    # 学历要求
    degree_need = Float()
    # 工作性质 实习兼职全职
    job_type = Keyword()
    # 发布时间
    publish_time = Date()
    # 职位诱惑
    job_advantage = Text(analyzer="ik_max_word")
    # 职位描述
    job_desc = Text(analyzer="ik_max_word")
    # 工作城市
    job_city = Keyword()
    # 工作地址
    job_addr = Text(analyzer="ik_max_word")
    # 公司url
    company_url = Keyword()
    # 公司名字
    company_name = Keyword()

    class Index:
        name = 'a51job'
Exemplo n.º 16
0
class VideoType(Document):
    class Index:
        name = 'video'
        # settings = {
        #     "number_of_shards": 2,
        #     "number_of_replicas": 1
        # }

    # 文章和视频通用
    url = Keyword()
    url_object_id = Keyword()
    title = Text(analyzer="ik_max_word")
    source = Text(analyzer="ik_max_word")
    date = Date()
    # 视频特有
    img_url = Keyword()
    suggest = Completion(analyzer=ik_analyzer)  # 搜索建议

    def __init__(self, item):
        super(VideoType, self).__init__()
        self.assign(item)

    def assign(self, item):
        keys = ["url", "title", "source", "date", "url_object_id", "img_url"]
        for key in keys:
            try:
                item[key]
            except:
                item[key] = ""
        self.url = item["url"]
        self.title = item["title"]
        self.source = item["source"]
        self.date = item["date"]
        self.meta.id = item["url_object_id"]
        self.img_url = item["img_url"]
        self.suggest = gen_suggests(VideoType.Index.name,
                                    ((self.title, 10), (self.source, 2)))
Exemplo n.º 17
0
class GameType(DocType):
    # suggest字段类型定义为completion
    # string类型:text, keyword两种
    #   text类型:会进行分词,抽取词干,建立倒排索引
    #   keyword类型:就是一个普通字符串,只能完全匹配才能搜索到
    # 数字类型:long, integer, short, byte, double, float
    # 日期类型:date
    # bool(布尔)
    # 类型:boolean
    # binary(二进制)
    # 类型:binary
    # 复杂类型:object, nested
    # geo(地区)
    # 类型:geo - point, geo - shape
    # 专业类型:ip, competion
    url = Keyword()
    suggest = Completion(analyzer=ik_analyzer)
    # analyzer = "ik_max_word" 是以一种分词方式
    gameName = Text(analyzer="ik_max_word")
    # 动作啥的
    gameType = Keyword()
    # 开发商
    developer = Text(analyzer="ik_max_word")
    # 发行商
    publisher = Text(analyzer="ik_max_word")
    # 发售日期
    publishDate = Keyword()
    gameLanguage = Keyword()
    # 标签 魔幻 穿越啥的
    gameTitle = Text(analyzer="ik_max_word")
    gamePlatform = Keyword()
    # 游戏内容 对应故事背景
    gameContext = Text(analyzer="ik_max_word")

    class Meta:
        index = "test"
        doc_type = "game"
Exemplo n.º 18
0
class Person(DocType, RangeRelevantEntitiesMixin):
    """Person document."""

    full_name_suggest = Completion(preserve_separators=False)

    translated_first_name = TranslatedField("first_name", "first_name_en")
    translated_last_name = TranslatedField("last_name", "last_name_en")
    translated_patronymic = TranslatedField("patronymic", "patronymic_en")

    translated_last_workplace = TranslatedField("last_workplace",
                                                "last_workplace_en")
    translated_last_job_title = TranslatedField("last_job_title",
                                                "last_job_title_en")

    @classmethod
    @cached(timeout=25 * 60 * 60)
    def get_all_persons(cls):
        return [
            blacklist(
                p.to_dict(),
                [
                    "full_name_suggest_en",
                    "dob_details",
                    "dob",
                    "full_name_suggest",
                    "last_job_id",
                    "risk_category",
                    "photo_path",
                    "terminated",
                    "last_modified",
                    "inn",
                    "inn_source",
                    "passport",
                    "passport_source",
                ],
            ) for p in cls.search().scan()
        ]
Exemplo n.º 19
0
class VideoType(DocType):
    """
    学习视频类型
    """
    # 搜索建议的mapping设置
    suggest = Completion(analyzer=ik_analyzer)

    url_object_id = Keyword()
    url = Keyword()
    class_name = Text(analyzer='ik_max_word')
    price = Float()
    abstract = Text(analyzer='ik_max_word')
    data_source = Keyword()
    second_classify = Text(analyzer='ik_max_word')
    first_classify = Text(analyzer='ik_max_word')

    # recommend_score = Float()

    # 用来初始化index以及type的名称
    class Meta:
        # 类比到mysql就是数据库名
        index = 'learning_video'
        # 类比到mysql就是表名
        doc_type = 'video'
Exemplo n.º 20
0
class Film(Document):
    id = Keyword(required=True)
    imdb_id = Keyword()
    original_title = Text()
    characters = Nested(properties={'char_id': Keyword(), 'char_name': Text()})
    directors = Nested(properties={'director_id': Keyword(), 'director_name': Text()})
    original_language = Keyword()
    adult = Boolean()
    belongs_to_collection = Text()
    genres = Nested(properties={'genre': Keyword()})
    popularity = Float()
    release_date = Date()
    budget = Float()
    revenue = Float()
    runtime = Float()
    spoken_languages = Nested(properties={'spoken_language': Keyword()})
    poster_path = Keyword()
    vote_average = Float()
    vote_count = Integer()
    keywords = Nested(properties={'keyword': Keyword()})
    suggestion = Completion()

    class Index:
        name = ELASTIC_INDEX
Exemplo n.º 21
0
class ZhiHuAnswerType(Document):
    """ 知乎回答 """

    suggest = Completion(analyzer=my_analyzer)
    # 知乎的问题 item
    zhihu_id = Keyword()
    url = Keyword()
    question_id = Keyword()
    author_id = Keyword()
    content = Text(analyzer="ik_max_word")
    praise_num = Integer()
    comments_num = Integer()
    create_time = Date()
    update_time = Date()
    crawl_time = Date()
    author_name = Keyword()

    # 定义了es中对应的index
    class Index:
        name = 'zhihu'
        doc_type = "answer"

    class Meta:
        doc_type = "answer"
Exemplo n.º 22
0
class ArticleType(DocType):
    """
    技术文章类型,此处会影响搜索打分结果
    """
    # 搜索建议的mapping设置
    suggest = Completion(analyzer=ik_analyzer)

    url_object_id = Keyword()
    url = Keyword()
    title = Text(analyzer='ik_max_word')
    article_type = Text(analyzer='ik_max_word')
    data_source = Keyword()
    # 热度预留字段,由阅读数,评论数,点赞数,收藏数组成
    hot_score = Integer()
    publish_time = Date()
    abstract = Text(analyzer='ik_max_word')
    tags = Text(analyzer='ik_max_word')

    # 用来初始化index以及type的名称
    class Meta:
        # 类比到mysql就是数据库名
        index = 'technology_article'
        # 类比到mysql就是表名
        doc_type = 'article'
Exemplo n.º 23
0
class DouBanIndex(Document):
    """
        :param: 电影id,电影名,别名,简介,链接,导演,演员,类型,上映时间
    """
    suggest = Completion(analyzer=my_analyzer)
    movie_id = Keyword()
    name = Text(analyzer="ik_max_word")
    alias = Text(analyzer="ik_smart")
    introduce = Text(analyzer="ik_smart")
    url = Keyword()
    directors = Keyword()
    rate = Float()
    casts = Keyword()
    type = Keyword()
    date = Keyword()

    class Index:
        name = "movie"

        # 设置分片数量,副本数量,
        settings = {
            "number_of_shards": 2,
            "number_of_replicas": 0
        }
Exemplo n.º 24
0
class LagouJobType(DocType):
    # 拉勾网的职位
    suggest = Completion(analyzer=ik_analyzer)
    title = Text(analyzer="ik_max_word")
    create_date = Date()
    url = Keyword()
    url_object_id = Keyword()
    salary = Keyword()
    job_city = Keyword()
    work_years = Keyword()
    degree_need = Keyword()
    job_type = Keyword()
    publish_time = Keyword()
    job_advantage = Keyword()
    job_desc = Keyword()
    job_addr = Keyword()
    company_name = Keyword()
    company_url = Keyword()
    tags = Text(analyzer="ik_max_word")
    content = Text(analyzer="ik_max_word")

    class Meta:
        index = "lagou"
        doc_type = "lagou"
Exemplo n.º 25
0
class ItemDoc(DocType):
    """
    Common superclass for Document, Audio, Video, Wiki, Maps, and Newspaper.
    Don't get index in the index server."""
    id = Text()
    title = Text(fields={'keyword': Keyword()})
    title_search = Text()
    title_suggest = Completion()
    abstract = Text()
    type = Text(fields={'keyword': Keyword()})
    education_levels = Text(multi=True, fields={'keyword': Keyword()})
    communities = Text(multi=True, fields={'keyword': Keyword()})
    collections = Text(multi=True, fields={'keyword': Keyword()})
    languages = Text(multi=True, fields={'keyword': Keyword()})
    description = Text()
    license_type = Text(fields={'keyword': Keyword()})
    year_of_available = Date()
    publication_year = Date()
    created_date = Date()
    updated_date = Date()
    author_list = Keyword(fields={'keyword': Keyword()}, multi=True)
    url = Text()
    view_count = Integer()
    published = Text(fields={'keyword': Keyword()})
Exemplo n.º 26
0
class LagouType(DocType):
    suggest = Completion(analyzer=ik_analyzer)
    title = Text(analyzer="ik_max_word")
    url = Keyword()
    url_object_id = Keyword()
    salary_min = Integer()
    salary_max = Integer()
    job_city = Keyword()
    work_years_min = Integer()
    work_years_max = Integer()
    degree_need = Text(analyzer="ik_max_word")
    job_type = Keyword()
    publish_time = Date()
    job_advantage = Text(analyzer="ik_max_word")
    job_desc = Text(analyzer="ik_max_word")
    job_addr = Text(analyzer="ik_max_word")
    company_name = Keyword()
    company_url = Keyword()
    tags = Text(analyzer="ik_max_word")
    crawl_time = Date()

    class Meta:
        index = "lagou"
        doc_type = "job"
Exemplo n.º 27
0
class NACPDeclaration(DocType, AbstractDeclaration):
    """NACP Declaration document.
    Assumes there's a dynamic mapping with all fields not indexed by default."""

    persons = Text(analyzer="ukrainian", copy_to="all")
    countries = Text(analyzer="ukrainian", copy_to="all")
    companies = Text(analyzer="ukrainian", copy_to="all")
    names_autocomplete = Text(
        analyzer="namesAutocompleteAnalyzer",
        search_analyzer="namesAutocompleteSearchAnalyzer",
        fields={"raw": Text(index=True)},
        term_vector="with_positions_offsets",
    )

    all = Text(analyzer="ukrainian")

    general = Object(
        properties={
            "full_name_suggest": Completion(preserve_separators=False),
            "full_name": Text(index=True, analyzer="ukrainian"),
            "full_name_for_sorting": Keyword(
                index=True, ignore_above=100
            ),  # only for sorting purposes
            "name": Text(index=True, analyzer="ukrainian"),
            "patronymic": Text(index=True, analyzer="ukrainian"),
            "last_name": Text(index=True, analyzer="ukrainian"),
            "post": Object(
                properties={
                    "actual_region": Text(
                        index=True,
                        analyzer="ukrainian",
                        fields={"raw": Keyword(index=True)},
                    ),
                    "region": Text(
                        index=True,
                        analyzer="ukrainian",
                        fields={"raw": Keyword(index=True)},
                    ),
                    "office": Text(
                        index=True,
                        analyzer="ukrainian",
                        fields={"raw": Keyword(index=True)},
                    ),
                    "post_type": Text(
                        index=True,
                        analyzer="ukrainian",
                        fields={"raw": Keyword(index=True)},
                    ),
                    "post": Text(
                        index=True,
                        analyzer="ukrainian",
                        fields={"raw": Keyword(index=True)},
                    ),
                }
            ),
        }
    )
    declaration = Object(properties={"date": NoneAwareDate()})
    estate = Object(
        properties={
            "region": Text(
                index=True, analyzer="ukrainian", fields={"raw": Keyword(index=True)}
            )
        }
    )
    intro = Object(
        properties={
            "declaration_year": Keyword(index=True),
            "declaration_year_to": NoneAwareDate(),
            "declaration_year_from": NoneAwareDate(),
            "doc_type": Keyword(index=True),
            "date": NoneAwareDate(index=True),
        }
    )

    ft_src = Text(index=True, analyzer="ukrainian", copy_to="all")
    nacp_orig = Object(include_in_all=False, enabled=False)

    # concatinated from set of fields for regular search (not deepsearch mode)
    index_card = Text(index=True, analyzer="ukrainian")

    INDEX_CARD_FIELDS = [
        "general.last_name",
        "general.name",
        "general.patronymic",
        "general.full_name",
        "general.post.post",
        "general.post.office",
        "general.post.region",
        "general.post.actual_region",
        "intro.declaration_year",
        "intro.doc_type",
        "declaration.source",
        "declaration.url",
    ]

    def raw_html(self):
        fname = os.path.join(
            settings.NACP_DECLARATIONS_PATH,
            self.meta.id[5:7],
            os.path.basename(self.declaration.basename) + ".html",
        )

        try:
            with open(fname, "r") as fp:
                d = fp.read()
        except FileNotFoundError:
            return "<h2>Вибачте, декларація тимчасово відсутня, але ми вже працюємо над вирішенням проблеми</h2>"

        m = re.search(r"<\/style>(.*)</body>", d)
        declaration_html = m.group(1)

        # OH LORD, THAT'S NOT WHAT I'VE BEEN TAUGHT IN UNIVERSITY
        doc = declaration_html.replace(
            "</div></div></div><header><h2>", "</div></div><header><h2>"
        )
        # MY ASS IS ON FIRE
        doc = re.sub(r"</table>\s*<header>", "</table></div><header>", doc)

        companies = self._all_companies()

        codes = [c.lstrip("0") for c in companies if c.isdigit() and 4 < len(c) < 9]

        for c in codes:
            if c:
                full_code = c.rjust(8, "0")
                doc = re.sub(
                    r"\b0*{}\b".format(c),
                    ' <a href="https://ring.org.ua/edr/uk/company/{}" target="_blank">{}</a>'.format(
                        full_code, full_code
                    ),
                    doc,
                )

        return doc

    def prepare_translations(self, language):
        if language == "en":
            self.translator = HTMLTranslator(
                self.raw_html(),
                "h2 span, legend i, label strong, label, th, header, span.block b, b, p, span, td",
            )

    def raw_en_html(self):
        assert hasattr(self, "translator"), "You should call prepare_translations first"
        return self.translator.get_translated_html()

    af_paths = [
        jmespath.compile("step_7.*.emitent_ua_company_code"),
        jmespath.compile("step_7.*.rights[].*.ua_company_code[]"),
        jmespath.compile("step_8.*.corporate_rights_company_code"),
        jmespath.compile("step_8.*.rights[].*.ua_company_code[]"),
        jmespath.compile("step_9.*.beneficial_owner_company_code"),
    ]

    def _is_change_form(self):
        return self.intro.doc_type and self.intro.doc_type == "Форма змін"

    def _affiliated_companies(self, src=None):
        # For now
        if self._is_change_form():
            return []

        results = []
        if src is None:
            src = self.nacp_orig.to_dict()

        for path in self.af_paths:
            results += path.search(src) or []

        return set(filter(None, results))

    rl_paths = {
        "step_11": jmespath.compile("step_11.*"),
        "step_12": jmespath.compile("step_12.*"),
    }

    def _related_companies(self, src=None):
        # For now
        if self._is_change_form():
            return []

        results = []
        if src is None:
            src = self.nacp_orig.to_dict()

        for section in self.rl_paths["step_11"].search(src) or []:
            try:
                section = section or {}
                obj_type = section.get("objectType", "").lower()
                other_obj_type = section.get("otherObjectType", "").lower()

                if obj_type in INCOME_TYPES or other_obj_type in INCOME_TYPES:
                    results += [section.get("source_ua_company_code", "")]
            except AttributeError:
                pass

        for section in self.rl_paths["step_12"].search(src) or []:
            try:
                section = section or {}
                obj_type = section.get("objectType", "").lower()

                if obj_type in MONETARY_ASSETS_TYPES:
                    results += [section.get("organization_ua_company_code", "")]
            except AttributeError:
                pass

        return set(filter(None, results))

    ac_paths = [
        jmespath.compile("step_2.*.source_ua_company_code[]"),
        jmespath.compile("step_3.*.beneficial_owner_company_code[]"),
        jmespath.compile("step_3.*.rights[].*.ua_company_code[]"),
        jmespath.compile("step_4.*.addition_company_code[]"),
        jmespath.compile("step_4.*.rights[].*.ua_company_code[]"),
        jmespath.compile("step_4.undefined.rights[].*.ua_company_code[]"),
        jmespath.compile("step_5.*.emitent_ua_company_code[]"),
        jmespath.compile("step_5.*.rights[].*.ua_company_code[]"),
        jmespath.compile("step_6.*.corporate_rights_company_code[]"),
        jmespath.compile("step_6.*.rights[].*.ua_company_code[]"),
        jmespath.compile("step_10.*.rights[].*.ua_company_code[]"),
        jmespath.compile("step_11.*.rights[].*.ua_company_code[]"),
        jmespath.compile("step_11.*.rights[].*.ua_company_name[]"),
        jmespath.compile("step_12.*.rights[].*.ua_company_code[]"),
        jmespath.compile("step_13.*.emitent_ua_company_code[]"),
        jmespath.compile("step_13.*.emitent_ua_company_name[]"),
        jmespath.compile("step_13.*.guarantor[].*.guarantor_ua_company_code[]"),
        jmespath.compile(
            "step_13.*.guarantor_realty[].*.realty_rights_ua_company_code[]"
        ),
        jmespath.compile(
            "step_13.*.guarantor_realty[].*.realty_rights_ua_company_code[]"
        ),
        jmespath.compile("step_15.*.emitent_ua_company_code[]"),
        jmespath.compile("step_16.org.*.reestrCode[]"),
        jmespath.compile("step_16.part_org.*.reestrCode[]"),
        jmespath.compile("step_7.*.emitent_ua_company_code"),
        jmespath.compile("step_7.*.rights[].*.ua_company_code[]"),
        jmespath.compile("step_8.*.corporate_rights_company_code"),
        jmespath.compile("step_8.*.rights[].*.ua_company_code[]"),
        jmespath.compile("step_9.*.beneficial_owner_company_code"),
        jmespath.compile("step_11.*.source_ua_company_code"),
        jmespath.compile("step_12.*.organization_ua_company_code"),
    ]

    def _all_companies(self, src=None):
        # For now
        if self._is_change_form():
            return []

        results = []
        if src is None:
            src = self.nacp_orig.to_dict()

        for path in self.ac_paths:
            results += path.search(src) or []

        return set(filter(None, results))

    def related_companies(self, affiliated_only=True):
        """
        Prepares data to use with procurement dataset
        """
        src = self.nacp_orig.to_dict()

        res = self._affiliated_companies(src)
        if not affiliated_only:
            res += self._related_companies(src)

        res = filter(None, map(lambda x: x.strip().lstrip("0"), set(res)))

        return list(set(res) - BANK_EDRPOUS)

    def get_procurement_earnings_by_year(self, affiliated_only=True):
        # Safety valve against transactions with malformed dates
        next_year_dt = date(date.today().year + 1, 1, 1)

        return (
            Transactions.objects.select_related("seller")
            .filter(
                seller__code__in=self.related_companies(affiliated_only),
                date__lt=next_year_dt,
            )
            .annotate(year=ExtractYear("date"))
            .values("year")
            .annotate(count=Count("pk"), sum_uah=Sum("volume_uah"))
        )

    def get_procurement_earnings_by_company(self, affiliated_only=True):
        # Safety valve against transactions with malformed dates
        next_year_dt = date(date.today().year + 1, 1, 1)

        return (
            Transactions.objects.select_related("seller")
            .filter(
                seller__code__in=self.related_companies(affiliated_only),
                date__lt=next_year_dt,
            )
            .values("seller__code", "seller__pk", "seller__name")
            .annotate(count=Count("pk"), sum_uah=Sum("volume_uah"))
        )

    def infocard(self):
        return {
            "first_name": self.general.name,
            "patronymic": self.general.patronymic,
            "last_name": self.general.last_name,
            "office": self.general.post.office,
            "position": self.general.post.post,
            "source": self.declaration.source,
            "id": self.meta.id,
            "url": settings.SITE_URL
            + reverse("details", kwargs={"declaration_id": self.meta.id}),
            "document_type": self.intro.doc_type,
            "is_corrected": self.intro.corrected,
            "created_date": self.intro.date,
            "declaration_year": getattr(self.intro, "declaration_year"),
        }

    def raw_source(self):
        return {
            "url": "https://public-api.nazk.gov.ua/v1/declaration/%s"
            % self.meta.id.replace("nacp_", "")
        }

    def related_entities(self):
        src = self.nacp_orig.to_dict()
        owned_companies = self._affiliated_companies(src)
        related_companies = self._related_companies(src)
        all_companies = self._all_companies(src)

        return {
            "people": {"family": list(self.get_family_members())},
            "documents": {
                "corrected": list(getattr(self, "corrected_declarations", []) or []),
                "originals": list(getattr(self, "original_declarations", []) or []),
            },
            "companies": {
                "owned": list(owned_companies),
                "related": list(related_companies),
                "all": list(all_companies),
            },
        }

    def unified_source(self):
        return self.nacp_orig.to_dict()

    def aggregated_data(self):
        if hasattr(self, "aggregated"):
            return self.aggregated.to_dict()
        else:
            return {}

    class Meta:
        doc_type = "nacp_declaration_doctype"
Exemplo n.º 28
0
class Declaration(DocType, AbstractDeclaration):
    """Declaration document.
    Assumes there's a dynamic mapping with all fields not indexed by default."""

    persons = Text(analyzer="ukrainian", copy_to="all")
    countries = Text(analyzer="ukrainian", copy_to="all")
    companies = Text(analyzer="ukrainian", copy_to="all")
    names_autocomplete = Text(
        analyzer="namesAutocompleteAnalyzer",
        search_analyzer="namesAutocompleteSearchAnalyzer",
        fields={"raw": Text(index=True)},
        term_vector="with_positions_offsets",
    )

    all = Text(analyzer="ukrainian")

    general = Object(
        properties={
            "full_name_suggest": Completion(preserve_separators=False),
            "full_name": Text(index=True, analyzer="ukrainian"),
            "full_name_for_sorting": Keyword(
                index=True, ignore_above=100
            ),  # only for sorting purposes
            "name": Text(index=True, analyzer="ukrainian"),
            "patronymic": Text(index=True, analyzer="ukrainian"),
            "last_name": Text(index=True, analyzer="ukrainian"),
            "family_raw": Text(index=True, analyzer="ukrainian"),
            "family": Nested(
                properties={
                    "name": Text(index=True, analyzer="ukrainian"),
                    "relations": Keyword(index=False),
                    "inn": Keyword(index=False),
                }
            ),
            "post_raw": Text(index=True, analyzer="ukrainian"),
            "post": Object(
                properties={
                    "region": Text(
                        index=True,
                        analyzer="ukrainian",
                        fields={"raw": Keyword(index=True)},
                    ),
                    "office": Text(
                        index=True,
                        analyzer="ukrainian",
                        fields={"raw": Keyword(index=True)},
                    ),
                    "post": Text(
                        index=True,
                        analyzer="ukrainian",
                        fields={"raw": Keyword(index=True)},
                    ),
                }
            ),
            "addresses": Nested(
                properties={
                    "place": Text(index=False),
                    "place_hidden": Boolean(index=False),
                    "place_district": Text(index=False),
                    "place_district_hidden": Boolean(index=False),
                    "place_city": Text(index=False),
                    "place_city_hidden": Boolean(index=False),
                    "place_city_type": Keyword(index=False),
                    "place_city_type_hidden": Boolean(index=False),
                    "place_address": Text(index=False),
                    "place_address_hidden": Boolean(index=False),
                    "place_address_type": Keyword(index=False),
                }
            ),
        }
    )
    declaration = Object(
        properties={
            "date": NoneAwareDate(),
            "notfull": Boolean(index=False),
            "notfull_lostpages": Keyword(index=False),
            "additional_info": Boolean(index=False),
            "additional_info_text": Text(index=False),
            "needs_scancopy_check": Boolean(index=False),
        }
    )
    intro = Object(
        properties={
            "declaration_year": Keyword(index=True),
            "doc_type": Keyword(index=True),
            "date": NoneAwareDate(index=True),
        }
    )
    ft_src = Text(index=True, analyzer="ukrainian", copy_to="all")

    # concatinated from set of fields for regular search (not deepsearch mode)
    index_card = Text(index=True, analyzer="ukrainian")

    INDEX_CARD_FIELDS = [
        "general.last_name",
        "general.name",
        "general.patronymic",
        "general.full_name",
        "general.post.post",
        "general.post.office",
        "general.post.region",
        "general.post.actual_region",
        "intro.declaration_year",
        "intro.doc_type",
        "declaration.source",
        "declaration.url",
    ]

    INCOME_SINGLE_PROPERTIES = {
        "value": Keyword(index=False),
        "value_unclear": Boolean(index=False),
        "comment": Text(index=False),
        "family": Keyword(index=False),
        "family_unclear": Boolean(index=False),
        "family_comment": Text(index=False),
    }
    INCOME_LIST_PROPERTIES = {
        "country": Keyword(index=False),
        "country_comment": Text(index=False),
        "cur": Keyword(index=False),
        "cur_units": Keyword(index=False),
        "uah_equal": Keyword(index=False),
    }
    income = Object(
        properties={
            "5": Object(properties=INCOME_SINGLE_PROPERTIES),
            "6": Object(properties=INCOME_SINGLE_PROPERTIES),
            "7": Object(properties=INCOME_SINGLE_PROPERTIES),
            "8": Object(properties=INCOME_SINGLE_PROPERTIES),
            "9": Object(properties=INCOME_SINGLE_PROPERTIES),
            "10": Object(properties=INCOME_SINGLE_PROPERTIES),
            "11": Object(properties=INCOME_SINGLE_PROPERTIES),
            "12": Object(properties=INCOME_SINGLE_PROPERTIES),
            "13": Object(properties=INCOME_SINGLE_PROPERTIES),
            "14": Object(properties=INCOME_SINGLE_PROPERTIES),
            "15": Object(properties=INCOME_SINGLE_PROPERTIES),
            "16": Object(properties=INCOME_SINGLE_PROPERTIES),
            "17": Object(properties=INCOME_SINGLE_PROPERTIES),
            "18": Object(properties=INCOME_SINGLE_PROPERTIES),
            "19": Object(properties=INCOME_SINGLE_PROPERTIES),
            "20": Object(properties=INCOME_SINGLE_PROPERTIES),
            "21": Nested(properties=INCOME_LIST_PROPERTIES),
            "22": Nested(properties=INCOME_LIST_PROPERTIES),
        }
    )

    ESTATE_PROPERTIES = {
        "region": Text(index=False),
        "address": Text(index=False),
        "space": Keyword(index=False),
        "space_units": Keyword(index=False),
        "space_comment": Text(index=False),
        "costs": Keyword(index=False),
        "costs_comment": Text(index=False),
        "costs_rent": Keyword(index=False),
        "costs_rent_comment": Text(index=False),
        "costs_property": Keyword(index=False),
        "costs_property_comment": Text(index=False),
    }
    estate = Object(
        properties={
            "23": Nested(properties=ESTATE_PROPERTIES),
            "24": Nested(properties=ESTATE_PROPERTIES),
            "25": Nested(properties=ESTATE_PROPERTIES),
            "26": Nested(properties=ESTATE_PROPERTIES),
            "27": Nested(properties=ESTATE_PROPERTIES),
            "28": Nested(properties=ESTATE_PROPERTIES),
            "29": Nested(properties=ESTATE_PROPERTIES),
            "30": Nested(properties=ESTATE_PROPERTIES),
            "31": Nested(properties=ESTATE_PROPERTIES),
            "32": Nested(properties=ESTATE_PROPERTIES),
            "33": Nested(properties=ESTATE_PROPERTIES),
            "34": Nested(properties=ESTATE_PROPERTIES),
        }
    )

    VEHICLE_PROPERTIES = {
        "brand": Text(index=False),
        "brand_info": Text(index=False),
        "year": Keyword(index=False),
        "sum": Keyword(index=False),
        "sum_comment": Text(index=False),
        "sum_rent": Keyword(index=False),
        "sum_rent_comment": Text(index=False),
        "brand_hidden": Boolean(index=False),
        "brand_info_hidden": Boolean(index=False),
        "brand_info_unclear": Boolean(index=False),
    }
    vehicle = Object(
        properties={
            "35": Nested(properties=VEHICLE_PROPERTIES),
            "36": Nested(properties=VEHICLE_PROPERTIES),
            "37": Nested(properties=VEHICLE_PROPERTIES),
            "38": Nested(properties=VEHICLE_PROPERTIES),
            "39": Nested(properties=VEHICLE_PROPERTIES),
            "40": Nested(properties=VEHICLE_PROPERTIES),
            "41": Nested(properties=VEHICLE_PROPERTIES),
            "42": Nested(properties=VEHICLE_PROPERTIES),
            "43": Nested(properties=VEHICLE_PROPERTIES),
            "44": Nested(properties=VEHICLE_PROPERTIES),
        }
    )

    BANKS_PROPERTIES = {
        "sum": Keyword(index=False),
        "sum_hidden": Boolean(index=False),
        "sum_units": Keyword(index=False),
        "sum_comment": Text(index=False),
        "sum_foreign": Keyword(index=False),
        "sum_foreign_units": Keyword(index=False),
        "sum_foreign_comment": Text(index=False),
    }
    banks = Object(
        properties={
            "45": Nested(properties=BANKS_PROPERTIES),
            "46": Nested(properties=BANKS_PROPERTIES),
            "47": Nested(properties=BANKS_PROPERTIES),
            "48": Nested(properties=BANKS_PROPERTIES),
            "49": Nested(properties=BANKS_PROPERTIES),
            "50": Nested(properties=BANKS_PROPERTIES),
            "51": Nested(properties=BANKS_PROPERTIES),
            "52": Nested(properties=BANKS_PROPERTIES),
            "53": Nested(properties=BANKS_PROPERTIES),
        }
    )

    LIABILITIES_PROPERTIES = {
        "sum": Keyword(index=False),
        "sum_comment": Text(index=False),
        "sum_units": Keyword(index=False),
        "sum_foreign": Keyword(index=False),
        "sum_foreign_comment": Text(index=False),
    }
    liabilities = Object(
        properties={
            "54": Nested(properties=LIABILITIES_PROPERTIES),
            "55": Nested(properties=LIABILITIES_PROPERTIES),
            "56": Nested(properties=LIABILITIES_PROPERTIES),
            "57": Nested(properties=LIABILITIES_PROPERTIES),
            "58": Nested(properties=LIABILITIES_PROPERTIES),
            "59": Nested(properties=LIABILITIES_PROPERTIES),
            "60": Nested(properties=LIABILITIES_PROPERTIES),
            "61": Nested(properties=LIABILITIES_PROPERTIES),
            "62": Nested(properties=LIABILITIES_PROPERTIES),
            "63": Nested(properties=LIABILITIES_PROPERTIES),
            "64": Nested(properties=LIABILITIES_PROPERTIES),
        }
    )

    def raw_source(self):
        src = self.to_dict()
        return blacklist(src, ["ft_src", "index_card"])

    def infocard(self):
        return {
            "first_name": self.general.name,
            "patronymic": self.general.patronymic,
            "last_name": self.general.last_name,
            "office": self.general.post.office,
            "position": self.general.post.post,
            "source": getattr(self.declaration, "source", getattr(self, "source", "")),
            "id": self.meta.id,
            "url": settings.SITE_URL
            + reverse("details", kwargs={"declaration_id": self.meta.id}),
            "document_type": self.intro.doc_type,
            "is_corrected": False,
            "declaration_year": getattr(self.intro, "declaration_year"),
            "created_date": getattr(
                self.intro, "date", getattr(self.declaration, "date", "")
            ),
        }

    def related_entities(self):
        return {
            "people": {"family": list(self.get_family_members())},
            "documents": {"corrected": [], "originals": []},
            "companies": {"owned": [], "related": [], "all": []},
        }

    def unified_source(self):
        try:
            doc = self.to_dict()
            doc["id"] = self.meta.id
            converter = PaperToNACPConverter(doc)
            return converter.convert()
        except ConverterError:
            return None

    def _is_change_form(self):
        return False

    def aggregated_data(self):
        return self.aggregated

    # Temporary solution to provide enough aggregated data
    # to make it possible to compare old and new declarations
    # TODO: REPLACE ME
    @property
    def aggregated(self):
        if hasattr(self, "_aggregated"):
            return self._aggregated

        def to_float(doc, key):
            try:
                return float(str(getattr(doc, key, "0") or "0").replace(",", "."))
            except ValueError:
                return 0.

        def get_exchange_rate(year, curr):
            rates = {
                "2011": {"USD": 7.98, "EUR": 10.29, "RUB": 0.250},  # As on 2011/12/30
                "2012": {"USD": 7.99, "EUR": 10.53, "RUB": 0.263},  # As on 2012/12/29
                "2013": {"USD": 7.99, "EUR": 11.04, "RUB": 0.244},  # As on 2013/12/30
                "2014": {"USD": 15.76, "EUR": 19.23, "RUB": 0.303},  # As on 2014/12/29
                "2015": {"USD": 24.00, "EUR": 26.22, "RUB": 0.329},  # As on 2015/12/31
                "2016": {  # As on 2016/12/31
                    "USD": 27.1908,
                    "EUR": 28.4226,
                    "RUB": 0.4511,
                },
                "2017": {  # As on 2017/12/31
                    "USD": 28.0672,
                    "EUR": 33.4954,
                    "RUB": 0.4870,
                },
            }

            if year not in rates:
                return

            if curr not in rates[year]:
                return

            return rates[year][curr]

        def to_space(space):
            areas_koef = {"га": 10000, "cоток": 100, "соток": 100, "м²": 1}

            units = getattr(space, "space_units", "")

            return to_float(space, "space") * areas_koef.get(units, 1)

        resp = {
            "incomes.presents.all": 0,
            "incomes.family": 0,
            "incomes.declarant": 0,
            "assets.cash.total": 0,
            "assets.family": 0,
            "assets.declarant": 0,
            "incomes.total": 0,
            "assets.total": 0,
            "expenses.total": 0,
            "liabilities.total": 0,
            "estate.family_land": 0,
            "estate.declarant_land": 0,
            "estate.family_other": 0,
            "estate.declarant_other": 0,
            "vehicles.all_names": "",
        }

        if hasattr(self, "income"):
            resp["incomes.declarant"] = to_float(self.income["5"], "value")
            resp["incomes.family"] = to_float(self.income["5"], "family")
            resp["incomes.presents.all"] = to_float(
                self.income["11"], "value"
            ) + to_float(self.income["11"], "family")

            resp["incomes.total"] = resp["incomes.declarant"] + resp["incomes.family"]

        if hasattr(self, "liabilities"):
            for field in [
                "54",
                "55",
                "56",
                "57",
                "58",
                "59",
                "60",
                "61",
                "62",
                "63",
                "64",
            ]:
                if hasattr(self.liabilities, field):
                    resp["liabilities.total"] += to_float(
                        getattr(self.liabilities, field), "sum"
                    )

        if hasattr(self, "banks"):
            for d_key, k in (("45", "declarant"), ("51", "family")):
                for a in getattr(self.banks, d_key, []):
                    try:
                        currency = getattr(a, "sum_units", "UAH") or "UAH"
                        amount = to_float(a, "sum")
                        if currency == "грн":
                            currency = "UAH"

                        if currency != "UAH":
                            rate = get_exchange_rate(
                                str(self.intro.declaration_year), currency
                            )
                            if rate is None:
                                continue

                            amount *= rate

                        resp["assets.{}".format(k)] += amount
                    except ValueError:
                        continue

            resp["assets.total"] = resp["assets.family"] + resp["assets.declarant"]

        vehicles = []
        if hasattr(self, "vehicle"):
            for field in [
                "34",
                "35",
                "36",
                "37",
                "38",
                "39",
                "40",
                "41",
                "42",
                "43",
                "44",
            ]:
                car_infos = getattr(self.vehicle, field, [])
                for car_info in car_infos:
                    vehicles.append(
                        "{} {}".format(
                            car_info["brand"], car_info["brand_info"]
                        ).replace(";", "")
                    )

            resp["vehicles.all_names"] += ";".join(vehicles)

        if hasattr(self, "estate"):
            for d_key, k in (
                ("24", "declarant_other"),
                ("30", "family_other"),
                ("25", "declarant_other"),
                ("31", "family_other"),
                ("26", "declarant_other"),
                ("32", "family_other"),
                ("27", "declarant_other"),
                ("33", "family_other"),
                ("28", "declarant_other"),
                ("34", "family_other"),
            ):

                estate_infos = getattr(self.estate, d_key, [])

                for space in estate_infos:
                    resp["estate.{}".format(k)] += to_space(space)

            for d_key, k in (("23", "declarant_land"), ("29", "family_land")):

                estate_infos = getattr(self.estate, d_key, [])

                for space in estate_infos:
                    resp["estate.{}".format(k)] += to_space(space)

        self._aggregated = resp
        return resp

    class Meta:
        pass
Exemplo n.º 29
0
class DatasetSearch(ModelSearchAdapter):
    model = Dataset
    fuzzy = True
    exclude_fields = ['spatial.geom', 'spatial.zones.geom']

    class Meta:
        doc_type = 'Dataset'

    title = String(analyzer=i18n_analyzer,
                   fields={'raw': String(index='not_analyzed')})
    description = String(analyzer=i18n_analyzer)
    license = String(index='not_analyzed')
    frequency = String(index='not_analyzed')
    organization = String(index='not_analyzed')
    owner = String(index='not_analyzed')
    tags = String(index='not_analyzed',
                  fields={'i18n': String(index='not_analyzed')})
    badges = String(index='not_analyzed')
    tag_suggest = Completion(analyzer=simple,
                             search_analyzer=simple,
                             payloads=False)
    resources = Object(
        properties={
            'title': String(),
            'description': String(),
            'format': String(index='not_analyzed')
        })
    format_suggest = Completion(analyzer=simple,
                                search_analyzer=simple,
                                payloads=False)
    dataset_suggest = Completion(analyzer=simple,
                                 search_analyzer=simple,
                                 payloads=True)
    created = Date(format='date_hour_minute_second')
    last_modified = Date(format='date_hour_minute_second')
    metrics = metrics_mapping_for(Dataset)
    featured = Boolean()
    temporal_coverage = Nested(multi=False,
                               properties={
                                   'start': Long(),
                                   'end': Long()
                               })
    temporal_weight = Long(),
    geozones = Object(
        properties={
            'id': String(index='not_analyzed'),
            'name': String(index='not_analyzed'),
            'keys': String(index='not_analyzed')
        })
    granularity = String(index='not_analyzed')
    spatial_weight = Long()
    from_certified = Boolean()

    fields = (
        'geozones.keys^9',
        'geozones.name^9',
        'acronym^7',
        'title^6',
        'tags.i18n^3',
        'description',
    )
    sorts = {
        'title': 'title.raw',
        'created': 'created',
        'last_modified': 'last_modified',
        'reuses': 'metrics.reuses',
        'followers': 'metrics.followers',
        'views': 'metrics.views',
    }

    facets = {
        'tag':
        TermsFacet(field='tags'),
        'badge':
        TermsFacet(field='badges', labelizer=dataset_badge_labelizer),
        'organization':
        ModelTermsFacet(field='organization', model=Organization),
        'owner':
        ModelTermsFacet(field='owner', model=User),
        'license':
        ModelTermsFacet(field='license', model=License),
        'geozone':
        ModelTermsFacet(field='geozones.id',
                        model=GeoZone,
                        labelizer=zone_labelizer),
        'granularity':
        TermsFacet(field='granularity', labelizer=granularity_labelizer),
        'format':
        TermsFacet(field='resources.format'),
        'reuses':
        RangeFacet(field='metrics.reuses',
                   ranges=[('none', (None, 1)), ('few', (1, 5)),
                           ('quite', (5, 10)), ('many', (10, None))],
                   labels={
                       'none': _('Never reused'),
                       'few': _('Little reused'),
                       'quite': _('Quite reused'),
                       'many': _('Heavily reused'),
                   }),
        'temporal_coverage':
        TemporalCoverageFacet(field='temporal_coverage'),
        'featured':
        BoolFacet(field='featured'),
    }
    boosters = [
        BoolBooster('featured', 1.5),
        BoolBooster('from_certified', 1.2),
        ValueFactor('spatial_weight', missing=1),
        ValueFactor('temporal_weight', missing=1),
        GaussDecay('metrics.reuses', max_reuses, decay=0.1),
        GaussDecay('metrics.followers',
                   max_followers,
                   max_followers,
                   decay=0.1),
    ]

    @classmethod
    def is_indexable(cls, dataset):
        return (dataset.deleted is None and len(dataset.resources) > 0
                and not dataset.private)

    @classmethod
    def get_suggest_weight(cls, temporal_weight, spatial_weight, featured):
        '''Compute the suggest part of the indexation payload'''
        featured_weight = 1 if not featured else FEATURED_WEIGHT
        return temporal_weight * spatial_weight * featured_weight

    @classmethod
    def serialize(cls, dataset):
        organization = None
        owner = None
        image_url = None
        spatial_weight = DEFAULT_SPATIAL_WEIGHT
        temporal_weight = DEFAULT_TEMPORAL_WEIGHT

        if dataset.organization:
            organization = Organization.objects(
                id=dataset.organization.id).first()
            image_url = organization.logo(40, external=True)
        elif dataset.owner:
            owner = User.objects(id=dataset.owner.id).first()
            image_url = owner.avatar(40, external=True)

        certified = organization and organization.certified

        document = {
            'title':
            dataset.title,
            'description':
            dataset.description,
            'license':
            getattr(dataset.license, 'id', None),
            'tags':
            dataset.tags,
            'badges': [badge.kind for badge in dataset.badges],
            'tag_suggest':
            dataset.tags,
            'resources': [{
                'title': r.title,
                'description': r.description,
                'format': r.format,
            } for r in dataset.resources],
            'format_suggest':
            [r.format.lower() for r in dataset.resources if r.format],
            'frequency':
            dataset.frequency,
            'organization':
            str(organization.id) if organization else None,
            'owner':
            str(owner.id) if owner else None,
            'dataset_suggest': {
                'input': cls.completer_tokenize(dataset.title) + [dataset.id],
                'output': dataset.title,
                'payload': {
                    'id': str(dataset.id),
                    'slug': dataset.slug,
                    'acronym': dataset.acronym,
                    'image_url': image_url,
                },
            },
            'created':
            dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'last_modified':
            dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'),
            'metrics':
            dataset.metrics,
            'featured':
            dataset.featured,
            'from_certified':
            certified,
        }
        if (dataset.temporal_coverage is not None
                and dataset.temporal_coverage.start
                and dataset.temporal_coverage.end):
            start = dataset.temporal_coverage.start.toordinal()
            end = dataset.temporal_coverage.end.toordinal()
            temporal_weight = min((end - start) / 365, MAX_TEMPORAL_WEIGHT)
            document.update({
                'temporal_coverage': {
                    'start': start,
                    'end': end
                },
                'temporal_weight': temporal_weight,
            })

        if dataset.spatial is not None:
            # Index precise zone labels and parents zone identifiers
            # to allow fast filtering.
            zone_ids = [z.id for z in dataset.spatial.zones]
            zones = GeoZone.objects(id__in=zone_ids).exclude('geom')
            parents = set()
            geozones = []
            coverage_level = ADMIN_LEVEL_MAX
            for zone in zones:
                geozones.append({
                    'id': zone.id,
                    'name': zone.name,
                    'keys': zone.keys_values
                })
                parents |= set(zone.parents)
                coverage_level = min(coverage_level, admin_levels[zone.level])

            geozones.extend([{'id': p} for p in parents])

            spatial_weight = ADMIN_LEVEL_MAX / coverage_level
            document.update({
                'geozones': geozones,
                'granularity': dataset.spatial.granularity,
                'spatial_weight': spatial_weight,
            })

        document['dataset_suggest']['weight'] = cls.get_suggest_weight(
            temporal_weight, spatial_weight, dataset.featured)

        if dataset.acronym:
            document['dataset_suggest']['input'].append(dataset.acronym)

        return document
Exemplo n.º 30
0
class ReuseSearch(ModelSearchAdapter):
    model = Reuse
    fuzzy = True

    class Meta:
        doc_type = 'Reuse'

    title = String(analyzer=i18n_analyzer,
                   fields={'raw': String(index='not_analyzed')})
    description = String(analyzer=i18n_analyzer)
    url = String(index='not_analyzed')
    organization = String(index='not_analyzed')
    owner = String(index='not_analyzed')
    type = String(index='not_analyzed')
    tags = String(index='not_analyzed',
                  fields={'i18n': String(index='not_analyzed')})
    badges = String(index='not_analyzed')
    topic = String(index='not_analyzed')
    tag_suggest = Completion(analyzer=simple,
                             search_analyzer=simple,
                             payloads=False)
    datasets = Object(properties={
        'id': String(index='not_analyzed'),
        'title': String(),
    })
    created = Date(format='date_hour_minute_second')
    last_modified = Date(format='date_hour_minute_second')
    metrics = Reuse.__search_metrics__
    featured = Boolean()
    reuse_suggest = Completion(analyzer=simple,
                               search_analyzer=simple,
                               payloads=True)
    extras = Object()

    facets = {
        'tag':
        TermsFacet(field='tags'),
        'organization':
        ModelTermsFacet(field='organization', model=Organization),
        'owner':
        ModelTermsFacet(field='owner', model=User),
        'dataset':
        ModelTermsFacet(field='dataset.id', model=Dataset),
        'type':
        TermsFacet(field='type', labelizer=reuse_type_labelizer),
        'datasets':
        RangeFacet(field='metrics.datasets',
                   ranges=[('none', (None, 1)), ('few', (1, 5)),
                           ('many', (5, None))],
                   labels={
                       'none': _('No datasets'),
                       'few': _('Few datasets'),
                       'many': _('Many datasets'),
                   }),
        'followers':
        RangeFacet(field='metrics.followers',
                   ranges=[('none', (None, 1)), ('few', (1, 5)),
                           ('many', (5, None))],
                   labels={
                       'none': _('No followers'),
                       'few': _('Few followers'),
                       'many': _('Many followers'),
                   }),
        'badge':
        TermsFacet(field='badges', labelizer=reuse_badge_labelizer),
        'featured':
        BoolFacet(field='featured'),
        'topic':
        TermsFacet(field='topic', labelizer=reuse_topic_labelizer),
    }
    sorts = {
        'title': 'title.raw',
        'created': 'created',
        'last_modified': 'last_modified',
        'datasets': 'metrics.datasets',
        'followers': 'metrics.followers',
        'views': 'metrics.views',
    }
    boosters = [
        BoolBooster('featured', lazy('featured_boost')),
        GaussDecay('metrics.datasets',
                   max_datasets,
                   decay=lazy('datasets_decay')),
        GaussDecay('metrics.followers',
                   max_followers,
                   decay=lazy('followers_decay')),
    ]

    @classmethod
    def is_indexable(cls, reuse):
        return (reuse.deleted is None and len(reuse.datasets) > 0
                and not reuse.private)

    @classmethod
    def serialize(cls, reuse):
        """By default use the ``to_dict`` method

        and exclude ``_id``, ``_cls`` and ``owner`` fields.
        """
        datasets = Dataset.objects(id__in=[r.id for r in reuse.datasets])
        datasets = list(datasets.only('id', 'title').no_dereference())
        organization = None
        owner = None
        if reuse.organization:
            organization = Organization.objects(
                id=reuse.organization.id).first()
        elif reuse.owner:
            owner = User.objects(id=reuse.owner.id).first()
        return {
            'title': reuse.title,
            'description': reuse.description,
            'url': reuse.url,
            'organization': str(organization.id) if organization else None,
            'owner': str(owner.id) if owner else None,
            'type': reuse.type,
            'topic': reuse.topic,
            'tags': reuse.tags,
            'tag_suggest': reuse.tags,
            'badges': [badge.kind for badge in reuse.badges],
            'created': to_iso_datetime(reuse.created_at),
            'last_modified': to_iso_datetime(reuse.last_modified),
            'dataset': [{
                'id': str(d.id),
                'title': d.title
            } for d in datasets],
            'metrics': reuse.metrics,
            'featured': reuse.featured,
            'extras': reuse.extras,
            'reuse_suggest': {
                'input': cls.completer_tokenize(reuse.title) + [reuse.id],
                'output': str(reuse.id),
                'payload': {
                    'title': reuse.title,
                    'slug': reuse.slug,
                    'image_url': reuse.image(500, external=True),
                },
            },
        }