class QA(InnerDoc):
     ans_id = Integer()
     ans_str = Text(fields={'raw': Keyword()})
     query_id = Integer()
     query_str = Text()
Exemplo n.º 2
0
class BlogPostIndex(DocType):
    id = Integer()
    title = Text(analyzer="ik_max_word", search_analyzer="ik_max_word")
    content = Text(analyzer="ik_max_word", search_analyzer="ik_max_word")
    char_num = Integer()
    allow_comments = Boolean()
    vote_num = Integer()
    category = Text(analyzer="ik_max_word", search_analyzer="ik_max_word")
    tags = Text(analyzer="ik_max_word", search_analyzer="ik_max_word")
    publish_date = Date()
    suggestions = Completion()

    class Meta:
        index = "blogpost-index"

    @classmethod
    def add(cls, **kwargs):
        id = kwargs.pop("id", None)
        if id is None:
            return False
        blog = cls(meta={"id": id}, **kwargs)
        blog.save()
        return blog

    @staticmethod
    def search_posts(words, delim="......<br>......<br>......"):
        """
        用原生写法拼装结果
        :param words:
        :return:
        """
        q = {
            "_source": ["title", "category", "tags", "publish_date"],
            "query": {
                "bool": {
                    "must": [],
                    "must_not": [],
                    "should": [
                        {
                            "term": {
                                "tags": "{}".format(words)
                            }
                        },
                        {
                            "term": {
                                "title": "{}".format(words)
                            }
                        },
                        {
                            "term": {
                                "content": "{}".format(words)
                            }
                        },
                    ],
                }
            },
            "highlight": {
                "number_of_fragments": 3,
                "fragment_size": 150,
                "fields": {
                    "title": {
                        "pre_tags": ["<em>"],
                        "post_tags": ["</em>"]
                    },
                    "content": {
                        "pre_tags": ["<em>"],
                        "post_tags": ["</em>"]
                    },
                },
            },
            "from": 0,
            "size": 50,
            "sort": [],
            "aggs": {},
        }
        response = es_client.search(index="blogpost-index", body=q)
        r = []
        for item in response["hits"]["hits"]:
            if item.get("highlight", None):
                if item["highlight"].get("title", None):
                    title = "".join(item["highlight"]["title"])
                else:
                    title = item["_source"]["title"]
                if item["highlight"].get("content", None):
                    content = delim.join(
                        item["highlight"]["content"]) + "......<br>"
                else:
                    content = ""
                r.append({
                    "origin_title": item["_source"]["title"],
                    "title": title,
                    "content": content,
                })
        return r

    @staticmethod
    def suggest_word(words):
        q = {
            "_source": False,
            "suggest": {
                "search-as-you-type-suggestion": {
                    "prefix": "{}".format(words),
                    "completion": {
                        "field": "suggestions",
                        "size": 10,
                        "fuzzy": {
                            "fuzziness": 2
                        },
                        "skip_duplicates": True,
                    },
                }
            },
        }
        response = es_client.search(index="blogpost-index", body=q)
        tmp = response["suggest"]["search-as-you-type-suggestion"]
        options = []
        if len(tmp) >= 1:
            options = [item["text"] for item in tmp[0]["options"]]
        return options

    @staticmethod
    def similar_recommends_post(words):
        pass
Exemplo n.º 3
0
class Listing(Document):
    """Base class containing the common fields."""
    access = Text()
    additional_house_rules = Text()
    allows_events = Boolean()
    amenities = Keyword(multi=True)
    amenity_ids = Keyword(multi=True)
    avg_rating = Float()
    bathrooms = Float()
    bedrooms = Integer()
    beds = Integer()
    business_travel_ready = Boolean()
    city = Text(fields={'keyword': Keyword()}, required=True)
    country = Text(fields={'keyword': Keyword()}, required=True)
    coordinates = GeoPoint()
    description = Text()
    host_id = Integer(fields={'keyword': Keyword()})
    house_rules = Text()
    interaction = Text()
    is_hotel = Boolean()
    max_nights = Integer()
    min_nights = Integer()
    monthly_price_factor = Float()
    name = Text(fields={'keyword': Keyword()}, required=True)
    neighborhood_overview = Text()
    # notes = Text()
    person_capacity = Integer()
    photo_count = Integer()
    photos = Keyword(multi=True)
    place_id = Text(fields={'keyword': Keyword()})
    price_rate = Float()
    price_rate_type = Text(fields={'keyword': Keyword()}, required=True)
    province = Text(fields={'keyword': Keyword()})
    rating_accuracy = Float()
    rating_checkin = Float()
    rating_cleanliness = Float()
    rating_communication = Float()
    rating_location = Float()
    rating_value = Float()
    review_count = Integer()
    reviews = Nested()
    room_and_property_type = Text(fields={'keyword': Keyword()}, required=True)
    room_type = Text(fields={'keyword': Keyword()}, required=True)
    room_type_category = Text(fields={'keyword': Keyword()}, required=True)
    satisfaction_guest = Float()
    star_rating = Float()
    state = Text(fields={'keyword': Keyword()}, required=True)
    transit = Text()
    url = Text(fields={'keyword': Keyword()}, required=True)
    weekly_price_factor = Float()

    class Index:
        name = 'scrapy_airbnb_listing'

    def save(self, **kwargs):
        return super(Listing, self).save(**kwargs)
Exemplo n.º 4
0
class _EmailAddressDoc(InnerDoc):
    display_name = Text()
    address = Text()
Exemplo n.º 5
0
class Smlouva(Document):
    contractid = Keyword()
    contractor = Keyword()
    contractoraddresscity = Keyword()
    contractoraddressstreet = Keyword()
    contractoraddresszip = Keyword()
    contractorcompany = Keyword()
    contractorid = Keyword()
    contracttitle = Text(fields={'raw': Keyword()})
    contracttype = Keyword()
    dateconclusion = Date()
    datevalidity = Date()
    date_updated = Date()
    form = Keyword()
    ico = Keyword()
    id = Keyword()
    keyword = Keyword()
    originator = Keyword()
    originatorico = Keyword()
    radaevidcislo = Keyword()
    rok = Integer()
    title = Text(fields={'raw': Keyword()})
    total = Keyword()
    valuewithvat = Float()

    class Index:
        name = INDEX_MZP_SMLOUVA

    def save(self, **kwargs):
        try:
            self.date_updated = elasticsearch_dsl.datetime.now()
            return super(Smlouva, self).save(**kwargs)
        except ElasticsearchDslException as err:
            print("CHYBA ES: {0}".format(err))
            return None

    def load_data(self, data):
        self.rok = -1
        self.dateconclusion = None
        dc_iso = consolidate_date(data['DateConclusion'])
        if dc_iso is not None:
            dca = dc_iso.split('-')
            self.dateconclusion = datetime.date(int(dca[0]), int(dca[1]),
                                                int(dca[2]))
        self.datevalidity = None
        dv_iso = consolidate_date(data['DateValidity'])
        if dv_iso is not None:
            dva = dv_iso.split('-')
            self.datevalidity = datetime.date(int(dva[0]), int(dva[1]),
                                              int(dva[2]))
        self.contractid = data['ContractID']
        self.title = data['Title']
        self.contractorid = data['ContractorID']
        self.contractorcompany = data['ContractorName']
        t: str = str(data['ValueWithVAT'])
        t = t.replace(' ', '')
        t = t.replace('Kč', '')
        t = t.replace(',', '.')
        if not isfloat(t):
            t1 = re.findall(r'\d+', t)
            if t1:
                t = t1[0]
            else:
                t = '-1'
        self.valuewithvat = float(t)
        if self.dateconclusion is not None:
            self.rok = int(dc_iso.split('-')[0])
        self.contracttitle = self.title
        self.contractor = self.contractorcompany
        self.keyword = [
            "MZP", "MŽP", "smlouva", "OpenData", "MUZO", "JASU", "EKIS"
        ]
        self.form = "smlouva"
        self.ico = self.contractorid
        self.total = str(self.valuewithvat)
        self.originator = "Ministerstvo životního prostředí"
        self.originatorico = "00164801"
        self.id = self.contractid
Exemplo n.º 6
0
class _PolicyOverride(InnerDoc):
    type = Text()
    comment = Text()
Exemplo n.º 7
0
class _SPFResult(InnerDoc):
    domain = Text()
    scope = Text()
    results = Text()
Exemplo n.º 8
0
class DocumentDocType(ImprovedDocType):
    """
    The main documentation doc type to be used for searching.
    It stores a bit of meta data so we don't have to hit the db
    when rendering search results.

    The search view will be using the 'lang' and 'version' fields
    of the document's release to filter the search results, depending
    which was found in the URL.

    The breadcrumbs are shown under the search result title.
    """
    model = Document

    id = Long()
    title = Text(analyzer=lower_whitespace_analyzer, boost=1.2)
    path = Text(index='no', analyzer=path_analyzer)
    content = Text(analyzer=lower_whitespace_analyzer)
    content_raw = Text(index_options='offsets')
    release = Object(properties={
        'id': Long(),
        'version': Keyword(),
        'lang': Keyword(),
    })
    breadcrumbs = Nested(properties={
        'title': Keyword(),
        'path': Keyword(),
    })

    class Meta:
        index = 'docs'
        doc_type = 'document'

    @classmethod
    def alias_to_main_index(cls, index_name, using=None):
        """
        Alias `index_name` to 'docs' (`cls._doc_type.index`).
        """
        body = {'actions': [{'add': {'index': index_name, 'alias': cls._doc_type.index}}]}

        client = connections.get_connection(using or cls._doc_type.using)
        client.indices.refresh(index=index_name)
        try:
            old_index_name = list(client.indices.get_alias('docs').keys())[0]
        except elasticsearch.exceptions.NotFoundError:
            old_index_name = None
        else:
            body['actions'].append({'remove': {'index': old_index_name, 'alias': cls._doc_type.index}})

        client.indices.update_aliases(body=body)
        # Delete the old index that was aliased to 'docs'.
        if old_index_name:
            client.indices.delete(old_index_name)

    @classmethod
    def index_queryset(cls):
        qs = super(DocumentDocType, cls).index_queryset()
        return (
            # don't index the module pages since source code is hard to
            # combine with full text search
            qs.exclude(path__startswith='_modules')
            # not the crazy big flattened index of the CBVs
              .exclude(path__startswith='ref/class-based-views/flattened-index')
              .select_related('release'))

    @classmethod
    def from_django(cls, obj):
        # turns HTML entities into unicode characters again and removes
        # all HTML tags, aka "plain text" versio of the document
        raw_body = strip_tags(unescape_entities(obj.body).replace(u'¶', ''))
        doc = cls(path=obj.path,
                  title=obj.title,
                  content=obj.body,
                  content_raw=raw_body,
                  meta={'id': obj.id})
        doc.release = {
            'id': obj.release.id,
            'lang': obj.release.lang,
            'version': obj.release.version,
        }
        breadcrumbs = []
        for breadcrumb in cls.model.objects.breadcrumbs(obj):
            breadcrumbs.append({
                'title': breadcrumb.title,
                'path': breadcrumb.path,
            })
        doc.breadcrumbs = breadcrumbs
        return doc

    def get_absolute_url(self):
        return document_url(self)
class Declaration(DocType, RelatedDeclarationsMixin):
    """Declaration document.
    Assumes there's a dynamic mapping with all fields not indexed by default."""
    general = Object(
        properties={
            'full_name_suggest': Completion(preserve_separators=False),
            'full_name': Text(index=True, analyzer='ukrainian'),
            'name': Text(index=True, analyzer='ukrainian'),
            'patronymic': Text(index=True, analyzer='ukrainian'),
            'last_name': Text(index=True, analyzer='ukrainian'),
            'family_raw': Text(index=True, analyzer='ukrainian'),
            'family': Nested(
                properties={
                    'name': Text(index=True, analyzer='ukrainian'),
                    'relations': Keyword(index=False),
                    'inn': Keyword(index=False)
                }
            ),
            'post_raw': Text(index=True, analyzer='ukrainian'),
            'post': Object(
                properties={
                    'region': Text(index=True, analyzer='ukrainian', fields={'raw': Keyword(index=True)}),
                    'office': Text(index=True, analyzer='ukrainian', fields={'raw': Keyword(index=True)}),
                    'post': Text(index=True, analyzer='ukrainian', fields={'raw': Keyword(index=True)})
                }
            ),
            'addresses': Nested(
                properties={
                    'place': Text(index=False),
                    'place_hidden': Boolean(index=False),
                    'place_district': Text(index=False),
                    'place_district_hidden': Boolean(index=False),
                    'place_city': Text(index=False),
                    'place_city_hidden': Boolean(index=False),
                    'place_city_type': Keyword(index=False),
                    'place_city_type_hidden': Boolean(index=False),
                    'place_address': Text(index=False),
                    'place_address_hidden': Boolean(index=False),
                    'place_address_type': Keyword(index=False)
                }
            )
        }
    )
    declaration = Object(
        properties={
            'date': NoneAwareDate(),
            'notfull': Boolean(index=False),
            'notfull_lostpages': Keyword(index=False),
            'additional_info': Boolean(index=False),
            'additional_info_text': Text(index=False),
            'needs_scancopy_check': Boolean(index=False)
        }
    )
    intro = Object(
        properties={
            'declaration_year': Keyword(index=True)
        }
    )
    ft_src = Text(index=True, analyzer='ukrainian')

    INCOME_SINGLE_PROPERTIES = {
        'value': Keyword(index=False),
        'value_unclear': Boolean(index=False),
        'comment': Text(index=False),
        'family': Keyword(index=False),
        'family_unclear': Boolean(index=False),
        'family_comment': Text(index=False)
    }
    INCOME_LIST_PROPERTIES = {
        'country': Keyword(index=False),
        'country_comment': Text(index=False),
        'cur': Keyword(index=False),
        'cur_units': Keyword(index=False),
        'uah_equal': Keyword(index=False)
    }
    income = Object(
        properties={
            '5': Object(
                properties=INCOME_SINGLE_PROPERTIES
            ),
            '6': Object(
                properties=INCOME_SINGLE_PROPERTIES
            ),
            '7': Object(
                properties=INCOME_SINGLE_PROPERTIES
            ),
            '8': Object(
                properties=INCOME_SINGLE_PROPERTIES
            ),
            '9': Object(
                properties=INCOME_SINGLE_PROPERTIES
            ),
            '10': Object(
                properties=INCOME_SINGLE_PROPERTIES
            ),
            '11': Object(
                properties=INCOME_SINGLE_PROPERTIES
            ),
            '12': Object(
                properties=INCOME_SINGLE_PROPERTIES
            ),
            '13': Object(
                properties=INCOME_SINGLE_PROPERTIES
            ),
            '14': Object(
                properties=INCOME_SINGLE_PROPERTIES
            ),
            '15': Object(
                properties=INCOME_SINGLE_PROPERTIES
            ),
            '16': Object(
                properties=INCOME_SINGLE_PROPERTIES
            ),
            '17': Object(
                properties=INCOME_SINGLE_PROPERTIES
            ),
            '18': Object(
                properties=INCOME_SINGLE_PROPERTIES
            ),
            '19': Object(
                properties=INCOME_SINGLE_PROPERTIES
            ),
            '20': Object(
                properties=INCOME_SINGLE_PROPERTIES
            ),
            '21': Nested(
                properties=INCOME_LIST_PROPERTIES
            ),
            '22': Nested(
                properties=INCOME_LIST_PROPERTIES
            )
        }
    )

    ESTATE_PROPERTIES = {
        'region': Text(index=False),
        'address': Text(index=False),
        'space': Keyword(index=False),
        'space_units': Keyword(index=False),
        'space_comment': Text(index=False),
        'costs': Keyword(index=False),
        'costs_comment': Text(index=False),
        'costs_rent': Keyword(index=False),
        'costs_rent_comment': Text(index=False),
        'costs_property': Keyword(index=False),
        'costs_property_comment': Text(index=False)
    }
    estate = Object(
        properties={
            '23': Nested(
                properties=ESTATE_PROPERTIES
            ),
            '24': Nested(
                properties=ESTATE_PROPERTIES
            ),
            '25': Nested(
                properties=ESTATE_PROPERTIES
            ),
            '26': Nested(
                properties=ESTATE_PROPERTIES
            ),
            '27': Nested(
                properties=ESTATE_PROPERTIES
            ),
            '28': Nested(
                properties=ESTATE_PROPERTIES
            ),
            '29': Nested(
                properties=ESTATE_PROPERTIES
            ),
            '30': Nested(
                properties=ESTATE_PROPERTIES
            ),
            '31': Nested(
                properties=ESTATE_PROPERTIES
            ),
            '32': Nested(
                properties=ESTATE_PROPERTIES
            ),
            '33': Nested(
                properties=ESTATE_PROPERTIES
            ),
            '34': Nested(
                properties=ESTATE_PROPERTIES
            )
        }
    )

    VEHICLE_PROPERTIES = {
        "brand": Text(index=False),
        "brand_info": Text(index=False),
        "year": Keyword(index=False),
        "sum": Keyword(index=False),
        "sum_comment": Text(index=False),
        "sum_rent": Keyword(index=False),
        "sum_rent_comment": Text(index=False),
        "brand_hidden": Boolean(index=False),
        "brand_info_hidden": Boolean(index=False),
        "brand_info_unclear": Boolean(index=False)
    }
    vehicle = Object(
        properties={
            '35': Nested(
                properties=VEHICLE_PROPERTIES
            ),
            '36': Nested(
                properties=VEHICLE_PROPERTIES
            ),
            '37': Nested(
                properties=VEHICLE_PROPERTIES
            ),
            '38': Nested(
                properties=VEHICLE_PROPERTIES
            ),
            '39': Nested(
                properties=VEHICLE_PROPERTIES
            ),
            '40': Nested(
                properties=VEHICLE_PROPERTIES
            ),
            '41': Nested(
                properties=VEHICLE_PROPERTIES
            ),
            '42': Nested(
                properties=VEHICLE_PROPERTIES
            ),
            '43': Nested(
                properties=VEHICLE_PROPERTIES
            ),
            '44': Nested(
                properties=VEHICLE_PROPERTIES
            )
        }
    )

    BANKS_PROPERTIES = {
        'sum': Keyword(index=False),
        'sum_hidden': Boolean(index=False),
        'sum_units': Keyword(index=False),
        'sum_comment': Text(index=False),
        'sum_foreign': Keyword(index=False),
        'sum_foreign_units': Keyword(index=False),
        'sum_foreign_comment': Text(index=False)
    }
    banks = Object(
        properties={
            '45': Nested(
                properties=BANKS_PROPERTIES
            ),
            '46': Nested(
                properties=BANKS_PROPERTIES
            ),
            '47': Nested(
                properties=BANKS_PROPERTIES
            ),
            '48': Nested(
                properties=BANKS_PROPERTIES
            ),
            '49': Nested(
                properties=BANKS_PROPERTIES
            ),
            '50': Nested(
                properties=BANKS_PROPERTIES
            ),
            '51': Nested(
                properties=BANKS_PROPERTIES
            ),
            '52': Nested(
                properties=BANKS_PROPERTIES
            ),
            '53': Nested(
                properties=BANKS_PROPERTIES
            ),
        }
    )

    LIABILITIES_PROPERTIES = {
        'sum': Keyword(index=False),
        'sum_comment': Text(index=False),
        'sum_units': Keyword(index=False),
        'sum_foreign': Keyword(index=False),
        'sum_foreign_comment': Text(index=False)
    }
    liabilities = Object(
        properties={
            '54': Nested(
                properties=LIABILITIES_PROPERTIES
            ),
            '55': Nested(
                properties=LIABILITIES_PROPERTIES
            ),
            '56': Nested(
                properties=LIABILITIES_PROPERTIES
            ),
            '57': Nested(
                properties=LIABILITIES_PROPERTIES
            ),
            '58': Nested(
                properties=LIABILITIES_PROPERTIES
            ),
            '59': Nested(
                properties=LIABILITIES_PROPERTIES
            ),
            '60': Nested(
                properties=LIABILITIES_PROPERTIES
            ),
            '61': Nested(
                properties=LIABILITIES_PROPERTIES
            ),
            '62': Nested(
                properties=LIABILITIES_PROPERTIES
            ),
            '63': Nested(
                properties=LIABILITIES_PROPERTIES
            ),
            '64': Nested(
                properties=LIABILITIES_PROPERTIES
            ),
        }
    )

    class Meta:
        index = 'declarations_v2'
Exemplo n.º 10
0
class LagouType(DocType):
    suggest = Completion(analyzer=ik_analyzer)
    positionId = Keyword()  # 文章id
    art_title = Text(analyzer="ik_max_word")
    art_time = Date()
    art_position = Text(analyzer="ik_max_word")
    art_salary = Text(analyzer="ik_max_word")  # 工资
    art_work_year = Text(analyzer="ik_max_word")  # 要求工作年限
    art_education = Text(analyzer="ik_max_word")  # 学历要求
    art_jobNature = Text(analyzer="ik_max_word")  # 职位类型全职
    company_hitags = Keyword()  # 公司福利
    art_company_name = Text(analyzer="ik_max_word")
    art_company_id = Keyword()

    company_type = Keyword()
    company_size = Keyword()
    company_financestage = Keyword()  # 公司融资轮数
    company_label_list = Keyword()  # 公司吸引力
    art_first_type = Text(analyzer="ik_max_word")  # 具体职位类型
    art_second_type = Text(analyzer="ik_max_word")  # 总体职位类型
    art_third_type = Text(analyzer="ik_max_word")  # 语言职位类型

    compangy_full_position = Text(analyzer="ik_max_word")#具体位置
    art_description = Text(analyzer="ik_max_word")
    fingerprint = Keyword()
    class Meta:
        index = "jobbole"
        doc_type = "article"
Exemplo n.º 11
0
class SFNIOT(InnerDoc):
    event_type = Text()
    domain_name = Text(analyzer='snowball', fields={'raw': Keyword()})
    device_name = Text(analyzer='snowball', fields={'raw': Keyword()})
    host = Text(analyzer='snowball', fields={'raw': Keyword()})
    threat_id = Text(analyzer='snowball')
    threat_name = Text(analyzer='snowball')
    tag_name = Text(fields={'raw': Keyword()})
    tag_class = Text(fields={'raw': Keyword()})
    tag_group = Text(fields={'raw': Keyword()})
    tag_description = Text(analyzer='snowball')
    public_tag_name = Text(analyzer='snowball')
    confidence_level = Integer()
    sample_date = Date()
    file_type = Text(fields={'raw': Keyword()})
    updated_at = Date()
    processed = Integer()
    src_ip = Ip()
    dst_ip = Ip()
Exemplo n.º 12
0
class EntityDepartment(DocType):
    name = Keyword()
    source_url = Text()

    class Meta:
        index = 'med_base'
class BlogPostIndex(DocType):
    author = Text()
    posted_date = Date()
    title = Text()
    text = Text()
Exemplo n.º 14
0
class Item(Document):
    id = Integer()
    title = Text()
    description = Text()
    first_img = Text()
    basic_price = Float()
    price = Float()
    on_sale = Boolean()
    is_discounted = Boolean()
    created_at = Date()

    class Index:
        name = "flaskshop"

    @classmethod
    def add(cls, item):
        obj = cls(**get_item_data(item))
        obj.save()
        return obj

    @classmethod
    def update_item(cls, item):
        try:
            obj = cls.get(item.id)
        except NotFoundError:
            return cls.add(item)

        kw = get_item_data(item)
        try:
            obj.update(**kw)
        except ConflictError:
            obj = cls.get(item.id)
            obj.update(**kw)
        return True

    @classmethod
    def delete(cls, item):
        rs = cls.get(item.id)
        if rs:
            super(cls, rs).delete()
            return True
        return False

    @classmethod
    def bulk_update(cls, items, chunk_size=5000, op_type="update", **kwargs):
        index = cls._index._name
        _type = cls._doc_type.name
        obj = [{
            "_op_type": op_type,
            "_id": f"{doc.id}",
            "_index": index,
            "_type": _type,
            "_source": get_item_data(doc),
        } for doc in items]
        client = cls.get_es()
        rs = list(parallel_bulk(client, obj, chunk_size=chunk_size, **kwargs))
        return rs

    @classmethod
    def get_es(cls):
        search = cls.search()
        return connections.get_connection(search._using)

    @classmethod
    def new_search(cls, query, page, order_by=None, per_page=16):
        s = cls.search()
        s = s.query("multi_match", query=query, fields=SERACH_FIELDS)
        start = (page - 1) * per_page
        s = s.extra(**{"from": start, "size": per_page})
        s = s if order_by is None else s.sort(order_by)
        rs = s.execute()
        return Pagination(query, page, per_page, rs.hits.total, rs)
Exemplo n.º 15
0
class _EmailAttachmentDoc(Document):
    filename = Text()
    content_type = Text()
    sha256 = Text()
Exemplo n.º 16
0
class PodcastTranscriptionBlob(Document):
    """
    This defines a PodcastTranscriptionBlob model.

    This represents the groups of 3-5 sentences for each podcast's full transcription text,
    that we use to search through podcasts
    """
    podcast_id = Keyword()
    transcription_blob = Text()
    starting_timestamp_second = Integer(
    )  # The starting timestamp for this blob (in seconds since the start of the podcast's video)
    ending_timestamp_second = Integer()
    blob_index = Integer()
    lecture_num = Integer()

    # Filters that the user can search against
    department = Text()
    course_num = Keyword()
    quarter = Keyword()
    professor = Text()
    section_id = Keyword()

    date = Date()

    # Elasticsearch index settings
    class Index:
        name = "podcast_transription_blobs"

    def get_snippet_url(self):
        return url_for("podcasts.get_podcast_blob", blob_id=self.meta.id)

    @property
    def podcast(self):
        return Podcast.get(id=self.podcast_id)

    def convert_to_dict(self):
        dict_ = self.to_dict(include_meta=False)
        dict_['id'] = self.meta.id
        dict_['href'] = self.get_snippet_url()
        return dict_

    @staticmethod
    def search_podcasts(text_query,
                        department=None,
                        course_num=None,
                        professor=None,
                        quarter=None,
                        section_id=None,
                        page=1,
                        count=10):
        search_criteria = [
            Q('match', transcription_blob=text_query),
        ]

        if department:
            search_criteria.append(Q('match', department=department))

        if course_num:
            search_criteria.append(Q('match', course_num=course_num))

        if professor:
            search_criteria.append(Q('match', professor=professor))

        if quarter:
            search_criteria.append(Q('match', quarter=quarter))

        if section_id:
            search_criteria.append(Q('match', section_id=section_id))

        search_query = Q('bool', must=search_criteria)

        # TODO: Should we automatically sort the results by the date & starting_timestamp_second fields?
        # Because the professor might first introduce a topic in 1 lecture, and then refer to it in subsequent lectures.
        podcast_transcription_query = PodcastTranscriptionBlob.search(
        ).highlight("transcription_blob").query(search_query)

        # Slice the search query for the requested page & count
        podcast_transcription_query = podcast_transcription_query[
            (page - 1) * count:((page - 1) * count) + count]

        relevant_transcription_blobs = podcast_transcription_query.execute()

        return relevant_transcription_blobs[:count]
Exemplo n.º 17
0
class _ForensicReportDoc(Document):
    class Index:
        name = "dmarc_forensic"

    feedback_type = Text()
    user_agent = Text()
    version = Text()
    original_mail_from = Text()
    arrival_date = Date()
    domain = Text()
    original_envelope_id = Text()
    authentication_results = Text()
    delivery_results = Text()
    source_ip_address = Ip()
    source_country = Text()
    source_reverse_dns = Text()
    source_authentication_mechanisms = Text()
    source_auth_failures = Text()
    dkim_domain = Text()
    original_rcpt_to = Text()
    sample = Object(_ForensicSampleDoc)
Exemplo n.º 18
0
class User(InnerDoc):
    name = Text(fields={'raw': Keyword()})
Exemplo n.º 19
0
class _DKIMResult(InnerDoc):
    domain = Text()
    selector = Text()
    result = Text()
Exemplo n.º 20
0
class History(InnerDoc):
    timestamp = Date()
    diff = Text()
Exemplo n.º 21
0
class _AggregateReportDoc(Document):
    class Index:
        name = "dmarc_aggregate"

    xml_schema = Text()
    org_name = Text()
    org_email = Text()
    org_extra_contact_info = Text()
    report_id = Text()
    date_range = Date()
    date_begin = Date()
    date_end = Date()
    errors = Text()
    published_policy = Object(_PublishedPolicy)
    source_ip_address = Ip()
    source_country = Text()
    source_reverse_dns = Text()
    source_Base_domain = Text()
    message_count = Integer
    disposition = Text()
    dkim_aligned = Boolean()
    spf_aligned = Boolean()
    passed_dmarc = Boolean()
    policy_overrides = Nested(_PolicyOverride)
    header_from = Text()
    envelope_from = Text()
    envelope_to = Text()
    dkim_results = Nested(_DKIMResult)
    spf_results = Nested(_SPFResult)

    def add_policy_override(self, type_, comment):
        self.policy_overrides.append(_PolicyOverride(type=type_,
                                                     comment=comment))

    def add_dkim_result(self, domain, selector, result):
        self.dkim_results.append(_DKIMResult(domain=domain,
                                             selector=selector,
                                             result=result))

    def add_spf_result(self, domain, scope, result):
        self.spf_results.append(_SPFResult(domain=domain,
                                           scope=scope,
                                           result=result))

    def save(self, ** kwargs):
        self.passed_dmarc = False
        self.passed_dmarc = self.spf_aligned or self.dkim_aligned

        return super().save(** kwargs)
Exemplo n.º 22
0
def text_with_raw():
    return Text(fields=dict(raw=Keyword()))
Exemplo n.º 23
0
class PoeItem(InnerDoc):
    """
    items
    """

    abyssJewel = Boolean()
    additionalProperties = Boolean(multi=True)
    artFilename = Text()
    category = Nested(PoeCategory)
    corrupted = Boolean()
    cosmeticMods = Text(multi=True)
    craftedMods = Text(multi=True)
    descrText = Text()
    duplicated = Boolean()
    elder = Boolean()
    enchantMods = Text(multi=True)
    explicitMods = Text(multi=True)
    flavourText = Text(multi=True)
    frameType = Integer()
    h = Integer()
    icon = Keyword()
    id = Keyword()
    identified = Boolean()
    ilvl = Integer()
    implicitMods = Text(multi=True)
    inventoryId = Text()
    isRelic = Boolean()
    league = Keyword()
    lockedToCharacter = Boolean()
    maxStackSize = Integer()
    name = Text()
    nextLevelRequirements = Nested(PoePropsReqs, multi=True)
    note = Keyword()
    properties = Nested(PoePropsReqs, multi=True)
    prophecyDiffText = Text()
    prophecyText = Text()
    requirements = Nested(PoePropsReqs, multi=True)
    secDescrText = Text()
    shaper = Boolean()
    socketedItems = Nested()
    sockets = Nested(PoeSockets)
    stackSize = Integer()
    support = Boolean()
    talismanTier = Integer()
    typeLine = Text()
    utilityMods = Text(multi=True)
    verified = Boolean()
    w = Integer()
    x = Integer()
    y = Integer()
Exemplo n.º 24
0
class ModsecRule(Document):
    rule_id = Integer()
    msg = Text(analyzer='snowball', fields={'raw': Keyword()})
    body = Text(analyzer='snowball')
    tags = Keyword()
    rule_txt = Text(analyzer='snowball')
    severity = Text(analyzer='snowball')
    phase = Text(analyzer='snowball')
    rev = Text(analyzer='snowball')
    maturity = Text(analyzer='snowball')
    accuracy = Text(analyzer='snowball')
    ver = Text(analyzer='snowball')
    filename = Text(analyzer='snowball')
    # 2019-10-7 增加
    category = Text(analyzer='snowball')

    class Index:
        name = 'ngx_modsec_rules'
        settings = {
            "number_of_shards": 2,
        }

    def save(self, **kwargs):
        return super(ModsecRule, self).save(**kwargs)
Exemplo n.º 25
0
class LagouJobType(Document):
    """
    知乎answer类型
    """
    # Completion是es自带的自动补全提示工具
    # 由于报错,所以自定义了analyzer,但实际上什么都没做
    suggest = Completion(analyzer=ik_analyzer)
    # 以下是scrapy的item
    title = Text(analyzer="ik_max_word")
    url = Keyword()
    url_object_id = Keyword()
    salary = Text(analyzer="ik_max_word")
    job_city = Text(analyzer="ik_max_word")
    work_years = Text(analyzer="ik_max_word")
    degree_need = Text(analyzer="ik_max_word")
    job_type = Text(analyzer="ik_max_word")
    # 这里没有做处理,暂时保留text
    publish_time = Text(analyzer="ik_max_word")
    job_advantage = Text(analyzer="ik_max_word")
    job_desc = Text(analyzer="ik_max_word")
    job_addr = Text(analyzer="ik_max_word")
    company_name = Keyword()
    company_url = Keyword()
    tags = Text(analyzer="ik_max_word")
    crawl_time = Date()

    class Index:
        name = 'lagou'
        doc_type = "job"

    class Meta:
        doc_type = "job"
Exemplo n.º 26
0
class EventIndex(DocType):
    name = Text()
    created_at = Text()

    class Meta:
        index = 'event-index'
Exemplo n.º 27
0
class Lagou(DocType):
    # 拉勾网职位信息
    title_suggest = Completion(analyzer=ik_analyzer, search_analyzer=ik_analyzer)
    title = Text(analyzer='ik_max_word', search_analyzer="ik_max_word", fields={'title': Keyword()})
    id = Text()
    url = Text()
    salary = Text()
    job_city = Text()
    work_years = Text()
    degree_need = Text()
    job_type = Text()
    publish_time = Text()
    job_advantage = Text()
    job_desc = Text()
    job_addr = Text()
    company_name = Text()
    company_url = Text()
    tags = Text(analyzer='ik_max_word', fields={'tags': Keyword()})
    crawl_time = Date()

    class Meta:
        index = 'jobbole'
        doc_type = 'lagou_job'
Exemplo n.º 28
0
class DocTestSSLResult(Document):

    source = Text(fields={'raw': Keyword()})
    result = Boolean()
    timestamp = Date()
    ip = Keyword()
    hostname = Keyword()
    port = Integer()
    svcid = Keyword()
    protocols = Keyword(multi=True)
    ciphers = Text(multi=True, fields={'raw': Keyword()})
    ciphertests = Keyword(multi=True)
    serverpref = Object(
        properties={
            "cipher_order": Boolean(),
            "protocol": Keyword(),
            "cipher": Text(fields={'raw': Keyword()})
        })
    cert = Object(
        properties={
            "keysize": Short(),
            "signalgo": Text(fields={'raw': Keyword()}),
            "md5_fingerprint": Keyword(),
            "sha1_fingerprint": Keyword(),
            "sha256_fingerprint": Keyword(),
            "cn": Text(fields={'raw': Keyword()}),
            "san": Text(multi=True, fields={'raw': Keyword()}),
            "issuer": Text(fields={'raw': Keyword()}),
            "ev": Boolean(),
            "expiration": Date(),
            "ocsp_uri": Text(fields={'raw': Keyword()}),
            "Crl_url": Text(fields={'raw': Keyword()}),
            "ocsp_stapling": Boolean(),
        })
    vulnerabilities = Keyword(multi=True)

    def parseCSVLine(self, line):
        if line['id'] == "id":
            return

        if not self.ip or not self.hostname or not self.port:  # host, ip and port
            m = reIpHostColumn.search(line['fqdn/ip'])
            if m:
                self.hostname, self.ip = m.groups()
            self.port = int(line['port'])

        if reProtocol.search(line['id']) and reOffers.search(
                line['finding']):  # protocols
            self.result = True
            m = reProtocol.search(line['id'])
            if m:
                self.protocols.append(line['id'].upper())

        elif reCipherColumnName.search(line['id']):  # ciphers IT WORKS
            m = reCipherDetails.search(line['finding'])
            if m:
                self.ciphers.append(m.group(1))

        elif reCipherTests.search(line['id']) and reVulnerable.search(
                line['finding']):  # cipher tests
            m = reCipherTests.search(line['id'])
            print(m)
            if m:
                self.ciphertests.append(m.group(1))

        if line['id'] == "cipher_order":  # server prefers cipher IT WORKS
            self.serverpref.cipher_order = bool(reOk.search(line['severity']))

        elif line[
                'id'] == "protocol_negotiated":  # preferred protocol IT WORKS
            m = reDefaultProtocol.search(line['finding'])

            if m:
                self.serverpref.protocol = m.group(1)

        elif line['id'] == "cipher_negotiated":  # preferred cipher  IT WORKS
            m = reDefaultCipher.search(line['finding'])
            if m:
                self.serverpref.cipher = m.group(1)

        elif line['id'] == "cert_keySize":  # certificate key size IT WORKS
            m = reKeySize.search(line['finding'])
            if m:
                self.cert.keysize = int(m.group(1))

        elif line[
                'id'] == "cert_signatureAlgorithm":  # certificate sign algorithm IT WORKS
            m = reSignAlgorithm.search(line['finding'])
            if m:
                self.cert.signalgo = m.group(1)

        elif line[
                'id'] == "cert_fingerprintSHA1":  # certificate fingerprints SHA1 IT WORKS

            m = reFPSHA1.search(line['finding'])
            if m:
                self.cert.sha1_fingerprint = m.group(1)

        elif line[
                'id'] == "cert_fingerprintSHA256":  # certificate fingerprints SHA256 IT WORKS

            m = reFPSHA256.search(line['finding'])
            if m:
                self.cert.sha256_fingerprint = m.group(1)

        elif line[
                'id'] == "cert_fingerprintMD5":  # certificate fingerprints MD5 IT WORKS
            m = reFPMD5.search(line['finding'])
            if m:
                self.cert.md5_fingerprint = m.group(1)

        elif line['id'] == "cert_commonName":  # certificate CN IT WORKS
            m = reCN.search(line['finding'])
            if m:
                self.cert.cn = m.group(1)

        elif line[
                'id'] == "cert_subjectAltName":  # certificate SAN KINDA WORKS NEEDS REVISION
            m = reSAN.search(line['finding'])
            #print(m)
            if m:
                self.cert.san = m.group(1)

#sans = m.group(1)
#for san in sans.split(" "):
#    if san != "--":
#        self.cert.san.append(san)"""

        elif line['id'] == "cert_caIssuers":  # certificate issuer IT WORKS
            m = reIssuer.search(line['finding'])
            if m:
                self.cert.issuer = m.group(1)

        elif line['id'] == "ev":  # certificate extended validation NOT SUERE
            self.cert.ev = bool(reYes.search(line['finding']))

        elif line['id'] == "cert_notAfter":  # certificate expiration IT WORKS
            m = reExpiration.search(line['finding'])
            if m:
                unparsedDate = m.group(1)
                self.cert.expiration = datetime.strptime(
                    unparsedDate, "%Y-%m-%d %H:%M")

        elif line[
                'id'] == "cert_ocspURL":  # certificate OCSP URI IT WORKS ELSE NEEDS REWORK
            m = reOCSPURI.search(line['finding'])
            #print(m)
            if m:
                self.cert.ocsp_uri = m.group(1)
            else:
                self.cert.ocsp_uri = "-"

        elif line[
                'id'] == "cert_crlDistributionPoints":  # certificate CRL WORKS
            m = reAll.search(line['finding'])
            #print(m)
            if m:
                self.cert.Crl_url = m.group(1)
            else:
                self.cert.Crl_url = "-"

        elif line['id'] == "OCSP_stapling":  # certificate OCSP stapling
            self.cert.ocsp_stapling = not bool(
                reNotOffered.search(line['finding']))

        elif line['id'] in ("heartbleed", "CCS", "secure_renego",
                            "secure_client_renego", "CRIME_TLS", "SWEET32",
                            "POODLE_SSL", "fallback_SCSV", "FREAK", "DROWN",
                            "LOGJAM", "BEAST", "LUCKY13",
                            "RC4") and reVulnerable.search(line['severity']):
            m = reVulnerable.search(line['severity'])
            if str(m.group(1)) != '':
                self.vulnerabilities.append(line['id'].upper())

    def parseCSV(self, csvfile):
        if self.source:
            m = reDefaultFilename.search(self.source)
            if m:
                self.ip = m.group('ip')
                self.port = int(m.group('port') or 0)
                self.timestamp = datetime.strptime(m.group('datetime'),
                                                   "%Y%m%d-%H%M")
        csvReader = csv.DictReader(csvfile,
                                   fieldnames=("id", "fqdn/ip", "port",
                                               "severity", "finding", "cve",
                                               "cwe"),
                                   delimiter=',',
                                   quotechar='"')
        for line in csvReader:
            self.parseCSVLine(line)

    def save(self, **kwargs):
        if not self.timestamp:
            self.timestamp = datetime.now(tz)
        if not self.port:
            raise ValueError("Empty scan result")

        self.svcid = "%s:%d" % (self.ip, int(self.port) or 0)
        if not self.result:
            self.result = False

        if 'debug' in kwargs and kwargs['debug']:
            pp.pprint(self.to_dict())
        return super().save()
Exemplo n.º 29
0
class LagouType(DocType):
    suggest = Completion(analyzer=ik_analyzer)
    title = Text(analyzer="ik_max_word")
    url = Keyword()
    url_object_id = Keyword()
    salary_min = Text(analyzer="ik_max_word")
    salary_max = Text(analyzer="ik_max_word")
    job_city = Keyword()
    work_years_min = Text(analyzer="ik_max_word")
    work_years_max = Text(analyzer="ik_max_word")
    degree_need = Text(analyzer="ik_max_word")
    job_type = Keyword()
    publish_time = Text(analyzer="ik_max_word")
    job_advantage = Text(analyzer="ik_max_word")
    job_desc = Text(analyzer="ik_max_word")
    job_addr = Text(analyzer="ik_max_word")
    company_name = Keyword()
    company_url = Keyword()
    tags = Text(analyzer="ik_max_word")
    crawl_time = Date()

    class Meta:
        index = "lagou"
        doc_type = "job"
Exemplo n.º 30
0
class Event(DocType):
    title = Text(
        multi=True,
        fields={
            'en': String(analyzer='english'),
            'ru': String(analyzer='russian'),
            # 'raw': String(analyzer='trigrams'),
            'raw': Keyword(),
        },
        analyzer='russian')
    title_localized = Object(doc_class=Locales,
                             properties={
                                 'raw': String(analyzer='trigrams'),
                                 'en': String(analyzer='english'),
                                 'ru': String(analyzer='russian'),
                                 'de': String(analyzer='dutch'),
                                 'fr': String(analyzer='french'),
                                 'es': String(analyzer='spanish'),
                             })
    title_suggest = Completion()
    description = Text(analyzer='russian')
    description_localized = Object(doc_class=Locales,
                                   properties={
                                       'raw': String(analyzer='trigrams'),
                                       'en': String(analyzer='english'),
                                       'ru': String(analyzer='russian'),
                                       'de': String(analyzer='dutch'),
                                       'fr': String(analyzer='french'),
                                       'es': String(analyzer='spanish'),
                                   })
    slug = Text(fields={'raw': Keyword()})
    provider = Text()
    provider_id = Text()
    ticket_price = Text()
    ticket_url = Text()
    video_url = Text()
    deleted = Boolean()
    image = Text()
    average_rate = Float()
    category = Object(
        properties={
            'title': Text(fields={'raw': Keyword()}, analyzer='russian'),
            'slug': Text(fields={'raw': Keyword()})
        })
    place = Object(doc_class=Place,
                   properties={
                       'place_id':
                       Text(fields={'raw': Keyword()}),
                       'provider_id':
                       Text(fields={'raw': Keyword()}),
                       'slug':
                       String(),
                       'title':
                       Text(fields={'raw': Keyword()}, analyzer='russian'),
                       'title_localized':
                       Object(doc_class=Locales,
                              properties={
                                  'raw': String(analyzer='trigrams'),
                                  'en': String(analyzer='english'),
                                  'ru': String(analyzer='russian'),
                                  'de': String(analyzer='dutch'),
                                  'fr': String(analyzer='french'),
                                  'es': String(analyzer='spanish'),
                              }),
                       'address':
                       Text(fields={'raw': Keyword()}, analyzer='russian'),
                       'address_localized':
                       Object(doc_class=Locales,
                              properties={
                                  'raw': String(analyzer='trigrams'),
                                  'en': String(analyzer='english'),
                                  'ru': String(analyzer='russian'),
                                  'de': String(analyzer='dutch'),
                                  'fr': String(analyzer='french'),
                                  'es': String(analyzer='spanish'),
                              }),
                       'city':
                       Text(fields={'raw': Keyword()}, analyzer='russian'),
                       'description':
                       Text(),
                       'description_localized':
                       Object(doc_class=Locales,
                              properties={
                                  'raw': String(analyzer='trigrams'),
                                  'en': String(analyzer='english'),
                                  'ru': String(analyzer='russian'),
                                  'de': String(analyzer='dutch'),
                                  'fr': String(analyzer='french'),
                                  'es': String(analyzer='spanish'),
                              }),
                       'lat':
                       Float(),
                       'lng':
                       Float(),
                       'geometry':
                       GeoPoint(),
                       'email':
                       Text(),
                       'website':
                       Text(),
                       'phone':
                       Text()
                   })
    images = Nested(properties={
        'image': Text(),
    })
    dates = Object(doc_class=Schedule,
                   properties={
                       'start_date':
                       Date(format="YYYY-MM-dd||"
                            "YYYY-MM-dd'T'HH:mm:ss"),
                       'end_date':
                       Date(format="YYYY-MM-dd||"
                            "YYYY-MM-dd'T'HH:mm:ss")
                   })
    schedules = Object(doc_class=Schedule,
                       properties={
                           'start_date':
                           Date(format="YYYY-MM-dd||"
                                "YYYY-MM-dd'T'HH:mm:ss.SSS||"
                                "YYYY-MM-dd'T'HH:mm:ss||"
                                "YYYY-MM-dd'T'HH:mm:ssZ||"
                                "dd.MM.YYYY'T'HH:mm:ss"),
                           'end_date':
                           Date(format="YYYY-MM-dd||"
                                "YYYY-MM-dd'T'HH:mm:ss.SSS||"
                                "YYYY-MM-dd'T'HH:mm:ss||"
                                "YYYY-MM-dd'T'HH:mm:ssZ||"
                                "dd.MM.YYYY'T'HH:mm:ss")
                       })
    date_added = Date()
    counters = Object(doc_class=Counter,
                      properties={
                          'favorites_count': Integer(),
                          'interested_count': Integer(),
                          'rating': Integer(),
                      })
    source_url = String()
    min_price = Integer()
    max_price = Integer()
    is_free = Boolean()
    currency = String()
    formatted_price = String()

    class Meta:
        doc_type = 'events'
        index = 'event-index'

    def save(self, **kwargs):
        if not self.date_added:
            self.date_added = datetime.now()
        return super(Event, self).save(**kwargs)

    @classmethod
    def search_events(cls, limit=100, **kwargs):
        """
        Build ES query fillter by `kwargs` params.
        kwargs = {
            'q': 'query string',
            'start_date': 'now'
            'end_date': 'now+7d',
            'radius': 0,
            'lat': 0.0,
            'lng': 0.0,
            'actual': False,
            ....
        }
        """
        search = cls.search()
        search = search.exclude('term', deleted=True)

        observable = kwargs.get('observable', False)
        start_date = kwargs.get('start_date', None)
        end_date = kwargs.get('end_date', None)

        limit_from = kwargs.get('limit_from', 0)
        limit_to = kwargs.get('limit_to', 18)

        if not start_date:
            search = search.query(
                'range',
                **{'schedules.end_date': {
                    'lte': 'now+1y/d',
                    'gte': 'now/d'
                }})

        if 'q' in kwargs and kwargs['q']:
            search = search.query('multi_match',
                                  query=kwargs['q'],
                                  type='most_fields',
                                  minimum_should_match='75%',
                                  operator='and',
                                  tie_breaker=0.8,
                                  fields=[
                                      'title^4', 'description', 'place.city',
                                      'place.title^2', 'place.address'
                                  ])

        if start_date and end_date:
            search = search.query(
                'range', **{
                    'schedules.start_date': {
                        'gte': '{start_date}||/d'.format(**kwargs),
                        'lte': '{end_date}||/d'.format(**kwargs)
                    }
                })

        elif start_date:
            search = search.query(
                'range', **{
                    'schedules.start_date': {
                        'gte': '{start_date}||/d'.format(**kwargs),
                        'lte': '{start_date}||/d'.format(**kwargs)
                    }
                })
        elif end_date:
            search = search.query(
                'range', **{
                    'schedules.start_date': {
                        'gte': '{end_date}||/d'.format(**kwargs),
                        'lte': '{end_date}||/d'.format(**kwargs)
                    }
                })

        if 'radius' in kwargs and 'lat' in kwargs and 'lng' in kwargs:
            search = search.query('bool',
                                  filter=Q(
                                      'geo_distance',
                                      distance='{radius}m'.format(**kwargs),
                                      **{
                                          'place.geometry': {
                                              'lat': kwargs['lat'],
                                              'lon': kwargs['lng']
                                          }
                                      }))

        search = search.sort('schedules.start_date')
        search = search[limit_from:limit_to]
        return search if observable else search.execute()