class QA(InnerDoc): ans_id = Integer() ans_str = Text(fields={'raw': Keyword()}) query_id = Integer() query_str = Text()
class BlogPostIndex(DocType): id = Integer() title = Text(analyzer="ik_max_word", search_analyzer="ik_max_word") content = Text(analyzer="ik_max_word", search_analyzer="ik_max_word") char_num = Integer() allow_comments = Boolean() vote_num = Integer() category = Text(analyzer="ik_max_word", search_analyzer="ik_max_word") tags = Text(analyzer="ik_max_word", search_analyzer="ik_max_word") publish_date = Date() suggestions = Completion() class Meta: index = "blogpost-index" @classmethod def add(cls, **kwargs): id = kwargs.pop("id", None) if id is None: return False blog = cls(meta={"id": id}, **kwargs) blog.save() return blog @staticmethod def search_posts(words, delim="......<br>......<br>......"): """ 用原生写法拼装结果 :param words: :return: """ q = { "_source": ["title", "category", "tags", "publish_date"], "query": { "bool": { "must": [], "must_not": [], "should": [ { "term": { "tags": "{}".format(words) } }, { "term": { "title": "{}".format(words) } }, { "term": { "content": "{}".format(words) } }, ], } }, "highlight": { "number_of_fragments": 3, "fragment_size": 150, "fields": { "title": { "pre_tags": ["<em>"], "post_tags": ["</em>"] }, "content": { "pre_tags": ["<em>"], "post_tags": ["</em>"] }, }, }, "from": 0, "size": 50, "sort": [], "aggs": {}, } response = es_client.search(index="blogpost-index", body=q) r = [] for item in response["hits"]["hits"]: if item.get("highlight", None): if item["highlight"].get("title", None): title = "".join(item["highlight"]["title"]) else: title = item["_source"]["title"] if item["highlight"].get("content", None): content = delim.join( item["highlight"]["content"]) + "......<br>" else: content = "" r.append({ "origin_title": item["_source"]["title"], "title": title, "content": content, }) return r @staticmethod def suggest_word(words): q = { "_source": False, "suggest": { "search-as-you-type-suggestion": { "prefix": "{}".format(words), "completion": { "field": "suggestions", "size": 10, "fuzzy": { "fuzziness": 2 }, "skip_duplicates": True, }, } }, } response = es_client.search(index="blogpost-index", body=q) tmp = response["suggest"]["search-as-you-type-suggestion"] options = [] if len(tmp) >= 1: options = [item["text"] for item in tmp[0]["options"]] return options @staticmethod def similar_recommends_post(words): pass
class Listing(Document): """Base class containing the common fields.""" access = Text() additional_house_rules = Text() allows_events = Boolean() amenities = Keyword(multi=True) amenity_ids = Keyword(multi=True) avg_rating = Float() bathrooms = Float() bedrooms = Integer() beds = Integer() business_travel_ready = Boolean() city = Text(fields={'keyword': Keyword()}, required=True) country = Text(fields={'keyword': Keyword()}, required=True) coordinates = GeoPoint() description = Text() host_id = Integer(fields={'keyword': Keyword()}) house_rules = Text() interaction = Text() is_hotel = Boolean() max_nights = Integer() min_nights = Integer() monthly_price_factor = Float() name = Text(fields={'keyword': Keyword()}, required=True) neighborhood_overview = Text() # notes = Text() person_capacity = Integer() photo_count = Integer() photos = Keyword(multi=True) place_id = Text(fields={'keyword': Keyword()}) price_rate = Float() price_rate_type = Text(fields={'keyword': Keyword()}, required=True) province = Text(fields={'keyword': Keyword()}) rating_accuracy = Float() rating_checkin = Float() rating_cleanliness = Float() rating_communication = Float() rating_location = Float() rating_value = Float() review_count = Integer() reviews = Nested() room_and_property_type = Text(fields={'keyword': Keyword()}, required=True) room_type = Text(fields={'keyword': Keyword()}, required=True) room_type_category = Text(fields={'keyword': Keyword()}, required=True) satisfaction_guest = Float() star_rating = Float() state = Text(fields={'keyword': Keyword()}, required=True) transit = Text() url = Text(fields={'keyword': Keyword()}, required=True) weekly_price_factor = Float() class Index: name = 'scrapy_airbnb_listing' def save(self, **kwargs): return super(Listing, self).save(**kwargs)
class _EmailAddressDoc(InnerDoc): display_name = Text() address = Text()
class Smlouva(Document): contractid = Keyword() contractor = Keyword() contractoraddresscity = Keyword() contractoraddressstreet = Keyword() contractoraddresszip = Keyword() contractorcompany = Keyword() contractorid = Keyword() contracttitle = Text(fields={'raw': Keyword()}) contracttype = Keyword() dateconclusion = Date() datevalidity = Date() date_updated = Date() form = Keyword() ico = Keyword() id = Keyword() keyword = Keyword() originator = Keyword() originatorico = Keyword() radaevidcislo = Keyword() rok = Integer() title = Text(fields={'raw': Keyword()}) total = Keyword() valuewithvat = Float() class Index: name = INDEX_MZP_SMLOUVA def save(self, **kwargs): try: self.date_updated = elasticsearch_dsl.datetime.now() return super(Smlouva, self).save(**kwargs) except ElasticsearchDslException as err: print("CHYBA ES: {0}".format(err)) return None def load_data(self, data): self.rok = -1 self.dateconclusion = None dc_iso = consolidate_date(data['DateConclusion']) if dc_iso is not None: dca = dc_iso.split('-') self.dateconclusion = datetime.date(int(dca[0]), int(dca[1]), int(dca[2])) self.datevalidity = None dv_iso = consolidate_date(data['DateValidity']) if dv_iso is not None: dva = dv_iso.split('-') self.datevalidity = datetime.date(int(dva[0]), int(dva[1]), int(dva[2])) self.contractid = data['ContractID'] self.title = data['Title'] self.contractorid = data['ContractorID'] self.contractorcompany = data['ContractorName'] t: str = str(data['ValueWithVAT']) t = t.replace(' ', '') t = t.replace('Kč', '') t = t.replace(',', '.') if not isfloat(t): t1 = re.findall(r'\d+', t) if t1: t = t1[0] else: t = '-1' self.valuewithvat = float(t) if self.dateconclusion is not None: self.rok = int(dc_iso.split('-')[0]) self.contracttitle = self.title self.contractor = self.contractorcompany self.keyword = [ "MZP", "MŽP", "smlouva", "OpenData", "MUZO", "JASU", "EKIS" ] self.form = "smlouva" self.ico = self.contractorid self.total = str(self.valuewithvat) self.originator = "Ministerstvo životního prostředí" self.originatorico = "00164801" self.id = self.contractid
class _PolicyOverride(InnerDoc): type = Text() comment = Text()
class _SPFResult(InnerDoc): domain = Text() scope = Text() results = Text()
class DocumentDocType(ImprovedDocType): """ The main documentation doc type to be used for searching. It stores a bit of meta data so we don't have to hit the db when rendering search results. The search view will be using the 'lang' and 'version' fields of the document's release to filter the search results, depending which was found in the URL. The breadcrumbs are shown under the search result title. """ model = Document id = Long() title = Text(analyzer=lower_whitespace_analyzer, boost=1.2) path = Text(index='no', analyzer=path_analyzer) content = Text(analyzer=lower_whitespace_analyzer) content_raw = Text(index_options='offsets') release = Object(properties={ 'id': Long(), 'version': Keyword(), 'lang': Keyword(), }) breadcrumbs = Nested(properties={ 'title': Keyword(), 'path': Keyword(), }) class Meta: index = 'docs' doc_type = 'document' @classmethod def alias_to_main_index(cls, index_name, using=None): """ Alias `index_name` to 'docs' (`cls._doc_type.index`). """ body = {'actions': [{'add': {'index': index_name, 'alias': cls._doc_type.index}}]} client = connections.get_connection(using or cls._doc_type.using) client.indices.refresh(index=index_name) try: old_index_name = list(client.indices.get_alias('docs').keys())[0] except elasticsearch.exceptions.NotFoundError: old_index_name = None else: body['actions'].append({'remove': {'index': old_index_name, 'alias': cls._doc_type.index}}) client.indices.update_aliases(body=body) # Delete the old index that was aliased to 'docs'. if old_index_name: client.indices.delete(old_index_name) @classmethod def index_queryset(cls): qs = super(DocumentDocType, cls).index_queryset() return ( # don't index the module pages since source code is hard to # combine with full text search qs.exclude(path__startswith='_modules') # not the crazy big flattened index of the CBVs .exclude(path__startswith='ref/class-based-views/flattened-index') .select_related('release')) @classmethod def from_django(cls, obj): # turns HTML entities into unicode characters again and removes # all HTML tags, aka "plain text" versio of the document raw_body = strip_tags(unescape_entities(obj.body).replace(u'¶', '')) doc = cls(path=obj.path, title=obj.title, content=obj.body, content_raw=raw_body, meta={'id': obj.id}) doc.release = { 'id': obj.release.id, 'lang': obj.release.lang, 'version': obj.release.version, } breadcrumbs = [] for breadcrumb in cls.model.objects.breadcrumbs(obj): breadcrumbs.append({ 'title': breadcrumb.title, 'path': breadcrumb.path, }) doc.breadcrumbs = breadcrumbs return doc def get_absolute_url(self): return document_url(self)
class Declaration(DocType, RelatedDeclarationsMixin): """Declaration document. Assumes there's a dynamic mapping with all fields not indexed by default.""" general = Object( properties={ 'full_name_suggest': Completion(preserve_separators=False), 'full_name': Text(index=True, analyzer='ukrainian'), 'name': Text(index=True, analyzer='ukrainian'), 'patronymic': Text(index=True, analyzer='ukrainian'), 'last_name': Text(index=True, analyzer='ukrainian'), 'family_raw': Text(index=True, analyzer='ukrainian'), 'family': Nested( properties={ 'name': Text(index=True, analyzer='ukrainian'), 'relations': Keyword(index=False), 'inn': Keyword(index=False) } ), 'post_raw': Text(index=True, analyzer='ukrainian'), 'post': Object( properties={ 'region': Text(index=True, analyzer='ukrainian', fields={'raw': Keyword(index=True)}), 'office': Text(index=True, analyzer='ukrainian', fields={'raw': Keyword(index=True)}), 'post': Text(index=True, analyzer='ukrainian', fields={'raw': Keyword(index=True)}) } ), 'addresses': Nested( properties={ 'place': Text(index=False), 'place_hidden': Boolean(index=False), 'place_district': Text(index=False), 'place_district_hidden': Boolean(index=False), 'place_city': Text(index=False), 'place_city_hidden': Boolean(index=False), 'place_city_type': Keyword(index=False), 'place_city_type_hidden': Boolean(index=False), 'place_address': Text(index=False), 'place_address_hidden': Boolean(index=False), 'place_address_type': Keyword(index=False) } ) } ) declaration = Object( properties={ 'date': NoneAwareDate(), 'notfull': Boolean(index=False), 'notfull_lostpages': Keyword(index=False), 'additional_info': Boolean(index=False), 'additional_info_text': Text(index=False), 'needs_scancopy_check': Boolean(index=False) } ) intro = Object( properties={ 'declaration_year': Keyword(index=True) } ) ft_src = Text(index=True, analyzer='ukrainian') INCOME_SINGLE_PROPERTIES = { 'value': Keyword(index=False), 'value_unclear': Boolean(index=False), 'comment': Text(index=False), 'family': Keyword(index=False), 'family_unclear': Boolean(index=False), 'family_comment': Text(index=False) } INCOME_LIST_PROPERTIES = { 'country': Keyword(index=False), 'country_comment': Text(index=False), 'cur': Keyword(index=False), 'cur_units': Keyword(index=False), 'uah_equal': Keyword(index=False) } income = Object( properties={ '5': Object( properties=INCOME_SINGLE_PROPERTIES ), '6': Object( properties=INCOME_SINGLE_PROPERTIES ), '7': Object( properties=INCOME_SINGLE_PROPERTIES ), '8': Object( properties=INCOME_SINGLE_PROPERTIES ), '9': Object( properties=INCOME_SINGLE_PROPERTIES ), '10': Object( properties=INCOME_SINGLE_PROPERTIES ), '11': Object( properties=INCOME_SINGLE_PROPERTIES ), '12': Object( properties=INCOME_SINGLE_PROPERTIES ), '13': Object( properties=INCOME_SINGLE_PROPERTIES ), '14': Object( properties=INCOME_SINGLE_PROPERTIES ), '15': Object( properties=INCOME_SINGLE_PROPERTIES ), '16': Object( properties=INCOME_SINGLE_PROPERTIES ), '17': Object( properties=INCOME_SINGLE_PROPERTIES ), '18': Object( properties=INCOME_SINGLE_PROPERTIES ), '19': Object( properties=INCOME_SINGLE_PROPERTIES ), '20': Object( properties=INCOME_SINGLE_PROPERTIES ), '21': Nested( properties=INCOME_LIST_PROPERTIES ), '22': Nested( properties=INCOME_LIST_PROPERTIES ) } ) ESTATE_PROPERTIES = { 'region': Text(index=False), 'address': Text(index=False), 'space': Keyword(index=False), 'space_units': Keyword(index=False), 'space_comment': Text(index=False), 'costs': Keyword(index=False), 'costs_comment': Text(index=False), 'costs_rent': Keyword(index=False), 'costs_rent_comment': Text(index=False), 'costs_property': Keyword(index=False), 'costs_property_comment': Text(index=False) } estate = Object( properties={ '23': Nested( properties=ESTATE_PROPERTIES ), '24': Nested( properties=ESTATE_PROPERTIES ), '25': Nested( properties=ESTATE_PROPERTIES ), '26': Nested( properties=ESTATE_PROPERTIES ), '27': Nested( properties=ESTATE_PROPERTIES ), '28': Nested( properties=ESTATE_PROPERTIES ), '29': Nested( properties=ESTATE_PROPERTIES ), '30': Nested( properties=ESTATE_PROPERTIES ), '31': Nested( properties=ESTATE_PROPERTIES ), '32': Nested( properties=ESTATE_PROPERTIES ), '33': Nested( properties=ESTATE_PROPERTIES ), '34': Nested( properties=ESTATE_PROPERTIES ) } ) VEHICLE_PROPERTIES = { "brand": Text(index=False), "brand_info": Text(index=False), "year": Keyword(index=False), "sum": Keyword(index=False), "sum_comment": Text(index=False), "sum_rent": Keyword(index=False), "sum_rent_comment": Text(index=False), "brand_hidden": Boolean(index=False), "brand_info_hidden": Boolean(index=False), "brand_info_unclear": Boolean(index=False) } vehicle = Object( properties={ '35': Nested( properties=VEHICLE_PROPERTIES ), '36': Nested( properties=VEHICLE_PROPERTIES ), '37': Nested( properties=VEHICLE_PROPERTIES ), '38': Nested( properties=VEHICLE_PROPERTIES ), '39': Nested( properties=VEHICLE_PROPERTIES ), '40': Nested( properties=VEHICLE_PROPERTIES ), '41': Nested( properties=VEHICLE_PROPERTIES ), '42': Nested( properties=VEHICLE_PROPERTIES ), '43': Nested( properties=VEHICLE_PROPERTIES ), '44': Nested( properties=VEHICLE_PROPERTIES ) } ) BANKS_PROPERTIES = { 'sum': Keyword(index=False), 'sum_hidden': Boolean(index=False), 'sum_units': Keyword(index=False), 'sum_comment': Text(index=False), 'sum_foreign': Keyword(index=False), 'sum_foreign_units': Keyword(index=False), 'sum_foreign_comment': Text(index=False) } banks = Object( properties={ '45': Nested( properties=BANKS_PROPERTIES ), '46': Nested( properties=BANKS_PROPERTIES ), '47': Nested( properties=BANKS_PROPERTIES ), '48': Nested( properties=BANKS_PROPERTIES ), '49': Nested( properties=BANKS_PROPERTIES ), '50': Nested( properties=BANKS_PROPERTIES ), '51': Nested( properties=BANKS_PROPERTIES ), '52': Nested( properties=BANKS_PROPERTIES ), '53': Nested( properties=BANKS_PROPERTIES ), } ) LIABILITIES_PROPERTIES = { 'sum': Keyword(index=False), 'sum_comment': Text(index=False), 'sum_units': Keyword(index=False), 'sum_foreign': Keyword(index=False), 'sum_foreign_comment': Text(index=False) } liabilities = Object( properties={ '54': Nested( properties=LIABILITIES_PROPERTIES ), '55': Nested( properties=LIABILITIES_PROPERTIES ), '56': Nested( properties=LIABILITIES_PROPERTIES ), '57': Nested( properties=LIABILITIES_PROPERTIES ), '58': Nested( properties=LIABILITIES_PROPERTIES ), '59': Nested( properties=LIABILITIES_PROPERTIES ), '60': Nested( properties=LIABILITIES_PROPERTIES ), '61': Nested( properties=LIABILITIES_PROPERTIES ), '62': Nested( properties=LIABILITIES_PROPERTIES ), '63': Nested( properties=LIABILITIES_PROPERTIES ), '64': Nested( properties=LIABILITIES_PROPERTIES ), } ) class Meta: index = 'declarations_v2'
class LagouType(DocType): suggest = Completion(analyzer=ik_analyzer) positionId = Keyword() # 文章id art_title = Text(analyzer="ik_max_word") art_time = Date() art_position = Text(analyzer="ik_max_word") art_salary = Text(analyzer="ik_max_word") # 工资 art_work_year = Text(analyzer="ik_max_word") # 要求工作年限 art_education = Text(analyzer="ik_max_word") # 学历要求 art_jobNature = Text(analyzer="ik_max_word") # 职位类型全职 company_hitags = Keyword() # 公司福利 art_company_name = Text(analyzer="ik_max_word") art_company_id = Keyword() company_type = Keyword() company_size = Keyword() company_financestage = Keyword() # 公司融资轮数 company_label_list = Keyword() # 公司吸引力 art_first_type = Text(analyzer="ik_max_word") # 具体职位类型 art_second_type = Text(analyzer="ik_max_word") # 总体职位类型 art_third_type = Text(analyzer="ik_max_word") # 语言职位类型 compangy_full_position = Text(analyzer="ik_max_word")#具体位置 art_description = Text(analyzer="ik_max_word") fingerprint = Keyword() class Meta: index = "jobbole" doc_type = "article"
class SFNIOT(InnerDoc): event_type = Text() domain_name = Text(analyzer='snowball', fields={'raw': Keyword()}) device_name = Text(analyzer='snowball', fields={'raw': Keyword()}) host = Text(analyzer='snowball', fields={'raw': Keyword()}) threat_id = Text(analyzer='snowball') threat_name = Text(analyzer='snowball') tag_name = Text(fields={'raw': Keyword()}) tag_class = Text(fields={'raw': Keyword()}) tag_group = Text(fields={'raw': Keyword()}) tag_description = Text(analyzer='snowball') public_tag_name = Text(analyzer='snowball') confidence_level = Integer() sample_date = Date() file_type = Text(fields={'raw': Keyword()}) updated_at = Date() processed = Integer() src_ip = Ip() dst_ip = Ip()
class EntityDepartment(DocType): name = Keyword() source_url = Text() class Meta: index = 'med_base'
class BlogPostIndex(DocType): author = Text() posted_date = Date() title = Text() text = Text()
class Item(Document): id = Integer() title = Text() description = Text() first_img = Text() basic_price = Float() price = Float() on_sale = Boolean() is_discounted = Boolean() created_at = Date() class Index: name = "flaskshop" @classmethod def add(cls, item): obj = cls(**get_item_data(item)) obj.save() return obj @classmethod def update_item(cls, item): try: obj = cls.get(item.id) except NotFoundError: return cls.add(item) kw = get_item_data(item) try: obj.update(**kw) except ConflictError: obj = cls.get(item.id) obj.update(**kw) return True @classmethod def delete(cls, item): rs = cls.get(item.id) if rs: super(cls, rs).delete() return True return False @classmethod def bulk_update(cls, items, chunk_size=5000, op_type="update", **kwargs): index = cls._index._name _type = cls._doc_type.name obj = [{ "_op_type": op_type, "_id": f"{doc.id}", "_index": index, "_type": _type, "_source": get_item_data(doc), } for doc in items] client = cls.get_es() rs = list(parallel_bulk(client, obj, chunk_size=chunk_size, **kwargs)) return rs @classmethod def get_es(cls): search = cls.search() return connections.get_connection(search._using) @classmethod def new_search(cls, query, page, order_by=None, per_page=16): s = cls.search() s = s.query("multi_match", query=query, fields=SERACH_FIELDS) start = (page - 1) * per_page s = s.extra(**{"from": start, "size": per_page}) s = s if order_by is None else s.sort(order_by) rs = s.execute() return Pagination(query, page, per_page, rs.hits.total, rs)
class _EmailAttachmentDoc(Document): filename = Text() content_type = Text() sha256 = Text()
class PodcastTranscriptionBlob(Document): """ This defines a PodcastTranscriptionBlob model. This represents the groups of 3-5 sentences for each podcast's full transcription text, that we use to search through podcasts """ podcast_id = Keyword() transcription_blob = Text() starting_timestamp_second = Integer( ) # The starting timestamp for this blob (in seconds since the start of the podcast's video) ending_timestamp_second = Integer() blob_index = Integer() lecture_num = Integer() # Filters that the user can search against department = Text() course_num = Keyword() quarter = Keyword() professor = Text() section_id = Keyword() date = Date() # Elasticsearch index settings class Index: name = "podcast_transription_blobs" def get_snippet_url(self): return url_for("podcasts.get_podcast_blob", blob_id=self.meta.id) @property def podcast(self): return Podcast.get(id=self.podcast_id) def convert_to_dict(self): dict_ = self.to_dict(include_meta=False) dict_['id'] = self.meta.id dict_['href'] = self.get_snippet_url() return dict_ @staticmethod def search_podcasts(text_query, department=None, course_num=None, professor=None, quarter=None, section_id=None, page=1, count=10): search_criteria = [ Q('match', transcription_blob=text_query), ] if department: search_criteria.append(Q('match', department=department)) if course_num: search_criteria.append(Q('match', course_num=course_num)) if professor: search_criteria.append(Q('match', professor=professor)) if quarter: search_criteria.append(Q('match', quarter=quarter)) if section_id: search_criteria.append(Q('match', section_id=section_id)) search_query = Q('bool', must=search_criteria) # TODO: Should we automatically sort the results by the date & starting_timestamp_second fields? # Because the professor might first introduce a topic in 1 lecture, and then refer to it in subsequent lectures. podcast_transcription_query = PodcastTranscriptionBlob.search( ).highlight("transcription_blob").query(search_query) # Slice the search query for the requested page & count podcast_transcription_query = podcast_transcription_query[ (page - 1) * count:((page - 1) * count) + count] relevant_transcription_blobs = podcast_transcription_query.execute() return relevant_transcription_blobs[:count]
class _ForensicReportDoc(Document): class Index: name = "dmarc_forensic" feedback_type = Text() user_agent = Text() version = Text() original_mail_from = Text() arrival_date = Date() domain = Text() original_envelope_id = Text() authentication_results = Text() delivery_results = Text() source_ip_address = Ip() source_country = Text() source_reverse_dns = Text() source_authentication_mechanisms = Text() source_auth_failures = Text() dkim_domain = Text() original_rcpt_to = Text() sample = Object(_ForensicSampleDoc)
class User(InnerDoc): name = Text(fields={'raw': Keyword()})
class _DKIMResult(InnerDoc): domain = Text() selector = Text() result = Text()
class History(InnerDoc): timestamp = Date() diff = Text()
class _AggregateReportDoc(Document): class Index: name = "dmarc_aggregate" xml_schema = Text() org_name = Text() org_email = Text() org_extra_contact_info = Text() report_id = Text() date_range = Date() date_begin = Date() date_end = Date() errors = Text() published_policy = Object(_PublishedPolicy) source_ip_address = Ip() source_country = Text() source_reverse_dns = Text() source_Base_domain = Text() message_count = Integer disposition = Text() dkim_aligned = Boolean() spf_aligned = Boolean() passed_dmarc = Boolean() policy_overrides = Nested(_PolicyOverride) header_from = Text() envelope_from = Text() envelope_to = Text() dkim_results = Nested(_DKIMResult) spf_results = Nested(_SPFResult) def add_policy_override(self, type_, comment): self.policy_overrides.append(_PolicyOverride(type=type_, comment=comment)) def add_dkim_result(self, domain, selector, result): self.dkim_results.append(_DKIMResult(domain=domain, selector=selector, result=result)) def add_spf_result(self, domain, scope, result): self.spf_results.append(_SPFResult(domain=domain, scope=scope, result=result)) def save(self, ** kwargs): self.passed_dmarc = False self.passed_dmarc = self.spf_aligned or self.dkim_aligned return super().save(** kwargs)
def text_with_raw(): return Text(fields=dict(raw=Keyword()))
class PoeItem(InnerDoc): """ items """ abyssJewel = Boolean() additionalProperties = Boolean(multi=True) artFilename = Text() category = Nested(PoeCategory) corrupted = Boolean() cosmeticMods = Text(multi=True) craftedMods = Text(multi=True) descrText = Text() duplicated = Boolean() elder = Boolean() enchantMods = Text(multi=True) explicitMods = Text(multi=True) flavourText = Text(multi=True) frameType = Integer() h = Integer() icon = Keyword() id = Keyword() identified = Boolean() ilvl = Integer() implicitMods = Text(multi=True) inventoryId = Text() isRelic = Boolean() league = Keyword() lockedToCharacter = Boolean() maxStackSize = Integer() name = Text() nextLevelRequirements = Nested(PoePropsReqs, multi=True) note = Keyword() properties = Nested(PoePropsReqs, multi=True) prophecyDiffText = Text() prophecyText = Text() requirements = Nested(PoePropsReqs, multi=True) secDescrText = Text() shaper = Boolean() socketedItems = Nested() sockets = Nested(PoeSockets) stackSize = Integer() support = Boolean() talismanTier = Integer() typeLine = Text() utilityMods = Text(multi=True) verified = Boolean() w = Integer() x = Integer() y = Integer()
class ModsecRule(Document): rule_id = Integer() msg = Text(analyzer='snowball', fields={'raw': Keyword()}) body = Text(analyzer='snowball') tags = Keyword() rule_txt = Text(analyzer='snowball') severity = Text(analyzer='snowball') phase = Text(analyzer='snowball') rev = Text(analyzer='snowball') maturity = Text(analyzer='snowball') accuracy = Text(analyzer='snowball') ver = Text(analyzer='snowball') filename = Text(analyzer='snowball') # 2019-10-7 增加 category = Text(analyzer='snowball') class Index: name = 'ngx_modsec_rules' settings = { "number_of_shards": 2, } def save(self, **kwargs): return super(ModsecRule, self).save(**kwargs)
class LagouJobType(Document): """ 知乎answer类型 """ # Completion是es自带的自动补全提示工具 # 由于报错,所以自定义了analyzer,但实际上什么都没做 suggest = Completion(analyzer=ik_analyzer) # 以下是scrapy的item title = Text(analyzer="ik_max_word") url = Keyword() url_object_id = Keyword() salary = Text(analyzer="ik_max_word") job_city = Text(analyzer="ik_max_word") work_years = Text(analyzer="ik_max_word") degree_need = Text(analyzer="ik_max_word") job_type = Text(analyzer="ik_max_word") # 这里没有做处理,暂时保留text publish_time = Text(analyzer="ik_max_word") job_advantage = Text(analyzer="ik_max_word") job_desc = Text(analyzer="ik_max_word") job_addr = Text(analyzer="ik_max_word") company_name = Keyword() company_url = Keyword() tags = Text(analyzer="ik_max_word") crawl_time = Date() class Index: name = 'lagou' doc_type = "job" class Meta: doc_type = "job"
class EventIndex(DocType): name = Text() created_at = Text() class Meta: index = 'event-index'
class Lagou(DocType): # 拉勾网职位信息 title_suggest = Completion(analyzer=ik_analyzer, search_analyzer=ik_analyzer) title = Text(analyzer='ik_max_word', search_analyzer="ik_max_word", fields={'title': Keyword()}) id = Text() url = Text() salary = Text() job_city = Text() work_years = Text() degree_need = Text() job_type = Text() publish_time = Text() job_advantage = Text() job_desc = Text() job_addr = Text() company_name = Text() company_url = Text() tags = Text(analyzer='ik_max_word', fields={'tags': Keyword()}) crawl_time = Date() class Meta: index = 'jobbole' doc_type = 'lagou_job'
class DocTestSSLResult(Document): source = Text(fields={'raw': Keyword()}) result = Boolean() timestamp = Date() ip = Keyword() hostname = Keyword() port = Integer() svcid = Keyword() protocols = Keyword(multi=True) ciphers = Text(multi=True, fields={'raw': Keyword()}) ciphertests = Keyword(multi=True) serverpref = Object( properties={ "cipher_order": Boolean(), "protocol": Keyword(), "cipher": Text(fields={'raw': Keyword()}) }) cert = Object( properties={ "keysize": Short(), "signalgo": Text(fields={'raw': Keyword()}), "md5_fingerprint": Keyword(), "sha1_fingerprint": Keyword(), "sha256_fingerprint": Keyword(), "cn": Text(fields={'raw': Keyword()}), "san": Text(multi=True, fields={'raw': Keyword()}), "issuer": Text(fields={'raw': Keyword()}), "ev": Boolean(), "expiration": Date(), "ocsp_uri": Text(fields={'raw': Keyword()}), "Crl_url": Text(fields={'raw': Keyword()}), "ocsp_stapling": Boolean(), }) vulnerabilities = Keyword(multi=True) def parseCSVLine(self, line): if line['id'] == "id": return if not self.ip or not self.hostname or not self.port: # host, ip and port m = reIpHostColumn.search(line['fqdn/ip']) if m: self.hostname, self.ip = m.groups() self.port = int(line['port']) if reProtocol.search(line['id']) and reOffers.search( line['finding']): # protocols self.result = True m = reProtocol.search(line['id']) if m: self.protocols.append(line['id'].upper()) elif reCipherColumnName.search(line['id']): # ciphers IT WORKS m = reCipherDetails.search(line['finding']) if m: self.ciphers.append(m.group(1)) elif reCipherTests.search(line['id']) and reVulnerable.search( line['finding']): # cipher tests m = reCipherTests.search(line['id']) print(m) if m: self.ciphertests.append(m.group(1)) if line['id'] == "cipher_order": # server prefers cipher IT WORKS self.serverpref.cipher_order = bool(reOk.search(line['severity'])) elif line[ 'id'] == "protocol_negotiated": # preferred protocol IT WORKS m = reDefaultProtocol.search(line['finding']) if m: self.serverpref.protocol = m.group(1) elif line['id'] == "cipher_negotiated": # preferred cipher IT WORKS m = reDefaultCipher.search(line['finding']) if m: self.serverpref.cipher = m.group(1) elif line['id'] == "cert_keySize": # certificate key size IT WORKS m = reKeySize.search(line['finding']) if m: self.cert.keysize = int(m.group(1)) elif line[ 'id'] == "cert_signatureAlgorithm": # certificate sign algorithm IT WORKS m = reSignAlgorithm.search(line['finding']) if m: self.cert.signalgo = m.group(1) elif line[ 'id'] == "cert_fingerprintSHA1": # certificate fingerprints SHA1 IT WORKS m = reFPSHA1.search(line['finding']) if m: self.cert.sha1_fingerprint = m.group(1) elif line[ 'id'] == "cert_fingerprintSHA256": # certificate fingerprints SHA256 IT WORKS m = reFPSHA256.search(line['finding']) if m: self.cert.sha256_fingerprint = m.group(1) elif line[ 'id'] == "cert_fingerprintMD5": # certificate fingerprints MD5 IT WORKS m = reFPMD5.search(line['finding']) if m: self.cert.md5_fingerprint = m.group(1) elif line['id'] == "cert_commonName": # certificate CN IT WORKS m = reCN.search(line['finding']) if m: self.cert.cn = m.group(1) elif line[ 'id'] == "cert_subjectAltName": # certificate SAN KINDA WORKS NEEDS REVISION m = reSAN.search(line['finding']) #print(m) if m: self.cert.san = m.group(1) #sans = m.group(1) #for san in sans.split(" "): # if san != "--": # self.cert.san.append(san)""" elif line['id'] == "cert_caIssuers": # certificate issuer IT WORKS m = reIssuer.search(line['finding']) if m: self.cert.issuer = m.group(1) elif line['id'] == "ev": # certificate extended validation NOT SUERE self.cert.ev = bool(reYes.search(line['finding'])) elif line['id'] == "cert_notAfter": # certificate expiration IT WORKS m = reExpiration.search(line['finding']) if m: unparsedDate = m.group(1) self.cert.expiration = datetime.strptime( unparsedDate, "%Y-%m-%d %H:%M") elif line[ 'id'] == "cert_ocspURL": # certificate OCSP URI IT WORKS ELSE NEEDS REWORK m = reOCSPURI.search(line['finding']) #print(m) if m: self.cert.ocsp_uri = m.group(1) else: self.cert.ocsp_uri = "-" elif line[ 'id'] == "cert_crlDistributionPoints": # certificate CRL WORKS m = reAll.search(line['finding']) #print(m) if m: self.cert.Crl_url = m.group(1) else: self.cert.Crl_url = "-" elif line['id'] == "OCSP_stapling": # certificate OCSP stapling self.cert.ocsp_stapling = not bool( reNotOffered.search(line['finding'])) elif line['id'] in ("heartbleed", "CCS", "secure_renego", "secure_client_renego", "CRIME_TLS", "SWEET32", "POODLE_SSL", "fallback_SCSV", "FREAK", "DROWN", "LOGJAM", "BEAST", "LUCKY13", "RC4") and reVulnerable.search(line['severity']): m = reVulnerable.search(line['severity']) if str(m.group(1)) != '': self.vulnerabilities.append(line['id'].upper()) def parseCSV(self, csvfile): if self.source: m = reDefaultFilename.search(self.source) if m: self.ip = m.group('ip') self.port = int(m.group('port') or 0) self.timestamp = datetime.strptime(m.group('datetime'), "%Y%m%d-%H%M") csvReader = csv.DictReader(csvfile, fieldnames=("id", "fqdn/ip", "port", "severity", "finding", "cve", "cwe"), delimiter=',', quotechar='"') for line in csvReader: self.parseCSVLine(line) def save(self, **kwargs): if not self.timestamp: self.timestamp = datetime.now(tz) if not self.port: raise ValueError("Empty scan result") self.svcid = "%s:%d" % (self.ip, int(self.port) or 0) if not self.result: self.result = False if 'debug' in kwargs and kwargs['debug']: pp.pprint(self.to_dict()) return super().save()
class LagouType(DocType): suggest = Completion(analyzer=ik_analyzer) title = Text(analyzer="ik_max_word") url = Keyword() url_object_id = Keyword() salary_min = Text(analyzer="ik_max_word") salary_max = Text(analyzer="ik_max_word") job_city = Keyword() work_years_min = Text(analyzer="ik_max_word") work_years_max = Text(analyzer="ik_max_word") degree_need = Text(analyzer="ik_max_word") job_type = Keyword() publish_time = Text(analyzer="ik_max_word") job_advantage = Text(analyzer="ik_max_word") job_desc = Text(analyzer="ik_max_word") job_addr = Text(analyzer="ik_max_word") company_name = Keyword() company_url = Keyword() tags = Text(analyzer="ik_max_word") crawl_time = Date() class Meta: index = "lagou" doc_type = "job"
class Event(DocType): title = Text( multi=True, fields={ 'en': String(analyzer='english'), 'ru': String(analyzer='russian'), # 'raw': String(analyzer='trigrams'), 'raw': Keyword(), }, analyzer='russian') title_localized = Object(doc_class=Locales, properties={ 'raw': String(analyzer='trigrams'), 'en': String(analyzer='english'), 'ru': String(analyzer='russian'), 'de': String(analyzer='dutch'), 'fr': String(analyzer='french'), 'es': String(analyzer='spanish'), }) title_suggest = Completion() description = Text(analyzer='russian') description_localized = Object(doc_class=Locales, properties={ 'raw': String(analyzer='trigrams'), 'en': String(analyzer='english'), 'ru': String(analyzer='russian'), 'de': String(analyzer='dutch'), 'fr': String(analyzer='french'), 'es': String(analyzer='spanish'), }) slug = Text(fields={'raw': Keyword()}) provider = Text() provider_id = Text() ticket_price = Text() ticket_url = Text() video_url = Text() deleted = Boolean() image = Text() average_rate = Float() category = Object( properties={ 'title': Text(fields={'raw': Keyword()}, analyzer='russian'), 'slug': Text(fields={'raw': Keyword()}) }) place = Object(doc_class=Place, properties={ 'place_id': Text(fields={'raw': Keyword()}), 'provider_id': Text(fields={'raw': Keyword()}), 'slug': String(), 'title': Text(fields={'raw': Keyword()}, analyzer='russian'), 'title_localized': Object(doc_class=Locales, properties={ 'raw': String(analyzer='trigrams'), 'en': String(analyzer='english'), 'ru': String(analyzer='russian'), 'de': String(analyzer='dutch'), 'fr': String(analyzer='french'), 'es': String(analyzer='spanish'), }), 'address': Text(fields={'raw': Keyword()}, analyzer='russian'), 'address_localized': Object(doc_class=Locales, properties={ 'raw': String(analyzer='trigrams'), 'en': String(analyzer='english'), 'ru': String(analyzer='russian'), 'de': String(analyzer='dutch'), 'fr': String(analyzer='french'), 'es': String(analyzer='spanish'), }), 'city': Text(fields={'raw': Keyword()}, analyzer='russian'), 'description': Text(), 'description_localized': Object(doc_class=Locales, properties={ 'raw': String(analyzer='trigrams'), 'en': String(analyzer='english'), 'ru': String(analyzer='russian'), 'de': String(analyzer='dutch'), 'fr': String(analyzer='french'), 'es': String(analyzer='spanish'), }), 'lat': Float(), 'lng': Float(), 'geometry': GeoPoint(), 'email': Text(), 'website': Text(), 'phone': Text() }) images = Nested(properties={ 'image': Text(), }) dates = Object(doc_class=Schedule, properties={ 'start_date': Date(format="YYYY-MM-dd||" "YYYY-MM-dd'T'HH:mm:ss"), 'end_date': Date(format="YYYY-MM-dd||" "YYYY-MM-dd'T'HH:mm:ss") }) schedules = Object(doc_class=Schedule, properties={ 'start_date': Date(format="YYYY-MM-dd||" "YYYY-MM-dd'T'HH:mm:ss.SSS||" "YYYY-MM-dd'T'HH:mm:ss||" "YYYY-MM-dd'T'HH:mm:ssZ||" "dd.MM.YYYY'T'HH:mm:ss"), 'end_date': Date(format="YYYY-MM-dd||" "YYYY-MM-dd'T'HH:mm:ss.SSS||" "YYYY-MM-dd'T'HH:mm:ss||" "YYYY-MM-dd'T'HH:mm:ssZ||" "dd.MM.YYYY'T'HH:mm:ss") }) date_added = Date() counters = Object(doc_class=Counter, properties={ 'favorites_count': Integer(), 'interested_count': Integer(), 'rating': Integer(), }) source_url = String() min_price = Integer() max_price = Integer() is_free = Boolean() currency = String() formatted_price = String() class Meta: doc_type = 'events' index = 'event-index' def save(self, **kwargs): if not self.date_added: self.date_added = datetime.now() return super(Event, self).save(**kwargs) @classmethod def search_events(cls, limit=100, **kwargs): """ Build ES query fillter by `kwargs` params. kwargs = { 'q': 'query string', 'start_date': 'now' 'end_date': 'now+7d', 'radius': 0, 'lat': 0.0, 'lng': 0.0, 'actual': False, .... } """ search = cls.search() search = search.exclude('term', deleted=True) observable = kwargs.get('observable', False) start_date = kwargs.get('start_date', None) end_date = kwargs.get('end_date', None) limit_from = kwargs.get('limit_from', 0) limit_to = kwargs.get('limit_to', 18) if not start_date: search = search.query( 'range', **{'schedules.end_date': { 'lte': 'now+1y/d', 'gte': 'now/d' }}) if 'q' in kwargs and kwargs['q']: search = search.query('multi_match', query=kwargs['q'], type='most_fields', minimum_should_match='75%', operator='and', tie_breaker=0.8, fields=[ 'title^4', 'description', 'place.city', 'place.title^2', 'place.address' ]) if start_date and end_date: search = search.query( 'range', **{ 'schedules.start_date': { 'gte': '{start_date}||/d'.format(**kwargs), 'lte': '{end_date}||/d'.format(**kwargs) } }) elif start_date: search = search.query( 'range', **{ 'schedules.start_date': { 'gte': '{start_date}||/d'.format(**kwargs), 'lte': '{start_date}||/d'.format(**kwargs) } }) elif end_date: search = search.query( 'range', **{ 'schedules.start_date': { 'gte': '{end_date}||/d'.format(**kwargs), 'lte': '{end_date}||/d'.format(**kwargs) } }) if 'radius' in kwargs and 'lat' in kwargs and 'lng' in kwargs: search = search.query('bool', filter=Q( 'geo_distance', distance='{radius}m'.format(**kwargs), **{ 'place.geometry': { 'lat': kwargs['lat'], 'lon': kwargs['lng'] } })) search = search.sort('schedules.start_date') search = search[limit_from:limit_to] return search if observable else search.execute()