def calc(self, w1, w2): w = stemmer.stem(w1) + " " + stemmer.stem(w2) q = { "query": { "match": { "gram": { "query": w, "minimum_should_match": "2" } } } } phraseHitsPos = es.search(index=posindex, body=q, size=0)['hits']['total'] + 0.01 phraseHitsNeg = es.search(index=negindex, body=q, size=0)['hits']['total'] + 0.01 if (phraseHitsNeg < 2 or phraseHitsPos < 2): return 0 else: q = {"query": {"match_all": {}}} totalPos = es.search(index=posindex, body=q, size=0)['hits']['total'] totalNeg = es.search(index=negindex, body=q, size=0)['hits']['total'] SO = math.log( (phraseHitsPos * totalNeg) / (phraseHitsNeg * totalPos), 2) # print("pos: " + str(phraseHitsPos)) # print("neg: " +str(phraseHitsNeg)) # print(SO) return SO
def on_post(self, req, resp): cmd = req.get_param('cmd') result = {} if cmd == 'add': book = req.get_param('book') file_path = save_file(book) task_data = {'path': file_path} try: add_book_task.delay(task_data) result = {'msg': 'file putted in queue'} except Exception as e: result = {'error': str(e)} delete_file(file_path) elif cmd == 'create': result = create_index() elif cmd == 'delete': result = delete_index() elif cmd == 'count': result = count_items() elif cmd == 'search': q = req.get_param('q') result = search(q) elif cmd == 'search_advanced': q = req.get_param('q') result = search_advanced(q) resp.body = json.dumps(result) resp.status = falcon.HTTP_200
def putKey(apiKey): matchAll = {"query": {"match_all": {}}} ssConfig = es.search(esService, matchAll, 'sweet_security', 'configuration') if ssConfig is not None: for config in ssConfig['hits']['hits']: body = {'doc' : {'fileCheckKey': apiKey}} es.update(esService, body, 'sweet_security', 'configuration', config['_id'])
def get_fixits(q: str) -> List[str]: fixits: List[str] = [] if q.find('tvtropes.org') >= 0: fixits += [ '(note that tvtropes.org is not directly supported; instead, use the url of the actual fic)' ] if q.find('http://') != 0 and q.find('https://') != 0: fixits += [ '(please try a full url including http:// or https:// at the start)' ] if q.find('fanfiction.net') >= 0: fixits += [ 'fanfiction.net is fragile at the moment; please try again later or check the discord' ] if q.find('fanfiction.net/u/') >= 0: fixits += [ 'user pages on fanfiction.net are not currently supported -- please try a specific story' ] if q.find('fictionpress.com') >= 0: fixits += [ 'fictionpress.com is fragile at the moment; please try again later or check the discord' ] try: import es import urllib.parse fis = es.search(q, limit=15) for fi in fis: u = urllib.parse.quote(fi.source, safe='') fixits += [ f'<br/>did you mean <a href=/fic/{fi.id}>{fi.title} by {fi.author}</a>?' ] except: pass return fixits
def search(search_term,result,start_result=0): es = Elasticsearch(host) q = { "fields" : ["text"], "from" : start_result, "size" : result, "query": { "query_string": { "query": search_term, } }, "highlight": { "fields": { "text": {"fragment_size" : 100, "number_of_fragments": 5} } } } res = es.search(index=_index, body=q) r = res['hits']['hits'] l = [] for re in r: d = {"urlAddress" : re['fields']['urlAddress'], "title" : str(re['fields']['title'][3:-2].decode('utf-8')), "id" : re['_id'], "score" : re['_score'],} if d['title'] == '': d['title'] = d['urlAddress'] try: d.update({"highlight" : re['highlight']['text']}) except: pass l.append(d) return {'result_list': l, 'result_count': res['hits']['total']}
def calc(self, w1, w2): w = stemmer.stem(w1)+ " " + stemmer.stem(w2) q = {"query":{"match": {"gram" :{"query": w,"minimum_should_match": "2"} }}} phraseHitsPos = es.search(index = posindex, body = q, size = 0)['hits']['total']+0.01 phraseHitsNeg = es.search(index = negindex, body = q, size = 0)['hits']['total']+0.01 if(phraseHitsNeg < 2 or phraseHitsPos < 2): return 0 else: q = {"query":{"match_all" : { } }} totalPos = es.search(index = posindex, body = q, size = 0)['hits']['total'] totalNeg = es.search(index = negindex, body = q, size = 0)['hits']['total'] SO = math.log((phraseHitsPos*totalNeg)/(phraseHitsNeg*totalPos),2) # print("pos: " + str(phraseHitsPos)) # print("neg: " +str(phraseHitsNeg)) # print(SO) return SO
def search(): keyword = request.args.get("keyword") results = es.search(keyword) if "username" in session: username = session["username"] u = users.find_one(username=username) return render_template("search_results.html", results=results, keyword=keyword, user=u) return render_template("search_results.html", results=results, keyword=keyword)
def getKey(): # Get Configuration Settings apiKey=None matchAll = {"query": {"match_all": {}}} ssConfig = es.search(esService, matchAll, 'sweet_security', 'configuration') if ssConfig is not None: for config in ssConfig['hits']['hits']: if 'fileCheckKey' in config['_source']: apiKey=config['_source']['fileCheckKey'] return apiKey
def do_task(task_data): """ Search, send result to email and run log task task. :param task_data: data to log :type task_data: dict :return: data structure for log task :rtype: dict """ results = search(task_data['q']) send_email(results, task_data) log_task_data = log_results(results, task_data) return log_task_data
def getSSConfig(): config = { 'defaultIsolate': 0, 'defaultLogRetention': 30, 'defaultMonitor': 0, 'defaultFW': 0 } matchAll = {"query": {"match_all": {}}} ssConfigData = es.search(esService, matchAll, 'sweet_security', 'configuration') if ssConfigData is not None: for entry in ssConfigData['hits']['hits']: config = entry['_source'] return config
def run(): logger = logging.getLogger('SweetSecurityServerLogger') logger.info('Running Baseliner') matchAll = {"query": {"match_all": {}}} #Create TARDIS index if it is missing tardisQuery = es.search(esService, matchAll, 'tardis', 'known_hosts') if tardisQuery is None: logger.info('Creating TARDIS Index') #print "Creating TARDIS Index" os.popen( 'curl -XPUT \'localhost:9200/tardis?pretty\' -H \'Content-Type: application/json\' -d\' {"mappings" : {"known_hosts" : {"properties" : { "mac" : { "type" : "text", "fields": {"raw": {"type": "keyword"}}},"destination" : { "type" : "text", "fields": {"raw": {"type": "keyword"}}},"port" : { "type" : "text", "fields": {"raw": {"type": "keyword"}}}}}}}\'' ).read() #Get List of Known Devices allDevices = es.search(esService, matchAll, 'sweet_security', 'devices') if allDevices is not None: for host in allDevices['hits']['hits']: logger.info("Searching Device %s(%s : %s)" % (host['_source']['nickname'], host['_source']['ip4'], host['_source']['mac'])) logger.info(" Searching conn.log") conn = connSearch(host['_source']['ip4'], host['_source']['mac']) logger.info(" Found %d new entries" % conn) logger.info(" Searching dns.log") dns = dnsSearch(host['_source']['ip4'], host['_source']['mac']) logger.info(" Found %d new entries" % dns) logger.info(" Searching http.log") http = httpSearch(host['_source']['ip4'], host['_source']['mac']) logger.info(" Found %d new entries" % http) logger.info(" Searching ssl.log") ssl = sslSearch(host['_source']['ip4'], host['_source']['mac']) logger.info(" Found %d new entries" % ssl)
def after_crawling(): # elastiesarch index 검색 # search_index = input("엘라스틱 서치에서 검색 할 index 이름을 입력하시오 : ") index = es.search("olympic") # es에서 검색한 결과 data_list = es.convert_to_list(index) # es _source(data value) 만 가져와서 list로 변환 # datapreprocessing 1. 형태소 분석 2.명사 추출 2-1. 불용어 처리 # 1. 형태소 분석 # data_preprocessing.m_analysis(data_list) # 2. 명사 추출 nouns_list = data_preprocessing.noun_extraction(data_list) # 2-1. 불용어 처리 ( 명사 추출 한 결과) result = data_preprocessing.stopword(nouns_list) # result store in mysql - 불용어 처리 결과 저장 mysql.nouns_store(result)
def getLogData(): files=[] #timestamp is 10m to make sure we don't miss anything, will skip the check if the file was already checked fileQuery={"query": { "bool":{ "must":[ {"match_phrase": { "path": "/opt/nsm/bro/logs/current/files.log" }}, {"range" : { "@timestamp" : {"gt" : "now-10m"}}}] } } } fileData=es.search(esService, fileQuery, 'logstash-*', 'logs', 10000) logTotal=fileData['hits']['total'] for log in fileData['hits']['hits']: files.append(log) return files
def get_definition_mdx(word, builder) -> List[bytes]: """根据关键字得到MDX词典的解释""" if not word: return [not_found.encode('utf-8')] word = word.lower() content = builder.mdx_lookup(word) if len(content) < 1: word = spellchecker.correction(word) content = builder.mdx_lookup(word) if len(content) < 1: content = builder.mdx_lookup(word.upper()) if len(content) < 1: content = builder.mdx_lookup(plural2singular(word.lower())) if is_chinese(word): content += search(word) if len(content) < 1: return [not_found.encode('utf-8')] pattern = re.compile(r"@@@LINK=([\w\s]*)") rst = pattern.match(content[0]) if rst is not None: link = rst.group(1).strip() content = builder.mdx_lookup(link) # remove \r\n and entry:/ str_content = "" if len(content) > 0: for c in content: str_content += c.replace("\r\n", "").replace("entry:/", "") injection_html = '' try: base_path = os.path.dirname(sys.executable) except IOError: base_path = os.path.abspath(".") resource_path = os.path.join(base_path, 'mdx') injection = get_all_files(resource_path) for p in injection: if match_file_ext(p, 'html'): injection_html += read_all_lines(p) output_html = str_content + injection_html return [output_html.encode('utf-8')]
def dnsSearch(ip, mac): numFound = 0 dnsData = getLogs(ip, '/opt/nsm/bro/logs/current/dns.log') knownQueries = [] knownDnsQuery = {"query": {"match_phrase": {"mac": {"query": mac}}}} knownDnsData = es.search(esService, knownDnsQuery, 'tardis', 'known_dnsqueries') for query in knownDnsData['hits']['hits']: if query['_source']['query'] not in knownQueries: knownQueries.append(query['_source']['query']) for log in dnsData['hits']['hits']: if log['_source']['query'] not in knownQueries: numFound += 1 knownQueries.append(log['_source']['query']) dnsData = {'mac': mac, 'query': log['_source']['query']} es.write(esService, dnsData, 'tardis', 'known_dnsqueries') alertMessage = 'A new DNS query was added to the baseline: %s' % log[ '_source']['query'] alert.send('Baseliner', alertMessage, log['_id'], log['_index']) return numFound
def connSearch(ip, mac): numFound = 0 connData = getLogs(ip, '/opt/nsm/bro/logs/current/conn.log') knownHosts = [] knownHostQuery = {"query": {"match_phrase": {"mac": {"query": mac}}}} knownHostData = es.search(esService, knownHostQuery, 'tardis', 'known_hosts') for device in knownHostData['hits']['hits']: if device['_source']['ip'] not in knownHosts: knownHosts.append(device['_source']['ip']) for log in connData['hits']['hits']: if log['_source']['resp_h'] not in knownHosts: numFound += 1 knownHosts.append(log['_source']['resp_h']) hostData = {'mac': mac, 'ip': log['_source']['resp_h']} es.write(esService, hostData, 'tardis', 'known_hosts') alertMessage = 'A new IP was added to the baseline: %s' % log[ '_source']['resp_h'] alert.send('Baseliner', alertMessage, log['_id'], log['_index']) return numFound
def deleteOldLogs(): logger = logging.getLogger('SweetSecurityServerLogger') logger.info('Checking local disk space') diskUsage = checkDisk() #Warn user if disk storage is above 85% if diskUsage > 84: message = 'Server disk usage is at %d%%' % diskUsage response = alert.send('Disk Check', message, None, None) logger.info('Cleaning up logs') ssConfig = getSSConfig() defaultLogRetention = ssConfig['defaultLogRetention'] if defaultLogRetention == 0: logger.info('System configured to never delete logs') return 'Logs configured to never delete' else: logger.info('System is configured to delete logs older than %d days' % defaultLogRetention) matchAll = {"query": {"match_all": {}}} logsDeleted = 0 today = datetime.datetime.now() indices = [] for index in esService.indices.get('logstash-*'): indices.append(index) logger.info("There are %d days worth of logs" % len(indices)) indices = sorted(indices) for index in indices: indexData = es.search(esService, matchAll, index, 'logs') logCount = indexData['hits']['total'] indexDate = datetime.datetime.strptime(index[-10:], "%Y.%m.%d") indexDaysOld = today - indexDate indexDaysOld = indexDaysOld.days logger.info("%s is %d days old and has %d logs" % (index, indexDaysOld, logCount)) if indexDaysOld > defaultLogRetention: logger.info("Deleting index %s" % index) #esService.indices.delete(index=index) logsDeleted += logCount logger.info("Deleted %d logs" % logsDeleted) return "Deleted %d logs" % logsDeleted
def httpSearch(ip, mac): numFound = 0 httpData = getLogs(ip, '/opt/nsm/bro/logs/current/http.log') knownWebsites = [] knownHostQuery = {"query": {"match_phrase": {"mac": {"query": mac}}}} knownHostData = es.search(esService, knownHostQuery, 'tardis', 'known_websites') for url in knownHostData['hits']['hits']: if url['_source']['server_name'] not in knownWebsites: knownWebsites.append(url['_source']['server_name']) for log in httpData['hits']['hits']: if log['_source']['server_name'] not in knownWebsites: numFound += 1 knownWebsites.append(log['_source']['server_name']) hostData = { 'mac': mac, 'server_name': log['_source']['server_name'] } es.write(esService, hostData, 'tardis', 'known_websites') alertMessage = 'A new website was added to the baseline: %s' % log[ '_source']['server_name'] alert.send('Baseliner', alertMessage, log['_id'], log['_index']) return numFound
def getLogs(ip, log): logQuery = { "query": { "bool": { "must": [{ "match": { "orig_h": ip } }, { "match_phrase": { "path": log } }, { "range": { "@timestamp": { "gt": "now-1h" } } }] } } } logData = es.search(esService, logQuery, 'logstash-*', 'logs') return logData
def search(query, offset=0, length=300, t_from="", t_to="", titles=[], title_indexes=[]): #titles = title_filter(titles) #title_indexes.extend() #if not title_indexes: # title_indexes = [0,1,33,74] count, data = es.search(query, offset=offset, t_from=t_from, t_to=t_to, title_indexes=title_indexes) rdata = [] for d in data: dx = {} content = d["value"] soup = BeautifulSoup(content, "lxml") content = soup.getText() content = content.replace("\n", "").replace("\u3000", "") pos = content.find(query) s = pos - length if pos > length else 0 dx["value"] = content[s:s + (length * 2)] if len(content) > (length * 2): dx["value"] += "..." dx["publisher"] = d["publisher"] dx["term"] = d["term"] dx["term_from"] = dt_convert(d["term_from"]) dx["term_to"] = dt_convert(d["term_to"]) rdata.append(dx) return count, rdata
def main(): # 크롤러 실행 article_data = crawler.crawling() # elasticsearh 크롤링 원문 데이터 저장 sotre_index = input("엘라스틱 서치에 저장 할 index 이름을 입력하시오 : ") es.store(sotre_index, article_data) # elastiesarch index 검색 search_index = input("엘라스틱 서치에서 검색 할 index 이름을 입력하시오 : ") index = es.search(search_index) # es에서 검색한 결과 data_list = es.convert_to_list(index) # es _source(data value) 만 가져와서 list로 변환 # datapreprocessing 1. 형태소 분석 2.명사 추출 2-1. 불용어 처리 # 1. 형태소 분석 # data_preprocessing.m_analysis(data_list) # 2. 명사 추출 nouns_list = data_preprocessing.noun_extraction(data_list) # 2-1. 불용어 처리 ( 명사 추출 한 결과) result = data_preprocessing.stopword(nouns_list) # result store in mysql - 불용어 처리 결과 저장 mysql.nouns_store(result) # tf 계산 words = mysql.search_in_dataResult() # tf 계산하기 위한 noun column만 가져오기 df_tf = tfidf.cal_tf(words) # tf 값 계산 mysql.store_tf_value(df_tf) # tf dataframe(id, noun, count) 저장 # TFIDF vector - sklearn # corpus = tfidf.make_list_for_tfidf(words) # tfidf.cal_vector(corpus) # ngram - top word 연관검색어 함수 실행 realted_keyword()
def find(self,body): """ Find something given a query or criterion """ res = es.search(index=dbname, body=body) return res
def search(): query = request.json.get('query') facets = request.json.get('facets') return jsonify(es.search(query, facets))