class ESPipeline(object): def __init__(self, *args, **kwargs): self.client = ElasticSearch('http://localhost:9200/') def process_item(self, item, spider): self.client.index('wiki', 'page', dict(item)) return item
def set_in_index(self, documentList): """ Store the list of documents in the Elasticsearch index via HTTP APIs @type documentList: List @param documentList: List of image layer JSON documents """ #Get the Elasticsearch address from the config file cfg = config.load() #Store the document list in Elasticsearch es = ElasticSearch(cfg.search_options.get("address")) try: es.bulk_index(cfg.search_options.get("index"), cfg.search_options.get("type"), documentList, id_field='id') except InvalidJsonResponseError: logger.debug("InvalidJsonResponseError!") except Timeout: logger.debug("Timeout!") except ConnectionError: logger.debug("ConnectionError!") except ElasticHttpError: logger.debug("ElasticHttpError!") except InvalidJsonResponseError: logger.debug("InvalidJsonResponseError!") except ElasticHttpNotFoundError: logger.debug("ElasticHttpNotFoundError!")
def search(field, queryStr): es_server = 'http://localhost:9200/' es_index = 'memex' es_doc_type = 'page' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] if environ.get('ELASTICSEARCH_INDEX'): es_index = environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_DOC_TYPE'): es_doc_type = environ['ELASTICSEARCH_DOC_TYPE'] es = ElasticSearch(es_server) if len(queryStr) > 0: query = { "query": { "query_string": { "fields": [field], "query": ' and '.join(queryStr[0:]), } }, "fields": [field] } print query res = es.search(query, index=es_index, doc_type=es_doc_type) hits = res['hits'] print 'Document found: %d' % hits['total'] return hits['hits']
class ESLayers(object): """Implementation of Elastic Search as layers backend""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) def _transform(self, layer, version, layer_name): """Add some meta data fields which are ES specific""" layer = dict(layer) # copy label = layer['label'] del layer['label'] return { 'id': '%s/%s/%s' % (version, layer_name, label), 'version': version, 'name': layer_name, 'label': label, 'layer': layer } def bulk_put(self, layers, version, layer_name, root_label): """Store all layer objects""" self.es.bulk_index( settings.ELASTIC_SEARCH_INDEX, 'layer', map(lambda l: self._transform(l, version, layer_name), layers)) def get(self, name, label, version): """Find the layer that matches these parameters""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'layer', version + '/' + name + '/' + label) return result['_source']['layer'] except ElasticHttpNotFoundError: return None
def __init__(self, es_url='http://localhost:9200/', batch_size=10, **kwargs): """ Do what is necessary to create/open the index. """ self.batch_size = batch_size self.batch_count = 0 self.es_url = es_url self.fast = kwargs.get('fast', False) if kwargs.get('noisy', False): from logging import getLogger, StreamHandler, DEBUG import sys logger = getLogger('pyelasticsearch') logger.setLevel(DEBUG) logger.addHandler(StreamHandler(sys.stdout)) self.es = ElasticSearch(self.es_url) try: self.es.count('*') except ConnectionError: print "Error connecting to ElasticSearch server!" raise self.urls = defaultdict( set) #track urls to be deleted before committing new content self.batches = defaultdict(list) #site: [list of docs]
def search(field, queryStr): es_server = 'http://localhost:9200/' es_index = 'memex' es_doc_type = 'page' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] if environ.get('ELASTICSEARCH_INDEX'): es_index = environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_DOC_TYPE'): es_doc_type = environ['ELASTICSEARCH_DOC_TYPE'] es = ElasticSearch(es_server) if len(queryStr) > 0: query = { "query": { "query_string": { "fields" : [field], "query": ' and '.join(queryStr[0:]), } }, "fields": [field] } print query res = es.search(query, index=es_index, doc_type=es_doc_type, size=500) hits = res['hits'] urls = [] for hit in hits['hits']: urls.append(hit['_id']) return urls
class ESDiffs(object): """Implementation of Elastic Search as diff backend""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) @staticmethod def to_id(label, old, new): return "%s/%s/%s" % (label, old, new) def put(self, label, old_version, new_version, diff): """Store a diff between two versions of a regulation node""" struct = { 'label': label, 'old_version': old_version, 'new_version': new_version, 'diff': diff } self.es.index(settings.ELASTIC_SEARCH_INDEX, 'diff', struct, id=self.to_id(label, old_version, new_version)) def get(self, label, old_version, new_version): """Find the associated diff""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'diff', self.to_id(label, old_version, new_version)) return result['_source']['diff'] except ElasticHttpNotFoundError: return None
def get_image(url): es_server = 'http://localhost:9200/' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) if url: query = { "query": { "term": { "url": url } }, "fields": ["thumbnail", "thumbnail_name"] } res = es.search(query, index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page') hits = res['hits']['hits'] if (len(hits) > 0): try: img = base64.b64decode(hits[0]['fields']['thumbnail'][0]) img_name = hits[0]['fields']['thumbnail_name'][0] return [img_name, img] except KeyError: print "No thumbnail found" else: print "No thumbnail found" return [None, None]
def search(field, queryStr): es_server = 'http://localhost:9200/' es_index = 'memex' es_doc_type = 'page' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] if environ.get('ELASTICSEARCH_INDEX'): es_index = environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_DOC_TYPE'): es_doc_type = environ['ELASTICSEARCH_DOC_TYPE'] es = ElasticSearch(es_server) if len(queryStr) > 0: query = { "query": { "query_string": { "fields": [field], "query": ' and '.join(queryStr[0:]), } }, "fields": [field] } print query res = es.search(query, index=es_index, doc_type=es_doc_type, size=500) hits = res['hits'] urls = [] for hit in hits['hits']: urls.append(hit['_id']) return urls
def term_search(field, queryStr): es_server = 'http://localhost:9200/' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) if len(queryStr) > 0: query = { "query": { "match": { field: { "query": ' '.join(queryStr), "minimum_should_match": "100%" } } }, "fields": ["url"] } print query res = es.search(query, index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page', size=500) hits = res['hits'] urls = [] for hit in hits['hits']: urls.append(hit['_id']) return urls
class ESNotices(object): """Implementation of Elastic Search as notice backend""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) def put(self, doc_number, notice): """Store a single notice""" self.es.index(settings.ELASTIC_SEARCH_INDEX, 'notice', notice, id=doc_number) def get(self, doc_number): """Find the associated notice""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'notice', doc_number) return result['_source'] except ElasticHttpNotFoundError: return None def listing(self, part=None): """All notices or filtered by cfr_part""" if part: query = {'match': {'cfr_part': part}} else: query = {'match_all': {}} query = {'fields': ['effective_on', 'fr_url', 'publication_date'], 'query': query} notices = [] results = self.es.search(query, doc_type='notice', size=100, index=settings.ELASTIC_SEARCH_INDEX) for notice in results['hits']['hits']: notice['fields']['document_number'] = notice['_id'] notices.append(notice['fields']) return notices
def get_documents(terms, term_field, fields=["text"], es_index='memex', es_doc_type='page', es=None): if es is None: es = ElasticSearch('http://localhost:9200/') if len(terms) > 0: results = {} for term in terms: query = { "query": { "term": { term_field: term } }, "fields": fields } res = es.search(query, index=es_index, doc_type=es_doc_type) if res['hits']['hits']: hits = res['hits']['hits'][0] if not hits.get('fields') is None: hits = hits['fields'] record = {} for field in fields: if(not hits.get(field) is None): record[field] = hits[field][0] results[term] = record return results
def _get(self): """Build and run the ES query """ opts = self.opts es = ElasticSearch(opts.url) query = {'sort': {'@timestamp': 'desc'}, 'size': 1} if opts.query: query['query'] = { 'filtered': { 'query': { 'query_string': { 'query': opts.query } } } } # ElasticSearch allows us to pass an array of indices. However, # it will throw an exception if any of these don't exist. This # isn't the right behavior, because there may not actually be # a logstash index from X days ago. Instead, we need to iterate # through the daily log indexes in reverse order until we get a # non-error response. result = None for index in self._indexes(): try: result = es.search(query, index=index) break except ElasticHttpNotFoundError, e: pass
class ElasticSearchBackend(BaseBackend): def __init__(self, es_url='http://localhost:9200/', batch_size=10, **kwargs): """ Do what is necessary to create/open the index. """ self.batch_size = batch_size self.batch_count = 0 self.es_url = es_url self.fast = kwargs.get('fast', False) if kwargs.get('noisy', False): from logging import getLogger, StreamHandler, DEBUG import sys logger = getLogger('pyelasticsearch') logger.setLevel(DEBUG) logger.addHandler(StreamHandler(sys.stdout)) self.es = ElasticSearch(self.es_url) try: self.es.count('*') except ConnectionError: print "Error connecting to ElasticSearch server!" raise self.urls = defaultdict(set) #track urls to be deleted before committing new content self.batches = defaultdict(list) #site: [list of docs] def create_index(self, name): name = name.lower() try: self.es.create_index(name) self.update_mapping(name) except Exception, e: print e return
def main(): """ Method to kick things off """ # Setup workers pool = Pool(processes=CPU_COUNT) # Prepare URLs urls = [] for url in CRAWL_URLS: urls.append(str(BASE_URL + url)) if USE_ES: # Create connection es = ElasticSearch(ES_URL) try: # Delete the existing index es.delete_index(ES_INDEX) except: # In case the index does not exist pass # Create the index to use es.create_index(ES_INDEX) else: # Setup the database tables, connect init_db() # Scrape and store async pool.map(scrape, urls)
def main(): #Train the Naive Bayes Classifier f=open('./data_set/naivebayes_trained_model.pickle') NBClassifier=pickle.load(f) #ElasticSearch- Call the es_indexer file to create 'sentiment_analysis' index and store #the contents of the tweet file in that Index es=ElasticSearch('http://localhost:9200/') es_indexer() ############Indexing into Elasticsearch############ i=0 for each in tweet_data(): i+=1 testTweet= each processedTestTweet=process_tweet(testTweet) sentiment=NBClassifier.classify(extract_features(build_feature_vector(processedTestTweet))) es.index("sentiment_analysis","document",{ "text": testTweet, "sentiment": sentiment },id=i) print "Indexing completed." es.refresh(index="sentiment_analysis") print "Index refreshed." f.close()
def term_search(field, queryStr): es_server = 'http://localhost:9200/' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) if len(queryStr) > 0: query = { "query" : { "match": { field: { "query": queryStr, "operator" : "and" } } }, "fields": ["url"] } print query res = es.search(query, index='memex', doc_type='page', size=500) hits = res['hits'] urls = [] for hit in hits['hits']: urls.append(hit['_id']) return urls
def get_documents(urls): host = environ['ELASTICSEARCH_SERVER'] if environ.get('ELASTICSEARCH_SERVER') else 'http://localhost:9200' es = ElasticSearch(host) if len(urls) > 0: results = {} for url in urls: query = { "query": { "term": { "url": url } }, "fields": ["text"] } res = es.search(query, index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page') hits = res['hits'] try: results[url] = hits['hits'][0]['fields']['text'][0] except KeyError, e: print url, e, " not found in database" except IndexError, e: print url, e, " not found in database"
def get_image(url, output_path=""): es_server = 'http://localhost:9200/' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) if output_path: output_path = output_path+'/' if url: query = { "query": { "term": { "url": url } }, "fields": ["thumbnail"] } res = es.search(query, index='memex', doc_type='page') hits = res['hits'] if (len(hits) > 0): img = base64.b64decode(hits['hits'][0]['fields']['thumbnail'][0]) with open(output_path+urllib2.quote(url).replace("/", "%2F")+'.png','wb') as f: f.write(img) else: print "No thumbnail found"
def range(field, from_val, to_val, ret_fields=[], epoch=None, es_index='memex', es_doc_type='page', es=None): if es is None: es = ElasticSearch("http://localhost:9200") if not (epoch is None): if epoch: from_val = datetime.utcfromtimestamp(long(from_val)).strftime('%Y-%m-%dT%H:%M:%S') to_val = datetime.utcfromtimestamp(long(to_val)).strftime('%Y-%m-%dT%H:%M:%S') query = { "query" : { "range" : { field : { "from": from_val, "to": to_val } }, }, "fields": ret_fields } res = es.search(query, index=es_index, doc_type=es_doc_type, size=500) hits = res['hits']['hits'] results=[] for hit in hits: results.append(hit['fields']) return results
def search_posts_elasticsearch(token): es = ElasticSearch('http://localhost:9200/') #for result in es.search("_type:post", index=token.lower())['hits']['hits']: # print result["_source"] print es.search("id:sdifhsdihf", index="caacedeose0cban4zbmltsbcyxgzbzfrvq7uiqksk1uxep0njzgza7jtxei59ekp1izcjbg9czbum5qm0ojjuekaa3vwnn8tnxezcplgyaa2esvpi1dzcycai6xyvfwbrzco8quwns9orejsbecktw738yglnevljlqeascfgdfc0xdrjc1s0n40uun4ypytklsjarzand9gtfazdzd")
def analyze_post(token, text): response = { 'post_now': False, 'hours_to_wait': 1, 'total_score': 0, 'time_score': 0, 'text_score': 0, 'hint': "Building index", } try: data = Newsfeed.filter_only_posts_by_people(token) except Exception, e: es = ElasticSearch('http://localhost:9200/') try: es.create_index(token.lower()) Newsfeed.newsfeed(token, [], 0, None, 1) t = threading.Thread(target=Newsfeed.newsfeed, args=(token, [], 0, None, 1500)) t.setDaemon(True) t.start() except Exception, e: print e.message
def get_context(terms): es_server = 'http://localhost:9200/' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) if len(terms) > 0: query = { "query": { "match": { "text": { "query": ' and '.join(terms[0:]), "operator" : "and" } } }, "highlight" : { "fields" : { "text": { "fragment_size" : 100, "number_of_fragments" : 1 } } } } print query res = es.search(query, index='memex', doc_type='page') hits = res['hits'] print 'Document found: %d' % hits['total'] highlights = [] for hit in hits['hits']: highlights.append(hit['highlight']['text']) return highlights
def term_search(field, queryStr): es_server = 'http://localhost:9200/' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) if len(queryStr) > 0: query = { "query" : { "match": { field: { "query": ' '.join(queryStr), "minimum_should_match":"100%" } } }, "fields": ["url"] } print query res = es.search(query, index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page', size=500) hits = res['hits'] urls = [] for hit in hits['hits']: urls.append(hit['_id']) return urls
def _check_es_health(product, env): if product is not None: hosts = [_url_for_host(env)] else: logging.info( "No product specified ; Checking health of all Elasticsearch hosts for env '%s'\n" % env) all_hosts = set(product_host.values()) hosts = [] for host in all_hosts: hosts.append(_url_for_host(env, host)) es = ElasticSearch(hosts, port=port) # Add check on elasticsearch health health = es.health() if health['status'] == 'red': logging.error( "Elasticsearch status is red. Search will hang. Exiting\n") sys.exit(-1) elif health['status'] == 'yellow': logging.warning( 'Elasticsearch status is yellow. Search quality will be degraded\n' )
def search(request, doc_type, search_args): """Search elastic search for any matches in the node's text""" query = { 'fields': ['text', 'label', 'version', 'regulation', 'title', 'label_string'], 'from': search_args.page * search_args.page_size, 'size': search_args.page_size, } text_match = {'match': {'text': search_args.q, 'doc_type': doc_type}} if search_args.version or search_args.regulation: term = {} if search_args.version: term['version'] = search_args.version if search_args.regulation: term['regulation'] = search_args.regulation if search_args.is_root is not None: term['is_root'] = search_args.is_root if search_args.is_subpart is not None: term['is_subpart'] = search_args.is_subpart query['query'] = {'filtered': { 'query': text_match, 'filter': {'term': term} }} else: query['query'] = text_match es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) results = es.search(query, index=settings.ELASTIC_SEARCH_INDEX) return success({ 'total_hits': results['hits']['total'], 'results': transform_results([h['fields'] for h in results['hits']['hits']]) })
def get_image(url, es_index='memex', es_doc_type='page', es=None): if es is None: es = ElasticSearch("http://localhost:9200") if url: query = { "query": { "term": { "url": url } }, "fields": ["thumbnail", "thumbnail_name"] } res = es.search(query, index=es_index, doc_type=es_doc_type, size=500) hits = res['hits']['hits'] if (len(hits) > 0): try: img = base64.b64decode(hits[0]['fields']['thumbnail'][0]) img_name = hits[0]['fields']['thumbnail_name'][0] return [img_name, img] except KeyError: print "No thumbnail found" else: print "No thumbnail found" return [None, None]
def search(q): """ Implement search method with ElasticSearch """ # Create connection es = ElasticSearch(ES_URL) # Get results from index results = es.search( { "query": { "query_string": { "query": q } } }, index=[ES_INDEX], doc_type=['watch'] ) return { 'count': results['hits']['total'], 'results': [ hh.get('_source') for hh in results['hits']['hits'] ] }
def get_context(terms, es_index='memex', es_doc_type='page', es=None): if es is None: es = ElasticSearch("http://localhost:9200") if len(terms) > 0: query = { "query": { "match": { "text": { "query": ' and '.join(terms[0:]), "operator" : "and" } } }, "highlight" : { "fields" : { "text": { "fragment_size" : 100, "number_of_fragments" : 1 } } } } print query res = es.search(query, index=es_index, doc_type=es_doc_type, size=500) hits = res['hits'] highlights = [] for hit in hits['hits']: highlights.append(hit['highlight']['text'][0]) return highlights
def get_elasticsearch_connection(): es_conn = env.get_service(label='elasticsearch-swarm-1.7.1') if es_conn: es = ElasticSearch(es_conn.get_url(url='uri')) else: es = ElasticSearch('http://localhost:9200') return es
def get_available_domains(es=None): if es is None: es = ElasticSearch("http://localhost:9200") query = { "query": { "match_all": {} }, } res = es.search(query, index='config', doc_type='domains', size=100 ) hits = res['hits']['hits'] res = [] for hit in hits: res.append(hit['_source']) for i in range(0,len(res)): res[i]['timestamp'] = long(convert_to_epoch(datetime.strptime(res[i]['timestamp'], '%Y-%m-%dT%H:%M:%S.%f'))) print datetime.utcfromtimestamp(res[i]['timestamp']) return res
class ItvacaturesParseStrategy(ParseStrategy.ParseStrategy): def __init__(self): self.website = "it-vacatures" # elasticsearch binden aan es self.es = ElasticSearch("http://localhost:9200/") def parseTitel(self, soup): titel = soup.head.title.string return titel def parseWerkgever(self, soup): info = soup.find("td") infoTwee = info.find_next_sibling() p = re.compile(r"<.*?>") werkgever = p.sub("", str(infoTwee)) return werkgever def parseLocatie(self, soup): info = soup.find("td") infoTwee = info.find_next_sibling() locatieEen = infoTwee.find_next() p = re.compile(r"<.*?>") locatieTwee = p.sub("", str(locatieEen)) p = re.compile(r"Locatie") locatie = p.sub("", str(locatieTwee)) locatie = locatie.strip() return locatie def parseInhoud(self, soup): body = soup.find("div", {"id": "job-description"}) p = re.compile(r"<.*?>") inhoud = p.sub("", str(body)) return inhoud def parse(self, websiteUrl): soup = self.getSoup(websiteUrl) # parsen titel = self.parseTitel(soup) try: werkgever = self.parseWerkgever(soup) except: werkgever = "-" try: locatie = self.parseLocatie(soup) except: locatie = "-" inhoud = self.parseInhoud(soup) websiteUrl = re.sub(r"(?s)/\*.*\*/", "", websiteUrl) datum = time.strftime("%d-%m-%Y") # generate id (string) id = self.website + "-" + re.sub(r"\W+", "", titel) # make document to be send to elasticsearch database document = self.makeDocument(id, titel, websiteUrl, self.website, datum, werkgever, locatie, "-", inhoud) # indexeren (stoppen) van vacaturen in esDb self.es.index("vacature-index", "vacature", document, id=document["id"]) print "Es: " + titel
def __init__(self, config=None, es_instance=None): if es_instance: self.es = es_instance else: self.config = config self.excludes_fields = self.config['excludes_fields'] self.es = ElasticSearch('http://{host}:{port}/'.format( host=self.config['host'], port=self.config['port']))
def _query_applications(indices): hosts = [_url_for_host(env)] es = ElasticSearch(hosts, port = port) es_results = es.search(APPLICATIONS_QUERY, index=indices, query_params={'ignore_unavailable':'true'}) applications = map((lambda result: result['key']), es_results['aggregations']['applications']['buckets']) return applications
def _query_applications(product_group, indices): hosts = [_url_for_host(env)] es = ElasticSearch(hosts, port = port) es_results = es.search(APPLICATIONS_QUERY, index=indices) applications = map((lambda result: result['term']), es_results['facets']['applications']['terms']) return applications
class IitjobsParseStrategy(ParseStrategy.ParseStrategy): def __init__(self): self.website = "iitjobs" # elasticsearch binden aan es self.es = ElasticSearch('http://localhost:9200/') def parseTitel(self, soup): titel = soup.head.title.string titel = titel.strip() return titel def parseWerkgever(self, soup): body = soup.find( "span", {"id": "ctl00_middleContent_idShowJobDetails_lblCompanyName"}) p = re.compile(r'<.*?>') werkgever = p.sub('', str(body)) werkgever = werkgever.strip() return werkgever def parseLocatie(self, soup): body = soup.find( "span", {"id": "ctl00_middleContent_idShowJobDetails_lblCountryID"}) p = re.compile(r'<.*?>') locatie = p.sub('', str(body)) locatie = locatie.strip() return locatie def parseInhoud(self, soup): body = soup.find("div", {"id": "divJobDescrip"}) p = re.compile(r'<.*?>') inhoud = p.sub('', str(body)) inhoud = inhoud.strip() return inhoud def parse(self, websiteUrl): soup = self.getSoup(websiteUrl) #parsen titel = self.parseTitel(soup) werkgever = self.parseWerkgever(soup) locatie = self.parseLocatie(soup) inhoud = self.parseInhoud(soup) websiteUrl = re.sub(r'(?s)/\*.*\*/', '', websiteUrl) datum = time.strftime("%d-%m-%Y") # generate id for website (string) id = self.website + "-" + re.sub(r'\W+', '', titel) # make document to be send to elasticsearch database document = self.makeDocument(id, titel, websiteUrl, self.website, datum, werkgever, locatie, "-", inhoud) #indexeren (stoppen) van vacaturen in esDb self.es.index('vacature-index', 'vacature', document, id=document['id']) print('Es: ' + titel)
def query(request): es = ElasticSearch(settings.ELASTIC_SEARCH) query = {"query": {"bool": {}}} #Building the query dict_value = dict(request.POST) for key in dict_value['query']: key = key value = ast.literal_eval(key) AndQueries = [] OrQueries = [] for index, key in enumerate(value['exact_query']): if key['condition'] == 'is equal to': query_values = {"term": {key['column']: key['value']}} if key['condition'] == 'is less than': query_values = {"range": {key['column']: {"lt": key['value']}}} if key['condition'] == 'is greater than': query_values = {"range": {key['column']: {"gt": key['value']}}} if key['condition'] == 'is less than or equal to': query_values = {"range": {key['column']: {"lte": key['value']}}} if key['condition'] == 'is greater than or equal to': query_values = {"range": {key['column']: {"gte": key['value']}}} if key['condition'] == 'is not equal to': query_values = { "must_not": { "term": { key['column']: key['value'] } } } if key['operation'] == 'and': AndQueries.append(query_values) if key['operation'] == 'or': OrQueries.append(query_values) if key['operation'] == '': if index < (len(value['exact_query']) - 1): next_value = value['exact_query'][index + 1] if next_value['operation'] == 'and': AndQueries.append(query_values) if next_value['operation'] == 'or': OrQueries.append(query_values) else: query['query']['bool']['must'] = query_values if len(AndQueries) != 0: query['query']['bool']['must'] = AndQueries if len(OrQueries) != 0: query['query']['bool']['should'] = OrQueries results = es.search(query, index=dict_value['index'][0], size=10000) return HttpResponse(json.dumps({ 'success': "Added successfully", 'results': results }), content_type="application/json")
def get_posts_elasticsearch(token): es = ElasticSearch('http://localhost:9200/') r = [] for result in es.search("_type:post", index=token.lower(), size=1000)['hits']['hits']: r.append(result["_source"]) return r
def add_document(entries): es_server = 'http://localhost:9200/' if os.environ.get('ELASTICSEARCH_SERVER'): es_server = os.environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) es.bulk([es.index_op(doc) for doc in entries], index=os.environ['ELASTICSEARCH_INDEX'] if os.environ.get('ELASTICSEARCH_INDEX') else 'memex', doc_type=os.environ['ELASTICSEARCH_DOC_TYPE'] if os.environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')
def _query_applications(product_group, indices): hosts = [_url_for_host(env)] es = ElasticSearch(hosts, port=port) es_results = es.search(APPLICATIONS_QUERY, index=indices) applications = map((lambda result: result['term']), es_results['facets']['applications']['terms']) return applications
def search(request, doc_type): """Search elastic search for any matches in the node's text""" term = request.GET.get('q', '') version = request.GET.get('version', '') regulation = request.GET.get('regulation', '') is_root = request.GET.get('is_root') is_subpart = request.GET.get('is_subpart') try: page = int(request.GET.get('page', '0')) except ValueError: page = 0 if not term: return user_error('No query term') if not validate_boolean(is_root): return user_error('Parameter "is_root" must be "true" or "false"') if not validate_boolean(is_subpart): return user_error('Parameter "is_subpart" must be "true" or "false"') query = { 'fields': ['text', 'label', 'version', 'regulation', 'title', 'label_string'], 'from': page * PAGE_SIZE, 'size': PAGE_SIZE, } text_match = {'match': {'text': term, 'doc_type': doc_type}} if version or regulation: term = {} if version: term['version'] = version if regulation: term['regulation'] = regulation if is_root: term['is_root'] = is_root if is_subpart: term['is_subpart'] = is_subpart query['query'] = { 'filtered': { 'query': text_match, 'filter': { 'term': term } } } else: query['query'] = text_match es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) results = es.search(query, index=settings.ELASTIC_SEARCH_INDEX) return success({ 'total_hits': results['hits']['total'], 'results': transform_results([h['fields'] for h in results['hits']['hits']]) })
def add_document(entries): es_server = 'http://localhost:9200/' if os.environ.get('ELASTICSEARCH_SERVER'): es_server = os.environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) es.bulk([es.index_op(doc) for doc in entries], index='memex', doc_type='page')
def query(self, query_dict, maxhits=10000): from pyelasticsearch import ElasticSearch if self.es_connection is None: es_server = self.server.rstrip('/') + ':9200' self.es_connection = ElasticSearch(es_server) results = self.es_connection.search(query_dict, index='encoded', doc_type=self.search_name, size=maxhits) return results
def cli(index_name, delete_index, mapping_file, settings_file, doc_type, host, docs_per_chunk, bytes_per_chunk, parallel, quiet, parser, config_file, user, passwd): with open(config_file, "rb") as f: con = json.loads(f.read()) host = con['es_config']['host'] echo('Using host: ' + host, quiet) es = ElasticSearch(host) if con['db']['type'] == "oracle": db = import_module('cx_Oracle') collection = db.connect(user, passwd, con['db']['con_str']) else: db = import_module('MySQLdb') collection = db.connect(con['db']['con_str'][0], user, passwd, con['db']['con_str'][1], charset=con['db']['con_str'][2]) if delete_index: # 删除索引 try: stamp = 0 es.delete_index(index_name) echo('Deleted: ' + index_name, quiet) except ElasticHttpNotFoundError: echo('Index ' + index_name + ' not found, nothing to delete', quiet) try: if settings_file: with open(settings_file, 'r') as f: settings_json = json.loads(f.read()) es.create_index(index_name, settings=settings_json) else: es.create_index(index_name) echo('Created new index: ' + index_name, quiet) except Exception: echo('Index ' + index_name + ' already exists', quiet) echo('Using document type: ' + doc_type, quiet) es.put_mapping(index_name, doc_type, con['mapping']) parser_fun = None if parser is not None: # 加载解释函数 parser_fun = import_module(PARSER_PATH + '.' + parser) documents = documents_from_file(es, collection, quiet, parser_fun, con) perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk, bytes_per_chunk, parallel) print "end:" + time.strftime( ISOTIMEFORMAT, time.localtime()) + '/n all records import complete.'
class ElasticSearchTestCase(unittest.TestCase): def setUp(self): self.conn = ElasticSearch('http://localhost:9200/') def tearDown(self): self.conn.delete_index("test-index") def assertResultContains(self, result, expected): for (key, value) in expected.items(): self.assertEquals(value, result[key])
def delete(config, tree_names, all, force): """Delete indices and their catalog entries. This deletes the indices that have the format version of the copy of DXR this runs under. """ es = ElasticSearch(config.es_hosts) if all: echo('Deleting catalog...') es.delete_index(config.es_catalog_index) # TODO: Delete tree indices as well. else: for tree_name in tree_names: frozen_id = '%s/%s' % (FORMAT, tree_name) try: frozen = es.get(config.es_catalog_index, TREE, frozen_id) except ElasticHttpNotFoundError: raise ClickException('No tree "%s" in catalog.' % tree_name) # Delete the index first. That way, if that fails, we can still # try again; we won't have lost the catalog entry. Refresh is # infrequent enough that we wouldn't avoid a race around a # catalogued but deleted instance the other way around. try: es.delete_index(frozen['_source']['es_alias']) except ElasticHttpNotFoundError: # It's already gone. Fine. Just remove the catalog entry. pass es.delete(config.es_catalog_index, TREE, frozen_id)
def IndexData(request): es = ElasticSearch(settings.ELASTIC_SEARCH) for file in fileHolder: index = file['segment_name'].lower() rawfiles = file['rawfiles'] data_for_es = file['dataFrames'] try: es.delete_index(index.replace(" ", "")) except: pass es.create_index(index.replace(" ", "")) ## Loop dataframe and to elasticsearch index docs = json.loads(data_for_es.to_json(orient='records')) es.bulk((es.index_op(doc) for doc in docs), index=index.replace(" ", ""), doc_type=index) ##Create segment template file_names = [] for file in rawfiles: file_names.append(file.name) segment = Segments(name=index, files_added=",".join(file_names), es_index=index.replace(" ", "")) segment.save() segment = Segments.objects.get(name=index) return render(request, 'analyse.html', {'segment': segment})
class IctergezochtParseStrategy(ParseStrategy.ParseStrategy): def __init__(self): self.website = "ictergezocht" # elasticsearch binden aan es self.es = ElasticSearch('http://localhost:9200/') def parseWerkgever(self, soup): info = soup.find(class_="highlight") p = re.compile(r'<.*?>') werkgever = p.sub('', str(info)) return werkgever def parseLocatie(self, soup): infoTwee = soup.find(class_="bf") locatieEen = infoTwee.find_next() locatieTwee = locatieEen.find_next() locatieDrie = locatieTwee.find_next() locatieVier = locatieDrie.find_next() p = re.compile(r'<.*?>') locatieVijf = p.sub('', str(locatieVier)) p = re.compile(r'Locatie') locatie = p.sub('', str(locatieVijf)) locatie = locatie.strip() return locatie def parseInhoud(self, soup): body = soup.find(class_="vacancybody") p = re.compile(r'<.*?>') inhoud = p.sub('', str(body)) return inhoud def parseTitel(self, soup): titel = soup.head.title.string return titel def parse(self, websiteUrl): soup = self.getSoup(websiteUrl) titel = self.parseTitel(soup) if titel.startswith("Vacature"): #parsen werkgever = self.parseWerkgever(soup) locatie = self.parseLocatie(soup) inhoud = self.parseInhoud(soup) websiteUrl = re.sub(r'(?s)/\*.*\*/', '', websiteUrl) datum = time.strftime("%d-%m-%Y") # generate id website (string) id = self.website + "-" + re.sub(r'\W+', '', titel) #make document document = self.makeDocument(id, titel, websiteUrl, self.website, datum, werkgever, locatie, "-", inhoud) #indexeren (stoppen) van vacaturen in esDb self.es.index('vacature-index', 'vacature', document, id=document['id']) print "Es: " + titel
def update_document(url, doc): es_server = 'http://localhost:9200/' if os.environ.get('ELASTICSEARCH_SERVER'): es_server = os.environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) try: es.update(index='memex', doc_type='page', id=url, script=doc) except: print "Unexpected error:", sys.exc_info()[0] pass
def search(elastic_config, fqdn): pattern = elastic_config.index_pattern lookback = elastic_config.lookback indices = common.get_indexes(lookback, pattern) hosts = elastic_config.hosts port = elastic_config.port username = elastic_config.username password = elastic_config.password environment = elastic_config.environment es = ElasticSearch(hosts, port=port, username=username, password=password) #try: doc = es.search(common.build_query(fqdn, environment), index=indices) return doc, fqdn
def update_process_datetime(doc_id, timestamp): ''' Updates the last_update_date for the document id passed into function. The document id in will be the name of another index in the cluster. ''' connection_string = 'http://localhost:9200' process_index = 'openfdametadata' _type = 'last_run' _map = {} _map[_type] = {} _map[_type]['properties'] = {} _map[_type]['properties']['last_update_date'] = {} _map[_type]['properties']['last_update_date']['type'] = 'date' _map[_type]['properties']['last_update_date'][ 'format'] = 'dateOptionalTime' es = ElasticSearch(connection_string) try: es.create_index(process_index) logging.info('Creating index %s', process_index) except exceptions.IndexAlreadyExistsError as e: logging.info('%s already exists', process_index) try: es.put_mapping(process_index, doc_type=_type, mapping=_map) logging.info('Successfully created mapping') except: logging.fatal('Could not create the mapping') new_doc = {} new_doc['last_update_date'] = timestamp es.index(process_index, doc_type=_type, id=doc_id, doc=new_doc, overwrite_existing=True)
def send(self, messages): if self.type == '@type': self.type = messages[0].get('@type') logger.debug('Type is \'@type\' - setting it to %r', self.type) es = ElasticSearch('http://%s:%s' % (self.host, self.port)) now = datetime.utcnow() index = now.strftime('logstash-%Y.%m.%d') result = es.bulk_index(index=index, doc_type=self.type, docs=messages) logger.debug('Elasticsearch bulk_index run returned with:\n\n%s\n', pformat(result)) return True
class ESRegulations(object): """Implementation of Elastic Search as regulations backend""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) def get(self, label, version): """Find the regulation label + version""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'reg_tree', version + '/' + label) reg_node = result['_source'] del reg_node['regulation'] del reg_node['version'] del reg_node['label_string'] del reg_node['id'] return reg_node except ElasticHttpNotFoundError: return None def _transform(self, reg, version): """Add some meta data fields which are ES specific""" node = dict(reg) # copy node['version'] = version node['label_string'] = '-'.join(node['label']) node['regulation'] = node['label'][0] node['id'] = version + '/' + node['label_string'] node['root'] = len(node['label']) == 1 return node def bulk_put(self, regs, version, root_label): """Store all reg objects""" self.es.bulk_index(settings.ELASTIC_SEARCH_INDEX, 'reg_tree', map(lambda r: self._transform(r, version), regs)) def listing(self, label=None): """List regulation version-label pairs that match this label (or are root, if label is None)""" if label is None: query = {'match': {'root': True}} else: query = {'match': {'label_string': label}} query = {'fields': ['label_string', 'version'], 'query': query} result = self.es.search(query, index=settings.ELASTIC_SEARCH_INDEX, doc_type='reg_tree', size=100) return sorted((res['fields']['version'], res['fields']['label_string']) for res in result['hits']['hits'])