def flush(docs): print('Flushing {} movies'.format(len(docs))) resp = requests.post('{}/{}/update?commitWithin=1500'.format( self.solr_base_ep, index), json=docs) resp_msg(msg="Done", resp=resp) docs.clear()
def create_index(self, index): """ Take the local config files for Elasticsearch for index, reload them into ES""" cfg_json_path = os.path.join(self.configs_dir, "%s_settings.json" % index) with open(cfg_json_path) as src: settings = json.load(src) resp = self.es.indices.create(index, body=settings) resp_msg(msg="Created index {}".format(index), resp=ElasticResp(resp))
def create_featureset(self, index, name, ftr_config): self.validate_featureset(name, ftr_config) resp = requests.put('{}/{}/schema/feature-store'.format( self.solr_base_ep, index, name), json=ftr_config) resp_msg(msg='Created {} feature store under {}:'.format(name, index), resp=resp)
def submit_model(self, featureset, index, model_name, solr_model): url = '{}/{}/schema/model-store'.format(self.solr_base_ep, index) resp = requests.delete('{}/{}'.format(url, model_name)) resp_msg(msg='Deleted Model {}'.format(model_name), resp=resp) resp = requests.put(url, json=solr_model) resp_msg(msg='Created Model {}'.format(model_name), resp=resp)
def model_query(self, index, model, model_params, query): params = { "query": query, "rescore": { "window_size": 1000, "query": { "rescore_query": { "sltr": { "params": model_params, "model": model } } } }, "size": 1000 } resp = self.es.search(index, body=params) resp_msg(msg="Searching {} - {}".format(index, str(query)[:20]), resp=SearchResp(resp)) # Transform to consistent format between ES/Solr matches = [] for hit in resp['hits']['hits']: matches.append(hit['_source']) return matches
def create_index(self, index, settings): params = { 'action': 'CREATE', 'name': index, 'configSet': 'tmdb' } resp = requests.get('{}/admin/cores?'.format(self.solr_base_ep), params=params) resp_msg(msg="Created index {}".format(index), resp=resp)
def reset_ltr(self, index): resp = requests.delete(self.elastic_ep) resp_msg(msg="Removed Default LTR feature store".format(), resp=resp, throw=False) resp = requests.put(self.elastic_ep) resp_msg(msg="Initialize Default LTR feature store".format(), resp=resp)
def submit_model(self, featureset, index, model_name, model_payload): model_ep = "{}/_model/".format(self.elastic_ep) create_ep = "{}/_featureset/{}/_createmodel".format(self.elastic_ep, featureset) resp = requests.delete('{}{}'.format(model_ep, model_name)) print('Delete model {}: {}'.format(model_name, resp.status_code)) resp = requests.post(create_ep, json=model_payload) resp_msg(msg="Created Model {}".format(model_name), resp=resp)
def reset_ltr(self): models = ['classic', 'genre', 'latest', 'title', 'title_fuzzy'] for model in models: resp = requests.delete('{}/tmdb/schema/model-store/{}'.format(self.solr_base_ep, model)) resp_msg(msg='Deleted {} model'.format(model), resp=resp) stores = ['_DEFAULT', 'genre', 'release', 'title', 'title_fuzzy'] for store in stores: resp = requests.delete('{}/tmdb/schema/feature-store/{}'.format(self.solr_base_ep, store)) resp_msg(msg='Deleted {} Featurestore'.format(store), resp=resp)
def reset_ltr(self, index): models = self.get_models(index) for model in models: resp = requests.delete('{}/{}/schema/model-store/{}'.format(self.solr_base_ep, index, model)) resp_msg(msg='Deleted {} model'.format(model), resp=resp) stores = self.get_feature_stores(index) for store in stores: resp = requests.delete('{}/{}/schema/feature-store/{}'.format(self.solr_base_ep, index, store)) resp_msg(msg='Deleted {} Featurestore'.format(store), resp=resp)
def delete_index(self, index): params = { 'action': 'UNLOAD', 'core': index, 'deleteIndex': 'true', 'deleteDataDir': 'true', 'deleteInstanceDir': 'true' } resp = requests.get('{}/admin/cores?'.format(self.solr_base_ep), params=params) resp_msg(msg="Deleted index {}".format(index), resp=resp, throw=False)
def create_index(self, index): # Presumes there is a link between the docker container and the 'index' # directory under docker/solr/ (ie docker/solr/tmdb/ is linked into # Docker container configsets) params = { 'action': 'CREATE', 'name': index, 'configSet': index, } resp = requests.get('{}/admin/cores?'.format(self.solr_base_ep), params=params) resp_msg(msg="Created index {}".format(index), resp=resp)
def model_query(self, index, model, model_params, query): url = '{}/{}/select?'.format(self.solr_base_ep, index) params = { 'q': query, 'rq': '{{!ltr model={}}}'.format(model), 'rows': 10000 } resp = requests.post(url, data=params) resp_msg(msg='Search keywords - {}'.format(query), resp=resp) return resp.json()['response']['docs']
def log_query(self, index, featureset, ids, params={}): params = { "query": { "bool": { "filter": [ { "sltr": { "_name": "logged_features", "featureset": featureset, "params": params } } ] } }, "ext": { "ltr_log": { "log_specs": { "name": "ltr_features", "named_query": "logged_features" } } }, "size": 1000 } terms_query = [ { "terms": { "_id": ids } } ] if ids is not None: params["query"]["bool"]["must"] = terms_query resp = self.es.search(index, body=params) resp_msg(msg="Searching {} - {}".format(index, str(terms_query)[:20]), resp=SearchResp(resp)) matches = [] for hit in resp['hits']['hits']: hit['_source']['ltr_features'] = [] for feature in hit['fields']['_ltrlog'][0]['ltr_features']: value = 0.0 if 'value' in feature: value = feature['value'] hit['_source']['ltr_features'].append(value) matches.append(hit['_source']) return matches
def query(self, index, query): resp = self.es.search(index, body=query) resp_msg(msg="Searching {} - {}".format(index, str(query)[:20]), resp=SearchResp(resp)) # Transform to consistent format between ES/Solr matches = [] for hit in resp['hits']['hits']: hit['_source']['_score'] = hit['_score'] matches.append(hit['_source']) return matches
def index_documents(self, index, movie_source): def bulkDocs(movie_source): for movie in movie_source: addCmd = {"_index": index, "_type": "movie", "_id": movie['id'], "_source": movie} yield addCmd resp = elasticsearch.helpers.bulk(self.es, bulkDocs(movie_source), chunk_size=100) resp_msg(msg="Streaming Bulk index DONE {}".format(index), resp=BulkResp(resp))
def query(self, index, query): url = '{}/{}/select?'.format(self.solr_base_ep, index) resp = requests.post(url, data=query) resp_msg(msg='Query {}...'.format(str(query)[:10]), resp=resp) resp = resp.json() # Transform to be consistent for doc in resp['response']['docs']: if 'score' in doc: doc['_score'] = doc['score'] return resp['response']['docs']
def feature_set(self, index, name): resp = requests.get('{}/{}/schema/feature-store/{}'.format( self.solr_base_ep, index, name)) resp_msg(msg='Feature Set {}...'.format(name), resp=resp) response = resp.json() rawFeatureSet = response['features'] mapping = [] for feature in response['features']: mapping.append({'name': feature['name']}) return mapping, rawFeatureSet
def index_documents(self, index, doc_type, doc_src): def bulkDocs(doc_src): for doc in doc_src: if 'id' not in doc: raise ValueError("Expecting docs to have field 'id' that uniquely identifies document") addCmd = {"_index": index, "_type": doc_type, "_id": doc['id'], "_source": doc} yield addCmd resp = elasticsearch.helpers.bulk(self.es, bulkDocs(doc_src), chunk_size=100) resp_msg(msg="Streaming Bulk index DONE {}".format(index), resp=BulkResp(resp))
def feature_set(self, index, name): resp = requests.get('{}/_featureset/{}'.format(self.elastic_ep, name)) jsonResp = resp.json() if not jsonResp['found']: raise RuntimeError("Unable to find {}".format(name)) resp_msg(msg="Fetched FeatureSet {}".format(name), resp=resp) rawFeatureSet = jsonResp['_source']['featureset']['features'] mapping = [] for feature in rawFeatureSet: mapping.append({'name': feature['name']}) return mapping, rawFeatureSet
def submit_model(self, featureset, index, model_name, model_payload): model_ep = "{}/_model/".format(self.elastic_ep) create_ep = "{}/_featureset/{}/_createmodel".format(self.elastic_ep, featureset) resp = requests.delete('{}{}'.format(model_ep, model_name)) print('Delete model {}: {}'.format(model_name, resp.status_code)) params = { 'model': { 'name': model_name, 'model': { 'type': 'model/ranklib', 'definition': model_payload } } } resp = requests.post(create_ep, json=params) resp_msg(msg="Created Model {}".format(model_name), resp=resp)
def submit_ranklib_model(self, featureset, index, model_name, model_payload): """ Submits a Ranklib model, converting it to Solr representation """ resp = requests.get('{}/{}/schema/feature-store/{}'.format( self.solr_base_ep, index, featureset)) resp_msg(msg='Submit Model {} Ftr Set {}'.format( model_name, featureset), resp=resp) metadata = resp.json() features = metadata['features'] feature_dict = {} for idx, value in enumerate(features): feature_dict[idx + 1] = value['name'] feature_mapping, _ = self.feature_set(index, featureset) solr_model = convert(model_payload, model_name, featureset, feature_mapping) self.submit_model(featureset, index, model_name, solr_model)
def log_query(self, index, featureset, ids, options={}, id_field='id'): efi_options = [] for key, val in options.items(): efi_options.append('efi.{}="{}"'.format(key, val)) efi_str = ' '.join(efi_options) if ids == None: query = "*:*" else: query = "{{!terms f={}}}{}".format(id_field, ','.join(ids)) print(query) params = { 'fl': '{},[features store={} {}]'.format(id_field, featureset, efi_str), 'q': query, 'rows': 1000, 'wt': 'json' } resp = requests.post('{}/{}/select'.format(self.solr_base_ep, index), data=params) resp_msg(msg='Searching {}'.format(index), resp=resp) resp = resp.json() def parseFeatures(features): fv = [] all_features = features.split(',') for feature in all_features: elements = feature.split('=') fv.append(float(elements[1])) return fv # Clean up features to consistent format for doc in resp['response']['docs']: doc['ltr_features'] = parseFeatures(doc['[features]']) return resp['response']['docs']
def submit_model(self, featureset, model_name, model_payload): # Fetch feature metadata resp = requests.get('{}/tmdb/schema/feature-store/{}'.format( self.solr_base_ep, featureset)) resp_msg(msg='Submit Model {} Ftr Set {}'.format( model_name, featureset), resp=resp) metadata = resp.json() features = metadata['features'] feature_dict = {} for idx, value in enumerate(features): feature_dict[idx + 1] = value['name'] feature_mapping, _ = self.feature_set('tmdb', featureset) solr_model = convert(model_payload, model_name, featureset, feature_mapping) url = '{}/tmdb/schema/model-store'.format(self.solr_base_ep) resp = requests.delete('{}/{}'.format(url, model_name)) resp_msg(msg='Deleted Model {}'.format(model_name), resp=resp) resp = requests.put(url, json=solr_model) resp_msg(msg='Created Model {}'.format(model_name), resp=resp)
def create_featureset(self, index, name, ftr_config): resp = requests.post('{}/_featureset/{}'.format(self.elastic_ep, name), json=ftr_config) resp_msg(msg="Create {} feature set".format(name), resp=resp)
def create_index(self, index, settings): resp = self.es.indices.create(index, body=settings) resp_msg(msg="Created index {}".format(index), resp=ElasticResp(resp))
def commit(): resp = requests.get('{}/{}/update?commit=true'.format( self.solr_base_ep, index)) resp_msg(msg="Committed index {}".format(index), resp=resp)
def create_index(self, index): """ Take the local config files for Elasticsearch for index, reload them into ES""" with open("docker/elasticsearch/%s_settings.json" % index) as src: settings = json.load(src) resp = self.es.indices.create(index, body=settings) resp_msg(msg="Created index {}".format(index), resp=ElasticResp(resp))
def delete_index(self, index): resp = self.es.indices.delete(index=index, ignore=[400, 404]) resp_msg(msg="Deleted index {}".format(index), resp=ElasticResp(resp), throw=False)
def flush(docs): resp = self.solr.post('{}/{}/update'.format( self.solr_base_ep, index), json=docs) resp_msg(msg="{} Docs Sent".format(len(docs)), resp=resp)