def test_solr_to_solr_reindex_and_resume_reverse(self): ''' Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs), 0) # This gets somehwat of a mid point date in the range. midpoint = (datetime.datetime.now() - datetime.timedelta(days= ((self._end_date - self._start_date).days / 2) )) # Reindex approximately half of the data by restricting FQ reindexer.reindex(fq=['date:[{} TO *]'.format(midpoint.isoformat() + 'Z')]) # Make sure we have at least 20% of the data. dest_count = len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs) s_count = len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs) self.assertTrue(s_count > dest_count > s_count * .20) reindexer.resume() # Make sure countc match up after reindex self.assertEqual( len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs), len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs))
def test_solr_to_solr_with_date(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='index_date') reindexer.reindex() try: self.assertTrue(solr.transport._action_log[1]['params']['params'] ['sort'] == 'index_date asc, id desc') except KeyError: self.assertTrue(solr.transport._action_log[2]['params']['params'] ['sort'] == 'index_date asc, id desc') self.assertEqual( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs.sort(key=lambda x: x['id']), solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs.sort(key=lambda x: x['id']), )
def simple_query(page): d = dict() query = buildQuery() print(query) solr = SolrClient(current_app.config['SOLR']) res = solr.query('scripties', { 'q': query, 'rows': '0', }) count = res.get_num_found() pages = math.ceil(count / 10) start = (page - 1) * 10 res = solr.query( 'scripties', { 'q': query, 'rows': '10', 'start': start, 'fl': 'id,titel,auteur,jaar', 'facet': True, 'facet.field': ['jaar', 'type', 'faculteit'], }) facets = res.get_facets() d['result'] = res d['pages'] = pages d['page'] = page d['f_jaar'] = facets['jaar'] d['f_type'] = facets['type'] d['f_faculteit'] = collect(facets['faculteit']) d['f'] = request.args.get('faculteit') d['j'] = request.args.get('jaar') d['t'] = request.args.get('type') return d
def test_solr_to_solr_reindex_and_resume_reverse(self): """ Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) # This gets somehwat of a mid point date in the range. midpoint = datetime.datetime.now() - datetime.timedelta(days=((self._end_date - self._start_date).days / 2)) # Reindex approximately half of the data by restricting FQ reindexer.reindex(fq=["date:[{} TO *]".format(midpoint.isoformat() + "Z")]) sleep(10) # Make sure we have at least 20% of the data. dest_count = len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs) s_count = len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs) self.assertTrue(s_count > dest_count > s_count * 0.20) reindexer.resume() sleep(10) # Make sure countc match up after reindex self.assertEqual( len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), )
def test_solr_to_solr_reindexer_per_shard(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", per_shard=True, date_field="date" ) # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) reindexer.reindex() # sloppy check over here, will improve later self.assertEqual( len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), )
def test_solr_to_solr_resume_checkonly(self): ''' Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs), 0) reindexer.resume(check=True) # Makes sure nothing got indexed self.assertEqual(len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs), 0)
def get_tags_by_pmi(target_tag, solr:SolrClient, core_name="tags"): #http://localhost:8983/solr/tags/select?indent=on&q=tag_text:banmuslims%20AND%20type:1&wt=json rows=100 #100 results per page stop=False start=0 q='tag_text:' + target_tag+' AND type:1' #0=single tag; 1=tag pairs #because we need to get tags similar to this target, so we need to get all pairs and process them while not stop: res = solr.query(core_name, { 'q':q, #remember we only show tweets tagged as hate (0) 'rows':rows, 'fl':'*', #return all fields from the index (when available 'start':start, #start from 'sort':'pmi desc'}) #sort by risk_score descending start+=rows #resetting start will turn to next page. for specific page number, you need to work out the 'start' by pagenum*rows print("total number found={}".format(res.num_found)) if start>res.num_found: stop=True #now go through every page, every result for d in res.docs: #res.docs only contain documents on the CURRENT page tags=d['tag_text'].split(" ") relevant_tag=tags[0] if relevant_tag==target_tag: relevant_tag=tags[1] print(relevant_tag+", pmi="+d['pmi'])
def suggest(): query_key = request.args.get('query') solr = SolrClient('http://localhost:8983/solr') res = solr.query('myexample',{ 'q':query_key, },'suggest') return json.dumps(res.data['suggest']['suggest'][query_key]['suggestions'])
def search(query_dict): #pdb.set_trace() #instantiate solr connection solr = SolrClient('http://localhost:8983/solr') # Generic search if no query input given if len(query_dict) == 0: query_string = '*:*' #retrieve value of field in table and prepare a query string else: query_string = '' query_op = ' AND ' item_count = 0 for key in query_dict: if len(query_dict[key]) > 0: if item_count > 0: query_string = query_string + query_op + key + ':' + query_dict[ key] else: query_string = query_string + key + ':' + query_dict[key] item_count += 1 res = solr.query('lyrics', { 'q': query_string, }) return res.data['response']['docs']
def update(solr: SolrClient, tweet_core_name, tag_core_name, timespan, rows, feat_vectorizer, ml_model, selected_features, hate_indicative_features, scaling_option, sysout, logger): stop = False start = 0 while not stop: logger.warn("Processing from {} for a batch of {}".format(start, rows)) print("Processing from {} for a batch of {}".format(start, rows)) res = solr.query( tweet_core_name, { 'q': 'created_at:' + timespan, 'rows': rows, 'fl': '*', 'start': start, 'sort': 'id asc' }) start += rows if start > res.num_found: stop = True #apply pretrained ML model to tag data and update them update_ml_tag(solr, tweet_core_name, tag_core_name, res.docs, feat_vectorizer, ml_model, selected_features, hate_indicative_features, scaling_option, sysout, logger) pass
def get_tweets_by_time(timespan, solr:SolrClient, core_name="tweets"): rows=100 #100 results per page stop=False start=0 facet_counts=None q='created_at:' + timespan+' AND ml_tag:0' while not stop: res = solr.query(core_name, { 'q':q, #remember we only show tweets tagged as hate (0) 'facet.field':'entities_hashtag', #count results per facet (NOTE, not every tweet will have a hashtag, but this is ok 'facet':"on", #switch on facet search 'facet.mincount':"1", #show facets that have at least 1 result 'rows':rows, 'fl':'*', #return all fields from the index (when available 'start':start, #start from 'sort':'tweet_risk desc'}) #sort by risk_score descending start+=rows #resetting start will turn to next page. for specific page number, you need to work out the 'start' by pagenum*rows print("total number found={}".format(res.num_found)) if start>res.num_found: stop=True #assign facet results to another var. facet counts is for the whole dataset, not just this page if facet_counts is None: facet_counts=res.data['facet_counts']['facet_fields']['entities_hashtag'] #now go through every page, every result for d in res.docs: #res.docs only contain documents on the CURRENT page print("https://twitter.com/"+d['user_screen_name']+"/"+d['id']) if 'coordinates' in d.keys(): print(d['coordinates']) #finally print facet counts print(facet_counts)
def test_solr_to_solr_resume_checkonly(self): """ Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) reindexer.resume(check=True) # Makes sure nothing got indexed self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
def test_solr_to_solr_reindexer_per_shard(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], auth=test_config['SOLR_CREDENTIALS']) #Make sure only source has data self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), 50000) self.assertEqual( len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs), 0) reindexer = Reindexer(source=solr, source_coll='source_coll_shard1_replica1', dest=solr, dest_coll=self.colls[1], per_shard=True, date_field='date') reindexer.reindex() reindexer = Reindexer(source=solr, source_coll='source_coll_shard2_replica1', dest=solr, dest_coll=self.colls[1], per_shard=True, date_field='date') reindexer.reindex() self.solr.commit(self.colls[1], openSearcher=True) #sloppy check over here, will improve later self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs))
def detail_query(key): solr = SolrClient(current_app.config['SOLR']) q = 'id:{}'.format(key) res = solr.query( 'scripties', { 'q': q, 'fl': 'titel,auteur,jaar,supervisor,type,faculteit,opleiding,taal', }) return res
def test_solr_to_solr_resume_basic(self): """ Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Make sure only source has datae self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) reindexer.resume() sleep(10) # Make sure countc match up after reindex self.assertEqual( len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), )
def get_solr(): solr = SolrClient(current_app.config['SOLR']) res = solr.query('scripties', { 'q': 'titel:muslim', 'facet': True, 'facet.field': 'taal', }) return res.get_results_count() # def get_post(id, check_author=True): # post = get_db().execute( # 'SELECT p.id, title, body, created, author_id, username' # ' FROM post p JOIN user u ON p.author_id = u.id' # ' WHERE p.id = ?', # (id,) # ).fetchone() # if post is None: # abort(404, "Post id {0} doesn't exist.".format(id)) # if check_author and post['author_id'] != g.user['id']: # abort(403) # return post # @bp.route('/<int:id>/update', methods=('GET', 'POST')) # @login_required # def update(id): # post = get_post(id) # if request.method == 'POST': # title = request.form['title'] # body = request.form['body'] # error = None # if not title: # error = 'Title is required.' # if error is not None: # flash(error) # else: # db = get_db() # db.execute( # 'UPDATE post SET title = ?, body = ?' # ' WHERE id = ?', # (title, body, id) # ) # db.commit() # return redirect(url_for('blog.index')) # return render_template('blog/update.html', post=post) # @bp.route('/<int:id>/delete', methods=('POST',)) # @login_required # def delete(id): # get_post(id) # db = get_db() # db.execute('DELETE FROM post WHERE id = ?', (id,)) # db.commit() # return redirect(url_for('blog.index'))
def get_tag_riskscore(solr: SolrClient, core_name, tag): if tag[0] == '#': tag = tag[1:] tag = tag.lower() res = solr.query(core_name, { 'q': 'id:' + tag, 'fl': iu.tag_index_field_risk_score }) for d in res.docs: score = d[iu.tag_index_field_risk_score] return score return 0.0
def test_solr_to_solr_resume_basic(self): ''' Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') #Make sure only source has datae self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), 50000) self.assertEqual( len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs), 0) reindexer.resume() sleep(10) #Make sure countc match up after reindex self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs))
def read_all(): client = SolrClient('http://localhost:8983/solr') res = client.query('test', { 'q' : '*:*' }) res = json.loads(res.get_json()) docs = res['response']['docs'] for doc in docs: print (doc)
def test_index_multiproc(self): index = IndexQ(test_config['indexqbase'], 'testq') solr = SolrClient(test_config['SOLR_SERVER'], devel=True, auth=test_config['SOLR_CREDENTIALS']) solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*') buff = [] files = [] for doc in self.docs: files.append(index.add(doc, finalize=True)) index.index(solr,test_config['SOLR_COLLECTION'],threads=10) solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True) for doc in self.docs: res = solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])}) self.assertTrue(res.get_results_count()==1)
class SolrSearch(object): def __init__(self, core, url, limit=3): self.core = core self.url = url self.limit = limit self.solr = SolrClient(url) def query(self, question): passages = set() res = self.solr.query(self.core, { 'q': 'context_text:{}'.format(question) }) for doc in res.docs: passages.add(doc['context_text'][0]) return list(passages)[:self.limit]
def get_tweets_by_time_and_tag(tag, timespan, solr:SolrClient, core_name="tweets"): rows=100 #100 results per page stop=False start=0 facet_counts=None res = solr.query(core_name, { 'q':'created_at:' + timespan+' AND ml_tag:0' +' AND entities_hashtag:'+tag, #remember we only show tweets tagged as hate (0) 'facet.field':'entities_hashtag', #count results per facet (NOTE, not every tweet will have a hashtag, but this is ok 'facet':"on", #switch on facet search 'facet.mincount':"1", #show facets that have at least 1 result 'rows':rows, 'fl':'*', #return all fields from the index (when available 'start':start, #start from 'sort':'tweet_risk desc'}) #sort by risk_score descending
def test_solr_to_solr_with_date(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date" ) reindexer.reindex() try: self.assertTrue(solr.transport._action_log[1]["params"]["params"]["sort"] == "index_date asc, id desc") except KeyError: self.assertTrue(solr.transport._action_log[2]["params"]["params"]["sort"] == "index_date asc, id desc") self.assertEqual( solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]), solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]), )
def get_latest_update(url,collection, query): dttm = None solr = SolrClient(url) res = solr.query(collection, { 'q': query, 'rows': 1, 'sort': 'system_mtime desc' }) pp.pprint(res.get_results_count()) if res.get_results_count() == 1: pp.pprint(res.docs[0]['system_mtime']) date = res.docs[0]['system_mtime'] dttm = datetime.strptime(date,"%Y-%m-%dT%H:%M:%SZ") pp.pprint(dttm) return dttm
def computeScores1(type, query, output_file): solr = SolrClient('http://localhost:8983/solr') res = solr.query(query['index'], { 'q': '*:*', 'wt': 'json', 'indent': True, 'rows': 1000, }) docs = res.data['response']['docs'] with open(output_file, "wb") as outF: a = csv.writer(outF, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) a.writerow(["type", "x-coordinate", "y-coordinate", "Similarity_score"]) for doc in docs: for key in doc: if key in ["id", "_version_"]: continue try: doc[key] = doc[key][0].encode("ascii", "ignore") except: doc[key] = str(doc[key][0]).decode("unicode_escape").encode("ascii", "ignore") doc_tuples = itertools.combinations(docs, 2) for raw1, raw2 in doc_tuples: doc1 = raw1.copy() doc2 = raw2.copy() if "Name" in doc1: row_cosine_distance = [type, doc1["Name"], doc2["Name"]] else: row_cosine_distance = [type, doc1["name"], doc2["name"]] v1 = Vector(row_cosine_distance[0], doc1) v2 = Vector(row_cosine_distance[1], doc2) row_cosine_distance.append(v1.cosTheta(v2)) a.writerow(row_cosine_distance)
def readDocumentsFromSolr(numberofrecords): contents = [] solr = SolrClient('http://52.41.35.204:8983/solr') res = solr.query('tennisCollection', {'q': '*:*', 'rows': numberofrecords}) for doc in res.docs: processedDocument = {} processedDocument['id'] = '' processedDocument['title'] = '' processedDocument['content'] = '' if 'id' in doc: processedDocument['id'] = doc['id'] if 'title' in doc: processedDocument['title'] = doc['title'] if 'content' in doc: processedDocument['content'] = doc['content'] contents.append(processedDocument) return contents
def get_tweets_by_coordinates(lat, lon, range, timespan, solr:SolrClient, core_name="tweets"): lat_min=lat-range lat_max=lat+range lon_min=lon-range lon_max=lon+range rows=100 #100 results per page stop=False start=0 facet_counts=None res = solr.query(core_name, { 'q':'created_at:' + timespan+' AND ml_tag:0' +' AND coordinate_lat:'+'[{} TO {}]'.format(lat_min, lat_max) +' AND coordinate_lon:'+'[{} TO {}]'.format(lon_min, lon_max), 'facet.field':'entities_hashtag', #count results per facet (NOTE, not every tweet will have a hashtag, but this is ok 'facet':"on", #switch on facet search 'facet.mincount':"1", #show facets that have at least 1 result 'rows':rows, 'fl':'*', #return all fields from the index (when available 'start':start, #start from 'sort':'tweet_risk desc'}) #sort by risk_score descending
def get_existing(solr: SolrClient, core_name, pagesize): stop = False start = 0 tags = {} tag_pairs = {} while not stop: res = solr.query(core_name, { 'q': '*:*', 'rows': pagesize, 'fl': '*', 'start': start}) start += pagesize if start > res.num_found: stop = True for d in res.docs: if d['type'] == '0': # single tag tags[d['id']] = d else: tag_pairs[d['id']] = d return tags, tag_pairs
def get(self, request: HttpRequest, lang='en', keys=''): lang = request.LANGUAGE_CODE uuid_regex = self.uuid_pattern() uuids = keys.split(',') valid_uuids = [] for uuid in uuids: if uuid_regex.match(uuid): valid_uuids.append(uuid) keys = ",".join(valid_uuids) # Get the titles solr = SolrClient(settings.OPEN_DATA_SOLR_SERVER_URL) q_text = " OR id:".join(valid_uuids) q_text = "id:" + q_text solr_query = {'q': q_text, 'defType': 'edismax', 'sow': True} solr_response = solr.query(settings.OPEN_DATA_CORE, solr_query) titles = {} title_field = 'title_fr_s' if lang == "fr" else 'title_en_s' for doc in solr_response.docs: titles[doc['id']] = doc[title_field] context = { "language": lang, "keys": keys, "titles": titles, "open_data_url": settings.OPEN_DATA_BASE_URL_FR if lang == "fr" else settings.OPEN_DATA_BASE_URL_EN, "rcs_config": 'ramp/config.rcs.fr-CA.json' if lang == "fr" else 'ramp/config.rcs.en-CA.json', "toggle_url": self._get_toggle(lang, keys) } return render(request, 'ramp.html', context)
def get(self): term = self.get_argument('term') client = SolrClient('http://localhost:8983/solr') res = client.query('stocks', { #'q' : 'symbol:%s' % '*' 'q' : term }) res = json.loads(res.get_json()) docs = res['response']['docs'] formatted = [] for doc in docs: formatted.append({ 'name' : doc['name'], 'symbol' : doc['symbol'], 'sector' : doc['sector'], 'open' : doc['open'] }) self.write(json.dumps(formatted))
from SolrClient import SolrClient import sys import json import requests from config import ec_uri, solr_uri # if this is run again probably not ec_uri (from uri) #uri = 'http://192.168.1.122' #if run again this may also need to be changed (to uri) solr_new = SolrClient(ec_uri+':8983/solr') solr_old = SolrClient(solr_uri+'/solr') #solr_old = SolrClient(ec_uri+':8983/solr') #solr_new = SolrClient(uri+':8983/solr') collection = 'sonos_companion' start = 0 temp = [1] while len(temp) > 0: result = solr_old.query(collection, {'q':'*', 'rows':1000, 'start':start}) temp = result.data['response']['docs'] #print(repr(temp).encode('cp1252', errors='replace')) start+=1000 documents = [] for item in temp: #document = {'id':item['id'].lower()} # apparently ran the first time to transfer to raspi without track in the list # the reason so few tracks actually have a track number (I did a few starting 08072016) #document.update({k:item[k] for k in item if k in ('id','album','artist','title','uri','track')}) document = {k:item[k] for k in item if k in ('id','album','artist','title','uri','track')} documents.append(document) #print(documents) n = 0
class ReindexerTests(unittest.TestCase): #Methos to create the schema in the collections def create_fields(self): for coll in self.colls: logging.debug("Creating fields for {}".format(coll)) for field in test_config['collections']['fields']: try: self.solr.schema.create_field(coll, field) except ValueError: #Filed already exists probably pass def create_copy_fields(self): for coll in self.colls: logging.debug("Creating copy fields for {}".format(coll)) for field in test_config['collections']['copy_fields']: try: self.solr.schema.create_copy_field(coll, field) except ValueError: #Filed already exists probably pass def setUp(self): [self.solr.delete_doc_by_id(coll, '*') for coll in self.colls] [self.solr.commit(coll, openSearcher=True) for coll in self.colls] def _index_docs(self, numDocs, coll): ''' Generates and indexes in random data while maintaining counts of items in various date ranges. These counts in self.date_counts are used later to validate some reindexing methods. Brace yourself or have a drink..... ''' self.docs = self.rand_docs.get_docs(numDocs) sdate = datetime.datetime.now() - datetime.timedelta(days=180) edate = datetime.datetime.now() + datetime.timedelta(days=30) self._start_date = sdate self._end_date = edate import random #Assign random times to documents that are generated. This is used to spread out the documents over multiple time ranges hours = (edate - sdate).days * 24 hour_range = [x for x in range(int(hours))] self.date_counts = {} #Save the newest and oldest timestamps as well as assign them to first and second doc self.docs[0]['date'] = sdate.isoformat() + 'Z' self.date_counts[sdate.date().isoformat()] = 1 self.docs[1]['date'] = edate.isoformat() + 'Z' self.date_counts[edate.date().isoformat()] = 1 for doc in self.docs[2:]: #Make a new date and store a count of it so I can compare later new_date = (sdate + datetime.timedelta(hours=random.choice(hour_range))) new_date_s = new_date.date().isoformat() if new_date_s in self.date_counts: self.date_counts[new_date_s] += 1 else: self.date_counts[new_date_s] = 1 doc['date'] = new_date.isoformat() + 'Z' self.solr.index_json(coll, json.dumps(self.docs)) self.solr.commit(coll, openSearcher=True) time.sleep(10) def get_all_json_from_indexq(self, index): files = index.get_all_as_list() out = [] for efile in files: if efile.endswith('.gz'): f = gzip.open(efile, 'rt', encoding='utf-8') else: f = open(efile) f_data = json.load(f) f.close() out.extend(f_data) return out @classmethod def setUpClass(self): logging.debug("Starting to run Reindexer Tests") self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) self.colls = [ test_config['SOLR_REINDEXER_COLLECTION_S'], test_config['SOLR_REINDEXER_COLLECTION_D'] ] self.rand_docs = RandomTestData() def test_solr_to_indexq(self): ''' Will export documents from Solr and put them into an IndexQ. ''' index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0) for dir in ['_todo_dir', '_done_dir']: [os.remove(x) for x in index.get_all_as_list(dir=dir)] self._index_docs(5000, self.colls[0]) reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=index) reindexer.reindex() from_files = self.get_all_json_from_indexq(index) from_solr = self.solr.query('source_coll', { 'q': '*:*', 'rows': 5000 }).docs from_solr = reindexer._trim_fields(from_solr) self.assertEqual(sorted(from_files, key=lambda x: x['id']), sorted(from_solr, key=lambda x: x['id'])) def test_ignore_fields(self): ''' Will export documents from Solr and put them into an IndexQ. ''' index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0) for dir in ['_todo_dir', '_done_dir']: [os.remove(x) for x in index.get_all_as_list(dir=dir)] reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=index) for field in ['_version_', 'product_name_exact']: self.assertTrue(field in reindexer._ignore_fields) def test_ignore_fields_disable(self): ''' Checks to make sure ignore_fields override works ''' index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0) reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=index, ignore_fields=False) self.assertEqual(reindexer._ignore_fields, False) def test_ignore_fields_override(self): ''' Checks to make sure ignore_fields override works ''' index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0) reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=index, ignore_fields=['_text_', '_any_other_field']) self.assertEqual(reindexer._ignore_fields, ['_text_', '_any_other_field']) def test_get_copy_fields(self): ''' Tests the method to get copy fields from Solr. ''' reindexer = Reindexer(source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll='doesntmatter') self.assertEqual(reindexer._get_copy_fields(), [ field['dest'] for field in self.solr.schema.get_schema_copyfields(self.colls[0]) ]) def test_query_gen(self): ''' Tests the method to get copy fields from Solr. ''' reindexer = Reindexer(source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll='doesntmatter') self.assertEqual( reindexer._get_query('cursor'), { 'cursorMark': 'cursor', 'rows': reindexer._rows, 'q': '*:*', 'sort': 'id desc' }) def test_query_gen_pershard_distrib(self): ''' Tests the method to get copy fields from Solr. ''' reindexer = Reindexer(source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll='doesntmatter', per_shard=True) q = reindexer._get_query('cursor') self.assertTrue('distrib' in q and q['distrib'] == 'false') def test_query_gen_date(self): ''' Tests the method to get copy fields from Solr. ''' reindexer = Reindexer(source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll='doesntmatter', date_field='ddddd') self.assertEqual( reindexer._get_query('cursor'), { 'cursorMark': 'cursor', 'rows': reindexer._rows, 'q': '*:*', 'sort': 'id desc', 'sort': 'ddddd asc, id desc' }) def test_remove_copy_fields_from_data(self): index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0) for dir in ['_todo_dir', '_done_dir']: [os.remove(x) for x in index.get_all_as_list(dir=dir)] reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=index) reindexer.reindex() from_files = self.get_all_json_from_indexq(index) excluded_fields = reindexer._ignore_fields for doc in from_files: for field in excluded_fields: if field in doc: print(doc) #self.assertTrue(field not in doc) def test_solr_to_solr(self): self._index_docs(50000, self.colls[0]) reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=self.solr, dest_coll='dest_coll') reindexer.reindex() self.assertEqual( self.solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs.sort(key=lambda x: x['id']), self.solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs.sort(key=lambda x: x['id']), ) def test_solr_to_solr_with_date(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='index_date') reindexer.reindex() try: self.assertTrue(solr.transport._action_log[1]['params']['params'] ['sort'] == 'index_date asc, id desc') except KeyError: self.assertTrue(solr.transport._action_log[2]['params']['params'] ['sort'] == 'index_date asc, id desc') self.assertEqual( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs.sort(key=lambda x: x['id']), solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs.sort(key=lambda x: x['id']), ) def test_get_edge_date(self): ''' Checks to make sure _get_edge_date returns correct start and end dates. ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='index_date') solr_end_date_string = reindexer._get_edge_date('date', 'desc') solr_start_date_string = reindexer._get_edge_date('date', 'asc') self.assertTrue( self._start_date.date(), datetime.datetime.strptime(solr_start_date_string, '%Y-%m-%dT%H:%M:%S.%fZ')) self.assertTrue( self._end_date.date(), datetime.datetime.strptime(solr_end_date_string, '%Y-%m-%dT%H:%M:%S.%fZ')) def test_get_date_range_query(self): ''' Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one ''' solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='index_date') self.assertEqual( reindexer._get_date_range_query('2015-11-10', '2015-12-11'), { 'rows': 0, 'facet.range.end': '2015-12-11', 'facet': 'true', 'facet.range': 'index_date', 'facet.range.start': '2015-11-10', 'q': '*:*', 'facet.range.include': 'all', 'facet.range.gap': '+1DAY' }) self.assertEqual( reindexer._get_date_range_query('2015-11-10', '2015-12-11', date_field='date123'), { 'rows': 0, 'facet.range.end': '2015-12-11', 'facet': 'true', 'facet.range': 'date123', 'facet.range.start': '2015-11-10', 'q': '*:*', 'facet.range.include': 'all', 'facet.range.gap': '+1DAY' }) self.assertEqual( reindexer._get_date_range_query('2015-11-10', '2015-12-11', date_field='date123', timespan='MONTH'), { 'rows': 0, 'facet.range.end': '2015-12-11', 'facet': 'true', 'facet.range': 'date123', 'facet.range.start': '2015-11-10', 'q': '*:*', 'facet.range.include': 'all', 'facet.range.gap': '+1MONTH' }) self.assertEqual( reindexer._get_date_range_query('2015-11-10', '2015-12-11', timespan='MONTH'), { 'rows': 0, 'facet.range.end': '2015-12-11', 'facet': 'true', 'facet.range': 'index_date', 'facet.range.start': '2015-11-10', 'q': '*:*', 'facet.range.include': 'all', 'facet.range.gap': '+1MONTH' }) def test_get_date_facet_counts(self): ''' Checks the date_range_query generation function. Makes sure the date ranges returned matches what got indexed. ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') #Testing this one source_facet, dest_facet = reindexer._get_date_facet_counts( 'DAY', 'date', start_date=self._start_date.date().isoformat()) for dt_range in source_facet: dt = datetime.datetime.strptime( dt_range, '%Y-%m-%dT%H:%M:%SZ').date().isoformat() if source_facet[dt_range] != self.date_counts[dt]: logging.info("{} - {} - {}".format(dt, source_facet[dt_range], self.date_counts[dt])) self.assertEqual(source_facet[dt_range], self.date_counts[dt]) def test_get_date_facet_counts_without_start_date(self): ''' Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') #Testing this one source_facet, dest_facet = reindexer._get_date_facet_counts( 'DAY', 'date') for dt_range in source_facet: dt = datetime.datetime.strptime( dt_range, '%Y-%m-%dT%H:%M:%SZ').date().isoformat() if source_facet[dt_range] != self.date_counts[dt]: logging.info("{} - {} - {}".format(dt, source_facet[dt_range], self.date_counts[dt])) self.assertEqual(source_facet[dt_range], self.date_counts[dt]) def test_get_date_facet_counts_not_day(self): ''' Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') #Testing this one with self.assertRaises(ValueError): source_facet, dest_facet = reindexer._get_date_facet_counts( 'MONTH', 'date') ## These tests are focused on methods related to resuming re-indexing def test_solr_to_solr_resume_checkonly(self): ''' Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') #Make sure only source has data self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), 50000) self.assertEqual( len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs), 0) reindexer.resume(check=True) #Makes sure nothing got indexed self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), 50000) self.assertEqual( len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs), 0) def test_solr_to_solr_resume_basic(self): ''' Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') #Make sure only source has datae self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), 50000) self.assertEqual( len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs), 0) reindexer.resume() sleep(10) #Make sure countc match up after reindex self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs)) def test_solr_to_solr_reindex_and_resume(self): ''' Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') #Make sure only source has datae self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), 50000) self.assertEqual( len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs), 0) #This gets somehwat of a mid point date in the range. midpoint = (datetime.datetime.now() - datetime.timedelta(days=( (self._end_date - self._start_date).days / 2))) #Reindex approximately half of the data by restricting FQ reindexer.reindex( fq=['date:[* TO {}]'.format(midpoint.isoformat() + 'Z')]) sleep(10) #Make sure we have at least 20% of the data. dest_count = len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs) s_count = len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs) self.assertTrue(s_count > dest_count > s_count * .20) reindexer.resume() sleep(10) #Make sure countc match up after reindex self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs)) def test_solr_to_solr_reindex_and_resume_reverse(self): ''' Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') #Make sure only source has data self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), 50000) self.assertEqual( len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs), 0) #This gets somehwat of a mid point date in the range. midpoint = (datetime.datetime.now() - datetime.timedelta(days=( (self._end_date - self._start_date).days / 2))) #Reindex approximately half of the data by restricting FQ reindexer.reindex( fq=['date:[{} TO *]'.format(midpoint.isoformat() + 'Z')]) sleep(10) #Make sure we have at least 20% of the data. dest_count = len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs) s_count = len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs) self.assertTrue(s_count > dest_count > s_count * .20) reindexer.resume() sleep(10) #Make sure countc match up after reindex self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs)) def test_solr_to_solr_reindexer_per_shard(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], auth=test_config['SOLR_CREDENTIALS']) #Make sure only source has data self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), 50000) self.assertEqual( len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs), 0) reindexer = Reindexer(source=solr, source_coll='source_coll_shard1_replica1', dest=solr, dest_coll=self.colls[1], per_shard=True, date_field='date') reindexer.reindex() reindexer = Reindexer(source=solr, source_coll='source_coll_shard2_replica1', dest=solr, dest_coll=self.colls[1], per_shard=True, date_field='date') reindexer.reindex() self.solr.commit(self.colls[1], openSearcher=True) #sloppy check over here, will improve later self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs))
def scan(): # Get search result #print(parameters) # query_key, page_rank = parameters.split('&') # query_key = query_key.split('=')[1] # page_rank = page_rank.split('=')[1] query_key = request.args.get('query') page_rank = request.args.get('pagerank') # print(query_key) # print(page_rank) solr = SolrClient('http://localhost:8983/solr') if page_rank == '1': #print('exe1') res = solr.query('myexample',{ 'q':query_key, 'sort':'pageRankFile desc', }) else: #print('exe0') res = solr.query('myexample',{ 'q':query_key, }) if res is None: json_result = {'query':None} return json.dumps(json_result, ensure_ascii=False) else: #print(res) for value in res.docs: #print(value['id']) # Add snippets snippet = get_snippet(value['id'], query_key) value['snippet'] = snippet if 'description' not in value: value['description']='NULL' if 'og_url' not in value: with open('./mapNBCNewsDataFile.csv') as f: key = value['id'].split('/')[-1] for line in f: if line.split(',')[0] == key: value['og_url'] = (line.split(',')[-1]) break # Use Norvig's result to replace the Solr suggestion # correct_res = res.data['spellcheck'] # if correct_res.get('suggestions'): # correct_word = correction(query_key) # res.data['spellcheck']['suggestions'][1]['suggestion'][0]=correct_word correct_res = res.data['spellcheck'] correct_word_list=[] if correct_res.get('suggestions'): query_key_list = query_key.split() for i in query_key_list: # correct_word = correction(query_key) correct_word_list.append(correction(i)) res.data['spellcheck']['collations'][1]=' '.join(correct_word_list) return json.dumps(res.data, ensure_ascii=False,indent=4)
class SOLR(): def __init__(self, url): self.url = url self.solr = SolrClient(self.url) self.solr_home = home_dir + '/solr-7.1.0/server/solr/' def solr_core_exists(self, core_name): url = self.url + '/admin/cores?action=STATUS&core=' + core_name response = requests.get(url) r = response.json() if r['status'][core_name]: return 1 else: return 0 def create_solr_core(self, core_name): core_dir = os.path.join(self.solr_home, core_name) if os.path.exists(core_dir): shutil.rmtree(core_dir) os.makedirs(core_dir) src_dir = os.path.join(self.solr_home, 'configsets/_default/conf') #'configsets/sample_techproducts_configs/conf') dst_dir = os.path.join(core_dir, 'conf') shutil.copytree(src_dir, dst_dir) url1 = self.url + '/admin/cores?action=CREATE&name=' + core_name url2 = '&instanceDir=' + self.solr_home + core_name r = requests.get(url1 + url2) #print(r.text) def delete_solr_core(self, core_name): url1 = self.url + '/admin/cores?action=UNLOAD&core=' + core_name url2 = '&deleteIndex=true&deleteDataDir=true&deleteInstanceDir=true' r = requests.get(url1 + url2) #print(r.text) def update_solr(self, data, core_name): url = self.url + '/' + core_name + '/update?wt=json' headers = {'Content-Type': 'application/json', 'Connection': 'close'} params = {'boost': 1.0, 'overwrite': 'true', 'commitWithin': 1000} data = {'add': {'doc': data}} r = requests.post(url, headers=headers, params=params, json=data) #print(r.text) def delete_solr_by_id(self, core_name, _id): url = self.url + '/' + core_name + '/update?wt=json' headers = {'Content-Type': 'application/xml'} params = {'commit': 'true'} data = "<delete><id>" + _id + "</id></delete>" data = data.encode('utf8') r = requests.post(url, headers=headers, params=params, data=data) #print(r.text) def delete_solr_by_query(self, core_name, query): url = self.url + '/' + core_name + '/update?wt=json' headers = {'Content-Type': 'application/xml'} params = {'commit': 'true'} data = "<delete><query>" + query + "</query></delete>" data = data.encode('utf8') r = requests.post(url, headers=headers, params=params, data=data) #print(r.text) def query_question_solr(self, core_name, question, fields, num): query = { 'q': 'question_ik:' + question, 'fl': fields, 'rows': num, } res = self.solr.query(core_name, query) return res def query_solr(self, core_name, select, fields, num): query = { 'q': select, 'fl': fields, 'rows': num, } res = self.solr.query(core_name, query) return res
print("Could not bring back queue") else: existing_queue = json.loads(z.decode('utf-8')) # you are just going to create duplicate playlists if you do this n = 1 for track in existing_queue: print(n, track[0],track[1]) n+=1 else: action = 'play' try: while 1: track_title = input("\nwhat is the title of the track that you want to add to the queue (Ctrl-C if done)? ") s = 'title:' + ' AND title:'.join(track_title.split()) result = solr.query(collection, {'q':s, 'rows':10, 'fl':['score', 'id', 'uri', 'title', 'artist', 'album'], 'sort':'score desc'}) tracks = result.docs count = result.get_results_count() if count==0: print("Didn't find any tracks\n") elif count==1: track = tracks[0] try: print('id: ' + track['id']) print('artist: ' + track['artist']) print('album: ' + track['album']) print('song: ' + track['title']) print('uri: ' + track['uri']) except Exception as e: print(e) print('---------------------------------------------------------------')
class ClientTestQuery(unittest.TestCase): @classmethod def setUpClass(self): self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) self.rand_docs = RandomTestData() self.docs = self.rand_docs.get_docs(50) self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*') for field in test_config['collections']['copy_fields']: try: self.solr.collections.delete_copy_field( test_config['SOLR_COLLECTION'], field) except: pass for field in test_config['collections']['fields']: try: self.solr.collections.create_field( test_config['SOLR_COLLECTION'], field) except: pass #Index Some data self.solr.index_json(test_config['SOLR_COLLECTION'], json.dumps(self.docs)) self.solr.commit(test_config['SOLR_COLLECTION'], openSearcher=True) def test_basic_query(self): r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'}) self.assertEqual(r.get_num_found(), len(self.docs)) def test_facet(self): r = self.solr.query(test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': 'true', 'facet.field': 'facet_test', }) local_facets = {} for doc in self.docs: try: local_facets[doc['facet_test']] += 1 except: local_facets[doc['facet_test']] = 1 try: self.assertDictEqual(local_facets, r.get_facets()['facet_test']) except Exception as e: logging.info("local") logging.info(local_facets) logging.info("facets") logging.info(r.get_facets()) raise def test_facet_with_fq(self): r = self.solr.query(test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': True, 'facet.field': 'facet_test', }) first_facet_field = list(r.get_facets()['facet_test'].keys())[0] first_facet_field_count = r.get_facets( )['facet_test'][first_facet_field] r = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': True, 'facet.field': 'facet_test', 'fq': 'facet_test:{}'.format(first_facet_field) }) self.assertEqual(r.get_num_found(), first_facet_field_count) def test_facet_range(self): res = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': True, 'facet.range': 'price', 'facet.range.start': 0, 'facet.range.end': 100, 'facet.range.gap': 10 }) prices = [doc['price'] for doc in self.docs] div = lambda x: str(x // 10 * 10) out = {} for k, g in itertools.groupby(sorted(prices), div): out[k] = len(list(g)) or 0 self.assertDictEqual(out, res.get_facets_ranges()['price']) def test_facet_pivot(self): res = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': True, 'facet.pivot': ['facet_test,price', 'facet_test,id'] }) out = {} for doc in self.docs: if doc['facet_test'] not in out: out[doc['facet_test']] = {} if doc['price'] not in out[doc['facet_test']]: out[doc['facet_test']][doc['price']] = 1 else: out[doc['facet_test']][doc['price']] += 1 self.assertDictEqual(out, res.get_facet_pivot()['facet_test,price']) def test_get_field_values_as_list(self): res = self.solr.query(test_config['SOLR_COLLECTION'], { 'q': '*:*', }) results = res.get_field_values_as_list('product_name_exact') docs = res.docs temp = [] for doc in docs: if 'product_name_exact' in doc: temp.append(doc['product_name_exact']) self.assertEqual(results, temp) def test_get_facet_values_as_list(self): r = self.solr.query(test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': 'true', 'facet.field': 'facet_test', })
from solr import * import pysolr #conn = solr.solr("http://solr.example.net/solr") #conn = solr.Solr("http://solr.example.net/solr") #solr.SearchHandler(conn,"/select") #conn.query() import sklearn from SolrClient import SolrClient solr=SolrClient('http://192.168.1.100:8983/solr/') result=solr.query('tableAbstract',{'q':'memBody:blood','facet':True,'facet.range.start':0,'facet.range.end':1000000}) for x in result.docs: #print(x['id']) print(int(float(x['id']))) #print(x['id']) print (result.get_num_found())
class ReindexerTests(unittest.TestCase): # Methos to create the schema in the collections def create_fields(self): for coll in self.colls: logging.debug("Creating fields for {}".format(coll)) for field in test_config["collections"]["fields"]: try: self.solr.schema.create_field(coll, field) except ValueError: # Filed already exists probably pass def create_copy_fields(self): for coll in self.colls: logging.debug("Creating copy fields for {}".format(coll)) for field in test_config["collections"]["copy_fields"]: try: self.solr.schema.create_copy_field(coll, field) except ValueError: # Filed already exists probably pass def setUp(self): [self.solr.delete_doc_by_id(coll, "*") for coll in self.colls] [self.solr.commit(coll, openSearcher=True) for coll in self.colls] def _index_docs(self, numDocs, coll): """ Generates and indexes in random data while maintaining counts of items in various date ranges. These counts in self.date_counts are used later to validate some reindexing methods. Brace yourself or have a drink..... """ self.docs = self.rand_docs.get_docs(numDocs) sdate = datetime.datetime.now() - datetime.timedelta(days=180) edate = datetime.datetime.now() + datetime.timedelta(days=30) self._start_date = sdate self._end_date = edate import random # Assign random times to documents that are generated. This is used to spread out the documents over multiple time ranges hours = (edate - sdate).days * 24 hour_range = [x for x in range(int(hours))] self.date_counts = {} # Save the newest and oldest timestamps as well as assign them to first and second doc self.docs[0]["date"] = sdate.isoformat() + "Z" self.date_counts[sdate.date().isoformat()] = 1 self.docs[1]["date"] = edate.isoformat() + "Z" self.date_counts[edate.date().isoformat()] = 1 for doc in self.docs[2:]: # Make a new date and store a count of it so I can compare later new_date = sdate + datetime.timedelta(hours=random.choice(hour_range)) new_date_s = new_date.date().isoformat() if new_date_s in self.date_counts: self.date_counts[new_date_s] += 1 else: self.date_counts[new_date_s] = 1 doc["date"] = new_date.isoformat() + "Z" self.solr.index_json(coll, json.dumps(self.docs)) self.solr.commit(coll, openSearcher=True) time.sleep(10) def get_all_json_from_indexq(self, index): files = index.get_all_as_list() out = [] for efile in files: if efile.endswith(".gz"): f = gzip.open(efile, "rt", encoding="utf-8") else: f = open(efile) f_data = json.load(f) f.close() out.extend(f_data) return out @classmethod def setUpClass(self): logging.debug("Starting to run Reindexer Tests") self.solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) self.colls = [test_config["SOLR_REINDEXER_COLLECTION_S"], test_config["SOLR_REINDEXER_COLLECTION_D"]] self.rand_docs = RandomTestData() def test_solr_to_indexq(self): """ Will export documents from Solr and put them into an IndexQ. """ index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0) for dir in ["_todo_dir", "_done_dir"]: [os.remove(x) for x in index.get_all_as_list(dir=dir)] self._index_docs(5000, self.colls[0]) reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index) reindexer.reindex() from_files = self.get_all_json_from_indexq(index) from_solr = self.solr.query("source_coll", {"q": "*:*", "rows": 5000}).docs from_solr = reindexer._trim_fields(from_solr) self.assertEqual(sorted(from_files, key=lambda x: x["id"]), sorted(from_solr, key=lambda x: x["id"])) def test_ignore_fields(self): """ Will export documents from Solr and put them into an IndexQ. """ index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0) for dir in ["_todo_dir", "_done_dir"]: [os.remove(x) for x in index.get_all_as_list(dir=dir)] reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index) for field in ["_version_", "product_name_exact"]: self.assertTrue(field in reindexer._ignore_fields) def test_ignore_fields_disable(self): """ Checks to make sure ignore_fields override works """ index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0) reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index, ignore_fields=False) self.assertEqual(reindexer._ignore_fields, False) def test_ignore_fields_override(self): """ Checks to make sure ignore_fields override works """ index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0) reindexer = Reindexer( source=self.solr, source_coll="source_coll", dest=index, ignore_fields=["_text_", "_any_other_field"] ) self.assertEqual(reindexer._ignore_fields, ["_text_", "_any_other_field"]) def test_get_copy_fields(self): """ Tests the method to get copy fields from Solr. """ reindexer = Reindexer(source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll="doesntmatter") self.assertEqual( reindexer._get_copy_fields(), [field["dest"] for field in self.solr.schema.get_schema_copyfields(self.colls[0])], ) def test_query_gen(self): """ Tests the method to get copy fields from Solr. """ reindexer = Reindexer(source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll="doesntmatter") self.assertEqual( reindexer._get_query("cursor"), {"cursorMark": "cursor", "rows": reindexer._rows, "q": "*:*", "sort": "id desc"}, ) def test_query_gen_pershard(self): """ Tests the method to get copy fields from Solr. """ reindexer = Reindexer( source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll="doesntmatter", per_shard=True ) self.assertEqual( reindexer._get_query("cursor"), {"cursorMark": "cursor", "rows": reindexer._rows, "q": "*:*", "sort": "id desc", "distrib": "false"}, ) def test_query_gen_date(self): """ Tests the method to get copy fields from Solr. """ reindexer = Reindexer( source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll="doesntmatter", date_field="ddddd" ) self.assertEqual( reindexer._get_query("cursor"), { "cursorMark": "cursor", "rows": reindexer._rows, "q": "*:*", "sort": "id desc", "sort": "ddddd asc, id desc", }, ) def test_remove_copy_fields_from_data(self): index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0) for dir in ["_todo_dir", "_done_dir"]: [os.remove(x) for x in index.get_all_as_list(dir=dir)] reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index) reindexer.reindex() from_files = self.get_all_json_from_indexq(index) excluded_fields = reindexer._ignore_fields for doc in from_files: for field in excluded_fields: if field in doc: print(doc) # self.assertTrue(field not in doc) def test_solr_to_solr(self): self._index_docs(50000, self.colls[0]) reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=self.solr, dest_coll="dest_coll") reindexer.reindex() self.assertEquals( self.solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]), self.solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]), ) def test_solr_to_solr_with_date(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date" ) reindexer.reindex() try: self.assertTrue(solr.transport._action_log[1]["params"]["params"]["sort"] == "index_date asc, id desc") except KeyError: self.assertTrue(solr.transport._action_log[2]["params"]["params"]["sort"] == "index_date asc, id desc") self.assertEqual( solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]), solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]), ) def test_get_edge_date(self): """ Checks to make sure _get_edge_date returns correct start and end dates. """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date" ) solr_end_date_string = reindexer._get_edge_date("date", "desc") solr_start_date_string = reindexer._get_edge_date("date", "asc") self.assertTrue( self._start_date.date(), datetime.datetime.strptime(solr_start_date_string, "%Y-%m-%dT%H:%M:%S.%fZ") ) self.assertTrue( self._end_date.date(), datetime.datetime.strptime(solr_end_date_string, "%Y-%m-%dT%H:%M:%S.%fZ") ) def test_get_date_range_query(self): """ Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one """ solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date" ) self.assertEqual( reindexer._get_date_range_query("2015-11-10", "2015-12-11"), { "rows": 0, "facet.range.end": "2015-12-11", "facet": "true", "facet.range": "index_date", "facet.range.start": "2015-11-10", "q": "*:*", "facet.range.include": "all", "facet.range.gap": "+1DAY", }, ) self.assertEqual( reindexer._get_date_range_query("2015-11-10", "2015-12-11", date_field="date123"), { "rows": 0, "facet.range.end": "2015-12-11", "facet": "true", "facet.range": "date123", "facet.range.start": "2015-11-10", "q": "*:*", "facet.range.include": "all", "facet.range.gap": "+1DAY", }, ) self.assertEqual( reindexer._get_date_range_query("2015-11-10", "2015-12-11", date_field="date123", timespan="MONTH"), { "rows": 0, "facet.range.end": "2015-12-11", "facet": "true", "facet.range": "date123", "facet.range.start": "2015-11-10", "q": "*:*", "facet.range.include": "all", "facet.range.gap": "+1MONTH", }, ) self.assertEqual( reindexer._get_date_range_query("2015-11-10", "2015-12-11", timespan="MONTH"), { "rows": 0, "facet.range.end": "2015-12-11", "facet": "true", "facet.range": "index_date", "facet.range.start": "2015-11-10", "q": "*:*", "facet.range.include": "all", "facet.range.gap": "+1MONTH", }, ) def test_get_date_facet_counts(self): """ Checks the date_range_query generation function. Makes sure the date ranges returned matches what got indexed. """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Testing this one source_facet, dest_facet = reindexer._get_date_facet_counts( "DAY", "date", start_date=self._start_date.date().isoformat() ) for dt_range in source_facet: dt = datetime.datetime.strptime(dt_range, "%Y-%m-%dT%H:%M:%SZ").date().isoformat() if source_facet[dt_range] != self.date_counts[dt]: logging.info("{} - {} - {}".format(dt, source_facet[dt_range], self.date_counts[dt])) self.assertEqual(source_facet[dt_range], self.date_counts[dt]) def test_get_date_facet_counts_without_start_date(self): """ Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Testing this one source_facet, dest_facet = reindexer._get_date_facet_counts("DAY", "date") for dt_range in source_facet: dt = datetime.datetime.strptime(dt_range, "%Y-%m-%dT%H:%M:%SZ").date().isoformat() if source_facet[dt_range] != self.date_counts[dt]: logging.info("{} - {} - {}".format(dt, source_facet[dt_range], self.date_counts[dt])) self.assertEqual(source_facet[dt_range], self.date_counts[dt]) def test_get_date_facet_counts_not_day(self): """ Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Testing this one with self.assertRaises(ValueError): source_facet, dest_facet = reindexer._get_date_facet_counts("MONTH", "date") ## These tests are focused on methods related to resuming re-indexing def test_solr_to_solr_resume_checkonly(self): """ Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) reindexer.resume(check=True) # Makes sure nothing got indexed self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) def test_solr_to_solr_resume_basic(self): """ Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Make sure only source has datae self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) reindexer.resume() sleep(10) # Make sure countc match up after reindex self.assertEqual( len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), ) def test_solr_to_solr_reindex_and_resume(self): """ Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Make sure only source has datae self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) # This gets somehwat of a mid point date in the range. midpoint = datetime.datetime.now() - datetime.timedelta(days=((self._end_date - self._start_date).days / 2)) # Reindex approximately half of the data by restricting FQ reindexer.reindex(fq=["date:[* TO {}]".format(midpoint.isoformat() + "Z")]) sleep(10) # Make sure we have at least 20% of the data. dest_count = len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs) s_count = len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs) self.assertTrue(s_count > dest_count > s_count * 0.20) reindexer.resume() sleep(10) # Make sure countc match up after reindex self.assertEqual( len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), ) def test_solr_to_solr_reindex_and_resume_reverse(self): """ Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) # This gets somehwat of a mid point date in the range. midpoint = datetime.datetime.now() - datetime.timedelta(days=((self._end_date - self._start_date).days / 2)) # Reindex approximately half of the data by restricting FQ reindexer.reindex(fq=["date:[{} TO *]".format(midpoint.isoformat() + "Z")]) sleep(10) # Make sure we have at least 20% of the data. dest_count = len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs) s_count = len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs) self.assertTrue(s_count > dest_count > s_count * 0.20) reindexer.resume() sleep(10) # Make sure countc match up after reindex self.assertEqual( len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), ) def test_solr_to_solr_reindexer_per_shard(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", per_shard=True, date_field="date" ) # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) reindexer.reindex() # sloppy check over here, will improve later self.assertEqual( len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), )
#!/usr/bin/env python from __future__ import division import json import os from SolrClient import SolrClient import sys from tika import detector solr = SolrClient('http://localhost:8983/solr') walk_n = sum(len(files) for root, dirs, files in os.walk(sys.argv[1])) walk_i = 0 ratios = {} for root, dirs, files in os.walk(sys.argv[1]): for file in files: path = root + '/' + file file_size = os.stat(path).st_size if file_size == 0: continue mime = detector.from_file(path) sum, n = ratios.get(mime, (0, 0)) ratios[mime] = sum + len( json.dumps( solr.query('collection1', { 'q': 'id:' + file }).data['response']['docs'])) / file_size, n + 1 walk_i += 1 print str(walk_i * 100 // walk_n) + '%\r', with open('size-diversity.json', 'w') as f: json.dump({mime: sum / n for mime, (sum, n) in ratios.iteritems()}, f)
''' Create a playlist manually by entering songs one at a time and searching solr for the particular song There is also create_playlist_from_queue.py that has you put the songs on the queue (from a playlist or whatever) and creates a playlist from the queue ''' from SolrClient import SolrClient from config import ec_uri solr = SolrClient(ec_uri+':8983/solr') collection = 'sonos_companion' track_title = input("\nwhat is the title of the track that you are looking for? ") s = 'title:' + ' AND title:'.join(track_title.split()) result = solr.query(collection, {'q':s, 'rows':10, 'fl':['score', 'id', 'uri', 'title', 'artist', 'album'], 'sort':'score desc'}) tracks = result.docs count = result.get_results_count() if count==0: print("Didn't find any tracks\n") elif count==1: track = tracks[0] try: print('id: ' + track['id']) print('artist: ' + track['artist']) print('album: ' + track['album']) print('song: ' + track['title']) print('uri: ' + track['uri']) except Exception as e: print(e) print('------------------------------------------------------------------------------------------------') else:
if uri.startswith('pndrradio'): meta = meta_format_pandora.format(title=station[0], service=station[2]) master.play_uri(uri, meta, station[0]) # station[0] is the title of the station elif uri.startswith('x-sonosapi-stream'): uri = uri.replace('&', '&') # need to escape '&' in radio URIs meta = meta_format_radio.format(title=station[0], service=station[2]) master.play_uri(uri, meta, station[0]) # station[0] is the title of the station else: print("{} radio is not a preset station.".format(task['station'])) elif action in ('play','add') and task.get('trackinfo'): #The query below only searches title and artist fields so you don't get every song on After the Gold Rush #result = cloudsearchdomain.search(query=task['trackinfo'], queryOptions='{"fields":["title", "artist"]}') s = 'artist:' + ' artist:'.join(task['trackinfo'].split()) + ' title:' + ' title:'.join(task['trackinfo'].split()) result = solr.query(collection, {'q':s, 'rows':1}) #..'rows':25 ...}, queryOptions='{"fields":["title", "artist"]}') if result.get_results_count(): track = result.data['response']['docs'][0] try: print('artist: ' + track.get('artist', ['No artist'])) print('album: ' + track.get('album', ['No album'])) print('song: ' + track.get('title', ['No title'])) except Exception as e: print("Unicode error") uri = track.get('uri', ['']) print('uri: ' + uri) print("---------------------------------------------------------------") if 'amz' in uri: i = uri.find('amz')
from SolrClient import SolrClient solr = SolrClient('http://localhost:8983/solr') res = solr.query('dev', { 'q': 'test' })
def computeScores2(type, query, output_file, is_all_key): na_metadata = ["id", "_version_", "Name", "name"] solr = SolrClient('http://localhost:8983/solr') res = solr.query(query['index'], { 'q': '*:*', 'wt': 'json', 'indent': True, 'rows': 1000, }) docs = res.data['response']['docs'] with open(output_file, "wb") as outF: a = csv.writer(outF, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) a.writerow(["type", "x-coordinate", "y-coordinate", "Similarity_score"]) for doc in docs: for key in doc: if key in ["id", "_version_"]: continue try: doc[key] = doc[key][0].encode("ascii", "ignore") except: doc[key] = str(doc[key][0]).decode("unicode_escape").encode("ascii", "ignore") doc_tuples = itertools.combinations(docs, 2) for raw1, raw2 in doc_tuples: doc1 = raw1.copy() doc2 = raw2.copy() if "Name" in doc1: row_edit_distance = [type, doc1["Name"], doc2["Name"]] else: row_edit_distance = [type, doc1["name"], doc2["name"]] intersect_features = set(doc1.keys()) & set(doc2.keys()) intersect_features = [feature for feature in intersect_features if feature not in na_metadata] file_edit_distance = 0.0 for feature in intersect_features: file1_feature_value = stringify(doc1[feature]) file2_feature_value = stringify(doc2[feature]) if len(file1_feature_value) == 0 and len(file2_feature_value) == 0: feature_distance = 0.0 else: feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value)) / ( len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len( file2_feature_value)) file_edit_distance += feature_distance if is_all_key: file1_only_features = set(doc1.keys()) - set(intersect_features) file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata] file2_only_features = set(doc2.keys()) - set(intersect_features) file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata] file_edit_distance += len(file1_only_features) + len( file2_only_features) # increment by 1 for each disjunct feature in (A-B) & (B-A), file1_disjunct_feature_value/file1_disjunct_feature_value = 1 file_edit_distance /= float( len(intersect_features) + len(file1_only_features) + len(file2_only_features)) else: file_edit_distance /= float(len(intersect_features)) # average edit distance row_edit_distance.append(1 - file_edit_distance) a.writerow(row_edit_distance)
#!/usr/bin/env python from __future__ import division import json import os from SolrClient import SolrClient import sys from tika import detector solr = SolrClient('http://localhost:8983/solr') walk_n = sum(len(files) for root, dirs, files in os.walk(sys.argv[1])) walk_i = 0 ratios = {} for root, dirs, files in os.walk(sys.argv[1]): for file in files: path = root + '/' + file file_size = os.stat(path).st_size if file_size == 0: continue mime = detector.from_file(path) sum, n = ratios.get(mime, (0, 0)) ratios[mime] = sum + len(json.dumps(solr.query('collection1', {'q': 'id:' + file}).data['response']['docs'])) / file_size, n + 1 walk_i += 1 print str(walk_i * 100 // walk_n) + '%\r', with open('size-diversity.json', 'w') as f: json.dump({mime: sum / n for mime, (sum, n) in ratios.iteritems()}, f)
class ClientTestQuery(unittest.TestCase): @classmethod def setUpClass(self): self.solr = SolrClient(test_config['SOLR_SERVER'][0],devel=True,auth=test_config['SOLR_CREDENTIALS']) self.rand_docs = RandomTestData() self.docs = self.rand_docs.get_docs(50) self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*') for field in test_config['collections']['copy_fields']: try: self.solr.collections.delete_copy_field(test_config['SOLR_COLLECTION'],field) except: pass for field in test_config['collections']['fields']: try: self.solr.collections.create_field(test_config['SOLR_COLLECTION'],field) except: pass #Index Some data self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs)) self.solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True) def test_basic_query(self): r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'}) self.assertEqual(r.get_num_found(),len(self.docs)) def test_facet(self): r = self.solr.query(test_config['SOLR_COLLECTION'],{ 'q':'*:*', 'facet':'true', 'facet.field':'facet_test', }) local_facets = {} for doc in self.docs: try: local_facets[doc['facet_test']] +=1 except: local_facets[doc['facet_test']] = 1 try: self.assertDictEqual(local_facets,r.get_facets()['facet_test']) except Exception as e: logging.info("local") logging.info(local_facets) logging.info("facets") logging.info(r.get_facets()) raise def test_facet_with_fq(self): r = self.solr.query(test_config['SOLR_COLLECTION'],{ 'q':'*:*', 'facet':True, 'facet.field':'facet_test', }) first_facet_field = list(r.get_facets()['facet_test'].keys())[0] first_facet_field_count = r.get_facets()['facet_test'][first_facet_field] r = self.solr.query(test_config['SOLR_COLLECTION'],{ 'q':'*:*', 'facet':True, 'facet.field':'facet_test', 'fq':'facet_test:{}'.format(first_facet_field) }) self.assertEqual(r.get_num_found(),first_facet_field_count) def test_facet_range(self): res = self.solr.query(test_config['SOLR_COLLECTION'],{ 'q':'*:*', 'facet':True, 'facet.range':'price', 'facet.range.start':0, 'facet.range.end':100, 'facet.range.gap':10 }) prices = [doc['price'] for doc in self.docs] div = lambda x: str(x//10 * 10) out = {} for k,g in itertools.groupby(sorted(prices),div): out[k] = len(list(g)) or 0 self.assertDictEqual(out,res.get_facets_ranges()['price']) def test_facet_pivot(self): res = self.solr.query(test_config['SOLR_COLLECTION'],{ 'q':'*:*', 'facet':True, 'facet.pivot':['facet_test,price','facet_test,id'] }) out = {} for doc in self.docs: if doc['facet_test'] not in out: out[doc['facet_test']] = {} if doc['price'] not in out[doc['facet_test']]: out[doc['facet_test']][doc['price']]=1 else: out[doc['facet_test']][doc['price']]+=1 self.assertDictEqual(out,res.get_facet_pivot()['facet_test,price']) def test_get_field_values_as_list(self): res = self.solr.query(test_config['SOLR_COLLECTION'],{ 'q':'*:*', }) results = res.get_field_values_as_list('product_name_exact') docs = res.docs temp = [] for doc in docs: if 'product_name_exact' in doc: temp.append(doc['product_name_exact']) self.assertEqual(results,temp) def test_get_facet_values_as_list(self): r = self.solr.query(test_config['SOLR_COLLECTION'],{ 'q':'*:*', 'facet':'true', 'facet.field':'facet_test', })
return [distortion, clusters] if __name__ == "__main__": argParser = argparse.ArgumentParser('k-means Clustering of documents based on metadata values') argParser.add_argument('--inCore', required=True, help='Solr Core Name to be queried') argParser.add_argument('--outJSON', required=True, help='/path/to/clusters.json containing k-means cluster assignments') argParser.add_argument('--Kvalue', required=True, help='number of clusters to find') argParser.add_argument('--accept', nargs='+', type=str, help='Optional: compute similarity only on specified IANA MIME Type(s)') args = argParser.parse_args() if args.inCore and args.outJSON and args.Kvalue: solr=SolrClient('http://localhost:8983/solr') res=solr.query(args.inCore,{'q':'*:*'}) jsonData=res.get_json() list_of_points = [] with open(jsonData) as data_file: data = json.load(data_file) for eachFile in data: fileName=eachFile['id'] list_of_points.append(Vector(eachFile, fileName)) for point in list_of_points: union_features |= set(point.features.keys()) #Randomly initialize Centroids for each iteration to find global minima global_minima = K_Means(list_of_points, int(args.Kvalue))
from SolrClient import SolrClient while True: query = input("Query: ") query = query.replace(" ", "+") # Target solr collection solr = SolrClient('http://127.0.0.1:8983/solr') res = solr.query('Assignment1', { "q": 'question:' + query, "indent": "on", "rows": "10", "wt": "json"} ) print(''.join(res.docs[0]['answer']))
class ClientTestIndexing(unittest.TestCase): #High Level Client Tests @classmethod def setUpClass(self): self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) self.rand_docs = RandomTestData() self.docs = self.rand_docs.get_docs(50) for field in test_config['collections']['copy_fields']: try: self.solr.schema.delete_copy_field(test_config['SOLR_COLLECTION'],field) except: pass for field in test_config['collections']['fields']: try: self.solr.schema.create_field(test_config['SOLR_COLLECTION'],field) except: pass def setUp(self): self.delete_docs() self.commit() def delete_docs(self): self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*') self.commit() def commit(self): self.solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True) sleep(5) @unittest.skip("Skipping for now") def test_access_without_auth(self): if not test_config['SOLR_CREDENTIALS'][0]: return solr = SolrClient(test_config['SOLR_SERVER'],devel=True) with self.assertRaises(ConnectionError) as cm: solr.query('SolrClient_unittest',{'q':'not_gonna_happen'}) def test_indexing_json(self): self.docs = self.rand_docs.get_docs(53) self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs)) self.commit() sleep(5) for doc in self.docs: logging.debug("Checking {}".format(doc['id'])) self.assertEqual(self.solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])}).get_num_found(),1) self.delete_docs() self.commit() def test_indexing_conn_log(self): self.docs = self.rand_docs.get_docs(53) self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs)) self.commit() sleep(5) for doc in self.docs: logging.debug("Checking {}".format(doc['id'])) self.assertEqual(self.solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])}).get_num_found(),1) logging.info(self.solr.transport._action_log) self.delete_docs() self.commit() def test_index_json_file(self): self.docs = self.rand_docs.get_docs(55) with open('temp_file.json','w') as f: json.dump(self.docs,f) r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json') self.commit() r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'}) self.assertEqual(r.get_num_found(),len(self.docs)) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_stream_file_gzip_file(self): self.docs = self.rand_docs.get_docs(60) with gzip.open('temp_file.json.gz','wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz') self.commit() r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'}) self.assertEqual(r.get_num_found(),len(self.docs)) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass @unittest.skip("Don't test remote indexing in travis") def test_index_json_file(self): self.docs = self.rand_docs.get_docs(61) with open('temp_file.json','w') as f: json.dump(self.docs,f) r = self.solr.local_index(test_config['SOLR_COLLECTION'],'temp_file.json') self.commit() r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'}) self.assertEqual(r.get_num_found(),len(self.docs)) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_paging_query_with_rows(self): self.docs = self.rand_docs.get_docs(1000) with gzip.open('temp_file.json.gz','wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz') self.commit() queries = 0 docs = [] for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],{'q':'*:*'}, rows=50): self.assertTrue(len(res.docs) == 50) docs.extend(res.docs) queries +=1 self.assertEqual( [x['id'] for x in sorted(docs, key= lambda x: x['id'])], [x['id'] for x in sorted(self.docs, key= lambda x: x['id'])] ) self.assertTrue(1000/50 == queries) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_paging_query(self): self.docs = self.rand_docs.get_docs(1000) with gzip.open('temp_file.json.gz','wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz') self.commit() queries = 0 docs = [] for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],{'q':'*:*'}): self.assertTrue(len(res.docs) == 1000) docs.extend(res.docs) queries +=1 self.assertTrue(queries == 1) self.assertEqual( [x['id'] for x in sorted(docs, key= lambda x: x['id'])], [x['id'] for x in sorted(self.docs, key= lambda x: x['id'])] ) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_paging_query_with_max(self): self.docs = self.rand_docs.get_docs(1000) with gzip.open('temp_file.json.gz','wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz') self.commit() queries = 0 docs = [] for res in self.solr.paging_query(test_config['SOLR_COLLECTION'], {'q':'*:*'}, rows = 50, max_start = 502): self.assertTrue(len(res.docs) == 50) queries +=1 docs.extend(res.docs) ids = [x['id'] for x in docs] for item in docs: self.assertTrue(item['id'] in ids) self.assertEqual(11, queries) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass
def test_access_without_auth(self): if not test_config['SOLR_CREDENTIALS'][0]: return solr = SolrClient(test_config['SOLR_SERVER'],devel=True) with self.assertRaises(ConnectionError) as cm: solr.query('SolrClient_unittest',{'q':'not_gonna_happen'})
sys.exit() s3 = boto3.resource('s3') obj = s3.Object('sonos-scrobble','location') location = obj.get()['Body'].read().decode('utf-8') queue_name = 'echo_sonos_ct' if location=='ct' else 'echo_sonos' print("location = ", location) print("queue_name =", queue_name) sqs = boto3.resource('sqs', region_name='us-east-1') queue = sqs.get_queue_by_name(QueueName=queue_name) solr = SolrClient(ec_uri+':8983/solr') collection = 'sonos_companion' s = 'album:' + ' AND album:'.join(album.split()) result = solr.query(collection, {'q':s, 'rows':25, 'fields':['score','track','uri','album'], 'sort':'score desc'}) if result.docs: selected_album = result.docs[0]['album'] tracks = sorted([t for t in result.docs], key=itemgetter('track')) # The "if t['album']==selected_album" below only comes into play if we retrieved tracks from more than one album uris = [t['uri'] for t in tracks if t['album']==selected_album] sqs_response = queue.send_message(MessageBody=json.dumps({'action':'play', 'uris':uris})) print("Status Code =", sqs_response['ResponseMetadata']['HTTPStatusCode']) print("I will play {} songs from {}".format(len(uris), selected_album)) else: print("I couldn't find {}. Try again.".format(album))
class ClientTestQuery(unittest.TestCase): @classmethod def setUpClass(self): self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) self.rand_docs = RandomTestData() self.docs = self.rand_docs.get_docs(50) self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*') for field in test_config['collections']['copy_fields']: try: self.solr.collections.delete_copy_field( test_config['SOLR_COLLECTION'], field) except: pass for field in test_config['collections']['fields']: try: self.solr.collections.create_field( test_config['SOLR_COLLECTION'], field) except: pass #Index Some data self.solr.index_json(test_config['SOLR_COLLECTION'], json.dumps(self.docs)) self.solr.commit(test_config['SOLR_COLLECTION'], openSearcher=True) def test_basic_query(self): r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'}) self.assertEqual(r.get_num_found(), len(self.docs)) def test_facet(self): r = self.solr.query(test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': 'true', 'facet.field': 'facet_test', }) local_facets = {} for doc in self.docs: try: local_facets[doc['facet_test']] += 1 except: local_facets[doc['facet_test']] = 1 try: self.assertDictEqual(local_facets, r.get_facets()['facet_test']) except Exception as e: logging.info("local") logging.info(local_facets) logging.info("facets") logging.info(r.get_facets()) raise def test_facet_with_fq(self): r = self.solr.query(test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': True, 'facet.field': 'facet_test', }) first_facet_field = list(r.get_facets()['facet_test'].keys())[0] first_facet_field_count = r.get_facets( )['facet_test'][first_facet_field] r = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': True, 'facet.field': 'facet_test', 'fq': 'facet_test:{}'.format(first_facet_field) }) self.assertEqual(r.get_num_found(), first_facet_field_count) def test_facet_range(self): res = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': True, 'facet.range': 'price', 'facet.range.start': 0, 'facet.range.end': 100, 'facet.range.gap': 10 }) prices = [doc['price'] for doc in self.docs] div = lambda x: str(x // 10 * 10) out = {} for k, g in itertools.groupby(sorted(prices), div): out[k] = len(list(g)) or 0 self.assertDictEqual(out, res.get_facets_ranges()['price']) def test_facet_pivot(self): res = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': True, 'facet.pivot': ['facet_test,price', 'facet_test,id'] }) out = {} for doc in self.docs: if doc['facet_test'] not in out: out[doc['facet_test']] = {} if doc['price'] not in out[doc['facet_test']]: out[doc['facet_test']][doc['price']] = 1 else: out[doc['facet_test']][doc['price']] += 1 self.assertDictEqual(out, res.get_facet_pivot()['facet_test,price']) def test_get_field_values_as_list(self): res = self.solr.query(test_config['SOLR_COLLECTION'], { 'q': '*:*', }) results = res.get_field_values_as_list('product_name_exact') docs = res.docs temp = [] for doc in docs: if 'product_name_exact' in doc: temp.append(doc['product_name_exact']) self.assertEqual(results, temp) def test_get_facet_values_as_list(self): r = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': 'true', 'facet.limit': -1, 'facet.field': 'facet_test', }) self.assertEqual( sorted(r.data['facet_counts']['facet_fields']['facet_test'][1::2]), sorted(r.get_facet_values_as_list('facet_test'))) def test_grouped_count_1(self): ''' Get a dict of grouped docs ''' r = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'group': True, 'group.field': 'id', 'group.ngroups': True, }) self.assertEqual(r.get_ngroups(), 50) self.assertEqual(r.get_ngroups('id'), 50) def test_grouped_docs(self): ''' Get a dict of grouped docs ''' r = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'group': True, 'group.field': 'id', 'group.ngroups': True, }) self.assertEqual(len(r.docs), 10) self.assertTrue('doclist' in r.docs[0]) def test_grouped_docs(self): ''' Get a dict of grouped docs ''' r = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'group': True, 'group.field': 'id', 'group.ngroups': True, }) self.assertEqual(len(r.docs), 10) self.assertTrue('doclist' in r.docs[0]) def test_flat_groups(self): ''' Get a dict of grouped docs ''' r = self.solr.query(test_config['SOLR_COLLECTION'], { 'q': '*:*', 'group': True, 'group.field': 'id' }) flats = r.get_flat_groups() self.assertEqual(len(flats), 10) self.assertTrue('date' in flats[0]) def test_json_facet(self): ''' Get a dict of grouped docs ''' #Just lazy getting a new response object r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'}) a = r.get_jsonfacet_counts_as_dict( 'test', { 'count': 50, 'test': { 'buckets': [{ 'count': 10, 'pr': { 'buckets': [{ 'count': 2, 'unique': 1, 'val': 79 }, { 'count': 1, 'unique': 1, 'val': 9 }] }, 'pr_sum': 639.0, 'val': 'consectetur' }, { 'count': 8, 'pr': { 'buckets': [ { 'count': 1, 'unique': 1, 'val': 9 }, { 'count': 1, 'unique': 1, 'val': 31 }, { 'count': 1, 'unique': 1, 'val': 33 }, ] }, 'pr_sum': 420.0, 'val': 'auctor' }, { 'count': 8, 'pr': { 'buckets': [ { 'count': 2, 'unique': 1, 'val': 94 }, { 'count': 1, 'unique': 1, 'val': 25 }, ] }, 'pr_sum': 501.0, 'val': 'nulla' }] } }) b = { 'test': { 'auctor': { 'count': 8, 'pr': { 9: { 'count': 1, 'unique': 1 }, 31: { 'count': 1, 'unique': 1 }, 33: { 'count': 1, 'unique': 1 } }, 'pr_sum': 420.0 }, 'consectetur': { 'count': 10, 'pr': { 9: { 'count': 1, 'unique': 1 }, 79: { 'count': 2, 'unique': 1 } }, 'pr_sum': 639.0 }, 'nulla': { 'count': 8, 'pr': { 25: { 'count': 1, 'unique': 1 }, 94: { 'count': 2, 'unique': 1 } }, 'pr_sum': 501.0 } } } self.assertEqual(a, b)
#!/usr/bin/env python from __future__ import division import collections import json import os from SolrClient import SolrClient import sys from tika import detector solr = SolrClient('http://localhost:8983/solr') walk_n = sum(len(files) for root, dirs, files in os.walk(sys.argv[1])) walk_i = 0 entities = collections.defaultdict(lambda: []) for root, dirs, files in os.walk(sys.argv[1]): for file in files: path = root + '/' + file mime = detector.from_file(path) for val in solr.query('collection1', {'q': 'id:' + file}).data['response']['docs'][0].values(): if type(val) is list: entities[mime].extend(val) walk_i += 1 print str(walk_i * 100 // walk_n) + '%\r', with open('classification-path.json', 'w') as f: json.dump({k: collections.Counter(v) for k, v in entities.iteritems()}, f)