Пример #1
0
 def test_solr_to_solr_reindex_and_resume_reverse(self):
     '''
     Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works.
     '''
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config['SOLR_SERVER'][0], auth=test_config['SOLR_CREDENTIALS'])
     reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll',
                           date_field='date')
     # Make sure only source has data
     self.assertEqual(len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs), 0)
     # This gets somehwat of a mid point date in the range.
     midpoint = (datetime.datetime.now() - datetime.timedelta(days=
                                                              ((self._end_date - self._start_date).days / 2)
                                                              ))
     # Reindex approximately half of the data by restricting FQ
     reindexer.reindex(fq=['date:[{} TO *]'.format(midpoint.isoformat() + 'Z')])
     # Make sure we have at least 20% of the data.
     dest_count = len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs)
     s_count = len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs)
     self.assertTrue(s_count > dest_count > s_count * .20)
     reindexer.resume()
     # Make sure countc match up after reindex
     self.assertEqual(
         len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs),
         len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs))
Пример #2
0
 def test_solr_to_solr_with_date(self):
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config['SOLR_SERVER'][0],
                       devel=True,
                       auth=test_config['SOLR_CREDENTIALS'])
     reindexer = Reindexer(source=solr,
                           source_coll='source_coll',
                           dest=solr,
                           dest_coll='dest_coll',
                           date_field='index_date')
     reindexer.reindex()
     try:
         self.assertTrue(solr.transport._action_log[1]['params']['params']
                         ['sort'] == 'index_date asc, id desc')
     except KeyError:
         self.assertTrue(solr.transport._action_log[2]['params']['params']
                         ['sort'] == 'index_date asc, id desc')
     self.assertEqual(
         solr.query(self.colls[0], {
             'q': '*:*',
             'rows': 10000000
         }).docs.sort(key=lambda x: x['id']),
         solr.query(self.colls[1], {
             'q': '*:*',
             'rows': 10000000
         }).docs.sort(key=lambda x: x['id']),
     )
Пример #3
0
def simple_query(page):
    d = dict()
    query = buildQuery()
    print(query)
    solr = SolrClient(current_app.config['SOLR'])
    res = solr.query('scripties', {
        'q': query,
        'rows': '0',
    })
    count = res.get_num_found()
    pages = math.ceil(count / 10)
    start = (page - 1) * 10
    res = solr.query(
        'scripties', {
            'q': query,
            'rows': '10',
            'start': start,
            'fl': 'id,titel,auteur,jaar',
            'facet': True,
            'facet.field': ['jaar', 'type', 'faculteit'],
        })
    facets = res.get_facets()
    d['result'] = res
    d['pages'] = pages
    d['page'] = page
    d['f_jaar'] = facets['jaar']
    d['f_type'] = facets['type']
    d['f_faculteit'] = collect(facets['faculteit'])
    d['f'] = request.args.get('faculteit')
    d['j'] = request.args.get('jaar')
    d['t'] = request.args.get('type')
    return d
Пример #4
0
 def test_solr_to_solr_reindex_and_resume_reverse(self):
     """
     Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. 
     """
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"])
     reindexer = Reindexer(
         source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
     )
     # Make sure only source has data
     self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
     # This gets somehwat of a mid point date in the range.
     midpoint = datetime.datetime.now() - datetime.timedelta(days=((self._end_date - self._start_date).days / 2))
     # Reindex approximately half of the data by restricting FQ
     reindexer.reindex(fq=["date:[{} TO *]".format(midpoint.isoformat() + "Z")])
     sleep(10)
     # Make sure we have at least 20% of the data.
     dest_count = len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs)
     s_count = len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs)
     self.assertTrue(s_count > dest_count > s_count * 0.20)
     reindexer.resume()
     sleep(10)
     # Make sure countc match up after reindex
     self.assertEqual(
         len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs),
         len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs),
     )
Пример #5
0
 def test_solr_to_solr_reindexer_per_shard(self):
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"])
     reindexer = Reindexer(
         source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", per_shard=True, date_field="date"
     )
     # Make sure only source has data
     self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
     reindexer.reindex()
     # sloppy check over here, will improve later
     self.assertEqual(
         len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs),
         len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs),
     )
Пример #6
0
 def test_solr_to_solr_resume_checkonly(self):
     '''
     Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
     '''
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS'])
     reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll',
                           date_field='date')
     # Make sure only source has data
     self.assertEqual(len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs), 0)
     reindexer.resume(check=True)
     # Makes sure nothing got indexed
     self.assertEqual(len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs), 0)
Пример #7
0
def get_tags_by_pmi(target_tag, solr:SolrClient, core_name="tags"):
    #http://localhost:8983/solr/tags/select?indent=on&q=tag_text:banmuslims%20AND%20type:1&wt=json
    rows=100 #100 results per page
    stop=False
    start=0

    q='tag_text:' + target_tag+' AND type:1' #0=single tag; 1=tag pairs
    #because we need to get tags similar to this target, so we need to get all pairs and process them

    while not stop:
        res = solr.query(core_name, {
            'q':q, #remember we only show tweets tagged as hate (0)
            'rows':rows,
            'fl':'*',   #return all fields from the index (when available
            'start':start, #start from
            'sort':'pmi desc'}) #sort by risk_score descending
        start+=rows #resetting start will turn to next page. for specific page number, you need to work out the 'start' by pagenum*rows
        print("total number found={}".format(res.num_found))
        if start>res.num_found:
            stop=True

        #now go through every page, every result
        for d in res.docs: #res.docs only contain documents on the CURRENT page
            tags=d['tag_text'].split(" ")
            relevant_tag=tags[0]
            if relevant_tag==target_tag:
                relevant_tag=tags[1]
            print(relevant_tag+", pmi="+d['pmi'])
Пример #8
0
def suggest():
    query_key = request.args.get('query')
    solr = SolrClient('http://localhost:8983/solr')
    res = solr.query('myexample',{
            'q':query_key,
        },'suggest')
    return json.dumps(res.data['suggest']['suggest'][query_key]['suggestions'])
Пример #9
0
def search(query_dict):
    #pdb.set_trace()
    #instantiate solr connection
    solr = SolrClient('http://localhost:8983/solr')

    # Generic search if no query input given
    if len(query_dict) == 0:
        query_string = '*:*'
    #retrieve value of field in table and prepare a query string
    else:
        query_string = ''
        query_op = ' AND '
        item_count = 0
        for key in query_dict:
            if len(query_dict[key]) > 0:
                if item_count > 0:
                    query_string = query_string + query_op + key + ':' + query_dict[
                        key]
                else:
                    query_string = query_string + key + ':' + query_dict[key]
                item_count += 1
    res = solr.query('lyrics', {
        'q': query_string,
    })
    return res.data['response']['docs']
Пример #10
0
def update(solr: SolrClient, tweet_core_name, tag_core_name, timespan, rows,
           feat_vectorizer, ml_model, selected_features,
           hate_indicative_features, scaling_option, sysout, logger):

    stop = False
    start = 0
    while not stop:
        logger.warn("Processing from {} for a batch of {}".format(start, rows))
        print("Processing from {} for a batch of {}".format(start, rows))
        res = solr.query(
            tweet_core_name, {
                'q': 'created_at:' + timespan,
                'rows': rows,
                'fl': '*',
                'start': start,
                'sort': 'id asc'
            })
        start += rows
        if start > res.num_found:
            stop = True

        #apply pretrained ML model to tag data and update them
        update_ml_tag(solr, tweet_core_name, tag_core_name, res.docs,
                      feat_vectorizer, ml_model, selected_features,
                      hate_indicative_features, scaling_option, sysout, logger)

    pass
Пример #11
0
def get_tweets_by_time(timespan, solr:SolrClient, core_name="tweets"):
    rows=100 #100 results per page
    stop=False
    start=0
    facet_counts=None

    q='created_at:' + timespan+' AND ml_tag:0'

    while not stop:
        res = solr.query(core_name, {
            'q':q, #remember we only show tweets tagged as hate (0)
            'facet.field':'entities_hashtag', #count results per facet (NOTE, not every tweet will have a hashtag, but this is ok
            'facet':"on", #switch on facet search
            'facet.mincount':"1", #show facets that have at least 1 result
            'rows':rows,
            'fl':'*',   #return all fields from the index (when available
            'start':start, #start from
            'sort':'tweet_risk desc'}) #sort by risk_score descending
        start+=rows #resetting start will turn to next page. for specific page number, you need to work out the 'start' by pagenum*rows
        print("total number found={}".format(res.num_found))
        if start>res.num_found:
            stop=True

        #assign facet results to another var. facet counts is for the whole dataset, not just this page
        if facet_counts is None:
            facet_counts=res.data['facet_counts']['facet_fields']['entities_hashtag']

        #now go through every page, every result
        for d in res.docs: #res.docs only contain documents on the CURRENT page
            print("https://twitter.com/"+d['user_screen_name']+"/"+d['id'])
            if 'coordinates' in d.keys():
                print(d['coordinates'])

    #finally print facet counts
    print(facet_counts)
Пример #12
0
 def test_solr_to_solr_resume_checkonly(self):
     """
     Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
     """
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
     reindexer = Reindexer(
         source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
     )
     # Make sure only source has data
     self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
     reindexer.resume(check=True)
     # Makes sure nothing got indexed
     self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
Пример #13
0
    def test_solr_to_solr_reindexer_per_shard(self):
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          auth=test_config['SOLR_CREDENTIALS'])
        #Make sure only source has data
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 50000)
        self.assertEqual(
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 0)

        reindexer = Reindexer(source=solr,
                              source_coll='source_coll_shard1_replica1',
                              dest=solr,
                              dest_coll=self.colls[1],
                              per_shard=True,
                              date_field='date')
        reindexer.reindex()
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll_shard2_replica1',
                              dest=solr,
                              dest_coll=self.colls[1],
                              per_shard=True,
                              date_field='date')
        reindexer.reindex()

        self.solr.commit(self.colls[1], openSearcher=True)
        #sloppy check over here, will improve later
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs),
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs))
Пример #14
0
def detail_query(key):
    solr = SolrClient(current_app.config['SOLR'])
    q = 'id:{}'.format(key)
    res = solr.query(
        'scripties', {
            'q': q,
            'fl': 'titel,auteur,jaar,supervisor,type,faculteit,opleiding,taal',
        })
    return res
Пример #15
0
 def test_solr_to_solr_resume_basic(self):
     """
     Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
     """
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"])
     reindexer = Reindexer(
         source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
     )
     # Make sure only source has datae
     self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
     reindexer.resume()
     sleep(10)
     # Make sure countc match up after reindex
     self.assertEqual(
         len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs),
         len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs),
     )
Пример #16
0
def get_solr():
    solr = SolrClient(current_app.config['SOLR'])
    res = solr.query('scripties', {
        'q': 'titel:muslim',
        'facet': True,
        'facet.field': 'taal',
    })
    return res.get_results_count()


# def get_post(id, check_author=True):
#     post = get_db().execute(
#         'SELECT p.id, title, body, created, author_id, username'
#         ' FROM post p JOIN user u ON p.author_id = u.id'
#         ' WHERE p.id = ?',
#         (id,)
#     ).fetchone()
#     if post is None:
#         abort(404, "Post id {0} doesn't exist.".format(id))
#     if check_author and post['author_id'] != g.user['id']:
#         abort(403)
#     return post

# @bp.route('/<int:id>/update', methods=('GET', 'POST'))
# @login_required
# def update(id):
#     post = get_post(id)
#     if request.method == 'POST':
#         title = request.form['title']
#         body = request.form['body']
#         error = None
#         if not title:
#             error = 'Title is required.'
#         if error is not None:
#             flash(error)
#         else:
#             db = get_db()
#             db.execute(
#                 'UPDATE post SET title = ?, body = ?'
#                 ' WHERE id = ?',
#                 (title, body, id)
#             )
#             db.commit()
#             return redirect(url_for('blog.index'))
#     return render_template('blog/update.html', post=post)

# @bp.route('/<int:id>/delete', methods=('POST',))
# @login_required
# def delete(id):
#     get_post(id)
#     db = get_db()
#     db.execute('DELETE FROM post WHERE id = ?', (id,))
#     db.commit()
#     return redirect(url_for('blog.index'))
Пример #17
0
def get_tag_riskscore(solr: SolrClient, core_name, tag):
    if tag[0] == '#':
        tag = tag[1:]
    tag = tag.lower()
    res = solr.query(core_name, {
        'q': 'id:' + tag,
        'fl': iu.tag_index_field_risk_score
    })
    for d in res.docs:
        score = d[iu.tag_index_field_risk_score]
        return score
    return 0.0
Пример #18
0
 def test_solr_to_solr_resume_basic(self):
     '''
     Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
     '''
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config['SOLR_SERVER'][0],
                       auth=test_config['SOLR_CREDENTIALS'])
     reindexer = Reindexer(source=solr,
                           source_coll='source_coll',
                           dest=solr,
                           dest_coll='dest_coll',
                           date_field='date')
     #Make sure only source has datae
     self.assertEqual(
         len(
             solr.query(self.colls[0], {
                 'q': '*:*',
                 'rows': 10000000
             }).docs), 50000)
     self.assertEqual(
         len(
             solr.query(self.colls[1], {
                 'q': '*:*',
                 'rows': 10000000
             }).docs), 0)
     reindexer.resume()
     sleep(10)
     #Make sure countc match up after reindex
     self.assertEqual(
         len(
             solr.query(self.colls[0], {
                 'q': '*:*',
                 'rows': 10000000
             }).docs),
         len(
             solr.query(self.colls[1], {
                 'q': '*:*',
                 'rows': 10000000
             }).docs))
Пример #19
0
def read_all():

    client = SolrClient('http://localhost:8983/solr')

    res = client.query('test', {
        'q' : '*:*'
    })

    res = json.loads(res.get_json())
    docs = res['response']['docs']

    for doc in docs:
        print (doc)
Пример #20
0
 def test_index_multiproc(self):
     index = IndexQ(test_config['indexqbase'], 'testq')
     solr = SolrClient(test_config['SOLR_SERVER'], devel=True, auth=test_config['SOLR_CREDENTIALS'])
     solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*')
     buff = []
     files = []
     for doc in self.docs:
         files.append(index.add(doc, finalize=True))
     index.index(solr,test_config['SOLR_COLLECTION'],threads=10)
     solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True)
     for doc in self.docs:
         res = solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])})
         self.assertTrue(res.get_results_count()==1)
Пример #21
0
class SolrSearch(object):
    def __init__(self, core, url, limit=3):
        self.core = core
        self.url = url
        self.limit = limit
        self.solr = SolrClient(url)

    def query(self, question):
        passages = set()
        res = self.solr.query(self.core, {
            'q': 'context_text:{}'.format(question)
        })
        for doc in res.docs:
            passages.add(doc['context_text'][0])
        return list(passages)[:self.limit]
Пример #22
0
def get_tweets_by_time_and_tag(tag, timespan, solr:SolrClient, core_name="tweets"):
    rows=100 #100 results per page
    stop=False
    start=0
    facet_counts=None
    res = solr.query(core_name, {
            'q':'created_at:' + timespan+' AND ml_tag:0'
                    +' AND entities_hashtag:'+tag, #remember we only show tweets tagged as hate (0)
            'facet.field':'entities_hashtag', #count results per facet (NOTE, not every tweet will have a hashtag, but this is ok
            'facet':"on", #switch on facet search
            'facet.mincount':"1", #show facets that have at least 1 result
            'rows':rows,
            'fl':'*',   #return all fields from the index (when available
            'start':start, #start from
            'sort':'tweet_risk desc'}) #sort by risk_score descending
Пример #23
0
 def test_solr_to_solr_with_date(self):
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
     reindexer = Reindexer(
         source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date"
     )
     reindexer.reindex()
     try:
         self.assertTrue(solr.transport._action_log[1]["params"]["params"]["sort"] == "index_date asc, id desc")
     except KeyError:
         self.assertTrue(solr.transport._action_log[2]["params"]["params"]["sort"] == "index_date asc, id desc")
     self.assertEqual(
         solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]),
         solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]),
     )
Пример #24
0
def get_latest_update(url,collection, query):
    dttm = None
    solr = SolrClient(url)
    res = solr.query(collection, {
            'q': query,
            'rows': 1,
            'sort': 'system_mtime desc'
    })
    pp.pprint(res.get_results_count())
    
    if res.get_results_count() == 1:
        pp.pprint(res.docs[0]['system_mtime'])
        date = res.docs[0]['system_mtime']
        dttm = datetime.strptime(date,"%Y-%m-%dT%H:%M:%SZ")
        pp.pprint(dttm)
    return dttm
Пример #25
0
def computeScores1(type, query, output_file):
    solr = SolrClient('http://localhost:8983/solr')

    res = solr.query(query['index'], {
        'q': '*:*',
        'wt': 'json',
        'indent': True,
        'rows': 1000,
    })

    docs = res.data['response']['docs']

    with open(output_file, "wb") as outF:
        a = csv.writer(outF, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        a.writerow(["type", "x-coordinate", "y-coordinate", "Similarity_score"])

        for doc in docs:
            for key in doc:
                if key in ["id", "_version_"]:
                    continue
                try:
                    doc[key] = doc[key][0].encode("ascii", "ignore")
                except:
                    doc[key] = str(doc[key][0]).decode("unicode_escape").encode("ascii", "ignore")

        doc_tuples = itertools.combinations(docs, 2)
        for raw1, raw2 in doc_tuples:

            doc1 = raw1.copy()
            doc2 = raw2.copy()

            if "Name" in doc1:
                row_cosine_distance = [type, doc1["Name"], doc2["Name"]]
            else:
                row_cosine_distance = [type, doc1["name"], doc2["name"]]

            v1 = Vector(row_cosine_distance[0], doc1)
            v2 = Vector(row_cosine_distance[1], doc2)

            row_cosine_distance.append(v1.cosTheta(v2))

            a.writerow(row_cosine_distance)
Пример #26
0
def readDocumentsFromSolr(numberofrecords):
    contents = []
    solr = SolrClient('http://52.41.35.204:8983/solr')
    res = solr.query('tennisCollection', {'q': '*:*', 'rows': numberofrecords})
    for doc in res.docs:
        processedDocument = {}
        processedDocument['id'] = ''
        processedDocument['title'] = ''
        processedDocument['content'] = ''

        if 'id' in doc:
            processedDocument['id'] = doc['id']
        if 'title' in doc:
            processedDocument['title'] = doc['title']
        if 'content' in doc:
            processedDocument['content'] = doc['content']

        contents.append(processedDocument)

    return contents
Пример #27
0
def get_tweets_by_coordinates(lat, lon, range, timespan, solr:SolrClient, core_name="tweets"):
    lat_min=lat-range
    lat_max=lat+range
    lon_min=lon-range
    lon_max=lon+range
    rows=100 #100 results per page
    stop=False
    start=0
    facet_counts=None
    res = solr.query(core_name, {
            'q':'created_at:' + timespan+' AND ml_tag:0'
                    +' AND coordinate_lat:'+'[{} TO {}]'.format(lat_min, lat_max)
                    +' AND coordinate_lon:'+'[{} TO {}]'.format(lon_min, lon_max),
            'facet.field':'entities_hashtag', #count results per facet (NOTE, not every tweet will have a hashtag, but this is ok
            'facet':"on", #switch on facet search
            'facet.mincount':"1", #show facets that have at least 1 result
            'rows':rows,
            'fl':'*',   #return all fields from the index (when available
            'start':start, #start from
            'sort':'tweet_risk desc'}) #sort by risk_score descending
Пример #28
0
def get_existing(solr: SolrClient, core_name, pagesize):
    stop = False
    start = 0
    tags = {}
    tag_pairs = {}
    while not stop:
        res = solr.query(core_name, {
            'q': '*:*',
            'rows': pagesize,
            'fl': '*',
            'start': start})
        start += pagesize
        if start > res.num_found:
            stop = True

        for d in res.docs:
            if d['type'] == '0':  # single tag
                tags[d['id']] = d
            else:
                tag_pairs[d['id']] = d
    return tags, tag_pairs
Пример #29
0
    def get(self, request: HttpRequest, lang='en', keys=''):
        lang = request.LANGUAGE_CODE
        uuid_regex = self.uuid_pattern()
        uuids = keys.split(',')
        valid_uuids = []
        for uuid in uuids:
            if uuid_regex.match(uuid):
                valid_uuids.append(uuid)
        keys = ",".join(valid_uuids)

        # Get the titles

        solr = SolrClient(settings.OPEN_DATA_SOLR_SERVER_URL)
        q_text = " OR id:".join(valid_uuids)
        q_text = "id:" + q_text
        solr_query = {'q': q_text, 'defType': 'edismax', 'sow': True}
        solr_response = solr.query(settings.OPEN_DATA_CORE, solr_query)
        titles = {}
        title_field = 'title_fr_s' if lang == "fr" else 'title_en_s'
        for doc in solr_response.docs:
            titles[doc['id']] = doc[title_field]

        context = {
            "language":
            lang,
            "keys":
            keys,
            "titles":
            titles,
            "open_data_url":
            settings.OPEN_DATA_BASE_URL_FR
            if lang == "fr" else settings.OPEN_DATA_BASE_URL_EN,
            "rcs_config":
            'ramp/config.rcs.fr-CA.json'
            if lang == "fr" else 'ramp/config.rcs.en-CA.json',
            "toggle_url":
            self._get_toggle(lang, keys)
        }
        return render(request, 'ramp.html', context)
Пример #30
0
    def get(self):

        term = self.get_argument('term')

        client = SolrClient('http://localhost:8983/solr')
        res = client.query('stocks', {
            #'q' : 'symbol:%s' % '*'
            'q' : term
        })

        res = json.loads(res.get_json())
        docs = res['response']['docs']

        formatted = []

        for doc in docs:
            formatted.append({
                'name' : doc['name'],
                'symbol' : doc['symbol'],
                'sector' : doc['sector'],
                'open' : doc['open']
            })

        self.write(json.dumps(formatted))
Пример #31
0
from SolrClient import SolrClient
import sys
import json
import requests
from config import ec_uri, solr_uri # if this is run again probably not ec_uri (from uri)
#uri = 'http://192.168.1.122' #if run again this may also need to be changed (to uri)

solr_new = SolrClient(ec_uri+':8983/solr')
solr_old = SolrClient(solr_uri+'/solr')
#solr_old = SolrClient(ec_uri+':8983/solr')
#solr_new = SolrClient(uri+':8983/solr')
collection = 'sonos_companion'
start = 0
temp = [1]
while len(temp) > 0:
    result = solr_old.query(collection, {'q':'*', 'rows':1000, 'start':start}) 
    temp = result.data['response']['docs']
    #print(repr(temp).encode('cp1252', errors='replace'))
    start+=1000

    documents = []
    for item in temp:
        #document = {'id':item['id'].lower()}
        # apparently ran the first time to transfer to raspi without track in the list
        # the reason so few tracks actually have a track number (I did a few starting 08072016)
        #document.update({k:item[k] for k in item if k in ('id','album','artist','title','uri','track')})
        document = {k:item[k] for k in item if k in ('id','album','artist','title','uri','track')}
        documents.append(document)
    #print(documents)

    n = 0
Пример #32
0
class ReindexerTests(unittest.TestCase):

    #Methos to create the schema in the collections
    def create_fields(self):
        for coll in self.colls:
            logging.debug("Creating fields for {}".format(coll))
            for field in test_config['collections']['fields']:
                try:
                    self.solr.schema.create_field(coll, field)
                except ValueError:
                    #Filed already exists probably
                    pass

    def create_copy_fields(self):
        for coll in self.colls:
            logging.debug("Creating copy fields for {}".format(coll))
            for field in test_config['collections']['copy_fields']:
                try:
                    self.solr.schema.create_copy_field(coll, field)
                except ValueError:
                    #Filed already exists probably
                    pass

    def setUp(self):
        [self.solr.delete_doc_by_id(coll, '*') for coll in self.colls]
        [self.solr.commit(coll, openSearcher=True) for coll in self.colls]

    def _index_docs(self, numDocs, coll):
        '''
        Generates and indexes in random data while maintaining counts of items in various date ranges.

        These counts in self.date_counts are used later to validate some reindexing methods.

        Brace yourself or have a drink.....
        '''
        self.docs = self.rand_docs.get_docs(numDocs)
        sdate = datetime.datetime.now() - datetime.timedelta(days=180)
        edate = datetime.datetime.now() + datetime.timedelta(days=30)
        self._start_date = sdate
        self._end_date = edate

        import random
        #Assign random times to documents that are generated. This is used to spread out the documents over multiple time ranges
        hours = (edate - sdate).days * 24
        hour_range = [x for x in range(int(hours))]
        self.date_counts = {}

        #Save the newest and oldest timestamps as well as assign them to first and second doc
        self.docs[0]['date'] = sdate.isoformat() + 'Z'
        self.date_counts[sdate.date().isoformat()] = 1

        self.docs[1]['date'] = edate.isoformat() + 'Z'
        self.date_counts[edate.date().isoformat()] = 1

        for doc in self.docs[2:]:
            #Make a new date and store a count of it so I can compare later
            new_date = (sdate +
                        datetime.timedelta(hours=random.choice(hour_range)))
            new_date_s = new_date.date().isoformat()
            if new_date_s in self.date_counts:
                self.date_counts[new_date_s] += 1
            else:
                self.date_counts[new_date_s] = 1
            doc['date'] = new_date.isoformat() + 'Z'

        self.solr.index_json(coll, json.dumps(self.docs))
        self.solr.commit(coll, openSearcher=True)
        time.sleep(10)

    def get_all_json_from_indexq(self, index):
        files = index.get_all_as_list()
        out = []
        for efile in files:
            if efile.endswith('.gz'):
                f = gzip.open(efile, 'rt', encoding='utf-8')
            else:
                f = open(efile)
            f_data = json.load(f)
            f.close()
            out.extend(f_data)
        return out

    @classmethod
    def setUpClass(self):
        logging.debug("Starting to run Reindexer Tests")
        self.solr = SolrClient(test_config['SOLR_SERVER'][0],
                               devel=True,
                               auth=test_config['SOLR_CREDENTIALS'])
        self.colls = [
            test_config['SOLR_REINDEXER_COLLECTION_S'],
            test_config['SOLR_REINDEXER_COLLECTION_D']
        ]
        self.rand_docs = RandomTestData()

    def test_solr_to_indexq(self):
        '''
        Will export documents from Solr and put them into an IndexQ.
        '''
        index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0)
        for dir in ['_todo_dir', '_done_dir']:
            [os.remove(x) for x in index.get_all_as_list(dir=dir)]
        self._index_docs(5000, self.colls[0])
        reindexer = Reindexer(source=self.solr,
                              source_coll='source_coll',
                              dest=index)
        reindexer.reindex()
        from_files = self.get_all_json_from_indexq(index)
        from_solr = self.solr.query('source_coll', {
            'q': '*:*',
            'rows': 5000
        }).docs
        from_solr = reindexer._trim_fields(from_solr)
        self.assertEqual(sorted(from_files, key=lambda x: x['id']),
                         sorted(from_solr, key=lambda x: x['id']))

    def test_ignore_fields(self):
        '''
        Will export documents from Solr and put them into an IndexQ.
        '''
        index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0)
        for dir in ['_todo_dir', '_done_dir']:
            [os.remove(x) for x in index.get_all_as_list(dir=dir)]
        reindexer = Reindexer(source=self.solr,
                              source_coll='source_coll',
                              dest=index)
        for field in ['_version_', 'product_name_exact']:
            self.assertTrue(field in reindexer._ignore_fields)

    def test_ignore_fields_disable(self):
        '''
        Checks to make sure ignore_fields override works
        '''
        index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0)
        reindexer = Reindexer(source=self.solr,
                              source_coll='source_coll',
                              dest=index,
                              ignore_fields=False)
        self.assertEqual(reindexer._ignore_fields, False)

    def test_ignore_fields_override(self):
        '''
        Checks to make sure ignore_fields override works
        '''
        index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0)
        reindexer = Reindexer(source=self.solr,
                              source_coll='source_coll',
                              dest=index,
                              ignore_fields=['_text_', '_any_other_field'])
        self.assertEqual(reindexer._ignore_fields,
                         ['_text_', '_any_other_field'])

    def test_get_copy_fields(self):
        '''
        Tests the method to get copy fields from Solr.
        '''
        reindexer = Reindexer(source=self.solr,
                              source_coll=self.colls[0],
                              dest=self.solr,
                              dest_coll='doesntmatter')
        self.assertEqual(reindexer._get_copy_fields(), [
            field['dest']
            for field in self.solr.schema.get_schema_copyfields(self.colls[0])
        ])

    def test_query_gen(self):
        '''
        Tests the method to get copy fields from Solr.
        '''
        reindexer = Reindexer(source=self.solr,
                              source_coll=self.colls[0],
                              dest=self.solr,
                              dest_coll='doesntmatter')
        self.assertEqual(
            reindexer._get_query('cursor'), {
                'cursorMark': 'cursor',
                'rows': reindexer._rows,
                'q': '*:*',
                'sort': 'id desc'
            })

    def test_query_gen_pershard_distrib(self):
        '''
        Tests the method to get copy fields from Solr.
        '''
        reindexer = Reindexer(source=self.solr,
                              source_coll=self.colls[0],
                              dest=self.solr,
                              dest_coll='doesntmatter',
                              per_shard=True)
        q = reindexer._get_query('cursor')
        self.assertTrue('distrib' in q and q['distrib'] == 'false')

    def test_query_gen_date(self):
        '''
        Tests the method to get copy fields from Solr.
        '''
        reindexer = Reindexer(source=self.solr,
                              source_coll=self.colls[0],
                              dest=self.solr,
                              dest_coll='doesntmatter',
                              date_field='ddddd')
        self.assertEqual(
            reindexer._get_query('cursor'), {
                'cursorMark': 'cursor',
                'rows': reindexer._rows,
                'q': '*:*',
                'sort': 'id desc',
                'sort': 'ddddd asc, id desc'
            })

    def test_remove_copy_fields_from_data(self):
        index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0)
        for dir in ['_todo_dir', '_done_dir']:
            [os.remove(x) for x in index.get_all_as_list(dir=dir)]
        reindexer = Reindexer(source=self.solr,
                              source_coll='source_coll',
                              dest=index)
        reindexer.reindex()
        from_files = self.get_all_json_from_indexq(index)
        excluded_fields = reindexer._ignore_fields
        for doc in from_files:
            for field in excluded_fields:
                if field in doc:
                    print(doc)
                #self.assertTrue(field not in doc)

    def test_solr_to_solr(self):
        self._index_docs(50000, self.colls[0])
        reindexer = Reindexer(source=self.solr,
                              source_coll='source_coll',
                              dest=self.solr,
                              dest_coll='dest_coll')
        reindexer.reindex()
        self.assertEqual(
            self.solr.query(self.colls[0], {
                'q': '*:*',
                'rows': 10000000
            }).docs.sort(key=lambda x: x['id']),
            self.solr.query(self.colls[1], {
                'q': '*:*',
                'rows': 10000000
            }).docs.sort(key=lambda x: x['id']),
        )

    def test_solr_to_solr_with_date(self):
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          devel=True,
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='index_date')
        reindexer.reindex()
        try:
            self.assertTrue(solr.transport._action_log[1]['params']['params']
                            ['sort'] == 'index_date asc, id desc')
        except KeyError:
            self.assertTrue(solr.transport._action_log[2]['params']['params']
                            ['sort'] == 'index_date asc, id desc')
        self.assertEqual(
            solr.query(self.colls[0], {
                'q': '*:*',
                'rows': 10000000
            }).docs.sort(key=lambda x: x['id']),
            solr.query(self.colls[1], {
                'q': '*:*',
                'rows': 10000000
            }).docs.sort(key=lambda x: x['id']),
        )

    def test_get_edge_date(self):
        '''
        Checks to make sure _get_edge_date returns correct start and end dates.
        '''
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          devel=True,
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='index_date')
        solr_end_date_string = reindexer._get_edge_date('date', 'desc')
        solr_start_date_string = reindexer._get_edge_date('date', 'asc')
        self.assertTrue(
            self._start_date.date(),
            datetime.datetime.strptime(solr_start_date_string,
                                       '%Y-%m-%dT%H:%M:%S.%fZ'))
        self.assertTrue(
            self._end_date.date(),
            datetime.datetime.strptime(solr_end_date_string,
                                       '%Y-%m-%dT%H:%M:%S.%fZ'))

    def test_get_date_range_query(self):
        '''
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        '''
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          devel=True,
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='index_date')
        self.assertEqual(
            reindexer._get_date_range_query('2015-11-10', '2015-12-11'), {
                'rows': 0,
                'facet.range.end': '2015-12-11',
                'facet': 'true',
                'facet.range': 'index_date',
                'facet.range.start': '2015-11-10',
                'q': '*:*',
                'facet.range.include': 'all',
                'facet.range.gap': '+1DAY'
            })
        self.assertEqual(
            reindexer._get_date_range_query('2015-11-10',
                                            '2015-12-11',
                                            date_field='date123'),
            {
                'rows': 0,
                'facet.range.end': '2015-12-11',
                'facet': 'true',
                'facet.range': 'date123',
                'facet.range.start': '2015-11-10',
                'q': '*:*',
                'facet.range.include': 'all',
                'facet.range.gap': '+1DAY'
            })
        self.assertEqual(
            reindexer._get_date_range_query('2015-11-10',
                                            '2015-12-11',
                                            date_field='date123',
                                            timespan='MONTH'),
            {
                'rows': 0,
                'facet.range.end': '2015-12-11',
                'facet': 'true',
                'facet.range': 'date123',
                'facet.range.start': '2015-11-10',
                'q': '*:*',
                'facet.range.include': 'all',
                'facet.range.gap': '+1MONTH'
            })
        self.assertEqual(
            reindexer._get_date_range_query('2015-11-10',
                                            '2015-12-11',
                                            timespan='MONTH'),
            {
                'rows': 0,
                'facet.range.end': '2015-12-11',
                'facet': 'true',
                'facet.range': 'index_date',
                'facet.range.start': '2015-11-10',
                'q': '*:*',
                'facet.range.include': 'all',
                'facet.range.gap': '+1MONTH'
            })

    def test_get_date_facet_counts(self):
        '''
        Checks the date_range_query generation function. Makes sure the date ranges returned matches what got indexed.
        '''
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          devel=True,
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='date')
        #Testing this one
        source_facet, dest_facet = reindexer._get_date_facet_counts(
            'DAY', 'date', start_date=self._start_date.date().isoformat())
        for dt_range in source_facet:
            dt = datetime.datetime.strptime(
                dt_range, '%Y-%m-%dT%H:%M:%SZ').date().isoformat()
            if source_facet[dt_range] != self.date_counts[dt]:
                logging.info("{} - {} - {}".format(dt, source_facet[dt_range],
                                                   self.date_counts[dt]))
            self.assertEqual(source_facet[dt_range], self.date_counts[dt])

    def test_get_date_facet_counts_without_start_date(self):
        '''
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        '''
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          devel=True,
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='date')
        #Testing this one
        source_facet, dest_facet = reindexer._get_date_facet_counts(
            'DAY', 'date')
        for dt_range in source_facet:
            dt = datetime.datetime.strptime(
                dt_range, '%Y-%m-%dT%H:%M:%SZ').date().isoformat()
            if source_facet[dt_range] != self.date_counts[dt]:
                logging.info("{} - {} - {}".format(dt, source_facet[dt_range],
                                                   self.date_counts[dt]))
            self.assertEqual(source_facet[dt_range], self.date_counts[dt])

    def test_get_date_facet_counts_not_day(self):
        '''
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        '''
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          devel=True,
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='date')
        #Testing this one
        with self.assertRaises(ValueError):
            source_facet, dest_facet = reindexer._get_date_facet_counts(
                'MONTH', 'date')

    ## These tests are focused on methods related to resuming re-indexing

    def test_solr_to_solr_resume_checkonly(self):
        '''
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        '''
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          devel=True,
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='date')
        #Make sure only source has data
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 50000)
        self.assertEqual(
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 0)
        reindexer.resume(check=True)
        #Makes sure nothing got indexed
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 50000)
        self.assertEqual(
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 0)

    def test_solr_to_solr_resume_basic(self):
        '''
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        '''
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='date')
        #Make sure only source has datae
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 50000)
        self.assertEqual(
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 0)
        reindexer.resume()
        sleep(10)
        #Make sure countc match up after reindex
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs),
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs))

    def test_solr_to_solr_reindex_and_resume(self):
        '''
        Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works.
        '''
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='date')
        #Make sure only source has datae
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 50000)
        self.assertEqual(
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 0)
        #This gets somehwat of a mid point date in the range.
        midpoint = (datetime.datetime.now() - datetime.timedelta(days=(
            (self._end_date - self._start_date).days / 2)))
        #Reindex approximately half of the data by restricting FQ
        reindexer.reindex(
            fq=['date:[* TO {}]'.format(midpoint.isoformat() + 'Z')])
        sleep(10)
        #Make sure we have at least 20% of the data.
        dest_count = len(
            solr.query(self.colls[1], {
                'q': '*:*',
                'rows': 10000000
            }).docs)
        s_count = len(
            solr.query(self.colls[0], {
                'q': '*:*',
                'rows': 10000000
            }).docs)
        self.assertTrue(s_count > dest_count > s_count * .20)
        reindexer.resume()
        sleep(10)
        #Make sure countc match up after reindex
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs),
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs))

    def test_solr_to_solr_reindex_and_resume_reverse(self):
        '''
        Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works.
        '''
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          auth=test_config['SOLR_CREDENTIALS'])
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll',
                              dest=solr,
                              dest_coll='dest_coll',
                              date_field='date')
        #Make sure only source has data
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 50000)
        self.assertEqual(
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 0)
        #This gets somehwat of a mid point date in the range.
        midpoint = (datetime.datetime.now() - datetime.timedelta(days=(
            (self._end_date - self._start_date).days / 2)))
        #Reindex approximately half of the data by restricting FQ
        reindexer.reindex(
            fq=['date:[{} TO *]'.format(midpoint.isoformat() + 'Z')])
        sleep(10)
        #Make sure we have at least 20% of the data.
        dest_count = len(
            solr.query(self.colls[1], {
                'q': '*:*',
                'rows': 10000000
            }).docs)
        s_count = len(
            solr.query(self.colls[0], {
                'q': '*:*',
                'rows': 10000000
            }).docs)
        self.assertTrue(s_count > dest_count > s_count * .20)
        reindexer.resume()
        sleep(10)
        #Make sure countc match up after reindex
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs),
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs))

    def test_solr_to_solr_reindexer_per_shard(self):
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          auth=test_config['SOLR_CREDENTIALS'])
        #Make sure only source has data
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 50000)
        self.assertEqual(
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 0)

        reindexer = Reindexer(source=solr,
                              source_coll='source_coll_shard1_replica1',
                              dest=solr,
                              dest_coll=self.colls[1],
                              per_shard=True,
                              date_field='date')
        reindexer.reindex()
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll_shard2_replica1',
                              dest=solr,
                              dest_coll=self.colls[1],
                              per_shard=True,
                              date_field='date')
        reindexer.reindex()

        self.solr.commit(self.colls[1], openSearcher=True)
        #sloppy check over here, will improve later
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs),
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs))
Пример #33
0
def scan():
    # Get search result
    #print(parameters)
    # query_key, page_rank = parameters.split('&')
    # query_key = query_key.split('=')[1]
    # page_rank = page_rank.split('=')[1]
    query_key = request.args.get('query')
    page_rank = request.args.get('pagerank')
    # print(query_key)
    # print(page_rank)
    solr = SolrClient('http://localhost:8983/solr')
    if page_rank == '1':
        #print('exe1')
        res = solr.query('myexample',{
            'q':query_key,
            'sort':'pageRankFile desc',
        })
    else:
        #print('exe0')
        res = solr.query('myexample',{
            'q':query_key,
            })
    if res is None:
        json_result = {'query':None}
        return json.dumps(json_result, ensure_ascii=False)
    else:
        #print(res)
        for value in res.docs:
            #print(value['id'])

            # Add snippets
            snippet = get_snippet(value['id'], query_key)
            value['snippet'] = snippet

            if 'description' not in value:
                value['description']='NULL'
            if 'og_url' not in value:
                with open('./mapNBCNewsDataFile.csv') as f:
                    key = value['id'].split('/')[-1]
                    for line in f:
                        if line.split(',')[0] == key:
                            value['og_url'] = (line.split(',')[-1])
                            break

        # Use Norvig's result to replace the Solr suggestion
        # correct_res = res.data['spellcheck']
        # if correct_res.get('suggestions'):
        #     correct_word = correction(query_key)
        #     res.data['spellcheck']['suggestions'][1]['suggestion'][0]=correct_word
        correct_res = res.data['spellcheck']
        correct_word_list=[]
        if correct_res.get('suggestions'):
            query_key_list = query_key.split()
            
            for i in query_key_list:
                # correct_word = correction(query_key)
                correct_word_list.append(correction(i))
            res.data['spellcheck']['collations'][1]=' '.join(correct_word_list)


        return json.dumps(res.data, ensure_ascii=False,indent=4)
Пример #34
0
class SOLR():
    def __init__(self, url):
        self.url = url
        self.solr = SolrClient(self.url)
        self.solr_home = home_dir + '/solr-7.1.0/server/solr/'

    def solr_core_exists(self, core_name):
        url = self.url + '/admin/cores?action=STATUS&core=' + core_name
        response = requests.get(url)
        r = response.json()
        if r['status'][core_name]:
            return 1
        else:
            return 0

    def create_solr_core(self, core_name):
        core_dir = os.path.join(self.solr_home, core_name)
        if os.path.exists(core_dir):
            shutil.rmtree(core_dir)
        os.makedirs(core_dir)
        src_dir = os.path.join(self.solr_home, 'configsets/_default/conf')
        #'configsets/sample_techproducts_configs/conf')
        dst_dir = os.path.join(core_dir, 'conf')
        shutil.copytree(src_dir, dst_dir)
        url1 = self.url + '/admin/cores?action=CREATE&name=' + core_name
        url2 = '&instanceDir=' + self.solr_home + core_name
        r = requests.get(url1 + url2)
        #print(r.text)

    def delete_solr_core(self, core_name):
        url1 = self.url + '/admin/cores?action=UNLOAD&core=' + core_name
        url2 = '&deleteIndex=true&deleteDataDir=true&deleteInstanceDir=true'
        r = requests.get(url1 + url2)
        #print(r.text)

    def update_solr(self, data, core_name):
        url = self.url + '/' + core_name + '/update?wt=json'
        headers = {'Content-Type': 'application/json', 'Connection': 'close'}
        params = {'boost': 1.0, 'overwrite': 'true', 'commitWithin': 1000}
        data = {'add': {'doc': data}}
        r = requests.post(url, headers=headers, params=params, json=data)
        #print(r.text)

    def delete_solr_by_id(self, core_name, _id):
        url = self.url + '/' + core_name + '/update?wt=json'
        headers = {'Content-Type': 'application/xml'}
        params = {'commit': 'true'}
        data = "<delete><id>" + _id + "</id></delete>"
        data = data.encode('utf8')
        r = requests.post(url, headers=headers, params=params, data=data)
        #print(r.text)

    def delete_solr_by_query(self, core_name, query):
        url = self.url + '/' + core_name + '/update?wt=json'
        headers = {'Content-Type': 'application/xml'}
        params = {'commit': 'true'}
        data = "<delete><query>" + query + "</query></delete>"
        data = data.encode('utf8')
        r = requests.post(url, headers=headers, params=params, data=data)
        #print(r.text)

    def query_question_solr(self, core_name, question, fields, num):
        query = {
            'q': 'question_ik:' + question,
            'fl': fields,
            'rows': num,
        }
        res = self.solr.query(core_name, query)
        return res

    def query_solr(self, core_name, select, fields, num):
        query = {
            'q': select,
            'fl': fields,
            'rows': num,
        }
        res = self.solr.query(core_name, query)
        return res
Пример #35
0
        print("Could not bring back queue")
    else:
        existing_queue = json.loads(z.decode('utf-8')) # you are just going to create duplicate playlists if you do this
        n = 1
        for track in existing_queue:
            print(n, track[0],track[1])
            n+=1

else:
    action = 'play'

try:
    while 1:
        track_title = input("\nwhat is the title of the track that you want to add to the queue (Ctrl-C if done)? ")
        s = 'title:' + ' AND title:'.join(track_title.split())
        result = solr.query(collection, {'q':s, 'rows':10, 'fl':['score', 'id', 'uri', 'title', 'artist', 'album'], 'sort':'score desc'}) 
        tracks = result.docs
        count = result.get_results_count()
        if count==0:
            print("Didn't find any tracks\n")
        elif count==1:
            track = tracks[0]
            try:
                print('id: ' + track['id'])
                print('artist: ' + track['artist'])
                print('album: ' + track['album'])
                print('song: ' + track['title'])
                print('uri: ' + track['uri'])
            except Exception as e:
                print(e)
            print('---------------------------------------------------------------')
Пример #36
0
class ClientTestQuery(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        self.solr = SolrClient(test_config['SOLR_SERVER'][0],
                               devel=True,
                               auth=test_config['SOLR_CREDENTIALS'])
        self.rand_docs = RandomTestData()
        self.docs = self.rand_docs.get_docs(50)
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*')

        for field in test_config['collections']['copy_fields']:
            try:
                self.solr.collections.delete_copy_field(
                    test_config['SOLR_COLLECTION'], field)
            except:
                pass
        for field in test_config['collections']['fields']:
            try:
                self.solr.collections.create_field(
                    test_config['SOLR_COLLECTION'], field)
            except:
                pass

        #Index Some data
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps(self.docs))
        self.solr.commit(test_config['SOLR_COLLECTION'], openSearcher=True)

    def test_basic_query(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))

    def test_facet(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'], {
            'q': '*:*',
            'facet': 'true',
            'facet.field': 'facet_test',
        })
        local_facets = {}
        for doc in self.docs:
            try:
                local_facets[doc['facet_test']] += 1
            except:
                local_facets[doc['facet_test']] = 1
        try:
            self.assertDictEqual(local_facets, r.get_facets()['facet_test'])
        except Exception as e:
            logging.info("local")
            logging.info(local_facets)
            logging.info("facets")
            logging.info(r.get_facets())
            raise

    def test_facet_with_fq(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'], {
            'q': '*:*',
            'facet': True,
            'facet.field': 'facet_test',
        })
        first_facet_field = list(r.get_facets()['facet_test'].keys())[0]
        first_facet_field_count = r.get_facets(
        )['facet_test'][first_facet_field]
        r = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'facet': True,
                'facet.field': 'facet_test',
                'fq': 'facet_test:{}'.format(first_facet_field)
            })
        self.assertEqual(r.get_num_found(), first_facet_field_count)

    def test_facet_range(self):
        res = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'facet': True,
                'facet.range': 'price',
                'facet.range.start': 0,
                'facet.range.end': 100,
                'facet.range.gap': 10
            })

        prices = [doc['price'] for doc in self.docs]
        div = lambda x: str(x // 10 * 10)
        out = {}
        for k, g in itertools.groupby(sorted(prices), div):
            out[k] = len(list(g)) or 0
        self.assertDictEqual(out, res.get_facets_ranges()['price'])

    def test_facet_pivot(self):
        res = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'facet': True,
                'facet.pivot': ['facet_test,price', 'facet_test,id']
            })
        out = {}
        for doc in self.docs:
            if doc['facet_test'] not in out:
                out[doc['facet_test']] = {}
            if doc['price'] not in out[doc['facet_test']]:
                out[doc['facet_test']][doc['price']] = 1
            else:
                out[doc['facet_test']][doc['price']] += 1
        self.assertDictEqual(out, res.get_facet_pivot()['facet_test,price'])

    def test_get_field_values_as_list(self):
        res = self.solr.query(test_config['SOLR_COLLECTION'], {
            'q': '*:*',
        })
        results = res.get_field_values_as_list('product_name_exact')
        docs = res.docs
        temp = []
        for doc in docs:
            if 'product_name_exact' in doc:
                temp.append(doc['product_name_exact'])
        self.assertEqual(results, temp)

    def test_get_facet_values_as_list(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'], {
            'q': '*:*',
            'facet': 'true',
            'facet.field': 'facet_test',
        })
Пример #37
0
from solr import *
import pysolr

#conn = solr.solr("http://solr.example.net/solr")
#conn = solr.Solr("http://solr.example.net/solr")
#solr.SearchHandler(conn,"/select")
#conn.query()
import sklearn
from SolrClient import SolrClient

solr=SolrClient('http://192.168.1.100:8983/solr/')

result=solr.query('tableAbstract',{'q':'memBody:blood','facet':True,'facet.range.start':0,'facet.range.end':1000000})
for x in result.docs:
    #print(x['id'])
    print(int(float(x['id'])))
    #print(x['id'])
print (result.get_num_found())
Пример #38
0
class ReindexerTests(unittest.TestCase):

    # Methos to create the schema in the collections
    def create_fields(self):
        for coll in self.colls:
            logging.debug("Creating fields for {}".format(coll))
            for field in test_config["collections"]["fields"]:
                try:
                    self.solr.schema.create_field(coll, field)
                except ValueError:
                    # Filed already exists probably
                    pass

    def create_copy_fields(self):
        for coll in self.colls:
            logging.debug("Creating copy fields for {}".format(coll))
            for field in test_config["collections"]["copy_fields"]:
                try:
                    self.solr.schema.create_copy_field(coll, field)
                except ValueError:
                    # Filed already exists probably
                    pass

    def setUp(self):
        [self.solr.delete_doc_by_id(coll, "*") for coll in self.colls]
        [self.solr.commit(coll, openSearcher=True) for coll in self.colls]

    def _index_docs(self, numDocs, coll):
        """
        Generates and indexes in random data while maintaining counts of items in various date ranges.

        These counts in self.date_counts are used later to validate some reindexing methods. 

        Brace yourself or have a drink.....
        """
        self.docs = self.rand_docs.get_docs(numDocs)
        sdate = datetime.datetime.now() - datetime.timedelta(days=180)
        edate = datetime.datetime.now() + datetime.timedelta(days=30)
        self._start_date = sdate
        self._end_date = edate

        import random

        # Assign random times to documents that are generated. This is used to spread out the documents over multiple time ranges
        hours = (edate - sdate).days * 24
        hour_range = [x for x in range(int(hours))]
        self.date_counts = {}

        # Save the newest and oldest timestamps as well as assign them to first and second doc
        self.docs[0]["date"] = sdate.isoformat() + "Z"
        self.date_counts[sdate.date().isoformat()] = 1

        self.docs[1]["date"] = edate.isoformat() + "Z"
        self.date_counts[edate.date().isoformat()] = 1

        for doc in self.docs[2:]:
            # Make a new date and store a count of it so I can compare later
            new_date = sdate + datetime.timedelta(hours=random.choice(hour_range))
            new_date_s = new_date.date().isoformat()
            if new_date_s in self.date_counts:
                self.date_counts[new_date_s] += 1
            else:
                self.date_counts[new_date_s] = 1
            doc["date"] = new_date.isoformat() + "Z"

        self.solr.index_json(coll, json.dumps(self.docs))
        self.solr.commit(coll, openSearcher=True)
        time.sleep(10)

    def get_all_json_from_indexq(self, index):
        files = index.get_all_as_list()
        out = []
        for efile in files:
            if efile.endswith(".gz"):
                f = gzip.open(efile, "rt", encoding="utf-8")
            else:
                f = open(efile)
            f_data = json.load(f)
            f.close()
            out.extend(f_data)
        return out

    @classmethod
    def setUpClass(self):
        logging.debug("Starting to run Reindexer Tests")
        self.solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
        self.colls = [test_config["SOLR_REINDEXER_COLLECTION_S"], test_config["SOLR_REINDEXER_COLLECTION_D"]]
        self.rand_docs = RandomTestData()

    def test_solr_to_indexq(self):
        """
        Will export documents from Solr and put them into an IndexQ. 
        """
        index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0)
        for dir in ["_todo_dir", "_done_dir"]:
            [os.remove(x) for x in index.get_all_as_list(dir=dir)]
        self._index_docs(5000, self.colls[0])
        reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index)
        reindexer.reindex()
        from_files = self.get_all_json_from_indexq(index)
        from_solr = self.solr.query("source_coll", {"q": "*:*", "rows": 5000}).docs
        from_solr = reindexer._trim_fields(from_solr)
        self.assertEqual(sorted(from_files, key=lambda x: x["id"]), sorted(from_solr, key=lambda x: x["id"]))

    def test_ignore_fields(self):
        """
        Will export documents from Solr and put them into an IndexQ. 
        """
        index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0)
        for dir in ["_todo_dir", "_done_dir"]:
            [os.remove(x) for x in index.get_all_as_list(dir=dir)]
        reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index)
        for field in ["_version_", "product_name_exact"]:
            self.assertTrue(field in reindexer._ignore_fields)

    def test_ignore_fields_disable(self):
        """
        Checks to make sure ignore_fields override works
        """
        index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0)
        reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index, ignore_fields=False)
        self.assertEqual(reindexer._ignore_fields, False)

    def test_ignore_fields_override(self):
        """
        Checks to make sure ignore_fields override works
        """
        index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0)
        reindexer = Reindexer(
            source=self.solr, source_coll="source_coll", dest=index, ignore_fields=["_text_", "_any_other_field"]
        )
        self.assertEqual(reindexer._ignore_fields, ["_text_", "_any_other_field"])

    def test_get_copy_fields(self):
        """
        Tests the method to get copy fields from Solr. 
        """
        reindexer = Reindexer(source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll="doesntmatter")
        self.assertEqual(
            reindexer._get_copy_fields(),
            [field["dest"] for field in self.solr.schema.get_schema_copyfields(self.colls[0])],
        )

    def test_query_gen(self):
        """
        Tests the method to get copy fields from Solr. 
        """
        reindexer = Reindexer(source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll="doesntmatter")
        self.assertEqual(
            reindexer._get_query("cursor"),
            {"cursorMark": "cursor", "rows": reindexer._rows, "q": "*:*", "sort": "id desc"},
        )

    def test_query_gen_pershard(self):
        """
        Tests the method to get copy fields from Solr. 
        """
        reindexer = Reindexer(
            source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll="doesntmatter", per_shard=True
        )
        self.assertEqual(
            reindexer._get_query("cursor"),
            {"cursorMark": "cursor", "rows": reindexer._rows, "q": "*:*", "sort": "id desc", "distrib": "false"},
        )

    def test_query_gen_date(self):
        """
        Tests the method to get copy fields from Solr. 
        """
        reindexer = Reindexer(
            source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll="doesntmatter", date_field="ddddd"
        )
        self.assertEqual(
            reindexer._get_query("cursor"),
            {
                "cursorMark": "cursor",
                "rows": reindexer._rows,
                "q": "*:*",
                "sort": "id desc",
                "sort": "ddddd asc, id desc",
            },
        )

    def test_remove_copy_fields_from_data(self):
        index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0)
        for dir in ["_todo_dir", "_done_dir"]:
            [os.remove(x) for x in index.get_all_as_list(dir=dir)]
        reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index)
        reindexer.reindex()
        from_files = self.get_all_json_from_indexq(index)
        excluded_fields = reindexer._ignore_fields
        for doc in from_files:
            for field in excluded_fields:
                if field in doc:
                    print(doc)
                # self.assertTrue(field not in doc)

    def test_solr_to_solr(self):
        self._index_docs(50000, self.colls[0])
        reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=self.solr, dest_coll="dest_coll")
        reindexer.reindex()
        self.assertEquals(
            self.solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]),
            self.solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]),
        )

    def test_solr_to_solr_with_date(self):
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date"
        )
        reindexer.reindex()
        try:
            self.assertTrue(solr.transport._action_log[1]["params"]["params"]["sort"] == "index_date asc, id desc")
        except KeyError:
            self.assertTrue(solr.transport._action_log[2]["params"]["params"]["sort"] == "index_date asc, id desc")
        self.assertEqual(
            solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]),
            solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]),
        )

    def test_get_edge_date(self):
        """
        Checks to make sure _get_edge_date returns correct start and end dates. 
        """
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date"
        )
        solr_end_date_string = reindexer._get_edge_date("date", "desc")
        solr_start_date_string = reindexer._get_edge_date("date", "asc")
        self.assertTrue(
            self._start_date.date(), datetime.datetime.strptime(solr_start_date_string, "%Y-%m-%dT%H:%M:%S.%fZ")
        )
        self.assertTrue(
            self._end_date.date(), datetime.datetime.strptime(solr_end_date_string, "%Y-%m-%dT%H:%M:%S.%fZ")
        )

    def test_get_date_range_query(self):
        """
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        """
        solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date"
        )
        self.assertEqual(
            reindexer._get_date_range_query("2015-11-10", "2015-12-11"),
            {
                "rows": 0,
                "facet.range.end": "2015-12-11",
                "facet": "true",
                "facet.range": "index_date",
                "facet.range.start": "2015-11-10",
                "q": "*:*",
                "facet.range.include": "all",
                "facet.range.gap": "+1DAY",
            },
        )
        self.assertEqual(
            reindexer._get_date_range_query("2015-11-10", "2015-12-11", date_field="date123"),
            {
                "rows": 0,
                "facet.range.end": "2015-12-11",
                "facet": "true",
                "facet.range": "date123",
                "facet.range.start": "2015-11-10",
                "q": "*:*",
                "facet.range.include": "all",
                "facet.range.gap": "+1DAY",
            },
        )
        self.assertEqual(
            reindexer._get_date_range_query("2015-11-10", "2015-12-11", date_field="date123", timespan="MONTH"),
            {
                "rows": 0,
                "facet.range.end": "2015-12-11",
                "facet": "true",
                "facet.range": "date123",
                "facet.range.start": "2015-11-10",
                "q": "*:*",
                "facet.range.include": "all",
                "facet.range.gap": "+1MONTH",
            },
        )
        self.assertEqual(
            reindexer._get_date_range_query("2015-11-10", "2015-12-11", timespan="MONTH"),
            {
                "rows": 0,
                "facet.range.end": "2015-12-11",
                "facet": "true",
                "facet.range": "index_date",
                "facet.range.start": "2015-11-10",
                "q": "*:*",
                "facet.range.include": "all",
                "facet.range.gap": "+1MONTH",
            },
        )

    def test_get_date_facet_counts(self):
        """
        Checks the date_range_query generation function. Makes sure the date ranges returned matches what got indexed. 
        """
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
        )
        # Testing this one
        source_facet, dest_facet = reindexer._get_date_facet_counts(
            "DAY", "date", start_date=self._start_date.date().isoformat()
        )
        for dt_range in source_facet:
            dt = datetime.datetime.strptime(dt_range, "%Y-%m-%dT%H:%M:%SZ").date().isoformat()
            if source_facet[dt_range] != self.date_counts[dt]:
                logging.info("{} - {} - {}".format(dt, source_facet[dt_range], self.date_counts[dt]))
            self.assertEqual(source_facet[dt_range], self.date_counts[dt])

    def test_get_date_facet_counts_without_start_date(self):
        """
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        """
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
        )
        # Testing this one
        source_facet, dest_facet = reindexer._get_date_facet_counts("DAY", "date")
        for dt_range in source_facet:
            dt = datetime.datetime.strptime(dt_range, "%Y-%m-%dT%H:%M:%SZ").date().isoformat()
            if source_facet[dt_range] != self.date_counts[dt]:
                logging.info("{} - {} - {}".format(dt, source_facet[dt_range], self.date_counts[dt]))
            self.assertEqual(source_facet[dt_range], self.date_counts[dt])

    def test_get_date_facet_counts_not_day(self):
        """
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        """
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
        )
        # Testing this one
        with self.assertRaises(ValueError):
            source_facet, dest_facet = reindexer._get_date_facet_counts("MONTH", "date")

    ## These tests are focused on methods related to resuming re-indexing

    def test_solr_to_solr_resume_checkonly(self):
        """
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        """
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
        )
        # Make sure only source has data
        self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
        self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
        reindexer.resume(check=True)
        # Makes sure nothing got indexed
        self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
        self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)

    def test_solr_to_solr_resume_basic(self):
        """
        Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one
        """
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
        )
        # Make sure only source has datae
        self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
        self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
        reindexer.resume()
        sleep(10)
        # Make sure countc match up after reindex
        self.assertEqual(
            len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs),
            len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs),
        )

    def test_solr_to_solr_reindex_and_resume(self):
        """
        Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. 
        """
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
        )
        # Make sure only source has datae
        self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
        self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
        # This gets somehwat of a mid point date in the range.
        midpoint = datetime.datetime.now() - datetime.timedelta(days=((self._end_date - self._start_date).days / 2))
        # Reindex approximately half of the data by restricting FQ
        reindexer.reindex(fq=["date:[* TO {}]".format(midpoint.isoformat() + "Z")])
        sleep(10)
        # Make sure we have at least 20% of the data.
        dest_count = len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs)
        s_count = len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs)
        self.assertTrue(s_count > dest_count > s_count * 0.20)
        reindexer.resume()
        sleep(10)
        # Make sure countc match up after reindex
        self.assertEqual(
            len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs),
            len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs),
        )

    def test_solr_to_solr_reindex_and_resume_reverse(self):
        """
        Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. 
        """
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
        )
        # Make sure only source has data
        self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
        self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
        # This gets somehwat of a mid point date in the range.
        midpoint = datetime.datetime.now() - datetime.timedelta(days=((self._end_date - self._start_date).days / 2))
        # Reindex approximately half of the data by restricting FQ
        reindexer.reindex(fq=["date:[{} TO *]".format(midpoint.isoformat() + "Z")])
        sleep(10)
        # Make sure we have at least 20% of the data.
        dest_count = len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs)
        s_count = len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs)
        self.assertTrue(s_count > dest_count > s_count * 0.20)
        reindexer.resume()
        sleep(10)
        # Make sure countc match up after reindex
        self.assertEqual(
            len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs),
            len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs),
        )

    def test_solr_to_solr_reindexer_per_shard(self):
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"])
        reindexer = Reindexer(
            source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", per_shard=True, date_field="date"
        )
        # Make sure only source has data
        self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
        self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
        reindexer.reindex()
        # sloppy check over here, will improve later
        self.assertEqual(
            len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs),
            len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs),
        )
Пример #39
0
#!/usr/bin/env python

from __future__ import division
import json
import os
from SolrClient import SolrClient
import sys
from tika import detector

solr = SolrClient('http://localhost:8983/solr')
walk_n = sum(len(files) for root, dirs, files in os.walk(sys.argv[1]))
walk_i = 0
ratios = {}
for root, dirs, files in os.walk(sys.argv[1]):
    for file in files:
        path = root + '/' + file
        file_size = os.stat(path).st_size
        if file_size == 0: continue
        mime = detector.from_file(path)
        sum, n = ratios.get(mime, (0, 0))
        ratios[mime] = sum + len(
            json.dumps(
                solr.query('collection1', {
                    'q': 'id:' + file
                }).data['response']['docs'])) / file_size, n + 1
        walk_i += 1
        print str(walk_i * 100 // walk_n) + '%\r',
with open('size-diversity.json', 'w') as f:
    json.dump({mime: sum / n for mime, (sum, n) in ratios.iteritems()}, f)
Пример #40
0
'''
Create a playlist manually by entering songs one at a time
and searching solr for the particular song
There is also create_playlist_from_queue.py that has you put the songs on the queue
(from a playlist or whatever) and creates a playlist from the queue 
'''
from SolrClient import SolrClient
from config import ec_uri

solr = SolrClient(ec_uri+':8983/solr')
collection = 'sonos_companion'

track_title = input("\nwhat is the title of the track that you are looking for? ")
s = 'title:' + ' AND title:'.join(track_title.split())
result = solr.query(collection, {'q':s, 'rows':10, 'fl':['score', 'id', 'uri', 'title', 'artist', 'album'], 'sort':'score desc'}) 
tracks = result.docs
count = result.get_results_count()
if count==0:
    print("Didn't find any tracks\n")
elif count==1:
    track = tracks[0]
    try:
        print('id: ' + track['id'])
        print('artist: ' + track['artist'])
        print('album: ' + track['album'])
        print('song: ' + track['title'])
        print('uri: ' + track['uri'])
    except Exception as e:
        print(e)
    print('------------------------------------------------------------------------------------------------')
else:    
Пример #41
0
                    if uri.startswith('pndrradio'):
                        meta = meta_format_pandora.format(title=station[0], service=station[2])
                        master.play_uri(uri, meta, station[0]) # station[0] is the title of the station
                    elif uri.startswith('x-sonosapi-stream'):
                        uri = uri.replace('&', '&amp;') # need to escape '&' in radio URIs
                        meta = meta_format_radio.format(title=station[0], service=station[2])
                        master.play_uri(uri, meta, station[0]) # station[0] is the title of the station
                else:
                    print("{} radio is not a preset station.".format(task['station']))

        elif action in ('play','add') and task.get('trackinfo'): 

            #The query below only searches title and artist fields so you don't get every song on After the Gold Rush
            #result = cloudsearchdomain.search(query=task['trackinfo'], queryOptions='{"fields":["title", "artist"]}')
            s = 'artist:' + ' artist:'.join(task['trackinfo'].split()) + ' title:' + ' title:'.join(task['trackinfo'].split())
            result = solr.query(collection, {'q':s, 'rows':1}) #..'rows':25 ...}, queryOptions='{"fields":["title", "artist"]}')

            if result.get_results_count():
                track = result.data['response']['docs'][0]
                try:
                    print('artist: ' + track.get('artist', ['No artist']))
                    print('album: ' + track.get('album', ['No album']))
                    print('song: ' + track.get('title', ['No title']))
                except Exception as e:
                    print("Unicode error")
                uri = track.get('uri', [''])
                print('uri: ' + uri)
                print("---------------------------------------------------------------")

                if 'amz' in uri:
                    i = uri.find('amz')
Пример #42
0
from SolrClient import SolrClient

solr = SolrClient('http://localhost:8983/solr')
res = solr.query('dev', {
    'q': 'test'
})
Пример #43
0
def computeScores2(type, query, output_file, is_all_key):
    na_metadata = ["id", "_version_", "Name", "name"]

    solr = SolrClient('http://localhost:8983/solr')

    res = solr.query(query['index'], {
        'q': '*:*',
        'wt': 'json',
        'indent': True,
        'rows': 1000,
    })

    docs = res.data['response']['docs']

    with open(output_file, "wb") as outF:
        a = csv.writer(outF, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        a.writerow(["type", "x-coordinate", "y-coordinate", "Similarity_score"])

        for doc in docs:
            for key in doc:
                if key in ["id", "_version_"]:
                    continue
                try:
                    doc[key] = doc[key][0].encode("ascii", "ignore")
                except:
                    doc[key] = str(doc[key][0]).decode("unicode_escape").encode("ascii", "ignore")

        doc_tuples = itertools.combinations(docs, 2)
        for raw1, raw2 in doc_tuples:

            doc1 = raw1.copy()
            doc2 = raw2.copy()

            if "Name" in doc1:
                row_edit_distance = [type, doc1["Name"], doc2["Name"]]
            else:
                row_edit_distance = [type, doc1["name"], doc2["name"]]

            intersect_features = set(doc1.keys()) & set(doc2.keys())
            intersect_features = [feature for feature in intersect_features if feature not in na_metadata]

            file_edit_distance = 0.0
            for feature in intersect_features:

                file1_feature_value = stringify(doc1[feature])
                file2_feature_value = stringify(doc2[feature])

                if len(file1_feature_value) == 0 and len(file2_feature_value) == 0:
                    feature_distance = 0.0
                else:
                    feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value)) / (
                        len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(
                            file2_feature_value))

                file_edit_distance += feature_distance

            if is_all_key:
                file1_only_features = set(doc1.keys()) - set(intersect_features)
                file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata]

                file2_only_features = set(doc2.keys()) - set(intersect_features)
                file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata]

                file_edit_distance += len(file1_only_features) + len(
                    file2_only_features)  # increment by 1 for each disjunct feature in (A-B) & (B-A), file1_disjunct_feature_value/file1_disjunct_feature_value = 1
                file_edit_distance /= float(
                    len(intersect_features) + len(file1_only_features) + len(file2_only_features))

            else:
                file_edit_distance /= float(len(intersect_features))  # average edit distance

            row_edit_distance.append(1 - file_edit_distance)
            a.writerow(row_edit_distance)
Пример #44
0
#!/usr/bin/env python

from __future__ import division
import json
import os
from SolrClient import SolrClient
import sys
from tika import detector

solr = SolrClient('http://localhost:8983/solr')
walk_n = sum(len(files) for root, dirs, files in os.walk(sys.argv[1]))
walk_i = 0
ratios = {}
for root, dirs, files in os.walk(sys.argv[1]):
    for file in files:
        path = root + '/' + file
        file_size = os.stat(path).st_size
        if file_size == 0: continue
        mime = detector.from_file(path)
        sum, n = ratios.get(mime, (0, 0))
        ratios[mime] = sum + len(json.dumps(solr.query('collection1', {'q': 'id:' + file}).data['response']['docs'])) / file_size, n + 1
        walk_i += 1
        print str(walk_i * 100 // walk_n) + '%\r',
with open('size-diversity.json', 'w') as f:
    json.dump({mime: sum / n for mime, (sum, n) in ratios.iteritems()}, f)
Пример #45
0
class ClientTestQuery(unittest.TestCase):
    
    @classmethod
    def setUpClass(self):
        self.solr = SolrClient(test_config['SOLR_SERVER'][0],devel=True,auth=test_config['SOLR_CREDENTIALS'])
        self.rand_docs = RandomTestData()
        self.docs = self.rand_docs.get_docs(50)
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*')
        
        for field in test_config['collections']['copy_fields']:
            try:
                self.solr.collections.delete_copy_field(test_config['SOLR_COLLECTION'],field)
            except:
                pass
        for field in test_config['collections']['fields']:
            try:
                self.solr.collections.create_field(test_config['SOLR_COLLECTION'],field)
            except:
                pass
        
        #Index Some data
        self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs))
        self.solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True)
    
    def test_basic_query(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'})
        self.assertEqual(r.get_num_found(),len(self.docs))
        
    def test_facet(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'],{
            'q':'*:*',
            'facet':'true',
            'facet.field':'facet_test',
        })
        local_facets = {}
        for doc in self.docs:
            try:
                local_facets[doc['facet_test']] +=1
            except:
                local_facets[doc['facet_test']] = 1
        try:
            self.assertDictEqual(local_facets,r.get_facets()['facet_test'])
        except Exception as e:
            logging.info("local")
            logging.info(local_facets)
            logging.info("facets")
            logging.info(r.get_facets())
            raise
    
    def test_facet_with_fq(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'],{
            'q':'*:*',
            'facet':True,
            'facet.field':'facet_test',
        })
        first_facet_field = list(r.get_facets()['facet_test'].keys())[0]
        first_facet_field_count = r.get_facets()['facet_test'][first_facet_field]
        r = self.solr.query(test_config['SOLR_COLLECTION'],{
            'q':'*:*',
            'facet':True,
            'facet.field':'facet_test',
            'fq':'facet_test:{}'.format(first_facet_field)
        })
        self.assertEqual(r.get_num_found(),first_facet_field_count)
        
    def test_facet_range(self):
        res = self.solr.query(test_config['SOLR_COLLECTION'],{
            'q':'*:*',
            'facet':True,
            'facet.range':'price',
            'facet.range.start':0,
            'facet.range.end':100,
            'facet.range.gap':10
            })
        
        prices = [doc['price'] for doc in self.docs]
        div = lambda x: str(x//10 * 10)
        out = {}
        for k,g in itertools.groupby(sorted(prices),div):
            out[k] = len(list(g)) or 0
        self.assertDictEqual(out,res.get_facets_ranges()['price'])
    
    def test_facet_pivot(self):
        res = self.solr.query(test_config['SOLR_COLLECTION'],{
            'q':'*:*',
            'facet':True,
            'facet.pivot':['facet_test,price','facet_test,id']
        })
        out = {}
        for doc in self.docs:
            if doc['facet_test'] not in out:
                out[doc['facet_test']] = {}
            if doc['price'] not in out[doc['facet_test']]:
                out[doc['facet_test']][doc['price']]=1
            else:
                out[doc['facet_test']][doc['price']]+=1
        self.assertDictEqual(out,res.get_facet_pivot()['facet_test,price'])
        
    def test_get_field_values_as_list(self):
        res = self.solr.query(test_config['SOLR_COLLECTION'],{
            'q':'*:*',
            })
        results = res.get_field_values_as_list('product_name_exact')
        docs = res.docs
        temp = []
        for doc in docs:
            if 'product_name_exact' in doc:
                temp.append(doc['product_name_exact'])
        self.assertEqual(results,temp)
        
    def test_get_facet_values_as_list(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'],{
            'q':'*:*',
            'facet':'true',
            'facet.field':'facet_test',
        })
    return [distortion, clusters]
    

if __name__ == "__main__":

    argParser = argparse.ArgumentParser('k-means Clustering of documents based on metadata values')
    argParser.add_argument('--inCore', required=True, help='Solr Core Name to be queried')
    argParser.add_argument('--outJSON', required=True, help='/path/to/clusters.json containing k-means cluster assignments')
    argParser.add_argument('--Kvalue', required=True, help='number of clusters to find')
    argParser.add_argument('--accept', nargs='+', type=str, help='Optional: compute similarity only on specified IANA MIME Type(s)')
    args = argParser.parse_args()

    if args.inCore and args.outJSON and args.Kvalue:

        solr=SolrClient('http://localhost:8983/solr')
        res=solr.query(args.inCore,{'q':'*:*'})
        jsonData=res.get_json()
        list_of_points = []
        with open(jsonData) as data_file:    
            data = json.load(data_file)

        for eachFile in data:
            fileName=eachFile['id']
            list_of_points.append(Vector(eachFile, fileName))
        
        for point in list_of_points:
            union_features |= set(point.features.keys())

        
        #Randomly initialize Centroids for each iteration to find global minima
        global_minima = K_Means(list_of_points, int(args.Kvalue))
Пример #47
0
from SolrClient import SolrClient

while True:
    query = input("Query: ")

    query = query.replace(" ", "+")

    # Target solr collection
    solr = SolrClient('http://127.0.0.1:8983/solr')

    res = solr.query('Assignment1', {
        "q": 'question:' + query,
        "indent": "on",
        "rows": "10",
        "wt": "json"}
    )

    print(''.join(res.docs[0]['answer']))
Пример #48
0
class ClientTestIndexing(unittest.TestCase):
    #High Level Client Tests
    
    @classmethod
    def setUpClass(self):
        self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS'])
        self.rand_docs = RandomTestData()
        self.docs = self.rand_docs.get_docs(50)
        
        for field in test_config['collections']['copy_fields']:
            try:
                self.solr.schema.delete_copy_field(test_config['SOLR_COLLECTION'],field)
            except:
                pass
        for field in test_config['collections']['fields']:
            try:
                self.solr.schema.create_field(test_config['SOLR_COLLECTION'],field)
            except:
                pass
                
    def setUp(self):
        self.delete_docs()
        self.commit()
    
    def delete_docs(self):
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*')
        self.commit()
        
    def commit(self):
        self.solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True)
        sleep(5)
    
    @unittest.skip("Skipping for now")
    def test_access_without_auth(self):
        if not test_config['SOLR_CREDENTIALS'][0]:
            return
        solr = SolrClient(test_config['SOLR_SERVER'],devel=True)
        with self.assertRaises(ConnectionError) as cm:
            solr.query('SolrClient_unittest',{'q':'not_gonna_happen'})
            
    
    def test_indexing_json(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs))
        self.commit()
        sleep(5)
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(self.solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])}).get_num_found(),1)
        self.delete_docs()
        self.commit()
    
    def test_indexing_conn_log(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs))
        self.commit()
        sleep(5)
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(self.solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])}).get_num_found(),1)
        logging.info(self.solr.transport._action_log)
        self.delete_docs()
        self.commit()
    
    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(55)
        with open('temp_file.json','w') as f:
            json.dump(self.docs,f)
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'})
        self.assertEqual(r.get_num_found(),len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass
            
    
    def test_stream_file_gzip_file(self):
        self.docs = self.rand_docs.get_docs(60)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'})
        self.assertEqual(r.get_num_found(),len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass
            
    @unittest.skip("Don't test remote indexing in travis")
    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(61)
        with open('temp_file.json','w') as f:
            json.dump(self.docs,f)
        r = self.solr.local_index(test_config['SOLR_COLLECTION'],'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'})
        self.assertEqual(r.get_num_found(),len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query_with_rows(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],{'q':'*:*'}, rows=50):
            self.assertTrue(len(res.docs) == 50)
            docs.extend(res.docs)
            queries +=1
        self.assertEqual(
            [x['id'] for x in sorted(docs, key= lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key= lambda x: x['id'])]
            )
        self.assertTrue(1000/50 == queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass   

    def test_paging_query(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],{'q':'*:*'}):
            self.assertTrue(len(res.docs) == 1000)
            docs.extend(res.docs)
            queries +=1
        self.assertTrue(queries == 1)
        self.assertEqual(
            [x['id'] for x in sorted(docs, key= lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key= lambda x: x['id'])]
            )
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass              
            
    def test_paging_query_with_max(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'], {'q':'*:*'}, rows = 50, max_start = 502):
            self.assertTrue(len(res.docs) == 50)
            queries +=1
            docs.extend(res.docs)
        ids = [x['id'] for x in docs]

        for item in docs:
            self.assertTrue(item['id'] in ids)

        self.assertEqual(11, queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass    
Пример #49
0
 def test_access_without_auth(self):
     if not test_config['SOLR_CREDENTIALS'][0]:
         return
     solr = SolrClient(test_config['SOLR_SERVER'],devel=True)
     with self.assertRaises(ConnectionError) as cm:
         solr.query('SolrClient_unittest',{'q':'not_gonna_happen'})
Пример #50
0
    sys.exit()

s3 = boto3.resource('s3')
obj = s3.Object('sonos-scrobble','location')
location = obj.get()['Body'].read().decode('utf-8')
queue_name = 'echo_sonos_ct' if location=='ct' else 'echo_sonos'
print("location = ", location)
print("queue_name =", queue_name)

sqs = boto3.resource('sqs', region_name='us-east-1')
queue = sqs.get_queue_by_name(QueueName=queue_name)

solr = SolrClient(ec_uri+':8983/solr')
collection = 'sonos_companion'

s = 'album:' + ' AND album:'.join(album.split())
result = solr.query(collection, {'q':s, 'rows':25, 'fields':['score','track','uri','album'], 'sort':'score desc'})
if  result.docs:
    selected_album = result.docs[0]['album']
    tracks = sorted([t for t in result.docs], key=itemgetter('track'))

    # The "if t['album']==selected_album" below only comes into play if we retrieved tracks from more than one album
    uris = [t['uri'] for t in tracks if t['album']==selected_album]
    sqs_response = queue.send_message(MessageBody=json.dumps({'action':'play', 'uris':uris}))
    print("Status Code =", sqs_response['ResponseMetadata']['HTTPStatusCode'])
    print("I will play {} songs from {}".format(len(uris), selected_album))
else:
    print("I couldn't find {}. Try again.".format(album))


Пример #51
0
class ClientTestQuery(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        self.solr = SolrClient(test_config['SOLR_SERVER'][0],
                               devel=True,
                               auth=test_config['SOLR_CREDENTIALS'])
        self.rand_docs = RandomTestData()
        self.docs = self.rand_docs.get_docs(50)
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*')

        for field in test_config['collections']['copy_fields']:
            try:
                self.solr.collections.delete_copy_field(
                    test_config['SOLR_COLLECTION'], field)
            except:
                pass
        for field in test_config['collections']['fields']:
            try:
                self.solr.collections.create_field(
                    test_config['SOLR_COLLECTION'], field)
            except:
                pass

        #Index Some data
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps(self.docs))
        self.solr.commit(test_config['SOLR_COLLECTION'], openSearcher=True)

    def test_basic_query(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))

    def test_facet(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'], {
            'q': '*:*',
            'facet': 'true',
            'facet.field': 'facet_test',
        })
        local_facets = {}
        for doc in self.docs:
            try:
                local_facets[doc['facet_test']] += 1
            except:
                local_facets[doc['facet_test']] = 1
        try:
            self.assertDictEqual(local_facets, r.get_facets()['facet_test'])
        except Exception as e:
            logging.info("local")
            logging.info(local_facets)
            logging.info("facets")
            logging.info(r.get_facets())
            raise

    def test_facet_with_fq(self):
        r = self.solr.query(test_config['SOLR_COLLECTION'], {
            'q': '*:*',
            'facet': True,
            'facet.field': 'facet_test',
        })
        first_facet_field = list(r.get_facets()['facet_test'].keys())[0]
        first_facet_field_count = r.get_facets(
        )['facet_test'][first_facet_field]
        r = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'facet': True,
                'facet.field': 'facet_test',
                'fq': 'facet_test:{}'.format(first_facet_field)
            })
        self.assertEqual(r.get_num_found(), first_facet_field_count)

    def test_facet_range(self):
        res = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'facet': True,
                'facet.range': 'price',
                'facet.range.start': 0,
                'facet.range.end': 100,
                'facet.range.gap': 10
            })

        prices = [doc['price'] for doc in self.docs]
        div = lambda x: str(x // 10 * 10)
        out = {}
        for k, g in itertools.groupby(sorted(prices), div):
            out[k] = len(list(g)) or 0
        self.assertDictEqual(out, res.get_facets_ranges()['price'])

    def test_facet_pivot(self):
        res = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'facet': True,
                'facet.pivot': ['facet_test,price', 'facet_test,id']
            })
        out = {}
        for doc in self.docs:
            if doc['facet_test'] not in out:
                out[doc['facet_test']] = {}
            if doc['price'] not in out[doc['facet_test']]:
                out[doc['facet_test']][doc['price']] = 1
            else:
                out[doc['facet_test']][doc['price']] += 1
        self.assertDictEqual(out, res.get_facet_pivot()['facet_test,price'])

    def test_get_field_values_as_list(self):
        res = self.solr.query(test_config['SOLR_COLLECTION'], {
            'q': '*:*',
        })
        results = res.get_field_values_as_list('product_name_exact')
        docs = res.docs
        temp = []
        for doc in docs:
            if 'product_name_exact' in doc:
                temp.append(doc['product_name_exact'])
        self.assertEqual(results, temp)

    def test_get_facet_values_as_list(self):
        r = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'facet': 'true',
                'facet.limit': -1,
                'facet.field': 'facet_test',
            })
        self.assertEqual(
            sorted(r.data['facet_counts']['facet_fields']['facet_test'][1::2]),
            sorted(r.get_facet_values_as_list('facet_test')))

    def test_grouped_count_1(self):
        '''
        Get a dict of grouped docs
        '''
        r = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'group': True,
                'group.field': 'id',
                'group.ngroups': True,
            })
        self.assertEqual(r.get_ngroups(), 50)
        self.assertEqual(r.get_ngroups('id'), 50)

    def test_grouped_docs(self):
        '''
        Get a dict of grouped docs
        '''
        r = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'group': True,
                'group.field': 'id',
                'group.ngroups': True,
            })
        self.assertEqual(len(r.docs), 10)
        self.assertTrue('doclist' in r.docs[0])

    def test_grouped_docs(self):
        '''
        Get a dict of grouped docs
        '''
        r = self.solr.query(
            test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'group': True,
                'group.field': 'id',
                'group.ngroups': True,
            })
        self.assertEqual(len(r.docs), 10)
        self.assertTrue('doclist' in r.docs[0])

    def test_flat_groups(self):
        '''
        Get a dict of grouped docs
        '''
        r = self.solr.query(test_config['SOLR_COLLECTION'], {
            'q': '*:*',
            'group': True,
            'group.field': 'id'
        })
        flats = r.get_flat_groups()
        self.assertEqual(len(flats), 10)
        self.assertTrue('date' in flats[0])

    def test_json_facet(self):
        '''
        Get a dict of grouped docs
        '''
        #Just lazy getting a new response object
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})

        a = r.get_jsonfacet_counts_as_dict(
            'test', {
                'count': 50,
                'test': {
                    'buckets': [{
                        'count': 10,
                        'pr': {
                            'buckets': [{
                                'count': 2,
                                'unique': 1,
                                'val': 79
                            }, {
                                'count': 1,
                                'unique': 1,
                                'val': 9
                            }]
                        },
                        'pr_sum': 639.0,
                        'val': 'consectetur'
                    }, {
                        'count': 8,
                        'pr': {
                            'buckets': [
                                {
                                    'count': 1,
                                    'unique': 1,
                                    'val': 9
                                },
                                {
                                    'count': 1,
                                    'unique': 1,
                                    'val': 31
                                },
                                {
                                    'count': 1,
                                    'unique': 1,
                                    'val': 33
                                },
                            ]
                        },
                        'pr_sum': 420.0,
                        'val': 'auctor'
                    }, {
                        'count': 8,
                        'pr': {
                            'buckets': [
                                {
                                    'count': 2,
                                    'unique': 1,
                                    'val': 94
                                },
                                {
                                    'count': 1,
                                    'unique': 1,
                                    'val': 25
                                },
                            ]
                        },
                        'pr_sum': 501.0,
                        'val': 'nulla'
                    }]
                }
            })

        b = {
            'test': {
                'auctor': {
                    'count': 8,
                    'pr': {
                        9: {
                            'count': 1,
                            'unique': 1
                        },
                        31: {
                            'count': 1,
                            'unique': 1
                        },
                        33: {
                            'count': 1,
                            'unique': 1
                        }
                    },
                    'pr_sum': 420.0
                },
                'consectetur': {
                    'count': 10,
                    'pr': {
                        9: {
                            'count': 1,
                            'unique': 1
                        },
                        79: {
                            'count': 2,
                            'unique': 1
                        }
                    },
                    'pr_sum': 639.0
                },
                'nulla': {
                    'count': 8,
                    'pr': {
                        25: {
                            'count': 1,
                            'unique': 1
                        },
                        94: {
                            'count': 2,
                            'unique': 1
                        }
                    },
                    'pr_sum': 501.0
                }
            }
        }

        self.assertEqual(a, b)
#!/usr/bin/env python

from __future__ import division
import collections
import json
import os
from SolrClient import SolrClient
import sys
from tika import detector

solr = SolrClient('http://localhost:8983/solr')
walk_n = sum(len(files) for root, dirs, files in os.walk(sys.argv[1]))
walk_i = 0
entities = collections.defaultdict(lambda: [])
for root, dirs, files in os.walk(sys.argv[1]):
    for file in files:
        path = root + '/' + file
        mime = detector.from_file(path)
        for val in solr.query('collection1', {'q': 'id:' + file}).data['response']['docs'][0].values():
            if type(val) is list:
                entities[mime].extend(val)
        walk_i += 1
        print str(walk_i * 100 // walk_n) + '%\r',
with open('classification-path.json', 'w') as f:
    json.dump({k: collections.Counter(v) for k, v in entities.iteritems()}, f)