예제 #1
0
def submitDocs (docs, name):
	#pprint.pprint(docs)
	
	index = solr.Solr('http://localhost:8080/solr/edfu')
	index.add_many(docs)
	index.commit()

	index = solr.Solr('http://vlib.sub.uni-goettingen.de/solr/edfu')
	index.add_many(docs)
	index.commit()
	
	print str(len(docs)) + ' ' + name + u' Dokumente indexiert'
예제 #2
0
 def _make_solr_inst(self) -> "solr.Solr":
     return solr.Solr(
         self.solr_conn_addr,
         persistent=self.solr_persistent_connection,
         timeout=self.solr_timeout,
         # debug=True  # This makes things pretty verbose
     )
예제 #3
0
 def __init__(self, url_harvest, query, **query_params):
     super(SolrFetcher, self).__init__(url_harvest, query)
     self.solr = solr.Solr(url_harvest)  # , debug=True)
     self.query = query
     self.resp = self.solr.select(self.query)
     self.numFound = self.resp.numFound
     self.index = 0
예제 #4
0
    def get(self, request, *args, **kwargs):

        if not ('q' in request.GET and 'dictionary' in request.GET):
            return Response()

        q = u"{0}".format(request.GET['q'])
        dictionary = u"{0}".format(
            request.GET['dictionary'])  # The suggester to use
        manuscript = u"{0}".format(
            request.GET['manuscript']
        )  # Can be '*' when searching through all manuscripts

        connection = solr.Solr(settings.SOLR_SERVER)
        search_handler = solr.SearchHandler(connection, "/suggest")

        # TODO fix solr so that the suggesters work with a context field (cfq)]
        # search_results = search_handler(q=q, suggest_dictionary=dictionary, suggest_cfq=manuscript)
        search_results = search_handler(q=q, suggest_dictionary=dictionary)

        results = search_results.suggest[dictionary][q]

        # Remove duplicates from the suggestions and limits the return number to 10
        results['suggestions'] = self._get_filtered_results(
            results['suggestions'])

        response = Response(results)
        return response
예제 #5
0
    def _delete_solr_records(self, solr_base_url, core=None,
                             query=DEFAULT_QUERY):

        solr_url = (
            solr_base_url + "/" + core if core is not None else solr_base_url)
        solr_server = solr.Solr(solr_url)
        solr_server.delete_query(query)
        solr_server.close()
예제 #6
0
 def __init__(self, request):
     self.server = solr.Solr(settings.SOLR_SERVER)
     self.request = request
     self.parsed_request = {}
     self.prepared_query = ''
     self.solr_params = {}
     self._parse_request()
     self._prep_q()
예제 #7
0
    def __init__(self, type_str, uuid, solr_conn_addr,
                 type_field, uuid_field, vector_field, timestamp_field,
                 timeout=10, persistent_connection=False, commit_on_set=True):
        """
        Initialize a new Solr-stored descriptor element.

        :param type_str: Type of descriptor. This is usually the name of the
            content descriptor that generated this vector.
        :type type_str: str

        :param uuid: Unique ID reference of the descriptor.
        :type uuid: collections.Hashable

        :param solr_conn_addr: HTTP(S) address for the Solr index to use
        :type solr_conn_addr: str

        :param type_field: Solr index field to store descriptor type string
            value.
        :type type_field: str

        :param uuid_field: Solr index field to store descriptor UUID string
            value in.
        :type uuid_field: str

        :param vector_field: Solr index field to store the descriptor vector of
            floats in.
        :type vector_field: str

        :param timestamp_field: Solr index field to store floating-point UNIX
            timestamps.
        :type timestamp_field: str

        :param timeout: Whether or not the Solr connection should
            be persistent or not.
        :type timeout: int

        :param persistent_connection: Maintain a connection between Solr index
            interactions.
        :type persistent_connection: bool

        :param commit_on_set: Immediately commit changes when a vector is set.
        :type commit_on_set: bool

        """
        super(SolrDescriptorElement, self).__init__(type_str, uuid)

        self.type_field = type_field
        self.uuid_field = uuid_field
        self.vector_field = vector_field
        self.timestamp_field = timestamp_field
        self.commit_on_set = commit_on_set

        self.solr = solr.Solr(solr_conn_addr,
                              persistent=persistent_connection,
                              timeout=timeout,
                              # debug=True  # This makes things pretty verbose
                              )
예제 #8
0
    def _commit_solr(self, solr_base_url):

        for core in CORES:

            solr_url = solr_base_url + "/" + core
            logging.info("Committing to Solr index: %s" % solr_url)
            solr_server = solr.Solr(solr_url)
            solr_server.commit()
            solr_server.close()
예제 #9
0
    def _optimize_solr(self, solr_base_url):

        for core in CORES:

            solr_url = solr_base_url + "/" + core
            logging.info("Optimizing Solr index: %s" % solr_url)
            solr_server = solr.Solr(solr_url)
            solr_server.optimize()
            solr_server.close()
예제 #10
0
 def __init__(self, request, additional_query_params=None):
     self.server = solr.Solr(settings.SOLR_SERVER)
     # self.query_dict = query_dict
     self.request = request
     self.additional_query_params = additional_query_params
     self.parsed_request = {}
     self.prepared_query = u""
     self.solr_params = {}
     self._parse_request()
     self._prepare_query()
예제 #11
0
    def __init__(self,
                 solr_conn_addr: str,
                 set_uuid: str,
                 set_uuid_field: str,
                 d_uid_field: str,
                 descriptor_field: str,
                 timestamp_field: str,
                 solr_params: Dict[str, Any] = None,
                 commit_on_add: bool = True,
                 max_boolean_clauses: int = 1024,
                 pickle_protocol: int = -1):
        """
        Construct a descriptor set pointing to a Solr instance.

        :param solr_conn_addr: HTTP(S) address for the Solr set to use
        :param set_uuid: Unique ID for the descriptor set to use within the
            configured Solr set.
        :param set_uuid_field: Solr set field to store/locate set UUID
            value.
        :param d_uid_field: Solr set field to store/locate descriptor UUID
            values
        :param descriptor_field: Solr set field to store the code-associated
            descriptor object.
        :param timestamp_field: Solr set field to store floating-point UNIX
            timestamps.
        :param solr_params: Dictionary of additional keyword parameters to set
            in the ``solr.Solr`` instance used. See the ``pysolr``
            documentation for available parameters and values.
        :param commit_on_add: Immediately commit changes when one or many
            descriptor are added.
        :param max_boolean_clauses: Solr instance's configured
            maxBooleanClauses configuration property (found in solrconfig.xml
            file). This is needed so we can correctly chunk up batch queries
            without breaking the server. This may also be less than the Solr
            instance's set value.
        :param pickle_protocol: Pickling protocol to use. We will use -1 by
            default (latest version, probably binary).
        """
        super(SolrDescriptorSet, self).__init__()

        self.set_uuid = set_uuid

        self.set_uuid_field = set_uuid_field
        self.d_uid_field = d_uid_field
        self.descriptor_field = descriptor_field
        self.timestamp_field = timestamp_field

        self.commit_on_add = commit_on_add
        self.max_boolean_clauses = int(max_boolean_clauses)
        assert self.max_boolean_clauses >= 2, "Need more clauses"

        self.pickle_protocol = pickle_protocol

        self.solr_params = solr_params
        self.solr = solr.Solr(solr_conn_addr, **solr_params)
예제 #12
0
    def _check_record(self, solr_base_url, core, record_id):
        '''Checks for the existence of a record with a given id.'''

        solr_url = solr_base_url + "/" + core
        solr_server = solr.Solr(solr_url)
        query = "id:%s" % record_id
        response = solr_server.select(query)
        solr_server.close()
        if response.numFound > 0:
            return True
        else:
            return False
예제 #13
0
def collectionView(request, collection_id):
    if request.method == 'GET':
        q = reduce(concat_query, request.GET.getlist('q')) if 'q' in request.GET else '*:*'
        rows = request.GET['rows'] if 'rows' in request.GET else '16'
        start = request.GET['start'] if 'start' in request.GET else '0'
        view_format = request.GET['view_format'] if 'view_format' in request.GET else 'thumbnails'
        
        collection_url = 'https://registry.cdlib.org/api/v1/collection/' + collection_id + '/?format=json'
        collection_json = urllib2.urlopen(collection_url).read()
        collection_details = json.loads(collection_json)
        
        filters = dict((filter_type[0], request.GET.getlist(filter_type[0])) for filter_type in FACET_TYPES)
        filters['collection_name'] = [collection_details['name']]
        fq = solrize_filters(filters)
        
        # perform the search
        s = solr.Solr('http://107.21.228.130:8080/solr/dc-collection')
        solr_response = SOLR.select(
            q=q,
            rows=rows,
            start=start,
            fq=fq,
            facet='true', 
            facet_field=list(facet_type[0] for facet_type in FACET_TYPES)
        )
        
        for item in solr_response.results:
            if 'reference_image_md5' in item:
                item['reference_image_http'] = md5_to_http_url(item['reference_image_md5'])
        
        facets = {}
        for facet_type in FACET_TYPES:
            facets[facet_type[0]] = process_facets(
                solr_response.facet_counts['facet_fields'][facet_type[0]], 
                filters[facet_type[0]]
            )
        
        return render(request, 'public_interface/collectionResults.html', {
            'q': q,
            'filters': filters,
            'rows': rows,
            'start': start,
            'search_results': solr_response.results,
            'facets': facets,
            'FACET_TYPES': FACET_TYPES,
            'numFound': solr_response.numFound,
            'pages': int(math.ceil(float(solr_response.numFound)/int(rows))),
            'view_format': view_format, 
            'collection': collection_details
        })
    
    return render(request, 'public_interface/searchResults.html', {'yay': 'yamy'})
예제 #14
0
 def __setstate__(self, state):
     self._type_label = state['type_label']
     self._uuid = state['uuid']
     self.type_field = state['type_field']
     self.uuid_field = state['uuid_field']
     self.vector_field = state['vector_field']
     self.timestamp_field = state['timestamp_field']
     self.commit_on_set = state['commit_on_set']
     self.solr = solr.Solr(state['solr_url'],
                           persistent=state['solr_persistent'],
                           timeout=state['solr_timeout'],
                           # debug=True  # see above
                           )
예제 #15
0
    def __setstate__(self, state):
        self.uuid = state['uuid']
        self.commit_on_add = state['commit_on_add']
        self.max_boolean_clauses = state['max_boolean_clauses']
        self.idx_uuid_field = state['field_uuid']
        self.code_field = state['field_code']
        self.d_uid_field = state['field_descr_uuid']
        self.descriptor_field = state['field_descr_obj']
        self.timestamp_field = state['field_timestamp']

        self.solr = solr.Solr(state['solr_url'],
                              persistent=state['solr_persistent'],
                              timeout=state['solr_timeout'])
예제 #16
0
def main(url_solr=URL_SOLR, url_couchdb=None, couchdb_db=None):
    solr_db = solr.Solr(url_solr)
    db = get_couchdb(url=url_couchdb, dbname=couchdb_db)
    v = couchdb_pager(db, include_docs='true')
    # update or create new solr doc for each couchdb doc
    for r in v:
        doc_couch = r.doc
        if '_design' not in doc_couch['_id']:
            try:
                if not isinstance(doc_couch['originalRecord']['collection'],
                                  list):
                    doc_couch['originalRecord']['collection'] = [
                        doc_couch['originalRecord']['collection'],
                    ]
                    print("orgRec.Collection: {}".format(
                        doc_couch['sourceResource']['collection']))
            except KeyError:
                pass
            try:
                if not isinstance(doc_couch['sourceResource']['collection'],
                                  list):
                    doc_couch['sourceResource']['collection'] = [
                        doc_couch['sourceResource']['collection'],
                    ]
                    print("srcRes.Collection: {}".format(
                        doc_couch['sourceResource']['subject']))
            except KeyError:
                pass
            try:
                subject = doc_couch['sourceResource'].get('subject', None)
                if not isinstance(subject, list):
                    subject = [subject]
                subjects_norm = []
                for sub in subject:
                    if not isinstance(sub, dict):
                        subjects_norm.append({'name': sub})
                    else:
                        subjects_norm.append(sub)
                doc_couch['sourceResource']['subject'] = subjects_norm
            except KeyError:
                pass
            db.save(doc_couch)
            try:
                doc_solr = push_doc_to_solr(map_couch_to_solr_doc(doc_couch),
                                            solr_db=solr_db)
                print("PUSHED {} to solr".format(doc_couch['_id']))
            except TypeError:
                pass
    solr_db.commit()
예제 #17
0
 def __init__(self, limit=100000):
     global lock, ready
     t = time()
     self.limit = limit
     self.api = solr.Solr('http://localhost:8983/solr/samos')
     self.wktreg = re.compile(r'[-+]?\d*\.\d+|\d+')
     self.data = None
     self.size, self.total = self._load()
     # python3 version of cKDTree does periodic quick sort
     #  which severely hurts performance on sorted data
     #  -- solr returns semi sorted data usually
     # compact_nodes barely affects performance
     # , compact_nodes=False)
     self.tree = cKDTree(self.data['loc'][:self.total], balanced_tree=False)
     self.loadtime = time() - t
     with lock:
         ready = True
예제 #18
0
def solr_image_paths(solr_addr, begin_time, end_time, username, password, batch_size):
    log = logging.getLogger(__name__)

    conn = solr.Solr(solr_addr, http_user=username, http_pass=password)
    # Query for number of matching documents
    q = 'mainType:image AND indexedAt:[%s TO %s]' % (begin_time, end_time)
    r = conn.select(q, fields=['id'], rows=0)

    num_results = r.numFound
    log.debug("Found: %d", num_results)
    loops = (num_results // batch_size) + (num_results % batch_size > 0)
    log.debug("Making %d iterations", loops)

    for i in xrange(loops):
        r = conn.select(q, fields=['id'], rows=batch_size,
                        start=i * batch_size)
        for doc in r.results:
            yield doc['id'][5:]
예제 #19
0
def solr_connection(core):
    """
    Creates a :class:`solr:solr.Solr` connection for the core ``core``.

    :param str core:
    :raises urllib2.URLError: if a ping to the cores ping handler doesn't
                              succeed
    :rtype: :class:`solr:solr.Solr`
    """
    solr_uri = config.CFG.get("solr", "uri")
    core_uri = solr_uri + "/" + core
    ping_uri = core_uri + "/admin/ping"

    logger.info("Setting up a connection to %s", solr_uri)
    logger.debug("Pinging %s", ping_uri)
    urllib2.urlopen(ping_uri)

    logger.debug("Connection to the Solr core at %s", core_uri)
    return solr.Solr(core_uri)
 def delete_entry(self, file_id):
     """ Deletes solr entry.
         Called by run_remove_index_file(() """
     log.debug('file_id, `{}`'.format(file_id))
     s = solr.Solr(self.SOLR_URL)
     response = s.delete(file_id)
     s.commit()
     s.close()
     log.debug(
         'deletion-post complete; response, ```{}```'.format(response))
     status_str: str = ''
     if type(response) == str:
         status_str = response  # type: ignore
     else:
         status_str = repr(response)
     process_status_updater.update_single_status(
         inscription_id=file_id,
         status='deletion-processed',
         status_detail=status_str)
     return
예제 #21
0
def Main():
    s = solr.Solr("http://localhost:8080/solr")
    con = sqlite3.connect("d:/temp/test.db")
    cursor = con.cursor()
    #cursor.execute("select * from t_talents limit 6,3")
    cursor.execute("select * from t_talents")
    rows = cursor.fetchall()
    count = 0
    for row in rows:
        count += 1
        tid = row[0]
        tname = row[1]
        ttitle = row[2]
        tlocality = row[4]
        tindustry = row[5]
        tcurposition = row[6]
        tprofile = row[9]
        docn = {'id':tid,'talent_name':tname,'talent_title':ttitle,'talent_locality':tlocality,'talent_industry':tindustry,'talent_curposition':tcurposition,'talent_profile':tprofile}
        s.add(docn,commit=True)
        print('done:'+str(count))
    con.close()
예제 #22
0
def collectionsExplore(request):
    s = solr.Solr('http://107.21.228.130:8080/solr/dc-collection')
    
    collections_solr_query = SOLR.select(q='*:*', rows=0, start=0, facet='true', facet_field=['collection'], facet_limit='10')
    solr_collections = collections_solr_query.facet_counts['facet_fields']['collection']
    
    collections = []
    for collection_url in solr_collections:
        collection_api = urllib2.urlopen(collection_url + "?format=json")
        collection_json = collection_api.read()
        collection_details = json.loads(collection_json)
        rows = '4' if collection_details['description'] != '' else '5'
        display_items = SOLR.select(
            q='*:*', 
            fields='reference_image_md5, title, id', 
            rows=rows, 
            start=0, 
            fq=['collection: \"' + collection_url + '\"']
        )
        
        for item in display_items:
            if 'reference_image_md5' in item:
                item['reference_image_http'] = md5_to_http_url(item['reference_image_md5'])
        
        collection_url_pattern = re.compile('https://registry.cdlib.org/api/v1/collection/([0-9]+)[/]?')
        collection_id = collection_url_pattern.match(collection_url)
        
        collections.append({
            'name': collection_details['name'], 
            'description': collection_details['description'], 
            'slug': collection_details['slug'],
            'collection_id': collection_id.group(1),
            'display_items': display_items.results
        })
    
    return render(request, 'public_interface/collections-explore.html', {'collections': collections})
예제 #23
0
        return row.value


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Make csv report of indexed collections')
    parser.add_argument('auth_token', help='Authentication token')
    parser.add_argument('--solr_url', help='Solr index url')
    parser.add_argument('--couchdb_url', help='CouchDB url')
    args = parser.parse_args()
    solr_url = args.solr_url if args.solr_url else SOLR_URL
    print "SOLR_URL:{}".format(solr_url)
    SOLR = solr.SearchHandler(
        solr.Solr(
            solr_url,
            post_headers={
                'X-Authentication-Token': args.auth_token,
            },
        ), "/query")
    if args.couchdb_url:
        cdb = get_couchdb(url_couchdb=couchdb_url, dbname='ucldc')
    else:
        cdb = get_couchdb(dbname='ucldc')
    collections = get_indexed_collection_list(SOLR)
    date_to_minute = datetime.datetime.now().strftime('%Y%m%d-%H%M')
    fname = 'indexed_collections-{}.csv'.format(date_to_minute)
    with open(fname, 'wb') as csvfile:
        csvwriter = UnicodeWriter(csvfile)
        csvwriter.writerow(
            ('Collection Name', 'Collection URL', 'Number in index',
             'Number in couchdb', 'Number in OAC', 'Couch missing in solr',
             'OAC missing in couch', 'Repository Name', 'Repository URL',
예제 #24
0
from __future__ import print_function
import solr
import sys
import re

con = solr.Solr('http://localhost:8983/solr/wikipedia_core')
mlt = solr.SearchHandler(con, "/mlt")

THRESHOLD = 0.9


def process(line):
    # replace some known utf-8 chars with ascii
    line = re.sub("\xe2\x80\x99", "x",
                  line)  # U+2019 (right single quotation mark)
    line = re.sub("\xe2\x80\x93", "-", line)  # U+2013 (EN-DASH)
    # remove the rest of the non-ascii chars
    line = re.sub(r'[^\x00-\x7F]+', ' ', line)

    r = mlt(mlt_fl='page_text', mlt_mindf=1, mlt_mintf=1, stream_body=line)
    return ",".join([
        d['page_name'] for d in r.results
        if d['score'] / r.maxScore >= THRESHOLD
    ]).encode('utf-8')


def main():
    try:
        for line in sys.stdin:
            print(process(line.strip()))
    except:
예제 #25
0
#! /usr/bin/env python
import sys
import os
import iso8601
import json
import solr
import urllib
import urllib2

from utils import solr_escape, read_blob, get_accounts_for_blob, get_filters

import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

solrMessages = solr.Solr('http://localhost:8983/solr/messages')

ROWS = 100

INBOX_LABEL = "inbox"


def run_filters(account_hash, message_id, query_label_pairs):

    labels = set()
    skip_inbox = False

    for query, label, _skip_inbox in query_label_pairs:

        caged_query = '+id:%s +(%s) +account:%s -labels:"%s"' % (
            message_id, query, account_hash, solr_escape(label))
예제 #26
0
def search(keyword, value):
    s = solr.Solr("http://localhost:8983/solr/minor")
    response = s.select(keyword + ':' + value)
    print(response)
    for i in response.results:
        print i
예제 #27
0
 def __init__(self):
     self.mapping = settings['SOLR_MAPPING'].items()
     self.solr = solr.Solr(settings['SOLR_URL'])
예제 #28
0
 def connection(self):
     if self._connection:
         return self._connection
     else:
         return solr.Solr(settings.SOLR_HOST + '/' + self.core)
# You should have received a copy of the GNU General Public License
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
Solr utilities.
"""

import itertools

from invenio.config import CFG_SOLR_URL
from intbitset import intbitset
from invenio.ext.logging import register_exception

if CFG_SOLR_URL:
    import solr
    conn = solr.Solr(CFG_SOLR_URL)
    SOLR_CONNECTION = solr.SolrConnection(CFG_SOLR_URL)  # pylint: disable=E1101
    SOLR_MLT_CONNECTION = solr.SearchHandler(conn, "/mlt")

BOOLEAN_EQUIVALENTS = {"+": "AND", "|": "OR", "-": "NOT"}


def get_collection_filter(hitset, cutoff_amount):
    # The last n hitset records are considered to be newest and therfore most relevant
    start_index = len(hitset) - cutoff_amount
    if start_index < 0:
        start_index = 0
    it = itertools.islice(hitset, start_index, None)
    ids = ' '.join([str(recid) for recid in it])

    if ids:
예제 #30
0
# coding:utf-8
import solr
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
print sys.getdefaultencoding()

# create a connection to a solr server
conn = solr.Solr('http://localhost:8080/solr', timeout=1000)

# add a document to the index
tdoc = {"id": 3, "title": "Lucene in Action"}

for k in tdoc:
    print "dict[%s] =" % k, tdoc[k]

conn.add(tdoc)
conn.commit()

# do a search
response = conn.select('Lucene')
for hit in response.results:

    print hit