def __init__(self): ''' To use, import this library and instantiate a crawler object: from opencontext_py.apps.indexer.crawler import Crawler crawler = Crawler() # Then crawl as follows: crawler.crawl(100) Crawling a single document is also supported with the index_single_document method. Just provide the document's UUID. For example: crawler.index_single_document('9E474B89-E36B-4B9D-2D38-7C7CCBDBB030') ''' # The list of Open Context items to crawl self.uuidlist = UUIDList().uuids # Connect to Solr self.solr = SolrConnection().connection # Flag as human remains sensitive self.human_remains = 0 # 0 means no, positive values means yes self.max_geo_zoom = 30 # set the max geo tile zoom we should use for indexing
def solr_connect(self): """ Connects to solr """ if self.solr is not None: # We are already connected, so skip connecting. return None if configs.USE_TEST_SOLR_CONNECTION: # Connect to the testing solr server self.solr = SolrConnection( exit_on_error=False, solr_host=settings.SOLR_HOST_TEST, solr_port=settings.SOLR_PORT_TEST, solr_collection=settings.SOLR_COLLECTION_TEST ).connection else: # Connect to the default solr server self.solr = SolrConnection(False).connection
def __init__(self): ''' To use, import this library and instantiate a crawler object: crawler = Crawler() Then crawl as follows: crawler.crawl() Crawling a single document is also supported with the index_single_document method. Just provide the document's UUID. For example: crawler.index_single_document('9E474B89-E36B-4B9D-2D38-7C7CCBDBB030') ''' # The list of Open Context items to crawl self.uuidlist = UUIDList().uuids # Connect to Solr self.solr = SolrConnection().connection
class Crawler(): ''' The Open Context Crawler indexes Open Context items and makes them searchable in Apache Solr. ''' def __init__(self): ''' To use, import this library and instantiate a crawler object: from opencontext_py.apps.indexer.crawler import Crawler crawler = Crawler() # Then crawl as follows: crawler.crawl(100) Crawling a single document is also supported with the index_single_document method. Just provide the document's UUID. For example: crawler.index_single_document('9E474B89-E36B-4B9D-2D38-7C7CCBDBB030') ''' # The list of Open Context items to crawl self.uuidlist = UUIDList().uuids # Connect to Solr self.solr = SolrConnection().connection # Flag as human remains sensitive self.human_remains = 0 # 0 means no, positive values means yes self.max_geo_zoom = 30 # set the max geo tile zoom we should use for indexing def crawl(self, chunksize=100): ''' For efficiency, this method processes documents in "chunks." The default chunk size is 100, but one can specify other values. For example, to specify a chunksize of 500, use this method as follows: crawler = Crawler() crawler.crawl(500) ''' # Get a logger logger = logging.getLogger(__name__) start_time = time.time() print('\n\nStarting crawl...\n') print("(#)\tUUID") print('--------------------------------------------') document_count = 0 while self.uuidlist is not None: documents = [] # Process the UUID list in chunks ok_manifests = [] for uuid in islice(self.uuidlist, 0, chunksize): try: sd_obj = SolrDocument(uuid) if isinstance(self.max_geo_zoom, int): if self.max_geo_zoom > 5: # only positive integers sd_obj.max_geo_zoom = self.max_geo_zoom sd_obj.process_item() solrdocument = sd_obj.fields if crawlutil().is_valid_document(solrdocument): try: manifest = Manifest.objects.get(uuid=uuid) ok_manifests.append(manifest) # manifest.indexed_save() # saves the time this was indexed except Manifest.DoesNotExist: print("Error: {0} Database bizzare error -----> " + uuid) logger.error('[' + datetime.now().strftime('%x %X ') + settings.TIME_ZONE + '] Error: Missing manifest record for => ' + uuid) documents.append(solrdocument) document_count += 1 print("(" + str(document_count) + ")\t" + uuid) else: print('Error: Skipping document due to a datatype ' 'mismatch -----> ' + uuid) except Exception as error: print("Error: {0}".format(error) + " -----> " + uuid) logger.error('[' + datetime.now().strftime('%x %X ') + settings.TIME_ZONE + '] Error: ' + str(error) + ' => ' + uuid) # Send the documents to Solr while saving the # response status code (e.g, 200, 400, etc.) solr_status = self.solr.update(documents, 'json', commit=False).status if solr_status == 200: self.solr.commit() print('--------------------------------------------') print('Crawl Rate: ' + crawlutil().get_crawl_rate_in_seconds( document_count, start_time) + ' documents per second') print('Updating indexed time for ' + str(len(ok_manifests)) + ' manifest objects.') for manifest in ok_manifests: manifest.indexed_save() # saves the time this was indexed print('--------------------------------------------') else: print('Error: ' + str(self.solr.update( documents, 'json', commit=False ).raw_content['error']['msg'])) # Once the crawl has completed... self.solr.optimize() print('\n--------------------------------------------') print('Crawl completed') print('--------------------------------------------\n') def index_document_list(self, uuid_list, chunksize=20, stop_at_invalid=True): """ Indexes a list of uuids. The list is generated elsewhere. """ if isinstance(uuid_list, list): document_count = 0 documents = [] total_count = len(uuid_list) i = 0; for uuid in uuid_list: try: manifest = Manifest.objects.get(uuid=uuid) except Manifest.DoesNotExist: print('Where is ' + uuid + ' in the manifest?') manifest = False if manifest is not False: try: sd_obj = SolrDocument(uuid) if isinstance(self.max_geo_zoom, int): if self.max_geo_zoom > 5: # only positive integers sd_obj.max_geo_zoom = self.max_geo_zoom sd_obj.process_item() if isinstance(self.human_remains, int): if self.human_remains > 0: sd_obj.fields['human_remains'] = self.human_remains solrdocument = sd_obj.fields if crawlutil().is_valid_document(solrdocument): if solrdocument is not None: i += 1 print('OK to index: ' + uuid) documents.append(solrdocument) manifest.indexed_save() # saves the time this was indexed else: print('Something wrong with: ' + uuid) if stop_at_invalid: break else: print('Not valid: ' + uuid) if stop_at_invalid: break except Exception as error: print("Error: {0}".format(error) + " -----> " + uuid) if stop_at_invalid: break if len(documents) >= chunksize: ok = self.commit_documents(documents, i, total_count) if ok is False and stop_at_invalid: # a problem in committing the documents break documents = [] # now finish off the remaining documents if len(documents) > 0: ok = self.commit_documents(documents, i, total_count) def commit_documents(self, documents, last_index=False, total_docs=False): """ commits a set of documents to the Solr index """ ok = False solr_status = self.solr.update(documents, 'json', commit=False).status if solr_status == 200: last_message = '' if last_index is not False and total_docs is not False: last_message = '(' + str(last_index) + ' of ' + str(total_docs) + ')' ok = True self.solr.commit() print('--------------------------------------------') print('Committed : ' + str(len(documents)) + ' docs. ' + last_message) print('--------------------------------------------') else: print('Error: ' + \ str(self.solr.update( documents, 'json', commit=False ))) return ok def index_single_document(self, uuid): ''' Use this method to crawl a single document. Provide the item's UUID as an argument. For example: crawler = Crawler() crawler.index_single_document('9E474B89-E36B-4B9D-2D38-7C7CCBDBB030') ''' print('\nAttempting to index document ' + uuid + '...\n') start_time = time.time() try: sd_obj = SolrDocument(uuid) sd_obj.process_item() solrdocument = sd_obj.fields if crawlutil().is_valid_document(solrdocument): # Commit the document and save the response status. # Note: solr.update() expects a list solr_status = self.solr.update( [solrdocument], 'json', commit=True).status if solr_status == 200: print('Successfully indexed ' + uuid + ' in ' + crawlutil().get_elapsed_time_in_seconds(start_time) + ' seconds.') else: print('Error: ' + str(self.solr.update( [solrdocument], 'json', commit=True ).raw_content['error']['msg']) ) else: print('Error: Unable to index ' + uuid + ' due to ' 'a datatype mismatch.') except TypeError as error: print("Type Error {0}".format(error) + ": Unable to process document " + uuid + '.') except Exception as error: print("Error: {0}".format(error) + " -----> " + uuid)
class Crawler(): ''' The Open Context Crawler indexes Open Context items and makes them searchable in Apache Solr. ''' def __init__(self): ''' To use, import this library and instantiate a crawler object: crawler = Crawler() Then crawl as follows: crawler.crawl() Crawling a single document is also supported with the index_single_document method. Just provide the document's UUID. For example: crawler.index_single_document('9E474B89-E36B-4B9D-2D38-7C7CCBDBB030') ''' # The list of Open Context items to crawl self.uuidlist = UUIDList().uuids # Connect to Solr self.solr = SolrConnection().connection def crawl(self, chunksize=100): ''' For efficiency, this method processes documents in "chunks." The default chunk size is 100, but one can specify other values. For example, to specify a chunksize of 500, use this method as follows: crawler = Crawler() crawler.crawl(500) ''' # Get a logger logger = logging.getLogger(__name__) start_time = time.time() print('\n\nStarting crawl...\n') print("(#)\tUUID") print('--------------------------------------------') document_count = 0 while self.uuidlist is not None: documents = [] # Process the UUID list in chunks for uuid in islice(self.uuidlist, 0, chunksize): try: solrdocument = SolrDocument(uuid).fields if crawlutil().is_valid_document(solrdocument): try: manifest = Manifest.objects.get(uuid=uuid) manifest.indexed_save( ) # saves the time this was indexed except Manifest.DoesNotExist: print("Error: {0} Database bizzare error -----> " + uuid) logger.error( '[' + datetime.now().strftime('%x %X ') + settings.TIME_ZONE + '] Error: Missing manifest record for => ' + uuid) documents.append(solrdocument) document_count += 1 print("(" + str(document_count) + ")\t" + uuid) else: print('Error: Skipping document due to a datatype ' 'mismatch -----> ' + uuid) except Exception as error: print("Error: {0}".format(error) + " -----> " + uuid) logger.error('[' + datetime.now().strftime('%x %X ') + settings.TIME_ZONE + '] Error: ' + str(error) + ' => ' + uuid) # Send the documents to Solr while saving the # response status code (e.g, 200, 400, etc.) solr_status = self.solr.update(documents, 'json', commit=False).status if solr_status == 200: self.solr.commit() print('--------------------------------------------') print('Crawl Rate: ' + crawlutil().get_crawl_rate_in_seconds( document_count, start_time) + ' documents per second') print('--------------------------------------------') else: print('Error: ' + str( self.solr.update(documents, 'json', commit=False). raw_content['error']['msg'])) # Once the crawl has completed... self.solr.optimize() print('\n--------------------------------------------') print('Crawl completed') print('--------------------------------------------\n') def index_document_list(self, uuid_list, chunksize=20, stop_at_invalid=True): """ Indexes a list of uuids. The list is generated elsewhere. """ if isinstance(uuid_list, list): document_count = 0 documents = [] total_count = len(uuid_list) i = 0 for uuid in uuid_list: try: manifest = Manifest.objects.get(uuid=uuid) except Manifest.DoesNotExist: print('Where is ' + uuid + ' in the manifest?') manifest = False if manifest is not False: try: solrdocument = SolrDocument(uuid).fields if crawlutil().is_valid_document(solrdocument): i += 1 print('OK to index: ' + uuid) documents.append(solrdocument) manifest.indexed_save( ) # saves the time this was indexed else: print('Not valid: ' + uuid) if stop_at_invalid: break except Exception as error: print("Error: {0}".format(error) + " -----> " + uuid) if stop_at_invalid: break if len(documents) >= chunksize: ok = self.commit_documents(documents, i, total_count) if ok is False and stop_at_invalid: # a problem in committing the documents break documents = [] # now finish off the remaining documents if len(documents) > 0: ok = self.commit_documents(documents, i, total_count) def commit_documents(self, documents, last_index=False, total_docs=False): """ commits a set of documents to the Solr index """ ok = False solr_status = self.solr.update(documents, 'json', commit=False).status if solr_status == 200: last_message = '' if last_index is not False and total_docs is not False: last_message = '(' + str(last_index) + ' of ' + str( total_docs) + ')' ok = True self.solr.commit() print('--------------------------------------------') print('Committed : ' + str(len(documents)) + ' docs. ' + last_message) print('--------------------------------------------') else: print('Error: ' + \ str(self.solr.update( documents, 'json', commit=False ).raw_content['error']['msg'])) return ok def index_single_document(self, uuid): ''' Use this method to crawl a single document. Provide the item's UUID as an argument. For example: crawler = Crawler() crawler.index_single_document('9E474B89-E36B-4B9D-2D38-7C7CCBDBB030') ''' print('\nAttempting to index document ' + uuid + '...\n') start_time = time.time() try: solrdocument = SolrDocument(uuid).fields if crawlutil().is_valid_document(solrdocument): # Commit the document and save the response status. # Note: solr.update() expects a list solr_status = self.solr.update([solrdocument], 'json', commit=True).status if solr_status == 200: print('Successfully indexed ' + uuid + ' in ' + crawlutil().get_elapsed_time_in_seconds(start_time) + ' seconds.') else: print('Error: ' + str( self.solr.update([solrdocument], 'json', commit=True). raw_content['error']['msg'])) else: print('Error: Unable to index ' + uuid + ' due to ' 'a datatype mismatch.') except TypeError: print("Error: Unable to process document " + uuid + '.') except Exception as error: print("Error: {0}".format(error) + " -----> " + uuid)
def stats_ranges_query_dict_via_solr( stats_query, default_group_size=20, solr=None, return_pre_query_response=False): """ Makes stats range facet query dict by processing a solr query """ if not solr: # Connect to solr. if configs.USE_TEST_SOLR_CONNECTION: # Connect to the testing solr server solr = SolrConnection( exit_on_error=False, solr_host=settings.SOLR_HOST_TEST, solr_port=settings.SOLR_PORT_TEST, solr_collection=settings.SOLR_COLLECTION_TEST ).connection else: # Connect to the default solr server solr = SolrConnection(False).connection response = solr.search(**stats_query) # execute solr query solr_json = response.raw_content if not isinstance(solr_json, dict): return None if not 'stats' in solr_json: return None if not 'stats_fields' in solr_json['stats']: return None query_dict = {} if return_pre_query_response: # This is for testing purposes. query_dict['pre-query-response'] = solr_json query_dict['facet.range'] = [] query_dict['stats.field'] = [] for solr_field_key, stats in solr_json['stats']['stats_fields'].items(): group_size = default_group_size if not stats or not stats.get('count'): continue if solr_field_key not in query_dict['facet.range']: query_dict['facet.range'].append(solr_field_key) if solr_field_key not in query_dict['stats.field']: query_dict['stats.field'].append(solr_field_key) fstart = 'f.{}.facet.range.start'.format(solr_field_key) fend = 'f.{}.facet.range.end'.format(solr_field_key) fgap = 'f.{}.facet.range.gap'.format(solr_field_key) findex = 'f.{}.facet.range.sort'.format(solr_field_key) fother = 'f.{}.facet.range.other'.format(solr_field_key) finclude = 'f.{}.facet.range.include'.format(solr_field_key) query_dict[fother] = 'all' query_dict[finclude] = 'all' query_dict[findex] = 'index' # sort by index, not by count if (stats['count'] / group_size) < 3: group_size = 4 if solr_field_key.endswith('___pred_date'): query_dict[fstart] = utilities.convert_date_to_solr_date( stats['min'] ) query_dict[fend] = utilities.convert_date_to_solr_date( stats['max'] ) query_dict[fgap] = utilities.get_date_difference_for_solr( stats['min'], stats['max'], group_size ) elif solr_field_key.endswith('___pred_int'): query_dict[fstart] = int(round(stats['min'], 0)) query_dict[fend] = int(round(stats['max'], 0)) query_dict[fgap] = int(round(((stats['max'] - stats['min']) / group_size), 0)) if query_dict[fgap] > stats['mean']: query_dict[fgap] = int(round((stats['mean'] / 3), 0)) if query_dict[fgap] < 1: query_dict[fgap] = 1 else: query_dict[fstart] = stats['min'] query_dict[fend] = stats['max'] query_dict[fgap] = ((stats['max'] - stats['min']) / group_size) if query_dict[fgap] > stats['mean']: query_dict[fgap] = stats['mean'] / 3 if query_dict[fgap] == 0: query_dict[fgap] = 0.001 return query_dict
class StatsQuery(): """ Methods to get stats information for 1 or more fields from Solr. This is useful in composing queries for numeric range facets where we don't know the min or max of the filtered set """ def __init__(self): self.solr = False self.solr_connect() self.solr_response = False self.stats_fields = [] self.q = '*:*' # main solr query self.q_op = 'AND' # default operator for q terms self.fq = [] # filter query def solr_connect(self): """ connects to solr """ self.solr = SolrConnection(False).connection def add_stats_ranges_from_solr(self, query): """ gets solr stats by searching solr searches solr to get raw solr search results """ stats_query = self.compose_query() # make the stats query response = self.solr.search(**stats_query) # execute solr query solr_json = response.raw_content if isinstance(solr_json, dict): if 'stats' in solr_json: if 'stats_fields' in solr_json['stats']: qm = QueryMaker() groups = qm.histogram_groups for solr_field_key, stats in solr_json['stats']['stats_fields'].items(): if stats is not None: if solr_field_key not in query['facet.range']: query['facet.range'].append(solr_field_key) if solr_field_key not in query['stats.field']: query['stats.field'].append(solr_field_key) fstart = 'f.' + solr_field_key + '.facet.range.start' fend = 'f.' + solr_field_key + '.facet.range.end' fgap = 'f.' + solr_field_key + '.facet.range.gap' findex = 'f.' + solr_field_key + '.facet.sort' fother = 'f.' + solr_field_key + '.facet.range.other' finclude = 'f.' + solr_field_key + '.facet.range.include' query[fother] = 'all' query[finclude] = 'all' if 'count' in stats: if (stats['count'] / qm.histogram_groups) < 3: groups = 4 if '___pred_date' in solr_field_key: query[fstart] = qm.convert_date_to_solr_date(stats['min']) query[fend] = qm.convert_date_to_solr_date(stats['max']) query[fgap] = qm.get_date_difference_for_solr(stats['min'], stats['max'], groups) query[findex] = 'index' # sort by index, not by count else: query[fstart] = stats['min'] query[fend] = stats['max'] query[fgap] = ((stats['max'] - stats['min']) / groups) if query[fgap] > stats['mean']: query[fgap] = stats['mean'] / 3; # query[fgap] = ((stats['max'] - stats['min']) / groups) - ((stats['max'] - stats['min']) / groups) * .01 query[findex] = 'index' # sort by index, not by count return query def compose_query(self): """ composes a stats query using attributes in this class """ query = {} query['debugQuery'] = 'false' query['stats'] = 'true' query['rows'] = 0 query['q'] = self.q query['fq'] = self.fq query['stats.field'] = self.stats_fields return query
def solr_connect(self): """ connects to solr """ self.solr = SolrConnection(False).connection
class ProjectsQuery(): """ Methods to get projects from solr to see if we need to have project facet fields. If we have 1 project, then we need to show facet fields for project specific descriptive properties. """ def __init__(self): self.solr = False self.solr_connect() def solr_connect(self): """ connects to solr """ self.solr = SolrConnection(False).connection def check_single_project(self, query): """ checks to see if the query results only in a single project. If it does, then we need to show facet fields for project specific descriptive properties """ single_project = False projs_query = self.compose_query(query) # make the stats query response = self.solr.search(**projs_query) # execute solr query solr_json = response.raw_content if isinstance(solr_json, dict): if 'facet_counts' in solr_json: if 'facet_fields' in solr_json['facet_counts']: ff_dict = solr_json['facet_counts']['facet_fields'] if SolrDocument.ROOT_PROJECT_SOLR in ff_dict: proj_list = ff_dict[SolrDocument.ROOT_PROJECT_SOLR] num_projects = 0 last_proj_val = None for proj_val in solr_json['facet_counts']['facet_fields'][SolrDocument.ROOT_PROJECT_SOLR]: if isinstance(proj_val, str): if '___' in proj_val: last_proj_val = proj_val num_projects += 1 if num_projects == 1: # we have 1 project, check to make sure it's not a parent of a daughter project proj_ex = last_proj_val.split('___') if len(proj_ex) > 3: # get a uuid from 22-kenan-tepe___id___/projects/3DE4CD9C-259E-4C14-9B03-8B10454BA66E___Kenan Tepe p_uuid = proj_ex[2].replace('/projects/', '') ch_projs = Project.objects\ .filter(project_uuid=p_uuid)\ .exclude(uuid=p_uuid)[:1] if len(ch_projs) < 1: # this project does not have child projects, so it is OK to # consider a single project single_project = True return single_project def compose_query(self, old_query): """ composes a query to get a summary of projects that will be in shown in an old_query """ query = {} if 'q' in old_query: query['q'] = old_query['q'] if 'q.op' in old_query: query['q.op'] = old_query['q.op'] if 'fq' in old_query: query['fq'] = old_query['fq'] query['debugQuery'] = 'false' query['facet'] = 'true' query['facet.mincount'] = 1 query['rows'] = 0 query['start'] = 0 query['facet.field'] = [SolrDocument.ROOT_PROJECT_SOLR] return query
def __init__(self): # Connect to Solr self.solr = SolrConnection().connection self.request_error = False
class ProjectsQuery(): """ Methods to get projects from solr to see if we need to have project facet fields. If we have 1 project, then we need to show facet fields for project specific descriptive properties. """ def __init__(self): self.solr = False self.solr_connect() def solr_connect(self): """ connects to solr """ self.solr = SolrConnection(False).connection def check_single_project(self, query): """ checks to see if the query results only in a single project. If it does, then we need to show facet fields for project specific descriptive properties """ single_project = False projs_query = self.compose_query(query) # make the stats query response = self.solr.search(**projs_query) # execute solr query solr_json = response.raw_content if isinstance(solr_json, dict): if 'facet_counts' in solr_json: if 'facet_fields' in solr_json['facet_counts']: ff_dict = solr_json['facet_counts']['facet_fields'] if SolrDocument.ROOT_PROJECT_SOLR in ff_dict: proj_list = ff_dict[SolrDocument.ROOT_PROJECT_SOLR] num_projects = 0 last_proj_val = None for proj_val in solr_json['facet_counts'][ 'facet_fields'][ SolrDocument.ROOT_PROJECT_SOLR]: if isinstance(proj_val, str): if '___' in proj_val: last_proj_val = proj_val num_projects += 1 if num_projects == 1: # we have 1 project, check to make sure it's not a parent of a daughter project proj_ex = last_proj_val.split('___') if len(proj_ex) > 3: # get a uuid from 22-kenan-tepe___id___/projects/3DE4CD9C-259E-4C14-9B03-8B10454BA66E___Kenan Tepe p_uuid = proj_ex[2].replace('/projects/', '') ch_projs = Project.objects\ .filter(project_uuid=p_uuid)\ .exclude(uuid=p_uuid)[:1] if len(ch_projs) < 1: # this project does not have child projects, so it is OK to # consider a single project single_project = True return single_project def compose_query(self, old_query): """ composes a query to get a summary of projects that will be in shown in an old_query """ query = {} if 'q' in old_query: query['q'] = old_query['q'] if 'q.op' in old_query: query['q.op'] = old_query['q.op'] if 'fq' in old_query: query['fq'] = old_query['fq'] query['debugQuery'] = 'false' query['facet'] = 'true' query['facet.mincount'] = 1 query['rows'] = 0 query['start'] = 0 query['facet.field'] = [SolrDocument.ROOT_PROJECT_SOLR] return query