def ssearch_all_count(): try: solr = sunburnt.SolrInterface(settings.SOLR['host']) responce = solr.query(**{'*': '*'}).field_limit("id").execute() except socket.error: return {'count': 0} return {'count': responce.result.numFound}
def statictics(): solr_connection = httplib2.Http(disable_ssl_certificate_validation=True) solr = sunburnt.SolrInterface(settings.SOLR['host'], http_connection=solr_connection) facet_fields = ['fond_sf'] qkwargs = {'*': '*'} solr_searcher = solr.query(**qkwargs).paginate(start=0, rows=0) exclude_kwargs = {'system-catalog_s': u"1"} solr_searcher = solr_searcher.exclude(**exclude_kwargs) solr_searcher = solr_searcher.facet_by(field=facet_fields, limit=30, mincount=1) solr_searcher = solr_searcher.field_limit("id") response = solr_searcher.execute() collections = {} for key in response.facet_counts.facet_fields.keys(): for val in response.facet_counts.facet_fields[key]: collections[val[0]] = val[1] stats = { 'collections': collections, 'count_all': 0, 'count_last_month': 0, } now = datetime.datetime.now() before_30_now = now - datetime.timedelta(30) count_all = Record.objects.using('records').filter(source_id='2').exclude(deleted=True).count() count_last_month = Record.objects.using('records').filter(add_date__year=now.year, add_date__month=now.month, source_id='2').exclude(deleted=True).count() count_last_30 = Record.objects.using('records').filter(add_date__gte=before_30_now, add_date__lte=now, source_id='2').exclude(deleted=True).count() stats['count_all'] = count_all stats['count_last_month'] = count_last_month stats['count_last_30'] = count_last_30 return stats
def upload(self, network_name=None, network_file=None, annotations_file=None, type="public", email="*****@*****.**"): #cherrypy.log("Privacy: "+ type) #cherrypy.log("annotations: "+ repr(network_file)) #cherrypy.log("network: "+ repr(annotations_file)) if cherrypy.request.method == "GET": tmpl = lookup.get_template("upload_form.mako") return tmpl.render() else: network_name = pinv.clean(network_name) solr_url = '%s/solr/%s' % (settings.SOLR_SERVER, network_name) time.sleep(2) inv = InteractionNetwork(network_name, "description text", network_file.file, annotations_file.file) time.sleep(2) message, result = pinv.create_new_solr(network_name) time.sleep(2) cherrypy.log("*** UPLOAD. Connecting to: " + solr_url) si = sunburnt.SolrInterface(solr_url) time.sleep(2) inv.upload(si) inv.createClusters(si, 3, 7) cherrypy.log(str(inv.errors)) view_key, delete_key = auth.save_key(network_name, email, type) if type == "private": view_url = "http://biosual.cbio.uct.ac.za/pinViewer.html?core=%(core)s&key=%(key)s" % { 'core': network_name, 'key': view_key } delete_url = "http://biosual.cbio.uct.ac.za/solr/admin/cores?action=UNLOAD&deleteIndex=true&core=%(core)s&key=%(key)s" % { 'core': network_name, 'key': delete_key } else: view_url = "http://biosual.cbio.uct.ac.za/pinViewer.html?core=%(core)s" % { 'core': network_name } delete_url = "http://biosual.cbio.uct.ac.za/solr/admin/cores?action=UNLOAD&deleteIndex=true&core=%(core)s&key=%(key)s" % { 'core': network_name, 'key': delete_key } msg = auth.sendmail(email, view_url, delete_url, network_name) #cherrypy.log(msg); errormessage = "<br/>".join(inv.errors) tmpl = lookup.get_template("upload_result.mako") return tmpl.render(network_name=network_name, annotation_head="|".join(inv.ahead), annotation_count=len(inv.annotations), annotation_file=annotations_file.filename, network_head="|".join(inv.nhead), network_count=len(inv.network), network_file=network_file.filename, message="", errors="<br/>".join(inv.errors))
def connect(self): # noinspection PyUnusedLocal try: return solr.SolrInterface(self.host, mode="rw", retry_timeout=self.retry) except Exception, e: _logger.exception(u"Could not connect to [%s]", self.host) return None
def __init__(self): self.connection = Connection('localhost', 27017) self.db = self.connection.nSquared self.COLLECTION = 'thumbs' self.r = redis.StrictRedis(host='localhost', port=6379, db=0) self.SOLR_URL = 'http://10.10.10.31:8443/solr/' self.solr = sunburnt.SolrInterface(self.SOLR_URL) self.PAGE_LENGTH = 1000 self.connection, self.cursor = connect_mysql()
def solr_interface(solr_url=None): if not solr_url: solr_url = settings.SOLR_SERVER_URL http_opts = {} if hasattr(settings, 'SOLR_CA_CERT_PATH'): http_opts['ca_certs'] = settings.SOLR_CA_CERT_PATH if getattr(settings, 'SOLR_DISABLE_CERT_CHECK', False): http_opts['disable_ssl_certificate_validation'] = True http = httplib2.Http(**http_opts) solr = sunburnt.SolrInterface(solr_url, http_connection=http) return solr
def participant_income(request): sigla = request.GET.get('sigla', None) solr_connection = httplib2.Http(disable_ssl_certificate_validation=True) solr = sunburnt.SolrInterface(settings.SOLR['local_records_host'], http_connection=solr_connection) if sigla: query = solr.Q(**{'holder-sigla_s': sigla}) else: query = solr.Q(**{'*': '*'}) solr_searcher = solr.query(query) solr_searcher = solr_searcher.field_limit("id") solr_searcher = solr_searcher.sort_by('-record-create-date_dts') paginator = Paginator(solr_searcher, 20) # Show 25 contacts per page page = request.GET.get('page') try: results_page = paginator.page(page) except PageNotAnInteger: # If page is not an integer, deliver first page. results_page = paginator.page(1) except EmptyPage: # If page is out of range (e.g. 9999), deliver last page of results. results_page = paginator.page(paginator.num_pages) docs = [] for row in results_page.object_list: docs.append(replace_doc_attrs(row)) doc_ids = [] for doc in docs: doc_ids.append(doc['id']) records_dict = {} records = list( Record.objects.using('local_records').filter(gen_id__in=doc_ids)) for record in records: records_dict[record.gen_id] = etree.tostring(xslt_bib_draw_transformer( etree.XML(record.content), abstract='false()'), encoding='utf-8') for doc in docs: doc['record'] = records_dict.get(doc['id']) return render(request, 'ssearch/frontend/income.html', { 'results_page': results_page, 'docs': docs })
def publishMeta(mdList): """ Establish the Solr Instance, add the metadata, and commit it""" try: # Instantiate the interface to the Solr instance si = sunburnt.SolrInterface("http://%s:%s/solr/%s/" % (solrServer, solrPort, solrInstance)) # Add the XML metadata to the instance si.add(mdList) except: raise finally: # Commit/Save the metadata si.commit()
def indexes(self,*fieldnames): try: import sunburnt except ImportError: raise ImportError("Cannot find sunburnt, it is necessary to access Solr") self.fieldnames = fieldnames if not os.path.exists(self.schema_filename): schema='<fields><field name="id" type="int" indexed="true" stored="true" required="true" />%s</fields>' \ % ''.join('<field name="%s" type="string" />' % name for name in fieldname) open(self.schema_filename,'w').write(shema) try: self.interface = sunburnt.SolrInterface(self.url, self.schema_filename) except: raise RuntimeError("Cannot connect to Solr: %s" % self.url)
def index_email_address(): email_loaded_nodes = pickle.load(open(SETTINGS.email_object_file, "rb")) sEmail = sunburnt.SolrInterface("http://localhost:8983/solr/emailAddress/") docs = [] for key, value in email_loaded_nodes.iteritems(): doc = { "nodeId": key, "address": value._address, "fullyObserved": value._fully_observed } docs.append(doc) sEmail.add(docs) sEmail.commit()
def index_person(): person_loaded_nodes = pickle.load(open(SETTINGS.person_object_file, "rb")) sPerson = sunburnt.SolrInterface("http://localhost:8983/solr/person/") docs = [] for key, value in person_loaded_nodes.iteritems(): doc = { "nodeId": key, "lastname": value._last_name, "firstname": value._first_name, "provenance": value._provenance } docs.append(doc) sPerson.add(docs) sPerson.commit()
def participant_income(sigla): solr_connection = httplib2.Http(disable_ssl_certificate_validation=True) solr = sunburnt.SolrInterface(settings.SOLR['local_records_host'], http_connection=solr_connection) if sigla: query = solr.Q(**{'holder-sigla_s': sigla}) else: query = solr.Q(**{'*': '*'}) solr_searcher = solr.query(query) solr_searcher = solr_searcher.field_limit(['id', 'record-create-date_dts']) solr_searcher = solr_searcher.sort_by('-record-create-date_dts') paginator = Paginator(solr_searcher, 10) # Show 25 contacts per page # If page is not an integer, deliver first page. results_page = paginator.page(1) docs = [] for row in results_page.object_list: docs.append(replace_doc_attrs(row)) doc_ids = [] for doc in docs: doc_ids.append(doc['id']) records_dict = {} records = list( Record.objects.using('local_records').filter(gen_id__in=doc_ids)) for record in records: records_dict[record.gen_id] = rusmarc_template.beautify( etree.tostring(xslt_bib_draw_transformer(etree.XML(record.content), abstract='false()'), encoding='utf-8')) for doc in docs: doc['record'] = records_dict.get(doc['id']) return { # 'results_page': results_page, 'docs': docs, 'sigla': sigla }
def document_delete_index(document, user_id=None): import sunburnt document = json.loads(document) table = s3db.doc_document id = document["id"] filename = document["filename"] si = sunburnt.SolrInterface(settings.get_base_solr_url()) # Delete and Commit the indicies of the deleted document si.delete(id) si.commit() # After removing the index, set has_been_indexed value to False in the database db(table.id == id).update(has_been_indexed=False) db.commit()
def adapter_connect(host, mode, retry_timeout, raise_ex=False): """ Connect to backend. """ if host is None: utils.exit( utils.ENVIRONMENT_ENTRY_NOT_FOUND, 'settings["backend_host"] missing - please, specify it in settings.py.' ) try: _logger.info("Connecting to indexer [%s, %s].", host, mode) return solr.SolrInterface(host, mode=mode, retry_timeout=retry_timeout) except: _logger.exception("Connecting to indexer failed.") if raise_ex: raise return None
def categorize(schema, text, n_categories=5, n_terms=30, server='http://localhost:8983/solr', terms=False): """Categorize a piece of text using a MoreLikeThis query on Solr This is basically an approximated k-Neareast Neighbors using the TF-IDF similarity of the Solr index. The query is truncated to the top n_terms terms with maximum weights for efficiency reasons. """ solr = sunburnt.SolrInterface(server, schema) interestingTerms = 'list' if terms else 'none' q = solr.mlt_query("text", content=text, maxqt=n_terms, interestingTerms=interestingTerms) q = q.paginate(rows=n_categories) q = q.field_limit(score=True, all_fields=True) return q.execute()
def upload(self, network_name, network_file, annotations_file): out = """<html> <body> Network name: %s<br/><br/> Annotations columns: %s<br/> Annotations count: %i<br /> Annotations filename: %s<br /> Network head: %s<br /> Network length: %i<br /> Network filename: %s<br /> %s <br/> Errors:<br/> %s </body> </html>""" network_name = pinv.clean(network_name) if settings.TEST and not (network_file.file and annotations_file.file): network_file.file = open("../data/test.txt") annotations_file.file = open("../data/test_annot.txt") solr_url = settings.SOLR_URL solr_url = '%s/solr/%s' % (settings.SOLR_SERVER, network_name) time.sleep(2) inv = InteractionNetwork(network_name, "description text", network_file.file, annotations_file.file) time.sleep(2) message, result = pinv.create_new_solr(network_name) time.sleep(2) print "--- UPLOAD. Connecting to:", solr_url si = sunburnt.SolrInterface(solr_url) time.sleep(2) inv.upload(si) errormessage = "<br/>".join(inv.errors) return out % (network_name, "|".join(inv.ahead), len( inv.annotations), repr(annotations_file.filename), "|".join( inv.nhead), len(inv.network), repr( network_file.filename), message, errormessage)
def index_edges(): edge_loaded_nodes = pickle.load(open(SETTINGS.edge_object_file, "rb")) sEdges = sunburnt.SolrInterface("http://localhost:8983/solr/edge/") docs = [] for key, value in edge_loaded_nodes.iteritems(): doc = { "edgeId": key, "source": value._source, "target": value._target, "label": value._label, "epochSecs": value._epoch_secs, "order": value._order, "datetime": value._datetime, "edgeType": value._edge_type, "startDatetime": value._start_datetime, "endDatetime": value._end_datetime, "evidenceType": value._evidence_type } docs.append(doc) sEdges.add(docs) sEdges.commit()
def create_ngram_dict(): word_dict_unigram = {} word_dict_bigram = {} word_dict_trigram = {} # contains the list of results result_list = [] sMessage = sunburnt.SolrInterface("http://localhost:8983/solr/message/") for result in sMessage.query(text="*").field_limit(["body", "subject"]).paginate(start=0, rows=255636).execute(): result_list.append(result) for list_element in result_list: # for body and subject type for key, value in list_element.iteritems(): # Converting unicode to string value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore') put_ngram_word(word_dict_unigram, word_dict_bigram, word_dict_trigram, value) sorted_word_dict_unigram = sorted(word_dict_unigram.iteritems(), key=operator.itemgetter(1), reverse=True) sorted_word_dict_bigram = sorted(word_dict_bigram.iteritems(), key=operator.itemgetter(1), reverse=True) sorted_word_dict_trigram = sorted(word_dict_trigram.iteritems(), key=operator.itemgetter(1), reverse=True) return sorted_word_dict_unigram, sorted_word_dict_bigram, sorted_word_dict_trigram
def get_solr_interface(site): """cache the solr interface for an hour at a time so we don't need to fetch the schema on every single query.""" global saved_solr_interface global solr_interface_created if site not in settings.SOLR_SERVER_URLS: raise InvalidQueryError("Unknown site: %s" % site) if site not in saved_solr_interface: too_old = True else: age = datetime.now() - solr_interface_created[site] too_old = age > timedelta(hours=1) if too_old: try: saved_solr_interface[site] = sunburnt.SolrInterface( settings.SOLR_SERVER_URLS[site], http_connection=Connection(), format='json') solr_interface_created[site] = datetime.now() except Exception as e: logger.error("get_solr_interface: %s" % e, exc_info=True) raise SolrUnavailableError('Solr is not responding (using %s )' % settings.SOLR_SERVER_URLS[site]) return saved_solr_interface[site]
def index_message(): filter_words = load_filter_words() message_loaded_nodes = pickle.load(open(SETTINGS.message_object_file, "rb")) sPerson = sunburnt.SolrInterface("http://localhost:8983/solr/message/") docs = [] for key, value in message_loaded_nodes.iteritems(): #Checking if the subject or body contains filter words (non-compliant words) compliantFlag = True #NoneType check if value._subject == None: text = value._body elif value._body == None: text = value._subject else: text = value._subject + value._body if is_filter_word_present(text, filter_words): compliantFlag = False doc = { "nodeId": key, "datetime": value._datetime, "epochSecs": value._epoch_secs, "subject": value._subject, "body": clean_data(value._body), "emailId": value._email_id, "compliantFlag": compliantFlag } # doc = {"nodeId":key, "datetime":value._datetime, "epochSecs":value._epoch_secs, "subject":value._subject, "body":value._body, "emailId":value._email_id,"compliantFlag":compliantFlag} docs.append(doc) sPerson.add(docs) sPerson.commit()
import requests import sunburnt import MySQLdb THUMB_URL = 'http://209.17.190.27/rcw_wp/0.51.0/cache_image_lookup.php' SOLR_URL = 'http://10.10.10.31:8443/solr/' solr = sunburnt.SolrInterface(SOLR_URL) MYSQL_SETTINGS = '10.10.10.17', 'vulcan', '', 'linksDBProd' def find_thumb(urls, domain): params = {} for url in urls: params['image_url'] = url params['domain'] = domain thumb_request = requests.get(THUMB_URL, params=params) if thumb_request.status_code == 200: return thumb_request.content return None def get_domain(rssid, connection=None): query = ("SELECT keyCode FROM `domains` WHERE rssid = %s", str(rssid)) domain = execute_fetchone(connection, query) if domain and len(domain) > 0: domain = domain[0] return domain def get_rssid(domain, connection=None): query = ("SELECT rssid FROM `domains` WHERE keyCode = %s", str(domain)) rssid = execute_fetchone(connection, query) if rssid and len(rssid) > 0: rssid = rssid[0]
def document_create_index(document, user_id=None): import os from xlrd import open_workbook from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.plaintext.writer import PlaintextWriter import sunburnt document = json.loads(document) table = s3db.doc_document id = document["id"] name = document["name"] filename = document["filename"] filename = "%s/%s/uploads/%s" % (os.path.abspath("applications"), \ request.application, filename) si = sunburnt.SolrInterface(settings.get_base_solr_url()) extension = os.path.splitext(filename)[1][1:] if extension == "pdf": data = os.popen("pdf2txt.py " + filename).read() elif extension == "doc": data = os.popen("antiword " + filename).read() elif extension == "xls": wb = open_workbook(filename) data = " " for s in wb.sheets(): for row in range(s.nrows): values = [] for col in range(s.ncols): values.append(str(s.cell(row, col).value)) data = data + ",".join(values) + "\n" elif extension == "rtf": doct = Rtf15Reader.read(open(filename)) data = PlaintextWriter.write(doct).getvalue() else: data = os.popen("strings " + filename).read() # The text needs to be in unicode or ascii, with no contol characters data = str(unicode(data, errors="ignore")) data = "".join(c if ord(c) >= 32 else " " for c in data) # Put the data according to the Multiple Fields # @ToDo: Also, would change this according to requirement of Eden document = { "id": str(id), # doc_document.id "name": data, # the data of the file "url": filename, # the encoded file name stored in uploads/ "filename": name, # the filename actually uploaded by the user "filetype": extension # x.pdf -> pdf is the extension of the file } # Add and commit Indices si.add(document) si.commit() # After Indexing, set the value for has_been_indexed to True in the database db(table.id == id).update(has_been_indexed=True) db.commit()
def create_feature_vector(): feature_map = pickle.load(open(SETTINGS.feature_map, "rb")) print feature_map feature_vectors = [] modified_feature_vectors = [] target_values = [] # create a list of feature vector along with a list of target value from the data set result_list = [] sMessage = sunburnt.SolrInterface("http://localhost:8983/solr/message/") # for result in sMessage.query(text="*").field_limit(["body", "subject", "compliantFlag"]).paginate(start=0, rows=255636).execute(): for result in sMessage.query(text="*").field_limit(["body", "subject", "compliantFlag"]).paginate(start=0, rows=255636).execute(): result_list.append(result) # For each result for list_element in result_list: # initialize it to the size of feature selected i.e 5000+500+200 feature_vector = [0] * 5700 # print "Subject:" + list_element['subject'].lower() # print "Body:" + list_element['body'].lower() # print "Compliant Flag:", list_element['compliantFlag'] for key_feature, value_feature in feature_map.iteritems(): words_in_feature_key = key_feature.split() if (len(words_in_feature_key) == 1): if (key_feature.lower() in list_element['subject'].lower().split() or key_feature in list_element['body'].lower().split()): # Mark the value 1 at that index feature_vector[value_feature] = 1 else: if (key_feature.lower() in list_element['subject'].lower() or key_feature in list_element['body'].lower()): # Mark the value 1 at that index feature_vector[value_feature] = 1 if list_element['compliantFlag'] == False: y = 0 else: y = 1 feature_vectors.append(feature_vector) target_values.append(y) modified_feature_vector = feature_vector + list(str(y)) modified_feature_vectors.append(modified_feature_vector) # print feature_vector # print y # print modified_feature_vector assert_equal(len(feature_vectors), len(target_values)) #Shuffling the data for training and testing test random.shuffle(modified_feature_vectors) #Separating out training and testing data train_data = modified_feature_vectors[:178945] test_data = modified_feature_vectors[178945:] #For training data train_data_features = [] train_data_targets = [] for i in range(0,len(train_data)): train_data_features.append(train_data[i][:-1]) train_data_targets.append(train_data[i][-1]) #For testing data test_data_features = [] test_data_targets = [] for i in range(0,len(test_data)): test_data_features.append(test_data[i][:-1]) test_data_targets.append(test_data[i][-1]) #Dumping pickle.dump(train_data_features, open(SETTINGS.train_data_features, "wb")) pickle.dump(train_data_targets, open(SETTINGS.train_data_targets, "wb")) pickle.dump(test_data_features, open(SETTINGS.test_data_features, "wb")) pickle.dump(test_data_targets, open(SETTINGS.test_data_targets, "wb"))
def index_webentity(web_entity_pile, web_entity_done_pile, conf, mainlog): processlog = TimeElapsedLogging.create_log(str(os.getpid()), filename="logs/by_pid/%s.log" % os.getpid()) processlog.info("starting infinite loop") corpus = conf['hyphe-core']['corpus_id'] solr = sunburnt.SolrInterface( "http://%s:%s/solr/%s" % (conf["solr"]['host'], conf["solr"]['port'], get_solr_instance_name(conf["solr"]['path']))) hyphe_core = jsonrpclib.Server( 'http://%s:%s' % (conf["hyphe-core"]["host"], conf["hyphe-core"]["port"]), version=1) db = pymongo.MongoClient(conf['mongo']['host'], conf['mongo']['port']) collname = "%s.pages" % conf['hyphe-core']['corpus_id'] coll = db[conf["mongo"]["db"]][collname] while True: we = web_entity_pile.get() # logging in proc log processlog.info("%s: starting processing" % we["name"]) #setting LOG web_entity_name_safe = re.sub(r"[\W]", "", we['name']) web_entity_log_id = "%s_%s" % (web_entity_name_safe, we["id"]) logfilename = "logs/by_web_entity/%s.log" % web_entity_log_id errors_solr_document_filename = "logs/errors_solr_document/%s.json" % web_entity_log_id welog = TimeElapsedLogging.create_log(we["id"], filename=logfilename) #getting web pages URLS welog.log(logging.INFO, "retrieving pages of web entity %s" % (we["name"])) #mainlog.info("DEBUG %s"%(we["id"])) web_pages = hyphe_core.store.get_webentity_pages( we["id"], True, corpus) if (web_pages['code'] == 'fail'): mainlog.info(we_pages['message']) welog.log( logging.INFO, "retrieved %s pages of web entity %s" % (len(web_pages["result"]), we["name"])) we["web_pages"] = web_pages["result"] processlog.info("%s: got %s webpages" % (we["name"], len(we["web_pages"]))) #getting mongo html web page urls = [page["url"] for page in we["web_pages"]] #if page["http_status"]!=0] nb_urls = len(urls) last_id = "" pages_mongo = [] nb_pages_mongo = 0 nb_pages_indexed = 0 i = 0 url_slice_len = 1000 welog.info( "retrieving + indexing HTML pages from mongo to solr of web entity %s" % (we["name"])) while i < len(urls): urls_slice = urls[i:i + url_slice_len] pages_mongo_slice = list( coll.find( { "url": { "$in": urls_slice }, "status": 200, "content_type": { "$in": accepted_content_types }, "body": { "$exists": True } }, projection=[ "_id", "encoding", "url", "lru", "depth", "body" ])) #mainlog.info(str(len(pages_mongo_slice))) #local counters nb_slice_mongo = len(pages_mongo_slice) nb_slice_indexed = 0 welog.info( "%s %s: got %s pages in slice %s %s" % (we["name"], we["id"], nb_slice_mongo, i, len(urls_slice))) error_solr_doc = [] for page_mongo in pages_mongo_slice: body = page_mongo["body"].decode('zip') try: body = body.decode(page_mongo.get("encoding", "")) encoding = page_mongo.get("encoding", "") except Exception: body = body.decode("UTF8", "replace") encoding = "UTF8-replace" solr_document = { "id": page_mongo["_id"], "web_entity": we["name"], "web_entity_id": we["id"], "web_entity_status": we["status"], "corpus": conf['hyphe-core']['corpus_id'], "encoding": encoding, "original_encoding": page_mongo.get("encoding", ""), "url": page_mongo["url"], "lru": page_mongo["lru"], "depth": page_mongo["depth"], #"html":body, "text": html2text.textify(body, encoding) } try: solr.add(solr_document) nb_slice_indexed += 1 except Exception as e: # mainlog.info("ERROR %s: %s %s" %(type(e),e, solr_document)) #welog.debug("Exception with document :%s %s %s"%(solr_document["id"],solr_document["url"],solr_document["encoding"])) error_solr_doc.append({ "error": "%s: %s" % (type(e), e), "url": solr_document["url"], "encoding": solr_document["encoding"], "original_encoding": solr_document["original_encoding"] }) # import traceback # traceback.print_exc() if len(error_solr_doc) > 0: with open(errors_solr_document_filename, "a") as errors_solr_document_json_file: json.dump(error_solr_doc, errors_solr_document_json_file, indent=4) del (error_solr_doc) #log welog.info("%s %s: indexed %s pages" % (we["name"], we["id"], nb_slice_indexed)) #processlog.info("indexed %s html pages for %s"%(nb_slice_indexed,(we["name"]))) # global counters nb_pages_mongo += nb_slice_mongo nb_pages_indexed += nb_slice_indexed i = i + url_slice_len del we["web_pages"] del web_pages del urls welog.log( logging.INFO, "'%s' indexed (%s web pages on %s)" % (we["name"], nb_pages_indexed, nb_pages_mongo)) try: solr.commit() except Exception as e: mainlog.info("ERROR %s: %s" % (type(e), e)) mainlog.info("Retrying...") try: solr.commit() except Exception as e: mainlog.info("STILL BROKEN, giving up on %s %s" % (we['id'], we['name'])) #relying on autocommit #welog.info("inserts to solr comited") processlog.info("%s: indexed %s on %s Html pages" % (we["name"], nb_pages_indexed, nb_pages_mongo)) #adding we if to done list web_entity_done_pile.put(we["id"]) del we web_entity_pile.task_done()
background=True)) mainlog.info("index on content_type done") # prepare conte_type filter accepted_content_types = [] with open( conf['mongo'] ['contenttype_whitelist_filename']) as content_type_whitelist: accepted_content_types = content_type_whitelist.read().split("\n") except Exception as e: sys.stderr.write("%s: %s\n" % (type(e), e)) sys.stderr.write('ERROR: Could not initiate connection to MongoDB\n') sys.exit(1) # solr try: solr = sunburnt.SolrInterface( "http://%s:%s/solr/%s" % (conf["solr"]['host'], conf["solr"]['port'], get_solr_instance_name(conf["solr"]['path']))) if args.delete_index: solr.delete_all() solr.commit() except Exception as e: sys.stderr.write("%s: %s\n" % (type(e), e)) sys.stderr.write('ERROR: Could not initiate connection to SOLR node\n') sys.exit(1) # hyphe core try: hyphe_core = jsonrpclib.Server( 'http://%s:%s' % (conf["hyphe-core"]["host"], conf["hyphe-core"]["port"]), version=1) except Exception as e:
def __init__(self): print 'starting' self.si = sunburnt.SolrInterface("http://localhost:8080/solr")
def _indexing(slug, reset=False): sources_index = {} print 'getting source' sources = list(Source.objects.using('records').all()) print 'total sources', len(sources) for source in sources: sources_index[source.id] = source try: solr_address = settings.SOLR['host'] db_conf = settings.DATABASES.get( settings.SOLR['catalogs'][slug]['database'], None) except KeyError: raise Exception(u'Catalog not founded') if not db_conf: raise Exception( u'Settings not have inforamation about database, where contains records.' ) if db_conf['ENGINE'] != 'django.db.backends.mysql': raise Exception( u' Support only Mysql Database where contains records.') print 'connect to db', db_conf['HOST'] try: conn = MySQLdb.connect(host=db_conf['HOST'], user=db_conf['USER'], passwd=db_conf['PASSWORD'], db=db_conf['NAME'], port=int(db_conf['PORT']), compress=True, charset='utf8', use_unicode=True, cursorclass=MySQLdb.cursors.SSDictCursor) except MySQLdb.OperationalError as e: conn = MySQLdb.connect(unix_socket=db_conf['HOST'], user=db_conf['USER'], passwd=db_conf['PASSWORD'], db=db_conf['NAME'], port=int(db_conf['PORT']), compress=True, charset='utf8', use_unicode=True, cursorclass=MySQLdb.cursors.SSDictCursor) print 'connected to db' print 'load holdings' holdings_index = _load_holdings(conn) print 'load orgs' orgs_index = _load_orgs() print 'load sources' sources_index = _load_sources() try: index_status = IndexStatus.objects.get(catalog=slug) except IndexStatus.DoesNotExist: index_status = IndexStatus(catalog=slug) print 'index_status', index_status.last_index_date # select_query = "SELECT * FROM records where deleted = 0 AND LENGTH(content) > 0 and record_id='ru\\\\nlrt\\\\1359411'" select_query = "SELECT * FROM records where deleted = 0 AND LENGTH(content) > 0" # if not getattr(index_status, 'last_index_date', None): # select_query = "SELECT * FROM records where deleted = 0 and content != NULL" # else: # select_query = "SELECT * FROM records where update_date >= '%s' and deleted = 0" % ( # str(index_status.last_index_date)) solr = sunburnt.SolrInterface( solr_address, http_connection=httplib2.Http(disable_ssl_certificate_validation=True)) docs = list() start_index_date = datetime.datetime.now() print 'execute query', select_query conn.query(select_query) print 'query executed', select_query rows = conn.use_result() res = rows.fetch_row(how=1) print 'start fetching' i = 0 while res: if not res[0]['content']: res = rows.fetch_row(how=1) continue zf = zipfile.ZipFile(io.BytesIO((res[0]['content']))) content = zf.read('1.xml').decode('utf-8') doc_tree = etree.XML(content) doc_tree = xslt_indexing_transformer(doc_tree) doc = doc_tree_to_dict(doc_tree) doc = add_sort_fields(doc) date_of_publication = doc.get('date-of-publication_s') if date_of_publication: cleaned_date_of_publication = ''.join( ONLY_DIGITS_RE.findall(date_of_publication[0])) if cleaned_date_of_publication: doc['date_of_publication_l'] = [cleaned_date_of_publication] # для сортировки по тому, извлекаем строку содержащую номер тома или промежуток и посещаем резултат вычисления # в поле tom_f, которое в последствии сортируется # если трока типа т.1 то в том добавляется float 1 # если строка содержит т.1-2 то добавляется float (1+2) / 2 - средне арифметическое, чтобы усреднить для сортировки tom = doc.get('tom_s', None) if tom and isinstance(tom, unicode): tom = tom.strip().replace(u' ', u'') r = re_t1_t2.search(tom) if r: groups = r.groups() doc['tom_f'] = (int(groups[0]) + int(groups[1])) / 2.0 else: r = re_t1.search(tom) if r: doc['tom_f'] = float(r.groups()[0]) issn = doc.get('issn_t') if issn: for issn_item in issn: new_issn_value = issn_item.replace('-', '').replace(' ', '') if new_issn_value != issn_item: doc['issn_t'].append(new_issn_value) isbn = doc.get('isbn_t') if isbn: for isbn_item in isbn: new_isbn_value = isbn_item.replace('-', '').replace(' ', '') if new_isbn_value != isbn_item: doc['isbn_t'].append(new_isbn_value) try: record_create_date = doc.get('record-create-date_dt', None) # print 'record_create_date1', record_create_date if record_create_date: doc['record-create-date_dts'] = record_create_date except Exception as e: print 'Error record-create-date_dt' holder_codes = _get_holdings(source_id=res[0]['source_id'], record_id=res[0]['record_id'], orgs_index=orgs_index, holdings_index=holdings_index, sources_index=sources_index) # if holder_codes: # print holder_codes if holder_codes: doc['system-holder_s'] = holder_codes org_types = set() for holder_code in holder_codes: org_type = orgs_index.get('code', {}).get(holder_code, {}).get('org_type', '') if org_type: org_types.add(org_type) if org_types: doc['org_type_s'] = list(org_types) doc['system-add-date_dt'] = res[0]['add_date'] doc['system-add-date_dts'] = res[0]['add_date'] doc['system-update-date_dt'] = res[0]['update_date'] doc['system-update-date_dts'] = res[0]['update_date'] doc['system-catalog_s'] = res[0]['source_id'] # doc['source-type_s'] = sources_index[res[0]['source_id']].source_type if str(doc['system-catalog_s']) == '2': full_text_file = None # doc['system-update-date_dt'] = res[0]['doc-id_s'] urls = doc.get('doc-id_s', None) if urls and type(urls) == list: for url in doc.get('doc-id_s', None): if url: full_text_file = url.split('/')[-1] else: if urls: full_text_file = urls.split('/')[-1] if full_text_file: text = full_text_extract(full_text_file) if text: doc['full-text'] = text docs.append(doc) i += 1 if i % 100 == 0: print 'indexed', i if len(docs) > 100: pass solr.add(docs) docs = list() res = rows.fetch_row(how=1) if docs: pass solr.add(docs) solr.commit() index_status.indexed = i # удаление records = [] if getattr(index_status, 'last_index_date', None): records = Record.objects.using('records').filter( deleted=True, update_date__gte=index_status.last_index_date).values('gen_id') else: records = Record.objects.using('records').filter(deleted=True).values( 'gen_id', 'update_date') record_gen_ids = [] for record in list(records): record_gen_ids.append(record['gen_id']) if record_gen_ids: solr.delete(record_gen_ids) solr.commit() index_status.deleted = len(record_gen_ids) index_status.last_index_date = start_index_date index_status.save() conn.query('DELETE FROM records WHERE deleted = 1') return True
def main(global_config, **settings): """ This function returns a Pyramid WSGI application. """ config = Configurator(settings=settings) config.scan('raggregate.models') engine = engine_from_config(settings, 'sqlalchemy.') sqlahelper.add_engine(engine) initialize_sql(engine) session_factory = pyramid_beaker.session_factory_from_settings(settings) template_static_asset = "{0}/static".format(settings['mako.directories']) settings['template_static_asset'] = template_static_asset config = Configurator(settings=settings) config.include('pyramid_tm') if 'solr.address' in settings: import sunburnt solr_conn = sunburnt.SolrInterface(settings['solr.address']) config.registry.solr_conn = solr_conn if 'twitter.app_key' in settings and 'twitter.app_secret' in settings: from twython import Twython app_twit = Twython(settings['twitter.app_key'], settings['twitter.app_secret']) config.registry.app_twit = app_twit config.set_session_factory(session_factory) # @TODO: the name "mako.directories" implies this could be a list # right now we don't care. Someone should fix this. config.add_static_view('static', template_static_asset) config.add_static_view('user_imgs', settings['user.picture_upload_package']) config.add_static_view('section_imgs', settings['section.picture_upload_package']) config.add_route('home', '/') config.add_route('login', '/login') config.add_route('list', '/list') config.add_route('post', '/post') config.add_route('new_page', '/new_page') config.add_route('new_post', '/new_post') config.add_route('ban', '/ban') config.add_route('vote', '/vote/{way}') config.add_route('full', '/full/{sub_id}') config.add_route('epistle', '/messages/{box}') config.add_route('follow', '/follow') config.add_route('save', '/save') config.add_route('notify', '/notify') config.add_route('search', '/search') config.add_route('twit_sign', '/twit_sign') config.add_route('user_info', '/user_info') config.add_route('user_preferences', '/user_preferences') config.add_route('buttons', '/buttons') config.add_route('favicon', '/favicon.ico') config.add_route('atom_story', '/atom_story.xml') config.add_route('atom_self_story', '/atom_self_story.xml') config.add_route('atom_combined', '/atom_combined.xml') config.add_route('atom_comment', '/atom_comment.xml') config.add_route('section', '/section') config.add_route('motd', '/motd') config.add_route('sublist', '/sublist/{sub_title}') config.add_route('sublistc', '/sublist_create') config.add_route('lost_password', '/lost_password') config.add_subscriber(subscribers.ban, NewResponse) config.add_subscriber(subscribers.user_session_handler, BeforeRender) config.add_subscriber(subscribers.clear_per_request_session, NewRequest) config.add_subscriber(subscribers.clean_inputs, NewRequest) config.scan('raggregate.views') pyramid_beaker.set_cache_regions_from_settings(settings) return config.make_wsgi_app()
# -*- coding: utf-8 -*- import time import socket import xml.parsers.expat import sunburnt from Resource.ResourceHelper import ResourceHelper from Util.PathTool import PathTool from Digester.FeedDictFactory import FeedDictFactory # create a connection to a solr server try: solr = sunburnt.SolrInterface("http://localhost:8983/solr/") except socket.error as e: print(e, "Is Solr started?") _pt = PathTool.PathTool() _rh = ResourceHelper() feeds = _rh.getAllFeedPaths() for feed in feeds: if not _rh.checkFeedPath(feed): print(("Skipping:", feed)) continue try: feedDictFactory = FeedDictFactory.FeedDictFactory() feedDict = feedDictFactory.getFeedDict(feed) if feedDict != None and feedDict != {}:
while list(response): for match in response: scans = match.get('scansion', None) if not scans: scans = [''] if 'before_caesura' in match and 'after_caesura' in match: line = '%s // %s' % (match['before_caesura'].strip(), match['after_caesura'].strip()) else: line = match['line_text'] print('%-9s %-22s %s' % (match['lineid'], scans[0], line)) for scan in scans[1:]: print('%-9s %-22s %s' % ('', scan, ' alternate scansion')) start += ROWS response = query.paginate(start=start, rows=ROWS).execute() if __name__ == '__main__': import sys if len(sys.argv) < 3: print('Usage: %s solr_url word ...') solr_url = sys.argv[1] solr = sunburnt.SolrInterface(solr_url) query = solr # not really, but it will after the first iteration of: for word in sys.argv[2:]: query = query.query(unicode(word, 'utf-8')) report_results(query)