Exemplo n.º 1
0
def ssearch_all_count():
    try:
        solr = sunburnt.SolrInterface(settings.SOLR['host'])
        responce = solr.query(**{'*': '*'}).field_limit("id").execute()
    except socket.error:
        return {'count': 0}
    return {'count': responce.result.numFound}
Exemplo n.º 2
0
def statictics():
    solr_connection = httplib2.Http(disable_ssl_certificate_validation=True)
    solr = sunburnt.SolrInterface(settings.SOLR['host'], http_connection=solr_connection)

    facet_fields = ['fond_sf']
    qkwargs = {'*': '*'}
    solr_searcher = solr.query(**qkwargs).paginate(start=0, rows=0)
    exclude_kwargs = {'system-catalog_s': u"1"}
    solr_searcher = solr_searcher.exclude(**exclude_kwargs)
    solr_searcher = solr_searcher.facet_by(field=facet_fields, limit=30, mincount=1)
    solr_searcher = solr_searcher.field_limit("id")
    response = solr_searcher.execute()
    collections = {}
    for key in response.facet_counts.facet_fields.keys():
        for val in response.facet_counts.facet_fields[key]:
            collections[val[0]] = val[1]

    stats = {
        'collections': collections,
        'count_all': 0,
        'count_last_month': 0,
    }
    now = datetime.datetime.now()
    before_30_now = now - datetime.timedelta(30)
    count_all = Record.objects.using('records').filter(source_id='2').exclude(deleted=True).count()
    count_last_month = Record.objects.using('records').filter(add_date__year=now.year, add_date__month=now.month,
                                                              source_id='2').exclude(deleted=True).count()
    count_last_30 = Record.objects.using('records').filter(add_date__gte=before_30_now, add_date__lte=now,
                                                           source_id='2').exclude(deleted=True).count()
    stats['count_all'] = count_all
    stats['count_last_month'] = count_last_month
    stats['count_last_30'] = count_last_30
    return stats
Exemplo n.º 3
0
 def upload(self,
            network_name=None,
            network_file=None,
            annotations_file=None,
            type="public",
            email="*****@*****.**"):
     #cherrypy.log("Privacy: "+ type)
     #cherrypy.log("annotations: "+ repr(network_file))
     #cherrypy.log("network: "+ repr(annotations_file))
     if cherrypy.request.method == "GET":
         tmpl = lookup.get_template("upload_form.mako")
         return tmpl.render()
     else:
         network_name = pinv.clean(network_name)
         solr_url = '%s/solr/%s' % (settings.SOLR_SERVER, network_name)
         time.sleep(2)
         inv = InteractionNetwork(network_name, "description text",
                                  network_file.file, annotations_file.file)
         time.sleep(2)
         message, result = pinv.create_new_solr(network_name)
         time.sleep(2)
         cherrypy.log("*** UPLOAD. Connecting to: " + solr_url)
         si = sunburnt.SolrInterface(solr_url)
         time.sleep(2)
         inv.upload(si)
         inv.createClusters(si, 3, 7)
         cherrypy.log(str(inv.errors))
         view_key, delete_key = auth.save_key(network_name, email, type)
         if type == "private":
             view_url = "http://biosual.cbio.uct.ac.za/pinViewer.html?core=%(core)s&key=%(key)s" % {
                 'core': network_name,
                 'key': view_key
             }
             delete_url = "http://biosual.cbio.uct.ac.za/solr/admin/cores?action=UNLOAD&deleteIndex=true&core=%(core)s&key=%(key)s" % {
                 'core': network_name,
                 'key': delete_key
             }
         else:
             view_url = "http://biosual.cbio.uct.ac.za/pinViewer.html?core=%(core)s" % {
                 'core': network_name
             }
             delete_url = "http://biosual.cbio.uct.ac.za/solr/admin/cores?action=UNLOAD&deleteIndex=true&core=%(core)s&key=%(key)s" % {
                 'core': network_name,
                 'key': delete_key
             }
         msg = auth.sendmail(email, view_url, delete_url, network_name)
         #cherrypy.log(msg);
         errormessage = "<br/>".join(inv.errors)
         tmpl = lookup.get_template("upload_result.mako")
         return tmpl.render(network_name=network_name,
                            annotation_head="|".join(inv.ahead),
                            annotation_count=len(inv.annotations),
                            annotation_file=annotations_file.filename,
                            network_head="|".join(inv.nhead),
                            network_count=len(inv.network),
                            network_file=network_file.filename,
                            message="",
                            errors="<br/>".join(inv.errors))
Exemplo n.º 4
0
 def connect(self):
     # noinspection PyUnusedLocal
     try:
         return solr.SolrInterface(self.host,
                                   mode="rw",
                                   retry_timeout=self.retry)
     except Exception, e:
         _logger.exception(u"Could not connect to [%s]", self.host)
         return None
Exemplo n.º 5
0
    def __init__(self):
        self.connection = Connection('localhost', 27017)
        self.db = self.connection.nSquared
        self.COLLECTION = 'thumbs'
        self.r = redis.StrictRedis(host='localhost', port=6379, db=0)

        self.SOLR_URL = 'http://10.10.10.31:8443/solr/'
        self.solr = sunburnt.SolrInterface(self.SOLR_URL)
        self.PAGE_LENGTH = 1000
        self.connection, self.cursor = connect_mysql()
Exemplo n.º 6
0
def solr_interface(solr_url=None):
    if not solr_url:
        solr_url = settings.SOLR_SERVER_URL

    http_opts = {}
    if hasattr(settings, 'SOLR_CA_CERT_PATH'):
        http_opts['ca_certs'] = settings.SOLR_CA_CERT_PATH
    if getattr(settings, 'SOLR_DISABLE_CERT_CHECK', False):
        http_opts['disable_ssl_certificate_validation'] = True
    http = httplib2.Http(**http_opts)
    solr = sunburnt.SolrInterface(solr_url, http_connection=http)
    return solr
Exemplo n.º 7
0
def participant_income(request):
    sigla = request.GET.get('sigla', None)
    solr_connection = httplib2.Http(disable_ssl_certificate_validation=True)
    solr = sunburnt.SolrInterface(settings.SOLR['local_records_host'],
                                  http_connection=solr_connection)

    if sigla:
        query = solr.Q(**{'holder-sigla_s': sigla})
    else:
        query = solr.Q(**{'*': '*'})

    solr_searcher = solr.query(query)
    solr_searcher = solr_searcher.field_limit("id")

    solr_searcher = solr_searcher.sort_by('-record-create-date_dts')

    paginator = Paginator(solr_searcher, 20)  # Show 25 contacts per page

    page = request.GET.get('page')
    try:
        results_page = paginator.page(page)
    except PageNotAnInteger:
        # If page is not an integer, deliver first page.
        results_page = paginator.page(1)
    except EmptyPage:
        # If page is out of range (e.g. 9999), deliver last page of results.
        results_page = paginator.page(paginator.num_pages)

    docs = []

    for row in results_page.object_list:
        docs.append(replace_doc_attrs(row))

    doc_ids = []
    for doc in docs:
        doc_ids.append(doc['id'])

    records_dict = {}
    records = list(
        Record.objects.using('local_records').filter(gen_id__in=doc_ids))
    for record in records:
        records_dict[record.gen_id] = etree.tostring(xslt_bib_draw_transformer(
            etree.XML(record.content), abstract='false()'),
                                                     encoding='utf-8')

    for doc in docs:
        doc['record'] = records_dict.get(doc['id'])

    return render(request, 'ssearch/frontend/income.html', {
        'results_page': results_page,
        'docs': docs
    })
Exemplo n.º 8
0
def publishMeta(mdList):
    """ Establish the Solr Instance, add the metadata, and commit it"""
    try:
        # Instantiate the interface to the Solr instance
        si = sunburnt.SolrInterface("http://%s:%s/solr/%s/" %
                                    (solrServer, solrPort, solrInstance))
        # Add the XML metadata to the instance
        si.add(mdList)
    except:
        raise
    finally:
        # Commit/Save the metadata
        si.commit()
Exemplo n.º 9
0
 def indexes(self,*fieldnames):
     try:
         import sunburnt
     except ImportError:
         raise ImportError("Cannot find sunburnt, it is necessary to access Solr")
     self.fieldnames = fieldnames
     if not os.path.exists(self.schema_filename):
         schema='<fields><field name="id" type="int" indexed="true" stored="true" required="true" />%s</fields>' \
             % ''.join('<field name="%s" type="string" />' % name for name in fieldname)
         open(self.schema_filename,'w').write(shema)
     try:
         self.interface = sunburnt.SolrInterface(self.url, self.schema_filename)
     except:
         raise RuntimeError("Cannot connect to Solr: %s" % self.url)
Exemplo n.º 10
0
def index_email_address():
    email_loaded_nodes = pickle.load(open(SETTINGS.email_object_file, "rb"))
    sEmail = sunburnt.SolrInterface("http://localhost:8983/solr/emailAddress/")
    docs = []
    for key, value in email_loaded_nodes.iteritems():
        doc = {
            "nodeId": key,
            "address": value._address,
            "fullyObserved": value._fully_observed
        }
        docs.append(doc)

    sEmail.add(docs)
    sEmail.commit()
Exemplo n.º 11
0
def index_person():
    person_loaded_nodes = pickle.load(open(SETTINGS.person_object_file, "rb"))
    sPerson = sunburnt.SolrInterface("http://localhost:8983/solr/person/")
    docs = []
    for key, value in person_loaded_nodes.iteritems():
        doc = {
            "nodeId": key,
            "lastname": value._last_name,
            "firstname": value._first_name,
            "provenance": value._provenance
        }
        docs.append(doc)

    sPerson.add(docs)
    sPerson.commit()
Exemplo n.º 12
0
def participant_income(sigla):
    solr_connection = httplib2.Http(disable_ssl_certificate_validation=True)
    solr = sunburnt.SolrInterface(settings.SOLR['local_records_host'],
                                  http_connection=solr_connection)
    if sigla:
        query = solr.Q(**{'holder-sigla_s': sigla})
    else:
        query = solr.Q(**{'*': '*'})

    solr_searcher = solr.query(query)
    solr_searcher = solr_searcher.field_limit(['id', 'record-create-date_dts'])

    solr_searcher = solr_searcher.sort_by('-record-create-date_dts')

    paginator = Paginator(solr_searcher, 10)  # Show 25 contacts per page

    # If page is not an integer, deliver first page.
    results_page = paginator.page(1)

    docs = []

    for row in results_page.object_list:
        docs.append(replace_doc_attrs(row))

    doc_ids = []
    for doc in docs:
        doc_ids.append(doc['id'])

    records_dict = {}
    records = list(
        Record.objects.using('local_records').filter(gen_id__in=doc_ids))
    for record in records:
        records_dict[record.gen_id] = rusmarc_template.beautify(
            etree.tostring(xslt_bib_draw_transformer(etree.XML(record.content),
                                                     abstract='false()'),
                           encoding='utf-8'))

    for doc in docs:
        doc['record'] = records_dict.get(doc['id'])

    return {
        # 'results_page': results_page,
        'docs': docs,
        'sigla': sigla
    }
Exemplo n.º 13
0
    def document_delete_index(document, user_id=None):

        import sunburnt

        document = json.loads(document)
        table = s3db.doc_document
        id = document["id"]
        filename = document["filename"]

        si = sunburnt.SolrInterface(settings.get_base_solr_url())

        # Delete and Commit the indicies of the deleted document
        si.delete(id)
        si.commit()
        # After removing the index, set has_been_indexed value to False in the database
        db(table.id == id).update(has_been_indexed=False)

        db.commit()
Exemplo n.º 14
0
 def adapter_connect(host, mode, retry_timeout, raise_ex=False):
     """
   Connect to backend.
 """
     if host is None:
         utils.exit(
             utils.ENVIRONMENT_ENTRY_NOT_FOUND,
             'settings["backend_host"] missing - please, specify it in settings.py.'
         )
     try:
         _logger.info("Connecting to indexer [%s, %s].", host, mode)
         return solr.SolrInterface(host,
                                   mode=mode,
                                   retry_timeout=retry_timeout)
     except:
         _logger.exception("Connecting to indexer failed.")
         if raise_ex:
             raise
     return None
Exemplo n.º 15
0
def categorize(schema,
               text,
               n_categories=5,
               n_terms=30,
               server='http://localhost:8983/solr',
               terms=False):
    """Categorize a piece of text using a MoreLikeThis query on Solr

    This is basically an approximated k-Neareast Neighbors using the TF-IDF
    similarity of the Solr index. The query is truncated to the top n_terms
    terms with maximum weights for efficiency reasons.
    """
    solr = sunburnt.SolrInterface(server, schema)
    interestingTerms = 'list' if terms else 'none'
    q = solr.mlt_query("text",
                       content=text,
                       maxqt=n_terms,
                       interestingTerms=interestingTerms)
    q = q.paginate(rows=n_categories)
    q = q.field_limit(score=True, all_fields=True)
    return q.execute()
Exemplo n.º 16
0
 def upload(self, network_name, network_file, annotations_file):
     out = """<html>
     <body>
         Network name: %s<br/><br/>
         
         Annotations columns: %s<br/>
         Annotations count: %i<br />
         Annotations filename: %s<br />
         
         Network head: %s<br />
         Network length: %i<br />
         Network filename: %s<br />
         
         %s
                 <br/>
                 Errors:<br/>
                 %s
     </body>
     </html>"""
     network_name = pinv.clean(network_name)
     if settings.TEST and not (network_file.file and annotations_file.file):
         network_file.file = open("../data/test.txt")
         annotations_file.file = open("../data/test_annot.txt")
     solr_url = settings.SOLR_URL
     solr_url = '%s/solr/%s' % (settings.SOLR_SERVER, network_name)
     time.sleep(2)
     inv = InteractionNetwork(network_name, "description text",
                              network_file.file, annotations_file.file)
     time.sleep(2)
     message, result = pinv.create_new_solr(network_name)
     time.sleep(2)
     print "--- UPLOAD. Connecting to:", solr_url
     si = sunburnt.SolrInterface(solr_url)
     time.sleep(2)
     inv.upload(si)
     errormessage = "<br/>".join(inv.errors)
     return out % (network_name, "|".join(inv.ahead), len(
         inv.annotations), repr(annotations_file.filename), "|".join(
             inv.nhead), len(inv.network), repr(
                 network_file.filename), message, errormessage)
Exemplo n.º 17
0
def index_edges():
    edge_loaded_nodes = pickle.load(open(SETTINGS.edge_object_file, "rb"))
    sEdges = sunburnt.SolrInterface("http://localhost:8983/solr/edge/")
    docs = []
    for key, value in edge_loaded_nodes.iteritems():
        doc = {
            "edgeId": key,
            "source": value._source,
            "target": value._target,
            "label": value._label,
            "epochSecs": value._epoch_secs,
            "order": value._order,
            "datetime": value._datetime,
            "edgeType": value._edge_type,
            "startDatetime": value._start_datetime,
            "endDatetime": value._end_datetime,
            "evidenceType": value._evidence_type
        }
        docs.append(doc)

    sEdges.add(docs)
    sEdges.commit()
Exemplo n.º 18
0
def create_ngram_dict():
    word_dict_unigram = {}
    word_dict_bigram = {}
    word_dict_trigram = {}
    # contains the list of results
    result_list = []
    sMessage = sunburnt.SolrInterface("http://localhost:8983/solr/message/")
    for result in sMessage.query(text="*").field_limit(["body", "subject"]).paginate(start=0, rows=255636).execute():
        result_list.append(result)
    
    for list_element in result_list:    
        # for body and subject type
        for key, value in list_element.iteritems():
            # Converting unicode to string
            value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
            put_ngram_word(word_dict_unigram, word_dict_bigram, word_dict_trigram, value)
    
    sorted_word_dict_unigram = sorted(word_dict_unigram.iteritems(), key=operator.itemgetter(1), reverse=True)
    sorted_word_dict_bigram = sorted(word_dict_bigram.iteritems(), key=operator.itemgetter(1), reverse=True)
    sorted_word_dict_trigram = sorted(word_dict_trigram.iteritems(), key=operator.itemgetter(1), reverse=True)
    
    return sorted_word_dict_unigram, sorted_word_dict_bigram, sorted_word_dict_trigram
Exemplo n.º 19
0
def get_solr_interface(site):
    """cache the solr interface for an hour at a time so we don't need
    to fetch the schema on every single query."""
    global saved_solr_interface
    global solr_interface_created
    if site not in settings.SOLR_SERVER_URLS:
        raise InvalidQueryError("Unknown site: %s" % site)
    if site not in saved_solr_interface:
        too_old = True
    else:
        age = datetime.now() - solr_interface_created[site]
        too_old = age > timedelta(hours=1)
    if too_old:
        try:
            saved_solr_interface[site] = sunburnt.SolrInterface(
                settings.SOLR_SERVER_URLS[site],
                http_connection=Connection(),
                format='json')
            solr_interface_created[site] = datetime.now()
        except Exception as e:
            logger.error("get_solr_interface: %s" % e, exc_info=True)
            raise SolrUnavailableError('Solr is not responding (using %s )' %
                                       settings.SOLR_SERVER_URLS[site])
    return saved_solr_interface[site]
Exemplo n.º 20
0
def index_message():
    filter_words = load_filter_words()
    message_loaded_nodes = pickle.load(open(SETTINGS.message_object_file,
                                            "rb"))
    sPerson = sunburnt.SolrInterface("http://localhost:8983/solr/message/")
    docs = []
    for key, value in message_loaded_nodes.iteritems():

        #Checking if the subject or body contains filter words (non-compliant words)
        compliantFlag = True

        #NoneType check
        if value._subject == None:
            text = value._body
        elif value._body == None:
            text = value._subject
        else:
            text = value._subject + value._body

        if is_filter_word_present(text, filter_words):
            compliantFlag = False

        doc = {
            "nodeId": key,
            "datetime": value._datetime,
            "epochSecs": value._epoch_secs,
            "subject": value._subject,
            "body": clean_data(value._body),
            "emailId": value._email_id,
            "compliantFlag": compliantFlag
        }
        #         doc = {"nodeId":key, "datetime":value._datetime, "epochSecs":value._epoch_secs, "subject":value._subject, "body":value._body, "emailId":value._email_id,"compliantFlag":compliantFlag}
        docs.append(doc)

    sPerson.add(docs)
    sPerson.commit()
Exemplo n.º 21
0
import requests
import sunburnt
import MySQLdb

THUMB_URL = 'http://209.17.190.27/rcw_wp/0.51.0/cache_image_lookup.php'
SOLR_URL = 'http://10.10.10.31:8443/solr/'
solr = sunburnt.SolrInterface(SOLR_URL)
MYSQL_SETTINGS = '10.10.10.17', 'vulcan', '', 'linksDBProd'

def find_thumb(urls, domain):
  params = {}
  for url in urls:
    params['image_url'] = url
    params['domain'] = domain
    thumb_request = requests.get(THUMB_URL, params=params)
    if thumb_request.status_code == 200:
      return thumb_request.content
  return None

def get_domain(rssid, connection=None):
  query = ("SELECT keyCode FROM `domains` WHERE rssid = %s", str(rssid))
  domain = execute_fetchone(connection, query)
  if domain and len(domain) > 0:
    domain = domain[0]
  return domain

def get_rssid(domain, connection=None):
  query = ("SELECT rssid FROM `domains` WHERE keyCode = %s", str(domain))
  rssid = execute_fetchone(connection, query)
  if rssid and len(rssid) > 0:
    rssid = rssid[0]
Exemplo n.º 22
0
    def document_create_index(document, user_id=None):

        import os
        from xlrd import open_workbook
        from pyth.plugins.rtf15.reader import Rtf15Reader
        from pyth.plugins.plaintext.writer import PlaintextWriter
        import sunburnt

        document = json.loads(document)
        table = s3db.doc_document
        id = document["id"]

        name = document["name"]
        filename = document["filename"]

        filename = "%s/%s/uploads/%s" % (os.path.abspath("applications"), \
                                        request.application, filename)

        si = sunburnt.SolrInterface(settings.get_base_solr_url())

        extension = os.path.splitext(filename)[1][1:]

        if extension == "pdf":
            data = os.popen("pdf2txt.py " + filename).read()
        elif extension == "doc":
            data = os.popen("antiword " + filename).read()
        elif extension == "xls":
            wb = open_workbook(filename)
            data = " "
            for s in wb.sheets():
                for row in range(s.nrows):
                    values = []
                    for col in range(s.ncols):
                        values.append(str(s.cell(row, col).value))
                    data = data + ",".join(values) + "\n"
        elif extension == "rtf":
            doct = Rtf15Reader.read(open(filename))
            data = PlaintextWriter.write(doct).getvalue()
        else:
            data = os.popen("strings " + filename).read()

        # The text needs to be in unicode or ascii, with no contol characters
        data = str(unicode(data, errors="ignore"))
        data = "".join(c if ord(c) >= 32 else " " for c in data)

        # Put the data according to the Multiple Fields
        # @ToDo: Also, would change this according to requirement of Eden
        document = {
            "id": str(id),  # doc_document.id
            "name": data,  # the data of the file
            "url": filename,  # the encoded file name stored in uploads/
            "filename": name,  # the filename actually uploaded by the user
            "filetype": extension  # x.pdf -> pdf is the extension of the file
        }

        # Add and commit Indices
        si.add(document)
        si.commit()
        # After Indexing, set the value for has_been_indexed to True in the database
        db(table.id == id).update(has_been_indexed=True)

        db.commit()
Exemplo n.º 23
0
def create_feature_vector():
    feature_map = pickle.load(open(SETTINGS.feature_map, "rb"))
    
    print feature_map
    
    feature_vectors = []
    modified_feature_vectors = []
    target_values = []
    # create a list of feature vector along with a list of target value from the data set
    result_list = []
    sMessage = sunburnt.SolrInterface("http://localhost:8983/solr/message/")
#     for result in sMessage.query(text="*").field_limit(["body", "subject", "compliantFlag"]).paginate(start=0, rows=255636).execute():
    for result in sMessage.query(text="*").field_limit(["body", "subject", "compliantFlag"]).paginate(start=0, rows=255636).execute():
        result_list.append(result)
    
    # For each result
    for list_element in result_list:    
        # initialize it to the size of feature selected i.e 5000+500+200
        feature_vector = [0] * 5700
#         print "Subject:" + list_element['subject'].lower()
#         print "Body:" + list_element['body'].lower()
#         print "Compliant Flag:", list_element['compliantFlag']
        for key_feature, value_feature in feature_map.iteritems():
            words_in_feature_key = key_feature.split()
            if (len(words_in_feature_key) == 1):
                if (key_feature.lower() in list_element['subject'].lower().split() or key_feature in list_element['body'].lower().split()):
                    # Mark the value 1 at that index
                    feature_vector[value_feature] = 1
            else:
                if (key_feature.lower() in list_element['subject'].lower() or key_feature in list_element['body'].lower()):
                    # Mark the value 1 at that index
                    feature_vector[value_feature] = 1     
        if list_element['compliantFlag'] == False:
            y = 0
        else:
            y = 1
        feature_vectors.append(feature_vector)
        target_values.append(y)
        modified_feature_vector = feature_vector + list(str(y))
        modified_feature_vectors.append(modified_feature_vector)
#         print feature_vector
#         print y
#         print modified_feature_vector
    
    assert_equal(len(feature_vectors), len(target_values))
    
    #Shuffling the data for training and testing test
    random.shuffle(modified_feature_vectors)
    
    #Separating out training and testing data
    train_data = modified_feature_vectors[:178945]
    test_data = modified_feature_vectors[178945:]
    
    #For training data
    train_data_features = []
    train_data_targets = []
    
    for i in range(0,len(train_data)):
        train_data_features.append(train_data[i][:-1])
        train_data_targets.append(train_data[i][-1]) 
    
    #For testing data    
    test_data_features = []
    test_data_targets = []
            
    for i in range(0,len(test_data)):
        test_data_features.append(test_data[i][:-1])
        test_data_targets.append(test_data[i][-1]) 
    
    #Dumping 
    pickle.dump(train_data_features, open(SETTINGS.train_data_features, "wb"))
    pickle.dump(train_data_targets, open(SETTINGS.train_data_targets, "wb"))
    pickle.dump(test_data_features, open(SETTINGS.test_data_features, "wb"))
    pickle.dump(test_data_targets, open(SETTINGS.test_data_targets, "wb"))
Exemplo n.º 24
0
def index_webentity(web_entity_pile, web_entity_done_pile, conf, mainlog):
    processlog = TimeElapsedLogging.create_log(str(os.getpid()),
                                               filename="logs/by_pid/%s.log" %
                                               os.getpid())
    processlog.info("starting infinite loop")
    corpus = conf['hyphe-core']['corpus_id']
    solr = sunburnt.SolrInterface(
        "http://%s:%s/solr/%s" %
        (conf["solr"]['host'], conf["solr"]['port'],
         get_solr_instance_name(conf["solr"]['path'])))
    hyphe_core = jsonrpclib.Server(
        'http://%s:%s' %
        (conf["hyphe-core"]["host"], conf["hyphe-core"]["port"]),
        version=1)
    db = pymongo.MongoClient(conf['mongo']['host'], conf['mongo']['port'])
    collname = "%s.pages" % conf['hyphe-core']['corpus_id']
    coll = db[conf["mongo"]["db"]][collname]
    while True:
        we = web_entity_pile.get()

        # logging in proc log
        processlog.info("%s: starting processing" % we["name"])

        #setting LOG
        web_entity_name_safe = re.sub(r"[\W]", "", we['name'])
        web_entity_log_id = "%s_%s" % (web_entity_name_safe, we["id"])
        logfilename = "logs/by_web_entity/%s.log" % web_entity_log_id
        errors_solr_document_filename = "logs/errors_solr_document/%s.json" % web_entity_log_id
        welog = TimeElapsedLogging.create_log(we["id"], filename=logfilename)

        #getting web pages URLS
        welog.log(logging.INFO,
                  "retrieving pages of web entity %s" % (we["name"]))
        #mainlog.info("DEBUG %s"%(we["id"]))
        web_pages = hyphe_core.store.get_webentity_pages(
            we["id"], True, corpus)
        if (web_pages['code'] == 'fail'):
            mainlog.info(we_pages['message'])
        welog.log(
            logging.INFO, "retrieved %s pages of web entity %s" %
            (len(web_pages["result"]), we["name"]))
        we["web_pages"] = web_pages["result"]

        processlog.info("%s: got %s webpages" %
                        (we["name"], len(we["web_pages"])))

        #getting mongo html web page
        urls = [page["url"]
                for page in we["web_pages"]]  #if page["http_status"]!=0]
        nb_urls = len(urls)
        last_id = ""
        pages_mongo = []
        nb_pages_mongo = 0
        nb_pages_indexed = 0
        i = 0
        url_slice_len = 1000
        welog.info(
            "retrieving + indexing HTML pages from mongo to solr of web entity %s"
            % (we["name"]))

        while i < len(urls):
            urls_slice = urls[i:i + url_slice_len]
            pages_mongo_slice = list(
                coll.find(
                    {
                        "url": {
                            "$in": urls_slice
                        },
                        "status": 200,
                        "content_type": {
                            "$in": accepted_content_types
                        },
                        "body": {
                            "$exists": True
                        }
                    },
                    projection=[
                        "_id", "encoding", "url", "lru", "depth", "body"
                    ]))
            #mainlog.info(str(len(pages_mongo_slice)))
            #local counters
            nb_slice_mongo = len(pages_mongo_slice)
            nb_slice_indexed = 0

            welog.info(
                "%s %s: got %s pages in slice %s %s" %
                (we["name"], we["id"], nb_slice_mongo, i, len(urls_slice)))

            error_solr_doc = []
            for page_mongo in pages_mongo_slice:
                body = page_mongo["body"].decode('zip')
                try:
                    body = body.decode(page_mongo.get("encoding", ""))
                    encoding = page_mongo.get("encoding", "")
                except Exception:
                    body = body.decode("UTF8", "replace")
                    encoding = "UTF8-replace"
                solr_document = {
                    "id": page_mongo["_id"],
                    "web_entity": we["name"],
                    "web_entity_id": we["id"],
                    "web_entity_status": we["status"],
                    "corpus": conf['hyphe-core']['corpus_id'],
                    "encoding": encoding,
                    "original_encoding": page_mongo.get("encoding", ""),
                    "url": page_mongo["url"],
                    "lru": page_mongo["lru"],
                    "depth": page_mongo["depth"],
                    #"html":body,
                    "text": html2text.textify(body, encoding)
                }

                try:
                    solr.add(solr_document)
                    nb_slice_indexed += 1
                except Exception as e:
                    # mainlog.info("ERROR %s: %s %s" %(type(e),e, solr_document))
                    #welog.debug("Exception with document :%s %s %s"%(solr_document["id"],solr_document["url"],solr_document["encoding"]))
                    error_solr_doc.append({
                        "error":
                        "%s: %s" % (type(e), e),
                        "url":
                        solr_document["url"],
                        "encoding":
                        solr_document["encoding"],
                        "original_encoding":
                        solr_document["original_encoding"]
                    })
                    # import traceback
                    # traceback.print_exc()
            if len(error_solr_doc) > 0:
                with open(errors_solr_document_filename,
                          "a") as errors_solr_document_json_file:
                    json.dump(error_solr_doc,
                              errors_solr_document_json_file,
                              indent=4)
            del (error_solr_doc)
            #log
            welog.info("%s %s: indexed %s pages" %
                       (we["name"], we["id"], nb_slice_indexed))
            #processlog.info("indexed %s html pages for %s"%(nb_slice_indexed,(we["name"])))
            # global counters
            nb_pages_mongo += nb_slice_mongo
            nb_pages_indexed += nb_slice_indexed
            i = i + url_slice_len

        del we["web_pages"]
        del web_pages
        del urls

        welog.log(
            logging.INFO, "'%s' indexed (%s web pages on %s)" %
            (we["name"], nb_pages_indexed, nb_pages_mongo))
        try:
            solr.commit()
        except Exception as e:
            mainlog.info("ERROR %s: %s" % (type(e), e))
            mainlog.info("Retrying...")
            try:
                solr.commit()
            except Exception as e:
                mainlog.info("STILL BROKEN, giving up on %s %s" %
                             (we['id'], we['name']))

#relying on autocommit
#welog.info("inserts to solr comited")
        processlog.info("%s: indexed %s on %s Html pages" %
                        (we["name"], nb_pages_indexed, nb_pages_mongo))
        #adding we if to done list
        web_entity_done_pile.put(we["id"])
        del we
        web_entity_pile.task_done()
Exemplo n.º 25
0
                           background=True))
     mainlog.info("index on content_type done")
     # prepare conte_type filter
     accepted_content_types = []
     with open(
             conf['mongo']
         ['contenttype_whitelist_filename']) as content_type_whitelist:
         accepted_content_types = content_type_whitelist.read().split("\n")
 except Exception as e:
     sys.stderr.write("%s: %s\n" % (type(e), e))
     sys.stderr.write('ERROR: Could not initiate connection to MongoDB\n')
     sys.exit(1)
 # solr
 try:
     solr = sunburnt.SolrInterface(
         "http://%s:%s/solr/%s" %
         (conf["solr"]['host'], conf["solr"]['port'],
          get_solr_instance_name(conf["solr"]['path'])))
     if args.delete_index:
         solr.delete_all()
         solr.commit()
 except Exception as e:
     sys.stderr.write("%s: %s\n" % (type(e), e))
     sys.stderr.write('ERROR: Could not initiate connection to SOLR node\n')
     sys.exit(1)
 # hyphe core
 try:
     hyphe_core = jsonrpclib.Server(
         'http://%s:%s' %
         (conf["hyphe-core"]["host"], conf["hyphe-core"]["port"]),
         version=1)
 except Exception as e:
Exemplo n.º 26
0
 def __init__(self):
     print 'starting'
     self.si = sunburnt.SolrInterface("http://localhost:8080/solr")
Exemplo n.º 27
0
def _indexing(slug, reset=False):

    sources_index = {}

    print 'getting source'
    sources = list(Source.objects.using('records').all())

    print 'total sources', len(sources)
    for source in sources:
        sources_index[source.id] = source

    try:
        solr_address = settings.SOLR['host']
        db_conf = settings.DATABASES.get(
            settings.SOLR['catalogs'][slug]['database'], None)
    except KeyError:
        raise Exception(u'Catalog not founded')

    if not db_conf:
        raise Exception(
            u'Settings not have inforamation about database, where contains records.'
        )

    if db_conf['ENGINE'] != 'django.db.backends.mysql':
        raise Exception(
            u' Support only Mysql Database where contains records.')

    print 'connect to db', db_conf['HOST']
    try:
        conn = MySQLdb.connect(host=db_conf['HOST'],
                               user=db_conf['USER'],
                               passwd=db_conf['PASSWORD'],
                               db=db_conf['NAME'],
                               port=int(db_conf['PORT']),
                               compress=True,
                               charset='utf8',
                               use_unicode=True,
                               cursorclass=MySQLdb.cursors.SSDictCursor)
    except MySQLdb.OperationalError as e:
        conn = MySQLdb.connect(unix_socket=db_conf['HOST'],
                               user=db_conf['USER'],
                               passwd=db_conf['PASSWORD'],
                               db=db_conf['NAME'],
                               port=int(db_conf['PORT']),
                               compress=True,
                               charset='utf8',
                               use_unicode=True,
                               cursorclass=MySQLdb.cursors.SSDictCursor)

    print 'connected to db'

    print 'load holdings'
    holdings_index = _load_holdings(conn)

    print 'load orgs'
    orgs_index = _load_orgs()

    print 'load sources'
    sources_index = _load_sources()

    try:
        index_status = IndexStatus.objects.get(catalog=slug)
    except IndexStatus.DoesNotExist:
        index_status = IndexStatus(catalog=slug)

    print 'index_status', index_status.last_index_date
    # select_query = "SELECT * FROM records where deleted = 0 AND LENGTH(content) > 0 and record_id='ru\\\\nlrt\\\\1359411'"
    select_query = "SELECT * FROM records where deleted = 0 AND LENGTH(content) > 0"
    # if not getattr(index_status, 'last_index_date', None):
    #     select_query = "SELECT * FROM records where deleted = 0 and content != NULL"
    # else:
    #     select_query = "SELECT * FROM records where update_date >= '%s' and deleted = 0" % (
    #         str(index_status.last_index_date))

    solr = sunburnt.SolrInterface(
        solr_address,
        http_connection=httplib2.Http(disable_ssl_certificate_validation=True))
    docs = list()

    start_index_date = datetime.datetime.now()
    print 'execute query', select_query
    conn.query(select_query)
    print 'query executed', select_query

    rows = conn.use_result()

    res = rows.fetch_row(how=1)
    print 'start fetching'

    i = 0
    while res:

        if not res[0]['content']:
            res = rows.fetch_row(how=1)
            continue
        zf = zipfile.ZipFile(io.BytesIO((res[0]['content'])))
        content = zf.read('1.xml').decode('utf-8')
        doc_tree = etree.XML(content)
        doc_tree = xslt_indexing_transformer(doc_tree)
        doc = doc_tree_to_dict(doc_tree)
        doc = add_sort_fields(doc)

        date_of_publication = doc.get('date-of-publication_s')
        if date_of_publication:
            cleaned_date_of_publication = ''.join(
                ONLY_DIGITS_RE.findall(date_of_publication[0]))
            if cleaned_date_of_publication:
                doc['date_of_publication_l'] = [cleaned_date_of_publication]
        # для сортировки по тому, извлекаем строку содержащую номер тома или промежуток и посещаем резултат вычисления
        # в поле tom_f, которое в последствии сортируется
        # если трока типа т.1 то в том добавляется float 1
        # если строка содержит т.1-2 то добавляется float (1+2) / 2 - средне арифметическое, чтобы усреднить для сортировки

        tom = doc.get('tom_s', None)

        if tom and isinstance(tom, unicode):
            tom = tom.strip().replace(u' ', u'')
            r = re_t1_t2.search(tom)
            if r:
                groups = r.groups()
                doc['tom_f'] = (int(groups[0]) + int(groups[1])) / 2.0
            else:
                r = re_t1.search(tom)
                if r:
                    doc['tom_f'] = float(r.groups()[0])

        issn = doc.get('issn_t')
        if issn:
            for issn_item in issn:
                new_issn_value = issn_item.replace('-', '').replace(' ', '')
                if new_issn_value != issn_item:
                    doc['issn_t'].append(new_issn_value)

        isbn = doc.get('isbn_t')
        if isbn:
            for isbn_item in isbn:
                new_isbn_value = isbn_item.replace('-', '').replace(' ', '')
                if new_isbn_value != isbn_item:
                    doc['isbn_t'].append(new_isbn_value)

        try:
            record_create_date = doc.get('record-create-date_dt', None)
            # print 'record_create_date1', record_create_date
            if record_create_date:
                doc['record-create-date_dts'] = record_create_date
        except Exception as e:
            print 'Error record-create-date_dt'

        holder_codes = _get_holdings(source_id=res[0]['source_id'],
                                     record_id=res[0]['record_id'],
                                     orgs_index=orgs_index,
                                     holdings_index=holdings_index,
                                     sources_index=sources_index)

        # if holder_codes:
        #     print holder_codes

        if holder_codes:
            doc['system-holder_s'] = holder_codes

            org_types = set()
            for holder_code in holder_codes:
                org_type = orgs_index.get('code',
                                          {}).get(holder_code,
                                                  {}).get('org_type', '')
                if org_type:
                    org_types.add(org_type)

            if org_types:
                doc['org_type_s'] = list(org_types)

        doc['system-add-date_dt'] = res[0]['add_date']
        doc['system-add-date_dts'] = res[0]['add_date']
        doc['system-update-date_dt'] = res[0]['update_date']
        doc['system-update-date_dts'] = res[0]['update_date']
        doc['system-catalog_s'] = res[0]['source_id']
        # doc['source-type_s'] = sources_index[res[0]['source_id']].source_type
        if str(doc['system-catalog_s']) == '2':
            full_text_file = None
            #            doc['system-update-date_dt'] = res[0]['doc-id_s']
            urls = doc.get('doc-id_s', None)
            if urls and type(urls) == list:
                for url in doc.get('doc-id_s', None):
                    if url:
                        full_text_file = url.split('/')[-1]
            else:
                if urls:
                    full_text_file = urls.split('/')[-1]
            if full_text_file:
                text = full_text_extract(full_text_file)
                if text:
                    doc['full-text'] = text

        docs.append(doc)
        i += 1
        if i % 100 == 0:
            print 'indexed', i
        if len(docs) > 100:
            pass
            solr.add(docs)
            docs = list()
        res = rows.fetch_row(how=1)

    if docs:
        pass
        solr.add(docs)

    solr.commit()
    index_status.indexed = i

    # удаление
    records = []

    if getattr(index_status, 'last_index_date', None):
        records = Record.objects.using('records').filter(
            deleted=True,
            update_date__gte=index_status.last_index_date).values('gen_id')
    else:
        records = Record.objects.using('records').filter(deleted=True).values(
            'gen_id', 'update_date')

    record_gen_ids = []
    for record in list(records):
        record_gen_ids.append(record['gen_id'])

    if record_gen_ids:
        solr.delete(record_gen_ids)
        solr.commit()

    index_status.deleted = len(record_gen_ids)
    index_status.last_index_date = start_index_date
    index_status.save()
    conn.query('DELETE FROM records WHERE deleted = 1')
    return True
Exemplo n.º 28
0
def main(global_config, **settings):
    """ This function returns a Pyramid WSGI application.
    """
    config = Configurator(settings=settings)
    config.scan('raggregate.models')
    engine = engine_from_config(settings, 'sqlalchemy.')
    sqlahelper.add_engine(engine)
    initialize_sql(engine)

    session_factory = pyramid_beaker.session_factory_from_settings(settings)

    template_static_asset = "{0}/static".format(settings['mako.directories'])
    settings['template_static_asset'] = template_static_asset

    config = Configurator(settings=settings)
    config.include('pyramid_tm')

    if 'solr.address' in settings:
        import sunburnt
        solr_conn = sunburnt.SolrInterface(settings['solr.address'])
        config.registry.solr_conn = solr_conn

    if 'twitter.app_key' in settings and 'twitter.app_secret' in settings:
        from twython import Twython
        app_twit = Twython(settings['twitter.app_key'],
                           settings['twitter.app_secret'])
        config.registry.app_twit = app_twit

    config.set_session_factory(session_factory)

    # @TODO: the name "mako.directories" implies this could be a list
    # right now we don't care. Someone should fix this.
    config.add_static_view('static', template_static_asset)
    config.add_static_view('user_imgs', settings['user.picture_upload_package'])
    config.add_static_view('section_imgs', settings['section.picture_upload_package'])

    config.add_route('home', '/')
    config.add_route('login', '/login')
    config.add_route('list', '/list')
    config.add_route('post', '/post')
    config.add_route('new_page', '/new_page')
    config.add_route('new_post', '/new_post')
    config.add_route('ban', '/ban')
    config.add_route('vote', '/vote/{way}')
    config.add_route('full', '/full/{sub_id}')
    config.add_route('epistle', '/messages/{box}')
    config.add_route('follow', '/follow')
    config.add_route('save', '/save')
    config.add_route('notify', '/notify')
    config.add_route('search', '/search')
    config.add_route('twit_sign', '/twit_sign')
    config.add_route('user_info', '/user_info')
    config.add_route('user_preferences', '/user_preferences')
    config.add_route('buttons', '/buttons')
    config.add_route('favicon', '/favicon.ico')
    config.add_route('atom_story', '/atom_story.xml')
    config.add_route('atom_self_story', '/atom_self_story.xml')
    config.add_route('atom_combined', '/atom_combined.xml')
    config.add_route('atom_comment', '/atom_comment.xml')
    config.add_route('section', '/section')
    config.add_route('motd', '/motd')
    config.add_route('sublist', '/sublist/{sub_title}')
    config.add_route('sublistc', '/sublist_create')
    config.add_route('lost_password', '/lost_password')

    config.add_subscriber(subscribers.ban, NewResponse)
    config.add_subscriber(subscribers.user_session_handler, BeforeRender)
    config.add_subscriber(subscribers.clear_per_request_session, NewRequest)
    config.add_subscriber(subscribers.clean_inputs, NewRequest)

    config.scan('raggregate.views')


    pyramid_beaker.set_cache_regions_from_settings(settings)

    return config.make_wsgi_app()
Exemplo n.º 29
0
# -*- coding: utf-8 -*-

import time
import socket
import xml.parsers.expat

import sunburnt

from Resource.ResourceHelper import ResourceHelper
from Util.PathTool import PathTool
from Digester.FeedDictFactory import FeedDictFactory

# create a connection to a solr server
try:
    solr = sunburnt.SolrInterface("http://localhost:8983/solr/")
except socket.error as e:
    print(e, "Is Solr started?")

_pt = PathTool.PathTool()
_rh = ResourceHelper()
feeds = _rh.getAllFeedPaths()
for feed in feeds:

    if not _rh.checkFeedPath(feed):
        print(("Skipping:", feed))
        continue

    try:
        feedDictFactory = FeedDictFactory.FeedDictFactory()
        feedDict = feedDictFactory.getFeedDict(feed)
        if feedDict != None and feedDict != {}:
Exemplo n.º 30
0
    
    while list(response):
        for match in response:
            scans = match.get('scansion', None)
            if not scans:
                scans = ['']

            if 'before_caesura' in match and 'after_caesura' in match:
                line = '%s // %s' % (match['before_caesura'].strip(),
                                     match['after_caesura'].strip())
            else:
                line = match['line_text']

            print('%-9s %-22s %s' % (match['lineid'], scans[0], line)) 
            for scan in scans[1:]:
                print('%-9s %-22s %s' % ('', scan, '  alternate scansion'))
        start += ROWS
        response = query.paginate(start=start, rows=ROWS).execute()


if __name__ == '__main__':
    import sys
    if len(sys.argv) < 3:
        print('Usage: %s solr_url word ...')
    solr_url = sys.argv[1]
    solr = sunburnt.SolrInterface(solr_url)
    query = solr # not really, but it will after the first iteration of:
    for word in sys.argv[2:]:
        query = query.query(unicode(word, 'utf-8'))
    report_results(query)