Пример #1
0
class Importer:
    def __init__(self):
        self._solr = Solr(Config(section='solr')['url'])
        self._mh = MediaHaven(buffer_size=100)

    def add(self, item):
        self._solr.add([item])

    def process(self, item):
        if item is None:
            raise Exception("Invalid item passed (None)")

        if type(item) is not str:
            pid = item['externalId']
        else:
            pid = item
            item = self._mh.one('+(externalId:%s)' % pid)

        if not pid:
            raise "No pid for item %s" % (item, )

        language = ''
        try:
            language = item['mdProperties']['language'][0].lower()
        except Exception as e:
            logger.warning('no language found for %s', pid)
            logger.exception(e)

        alto = self._mh.get_alto(item)
        if not alto:
            logger.debug("no alto for pid '%s' " % (pid, ))
            text = ''
        else:
            text = Conversions.normalize(alto.text)
        self.add(dict(id=pid, text=text, language=language))
Пример #2
0
def update_hub_products():
    try:
        hotel_solr_docs, hotels = _get_hub_products('hotels')
        package_solr_docs, packages = _get_hub_products('packages')

        hotel_city_ids = {item.get('city') for item in hotels}
        package_city_ids = {item.get('city') for item in packages}

        cities = list(hotel_city_ids) + list(package_city_ids)
        city_solr_docs = _get_cities(cities)

        print(f'hotel count: {len(hotels)}')
        print(f'package count: {len(packages)}')
        print(f'cities count: {len(cities)}')

        solr = Solr(f"http://{solr_host}/solr/hub-products")
        solr_drop('hub-products')
        solr.add(hotel_solr_docs + package_solr_docs + city_solr_docs)
        solr_res = solr.commit()
        print(f'SOLR UPDATE RESULT:')
        print(solr_res)
        print('SOLR INDEX UPDATE SUCCESS!')
    except expression as e:
        print('SOLR INDEX UPDATE FAILED!')
        print(e)
class Processor(object):

    def __init__(self, solr_server_url):
        self.server = Solr(solr_server_url)

    def process(self, fname):
        base, _ = os.path.splitext(os.path.basename(fname))
        url = DOCUMENT_URL + base + '.html'
        fp = open(fname)
        title = None
        while not title:
            title = fp.next().strip()
        content = ''
        for line in fp:
            s = line.strip()
            if s and not s.startswith(('**', '==', '--')):
                content += s
        fp.close()
        document_id = u"%s-%s" % (DOCUMENT_SITE_ID, title)
        logging.info("new document: %s" % (document_id,))
        t = os.path.getmtime(fname)
        doc = {
            'id': hashlib.sha1(document_id.encode('utf-8')).hexdigest(),
            'site': DOCUMENT_SITE_ID,
            'url': url,
            'title': title,
            'content': content,
            'last_modified': datetime.datetime.fromtimestamp(t)
        }
        self.server.add([doc])
Пример #4
0
class SolrSearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = ("AND", "NOT", "OR", "TO")

    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = ("\\", "+", "-", "&&", "||", "!", "(", ")", "{", "}", "[", "]", "^", '"', "~", "*", "?", ":")

    def __init__(self, connection_alias, **connection_options):
        super(SolrSearchBackend, self).__init__(connection_alias, **connection_options)

        if not "URL" in connection_options:
            raise ImproperlyConfigured(
                "You must specify a 'URL' in your settings for connection '%s'." % connection_alias
            )

        self.conn = Solr(connection_options["URL"], timeout=self.timeout)
        self.log = logging.getLogger("haystack")

    def update(self, index, iterable, commit=True):
        docs = []

        try:
            for obj in iterable:
                docs.append(index.full_prepare(obj))
        except UnicodeDecodeError:
            sys.stderr.write("Chunk failed.\n")

        if len(docs) > 0:
            try:
                self.conn.add(docs, commit=commit, boost=index.get_field_weights())
            except (IOError, SolrError), e:
                self.log.error("Failed to add documents to Solr: %s", e)
Пример #5
0
class SearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\',
        '+',
        '-',
        '&&',
        '||',
        '!',
        '(',
        ')',
        '{',
        '}',
        '[',
        ']',
        '^',
        '"',
        '~',
        '*',
        '?',
        ':',
    )

    def __init__(self, site=None):
        super(SearchBackend, self).__init__(site)

        if not hasattr(settings, 'HAYSTACK_SOLR_URL'):
            raise ImproperlyConfigured(
                'You must specify a HAYSTACK_SOLR_URL in your settings.')

        timeout = getattr(settings, 'HAYSTACK_SOLR_TIMEOUT', 10)
        self.conn = Solr(settings.HAYSTACK_SOLR_URL, timeout=timeout)
        self.log = logging.getLogger('haystack')

    def update(self, index, iterable, commit=True):
        docs = []

        try:
            for obj in iterable:
                docs.append(index.full_prepare(obj))
        except UnicodeDecodeError:
            sys.stderr.write("Chunk failed.\n")

        if len(docs) > 0:
            try:
                self.conn.add(docs,
                              commit=commit,
                              boost=index.get_field_weights())
            except (IOError, SolrError), e:
                self.log.error("Failed to add documents to Solr: %s", e)
Пример #6
0
def push_hotels(cursor, supplier):
    solr = Solr(f"http://{solr_host}/solr/hotels")
    docs = []
    index = 1
    total = cursor.count()
    for doc in cursor:
        d = {
            "id": str(doc["_id"]),
            "name": doc["name"],
            "name_cn": doc.get("name_cn", ""),
            "supplier": supplier,
            "address": doc["address"],
            "wgstar": doc["wgstar"],
        }

        if "city" in doc:
            d["city"] = doc["city"].get("name", "")
        if "country" in doc:
            country_name = (doc["country"].get("name", ""), )
            country_code = doc["country"].get("code", "")
            if "name" in doc["country"]:
                d["country"] = country_name
            else:
                d["country"] = country_code
        weego_id = doc.get("weego_id")
        if weego_id:
            d["to_c_ref"] = str(weego_id)
        wg_country_id = doc.get("wg_country_id")
        if wg_country_id:
            d["wg_country_id"] = str(wg_country_id)
        wg_province_id = doc.get("wg_province_id")
        if wg_province_id:
            d["wg_province_id"] = str(wg_province_id)
        wg_city_id = doc.get("wg_city_id")
        if wg_city_id:
            d["wg_city_id"] = wg_city_id
        wg_destination_id = doc.get("wg_destination_id")
        if wg_destination_id:
            d["wg_destination_id"] = wg_destination_id
        docs.append(d)
        if index % 500 == 0:
            try:
                print(solr.add(docs, commit=True))
                docs.clear()
                print("Progress of {1}: {0:.2f}%".format(
                    index / total * 100, supplier))
            except Exception as e:
                print("------------------------------------------------------")
                print(index)
                print("------------------------------------------------------")
                raise
        index += 1

    if docs:
        print(solr.add(docs, commit=True))
        print(index)
        print("Progress of {1}: {0:.2f}%".format(index / total * 100,
                                                 supplier))
Пример #7
0
class SolrSearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':',
    )

    def __init__(self, connection_alias, **connection_options):
        super(SolrSearchBackend, self).__init__(connection_alias, **connection_options)

        if not 'URL' in connection_options:
            raise ImproperlyConfigured("You must specify a 'URL' in your settings for connection '%s'." % connection_alias)

        self.connection_options = connection_options
        self.conn = Solr(connection_options['URL'], timeout=self.timeout)
        self.log = logging.getLogger('haystack')

    def update(self, index, iterable, commit=None):
        docs = []

        if commit == None:
            commit = self.connection_options.get('COMMIT_UPDATES', True)

        for obj in iterable:
            try:
                docs.append(index.full_prepare(obj))
            except UnicodeDecodeError:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"UnicodeDecodeError while preparing object for update", exc_info=True, extra={
                    "data": {
                        "index": index,
                        "object": get_identifier(obj)
                    }
                })

        if len(docs) > 0:
            try:
                self.conn.add(docs, commit=commit, boost=index.get_field_weights())
            except (IOError, SolrError), e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Solr: %s", e)
Пример #8
0
def add_to_solr(response, body):
    conn = Solr(settings.SOLR_BASE)
    conn.add(
        [
            dict(
                id="response:%d" % response.id,
                name=response.url.url,
                text=body.encode('utf-8'),
            )
        ])
Пример #9
0
class SolrSearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':',
    )

    def __init__(self, connection_alias, **connection_options):
        super(SolrSearchBackend, self).__init__(connection_alias, **connection_options)

        if not 'URL' in connection_options:
            raise ImproperlyConfigured("You must specify a 'URL' in your settings for connection '%s'." % connection_alias)

        self.conn = Solr(connection_options['URL'], timeout=self.timeout)
        self.log = logging.getLogger('haystack')

    def update(self, index, iterable, commit=True):
        docs = []

        for obj in iterable:
            try:
                docs.append(index.full_prepare(obj))
            except UnicodeDecodeError:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"UnicodeDecodeError while preparing object for update", exc_info=True, extra={
                    "data": {
                        "index": index,
                        "object": get_identifier(obj)
                    }
                })

        if len(docs) > 0:
            try:
                self.conn.add(docs, commit=commit, boost=index.get_field_weights())
            except (IOError, SolrError), e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Solr: %s", e)
Пример #10
0
    def index(self):
        """send csv to Solr index"""
        self.logger.info('Setting up Solr index...')
        solr = Solr("http://{0}:{1}/solr/travel/".format(self.host, self.port),
                    timeout=10000)

        self.logger.info('Indexing %s...' % self.file)
        act = [
            self.format(passage, cid=cid)
            for cid, passage in self.csv_generator()
        ]
        solr.add(act)

        solr.optimize()
Пример #11
0
class SolrSearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':',
    )

    def __init__(self, connection_alias, **connection_options):
        super(SolrSearchBackend, self).__init__(connection_alias, **connection_options)

        if not 'URL' in connection_options:
            raise ImproperlyConfigured("You must specify a 'URL' in your settings for connection '%s'." % connection_alias)

        user = connection_options.get("HTTP_AUTH_USERNAME")
        passwd = connection_options.get("HTTP_AUTH_PASSWORD")
        self.conn = Solr(connection_options['URL'], auth=(user,passwd),
                         timeout=self.timeout)
        self.log = logging.getLogger('haystack')

    def update(self, index, iterable, commit=True):
        docs = []

        try:
            for obj in iterable:
                docs.append(index.full_prepare(obj))
        except UnicodeDecodeError:
            if not self.silently_fail:
                raise

            self.log.error("Chunk failed.\n")

        if len(docs) > 0:
            try:
                self.conn.add(docs, commit=commit, boost=index.get_field_weights())
            except (IOError, SolrError), e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Solr: %s", e)
Пример #12
0
class SolrServer(object):
    def __init__(self, url):
        """
        Initialize Solr Server
        :param url: Solr URL
        """
        self.server = Solr(url)
        self.batch = 100

    def search(self, query, **kwargs):
        """
        Query Solr
        :param query: search string
        :param kwargs: other Solr arguments
        :return: list of results
        """
        print('Querying Solr for: ' + query)
        response = self.server.search(query, None, **kwargs)
        print('Number of rows selected: ' + str(len(response.docs)))
        return response.docs

    def update_status(self, docs, status):
        """
        Update document status
        :param doc_id: Unique identifier of a document
        :param status: Document Status (Eg: IN_CDR)
        :return: true/false based on document update
        """
        self.server.add(docs, fieldUpdates={'cdr_status': status})
        pass

    def mark_cdr_indexed(ids_list, solr_url, core):
        solr_url = solr_url + core + "/update"
        update_docs = []
        for id in ids_list:
            timestamp = datetime.datetime.utcnow().isoformat() + 'Z'
            doc = {"id": id, "cdr_status": {"set": "CDR_INDEXED"}}
            update_docs.append(doc)
        print json.dumps(update_docs)
        response = requests.post(solr_url,
                                 data=json.dumps(update_docs),
                                 headers={"content-type": "application/json"})
        return response

    def commit(self):
        self.server.commit()
        pass
Пример #13
0
class SolrSearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = ("AND", "NOT", "OR", "TO")

    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = ("\\", "+", "-", "&&", "||", "!", "(", ")", "{", "}", "[", "]", "^", '"', "~", "*", "?", ":")

    def __init__(self, connection_alias, **connection_options):
        super(SolrSearchBackend, self).__init__(connection_alias, **connection_options)

        if not "URL" in connection_options:
            raise ImproperlyConfigured(
                "You must specify a 'URL' in your settings for connection '%s'." % connection_alias
            )

        self.conn = Solr(connection_options["URL"], timeout=self.timeout)
        self.log = logging.getLogger("haystack")

    def update(self, index, iterable, commit=True):
        docs = []

        for obj in iterable:
            try:
                docs.append(index.full_prepare(obj))
            except UnicodeDecodeError:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(
                    u"UnicodeDecodeError while preparing object for update",
                    exc_info=True,
                    extra={"data": {"index": index, "object": get_identifier(obj)}},
                )

        if len(docs) > 0:
            try:
                self.conn.add(docs, commit=commit, boost=index.get_field_weights())
            except (IOError, SolrError), e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Solr: %s", e)
Пример #14
0
 def solr_save(self):
     """Save the item to the solr index. Index will be updated if solr_id already exists in the index."""
     if settings.SOLR['running']:
         con = Solr(settings.SOLR_URL)
         docs = [{
             'solr_id': 'Item_' + str(self.id),
             'id': str(self.id),
             'class': 'Item',
             'title_t': self.title,
             'description_t': self.description,
             'lat_f': str(self.lat),
             'lng_f': str(self.lng)
         }]
         con.add(docs)
         return True
     else:
         return False
Пример #15
0
    def add_tags_to_solr_doc(index_id, document_id, tags):
        '''Adds a list of tags to the Solr document.

        Args:
            index_id: The name of the Solr index.
            document_id: The value of the id field of the Solr document to
                update.
            tags: A list of strings.

        Returns:
            None
        '''
        solr_client = Solr(config['solr']['indexes'][index_id],
                           always_commit=True)

        tags_field = config['solr']['tags_field']
        copy_fields = config['solr']['copy_fields']

        # Get the value of each field to copy to the new document.
        src_fields = map(lambda x: x['src'], copy_fields)
        src_field_values = solr_client.search('id:{}'.format(document_id),
                                              fl=list(src_fields)).docs[0]

        # Only set the fields that are already set on the document.
        existing_src_fields = src_field_values.keys()

        # Add copy fields to the new Solr doc.
        solr_doc = {
            'id': document_id,
            tags_field: tags,
            **{
                copy_field['dst']: src_field_values[copy_field['src']]
                for copy_field in copy_fields if copy_field['src'] in existing_src_fields
            }
        }
        solr_client.add(
            [solr_doc],
            commitWithin='1000',
            fieldUpdates={
                tags_field: 'set',
                **{
                    copy_field['dst']: 'set'
                    for copy_field in copy_fields if copy_field['src'] in existing_src_fields
                }
            },
            overwrite=True)
class SearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )
    
    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':',
    )
    
    def __init__(self, site=None):
        super(SearchBackend, self).__init__(site)
        
        if not hasattr(settings, 'HAYSTACK_SOLR_URL'):
            raise ImproperlyConfigured('You must specify a HAYSTACK_SOLR_URL in your settings.')
        
        timeout = getattr(settings, 'HAYSTACK_SOLR_TIMEOUT', 10)
        self.conn = Solr(settings.HAYSTACK_SOLR_URL, timeout=timeout)
        self.log = logging.getLogger('haystack')
    
    def update(self, index, iterable, commit=True):
        docs = []
        
        try:
            for obj in iterable:
                docs.append(index.full_prepare(obj))
        except UnicodeDecodeError:
            if not self.silently_fail:
                raise
            
            self.log.error("Chunk failed.\n")
        
        if len(docs) > 0:
            try:
                self.conn.add(docs, commit=commit, boost=index.get_field_weights())
            except (IOError, SolrError), e:
                if not self.silently_fail:
                    raise
                
                self.log.error("Failed to add documents to Solr: %s", e)
class SolrSearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':',
    )

    def __init__(self, connection_alias, **connection_options):
        super(SolrSearchBackend, self).__init__(connection_alias, **connection_options)

        if not 'URL' in connection_options:
            raise ImproperlyConfigured("You must specify a 'URL' in your settings for connection '%s'." % connection_alias)

        self.conn = Solr(connection_options['URL'], timeout=self.timeout)
        self.log = logging.getLogger('haystack')

    def update(self, index, iterable, commit=True):
        docs = []

        try:
            for obj in iterable:
                docs.append(index.full_prepare(obj))
        except UnicodeDecodeError:
            if not self.silently_fail:
                raise

            self.log.error("Chunk failed.\n")

        if len(docs) > 0:
            try:
                self.conn.add(docs, commit=commit, boost=index.get_field_weights())
            except (IOError, SolrError), e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Solr: %s", e)
Пример #18
0
    def add_doc(self, id):
        conn = Solr(settings.SOLR_URL)

        self._build_group_tree()
        record_ids = [id]
        media_dict = self._preload_related(Media, record_ids)
        fieldvalue_dict = self._preload_related(FieldValue, record_ids,
                                                related=2)
        groups_dict = self._preload_related(CollectionItem, record_ids)
        core_fields = dict((f, f.get_equivalent_fields())
                           for f in Field.objects.filter(standard__prefix='dc'))
        
        record = Record.objects.filter(id__in=record_ids)[0]
        doc = self._record_to_solr(record, core_fields,
                                   groups_dict.get(record.id, []),
                                   fieldvalue_dict.get(record.id, []),
                                   media_dict.get(record.id, []))
        conn.add([doc])
Пример #19
0
def load_csv(filename: str, solr_url: typing.Optional[str]):
    """Load data from a csv.

    Args:
        filename: A CSV file.
        solr_url: URL of a solr instance.
    """

    solr_client = Solr(solr_url, always_commit=True) if solr_url else None

    data_frame = pandas.read_csv(filename)
    data_frame = data_frame.where(data_frame.notnull(), None)
    collection_rows = data_frame[data_frame["Object Type"] == "Collection"]

    config = {
        "collection_names": {
            row["Item ARK"]: row["Title"]
            for _, row in collection_rows.iterrows()
        },
        "controlled_fields": load_field_config("./fields"),
        "data_frame": data_frame,
    }

    if not solr_client:
        print("[", end="")

    first_row = True
    for _, row in data_frame.iterrows():
        if row["Object Type"] in ("ChildWork", "Page"):
            continue

        if first_row:
            first_row = False
        elif not solr_client:
            print(", ")

        mapped_record = map_record(row, solr_client, config=config)
        if solr_client:
            solr_client.add([mapped_record])
        else:
            print(mapped_record, end="")

    if not solr_client:
        print("]")
Пример #20
0
def reindex_resources(dbname, url=settings.SOLR_URL, printit=False):
    """docstring for reindex_resources"""
    # logger.error("indexing resources:")
    
    if printit:
        print 'CLEARING SOLR INDEX: ', url
    conn = Solr(url)
    conn.delete(q='*:*')
    batch_size = getattr(settings, 'SOLR_BATCH_SIZE', 100)
    if printit:
        print 'Indexing %s Resources... (batch: %s)' % (Resource.objects.count(), batch_size)
    
    docs = []
    for i, res in enumerate(Resource.objects):
        docs.extend(res.index())
        if i % batch_size == 0:
            conn.add(docs)
            docs = []
    conn.add(docs)
Пример #21
0
def reindex_resources(dbname, url=settings.SOLR_URL, printit=False):
    """docstring for reindex_resources"""
    # logger.error("indexing resources:")

    if printit:
        print 'CLEARING SOLR INDEX: ', url
    conn = Solr(url)
    conn.delete(q='*:*')
    batch_size = getattr(settings, 'SOLR_BATCH_SIZE', 100)
    if printit:
        print 'Indexing %s Resources... (batch: %s)' % (
            Resource.objects.count(), batch_size)

    docs = []
    for i, res in enumerate(Resource.objects):
        docs.extend(res.index())
        if i % batch_size == 0:
            conn.add(docs)
            docs = []
    conn.add(docs)
Пример #22
0
def copy_repository_field(solr_url):
    solr_connection = Solr(solr_url, always_commit=True)

    n_hits = float('inf')  # but will update from first chunk results
    start = 0
    chunk_size = 1000
    while start < n_hits:
        print(f"{start+1} to {min(start+chunk_size, n_hits)} of {n_hits}")
        chunk = solr_connection.search(
            "!repository_sim:*",
            fq="repository_tesim:*",
            fl="id,repository_tesim",
            defType="lucene",
            start=0,
            rows=100,
        )
        solr_connection.add([process_doc(d) for d in chunk.docs],
                            overwrite=False)
        n_hits = chunk.hits
        start += chunk_size
Пример #23
0
def _index_products(products, request, delete=False):
    """Indexes given products.
    """
    conn = Solr(SOLR_ADDRESS)
    if delete:
        conn.delete(q='*:*')

    temp = []
    for product in products:

        # Just index the default variant of a "Product with Variants"
        if product.is_product_with_variants():
            product = product.get_default_variant()

        if product is None:
            continue

        # Categories
        categories = []
        for category in product.get_categories():
            categories.append(category.name)

        # Manufacturer
        manufacturer = product.manufacturer
        if manufacturer:
            manufacturer_name = manufacturer.name
        else:
            manufacturer_name = ""

        temp.append({
            "id" : product.id,
            "name" : product.get_name(),
            "price" : product.get_price(request),
            "categories" : categories,
            "keywords" : product.get_meta_keywords(),
            "manufacturer" : manufacturer_name,
            "sku_manufacturer" : product.sku_manufacturer,
            "description" : product.description,
        })

    conn.add(temp)
def clone_solr_core(source_url, destination_url):
    source_solr = Solr(source_url, always_commit=True)
    destination_solr = Solr(destination_url, always_commit=True)

    if "sinai" in destination_url:
        chunk = source_solr.search(
            "*:*",
            defType="lucene",
            start=0,
            rows=1000,
        )
    else:
        chunk = source_solr.search(
            "has_model_ssim:Collection",
            defType="lucene",
            start=0,
            rows=200,
        )
    destination_solr.add(
        [process_doc(d, source_solr, destination_solr) for d in chunk.docs],
        overwrite=True)
Пример #25
0
def reindex_resources(url=settings.SOLR_URL, printit=False):
    """docstring for reindex_resources"""
    # logger.error("indexing resources:")

    from resources.models import Resource

    if printit:
        print 'CLEARING SOLR INDEX for Resources: ', url
    conn = Solr(url)
    conn.delete(q='res_type:%s' % settings.SOLR_RES)
    batch_size = getattr(settings, 'SOLR_BATCH_SIZE', 100)
    if printit:
        print 'Indexing %s Resources... (batch: %s)' % (Resource.objects.count(), batch_size)
    
    docs = []
    for i, res in enumerate(Resource.objects):
        entry = res.index()
        if entry:
            docs.extend(entry)
        if i % batch_size == 0:
            conn.add(docs)
            docs = []
    conn.add(docs)
Пример #26
0
class DocManager():
    """The DocManager class creates a connection to the backend engine and
    adds/removes documents, and in the case of rollback, searches for them.

    The reason for storing id/doc pairs as opposed to doc's is so that multiple
    updates to the same doc reflect the most up to date version as opposed to
    multiple, slightly different versions of a doc.
    """

    def __init__(self, url, auto_commit=True, unique_key='_id'):
        """Verify Solr URL and establish a connection.
        """
        if verify_url(url) is False:
            raise SystemError

        self.solr = Solr(url)
        self.unique_key = unique_key
        self.auto_commit = auto_commit

        if auto_commit:
            self.run_auto_commit()

    def stop(self):
        self.auto_commit = False

    def upsert(self, doc):
        """Update or insert a document into Solr

        This method should call whatever add/insert/update method exists for
        the backend engine and add the document in there. The input will
        always be one mongo document, represented as a Python dictionary.
        """
        self.solr.add([doc], commit=False)

    def remove(self, doc):
        """Removes documents from Solr

        The input is a python dictionary that represents a mongo document.
        """
        self.solr.delete(id=str(doc[self.unique_key]), commit=False)

    def search(self, start_ts, end_ts):
        """Called to query Solr for documents in a time range.
        """
        query = '_ts: [%s TO %s]' % (start_ts, end_ts)
        return self.solr.search(query, rows=100000000)

    def _search(self, query):
        """For test purposes only. Performs search on Solr with given query
            Does not have to be implemented.
        """
        return self.solr.search(query, rows=200)

    def commit(self):
        """This function is used to force a commit.
        """
        retry_until_ok(self.solr.commit)

    def run_auto_commit(self):
        """Periodically commits to the Solr server.
        """
        self.solr.commit()
        if self.auto_commit:
            Timer(1, self.run_auto_commit).start()

    def get_last_doc(self):
        """Returns the last document stored in the Solr engine.
        """
        #search everything, sort by descending timestamp, return 1 row
        result = self.solr.search('*:*', sort='_ts desc', rows=1
                                  )

        if len(result) == 0:
            return None

        return result.docs[0]
Пример #27
0
     group_order = 1
   else:
     group_list = ''
     if link.category == 'api':
       category_order = 2
     elif link.category == 'documentation':
       category_order = 6
     elif link.category == 'download':
       category_order = 3
   solr.add([
     {
       'id': index_counter,
       '_text_': str(group_list) + link.title + link.link + description + str(tag_list),
       'database_id': link.id,
       'category': link.category,
       'title': link.title,
       'link': link.link if link.category != 'geoservice' else '',
       'public': link.public,
       'category_order': category_order if 'category_order' in locals() else link.category_order,
       'group_order': group_order if 'group_order' in locals() else link.group_order
     }
   ], commit = True)
 else:
   add = False
   if not link.search_title:
     title = link.group
     add = True
   else:
     title = link.search_title
     add = True
   if add == True:
Пример #28
0
class SolrTestCase(unittest.TestCase):
    def setUp(self):
        super(SolrTestCase, self).setUp()
        self.default_solr = Solr('http://localhost:8983/solr/core0')
        # Short timeouts.
        self.solr = Solr('http://localhost:8983/solr/core0', timeout=2)
        self.docs = [
            {
                'id': 'doc_1',
                'title': 'Example doc 1',
                'price': 12.59,
                'popularity': 10,
            },
            {
                'id': 'doc_2',
                'title': 'Another example ☃ doc 2',
                'price': 13.69,
                'popularity': 7,
            },
            {
                'id': 'doc_3',
                'title': 'Another thing',
                'price': 2.35,
                'popularity': 8,
            },
            {
                'id': 'doc_4',
                'title': 'doc rock',
                'price': 99.99,
                'popularity': 10,
            },
            {
                'id': 'doc_5',
                'title': 'Boring',
                'price': 1.12,
                'popularity': 2,
            },
            {
                "id": "sn1",
                "cat": "pony",
                "comments": "blue",
                "description": "black",
                "store": "50.03131,10.12135"
            },
            {
                "id": "sn2",
                "cat": "pony",
                "name": "fake unicorn",
                "comments": "yellow",
                "description": "blue",
                "store": "54.23131,10.12135"
            },
            {
                "id": "sn3",
                "cat": "pony",
                "comments": "yellow",
                "description": "red",
                "store": "54.33131,10.12135"
            },
            {
                "id": "sn4",
                "cat": "unicorn",
                "comments": "yellow",
                "description": "blue"
            },
            {
                "id": "sn5",
                "cat": "unicorn",
                "comments": "steel",
                "description": "steel",
                "store": "54.43131,10.12135"
            },
            {
                "id": "sn6",
                "name": "blue pony",
                "cat": "unicorn",
                "comments": "blue",
                "description": "blue",
                "store": "54.33131,10.22135"
            },
        ]

        # Clear it.
        self.solr.delete(q='*:*')

        # Index our docs. Yes, this leans on functionality we're going to test
        # later & if it's broken, everything will catastrophically fail.
        # Such is life.
        self.solr.add(self.docs)

    def tearDown(self):
        self.solr.delete(q='*:*')
        super(SolrTestCase, self).tearDown()

    def test_init(self):
        self.assertEqual(self.default_solr.url, 'http://localhost:8983/solr/core0')
        self.assertTrue(isinstance(self.default_solr.decoder, json.JSONDecoder))
        self.assertEqual(self.default_solr.timeout, 60)

        self.assertEqual(self.solr.url, 'http://localhost:8983/solr/core0')
        self.assertTrue(isinstance(self.solr.decoder, json.JSONDecoder))
        self.assertEqual(self.solr.timeout, 2)

    def assertSameIDs(self, docs, expected_ids):
        doc_ids = frozenset([doc['id'] for doc in docs])
        ids_set = frozenset(expected_ids)
        self.assertEqual(doc_ids, ids_set)

    def test__create_full_url(self):
        # Nada.
        self.assertEqual(self.solr._create_full_url(path=''), 'http://localhost:8983/solr/core0')
        # Basic path.
        self.assertEqual(self.solr._create_full_url(path='pysolr_tests'), 'http://localhost:8983/solr/core0/pysolr_tests')
        # Leading slash (& making sure we don't touch the trailing slash).
        self.assertEqual(self.solr._create_full_url(path='/pysolr_tests/select/?whatever=/'), 'http://localhost:8983/solr/core0/pysolr_tests/select/?whatever=/')

    def test__send_request(self):
        # Test a valid request.
        resp_body = self.solr._send_request('GET', 'select/?q=doc&wt=json')
        self.assertTrue('"numFound":3' in resp_body)

        # Test a lowercase method & a body.
        xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
        resp_body = self.solr._send_request('POST', 'update/?commit=true', body=xml_body, headers={
            'Content-type': 'text/xml; charset=utf-8',
        })
        self.assertTrue('<int name="status">0</int>' in resp_body)

        # Test a non-existent URL.
        old_url = self.solr.url
        self.solr.url = 'http://127.0.0.1:567898/wahtever'
        self.assertRaises(SolrError, self.solr._send_request, 'get', 'select/?q=doc&wt=json')
        self.solr.url = old_url

    def test__select(self):
        # Short params.
        resp_body = self.solr._select({'q': 'doc'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 3)

        # Long params.
        resp_body = self.solr._select({'q': 'doc' * 1024})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)
        self.assertEqual(len(resp_data['responseHeader']['params']['q']), 3 * 1024)

    def test__mlt(self):
        resp_body = self.solr._mlt({'q': 'id:doc_1', 'mlt.fl': 'title'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)

    def test__suggest_terms(self):
        resp_body = self.solr._select({'terms.fl': 'title'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)

    def test__update(self):
        xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
        resp_body = self.solr._update(xml_body)
        self.assertTrue('<int name="status">0</int>' in resp_body)

    def test__extract_error(self):
        class RubbishResponse(object):
            def __init__(self, content, headers=None):
                self.content = content
                self.headers = headers

                if self.headers is None:
                    self.headers = {}

        # Just the reason.
        resp_1 = RubbishResponse("We don't care.", {'reason': 'Something went wrong.'})
        self.assertEqual(self.solr._extract_error(resp_1), "[Reason: Something went wrong.]")

        # Empty reason.
        resp_2 = RubbishResponse("We don't care.", {'reason': None})
        self.assertEqual(self.solr._extract_error(resp_2), "[Reason: None]\nWe don't care.")

        # No reason. Time to scrape.
        resp_3 = RubbishResponse('<html><body><pre>Something is broke.</pre></body></html>', {'server': 'jetty'})
        self.assertEqual(self.solr._extract_error(resp_3), "[Reason: Something is broke.]")

    def test__scrape_response(self):
        # Tomcat.
        resp_1 = self.solr._scrape_response({'server': 'coyote'}, '<html><body><p><span>Error message</span><span>messed up.</span></p></body></html>')
        self.assertEqual(resp_1, ('messed up.', ''))

        # Jetty.
        resp_2 = self.solr._scrape_response({'server': 'jetty'}, '<html><body><pre>Something is broke.</pre></body></html>')
        self.assertEqual(resp_2, ('Something is broke.', u''))

        # Broken Tomcat.
        resp_3 = self.solr._scrape_response({'server': 'coyote'}, '<html><body><p>Really broken. Scraping Java-generated HTML sucks.</pre></body></html>')
        self.assertEqual(resp_3, (None, u'<div><body><p>Really broken. Scraping Java-generated HTML sucks.</p></body></div>'))

        # Other.
        resp_4 = self.solr._scrape_response({'server': 'crapzilla'}, '<html><head><title>Wow. Seriously weird.</title></head><body><pre>Something is broke.</pre></body></html>')
        self.assertEqual(resp_4, ('Wow. Seriously weird.', u''))

    def test__from_python(self):
        self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)), '2013-01-18T00:00:00Z')
        self.assertEqual(self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)), '2013-01-18T00:30:28Z')
        self.assertEqual(self.solr._from_python(True), 'true')
        self.assertEqual(self.solr._from_python(False), 'false')
        self.assertEqual(self.solr._from_python(1), '1')
        self.assertEqual(self.solr._from_python(1.2), '1.2')
        self.assertEqual(self.solr._from_python(b'hello'), 'hello')
        self.assertEqual(self.solr._from_python('hello ☃'), 'hello ☃')

    def test__to_python(self):
        self.assertEqual(self.solr._to_python('2013-01-18T00:00:00Z'), datetime.datetime(2013, 1, 18))
        self.assertEqual(self.solr._to_python('2013-01-18T00:30:28Z'), datetime.datetime(2013, 1, 18, 0, 30, 28))
        self.assertEqual(self.solr._to_python('true'), True)
        self.assertEqual(self.solr._to_python('false'), False)
        self.assertEqual(self.solr._to_python(1), 1)
        self.assertEqual(self.solr._to_python(1.2), 1.2)
        self.assertEqual(self.solr._to_python(b'hello'), 'hello')
        self.assertEqual(self.solr._to_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._to_python(['foo', 'bar']), 'foo')
        self.assertEqual(self.solr._to_python(('foo', 'bar')), 'foo')

    def test__is_null_value(self):
        self.assertTrue(self.solr._is_null_value(None))
        self.assertTrue(self.solr._is_null_value(''))

        self.assertFalse(self.solr._is_null_value('Hello'))
        self.assertFalse(self.solr._is_null_value(1))

    def test_create_nested_q(self):
        query = self.solr.create_nested_q("dismax", "how now brown cow", **{
            'pf': 'myfield',
            'qf': 'myfield2',
        })
        self.assertEqual(query,
            '_query_:"{!dismax pf=\'myfield\' qf=\'myfield2\'}how now brown cow"')

    def test_search(self):
        results = self.solr.search('doc')
        self.assertEqual(len(results), 3)

        results = self.solr.search('example')
        self.assertEqual(len(results), 2)

        results = self.solr.search('nothing')
        self.assertEqual(len(results), 0)

        # Advanced options.
        results = self.solr.search('doc', **{
            'debug': 'true',
            'hl': 'true',
            'hl.fragsize': 8,
            'facet': 'on',
            'facet.field': 'popularity',
            'spellcheck': 'true',
            'spellcheck.collate': 'true',
            'spellcheck.count': 1,
            # TODO: Can't get these working in my test setup.
            # 'group': 'true',
            # 'group.field': 'id',
        })
        self.assertEqual(len(results), 3)
        self.assertTrue('explain' in results.debug)
        self.assertEqual(results.highlighting, {u'doc_4': {}, u'doc_2': {}, u'doc_1': {}})
        self.assertEqual(results.spellcheck, {})
        self.assertEqual(results.facets['facet_fields']['popularity'], ['10', 2, '7', 1, '2', 0, '8', 0])
        self.assertTrue(results.qtime is not None)
        # TODO: Can't get these working in my test setup.
        # self.assertEqual(results.grouped, '')

    def test_search_with_nested_q(self):
        nested_q = self.solr.create_nested_q('edismax', 'blue', **{
                'qf': 'description comments'
        })
        results = self.solr.search('pony AND {}'.format(nested_q))
        
        self.assertSameIDs(results, ['sn6', 'sn2', 'sn1'])

    def test_disjunction_max(self):
        results = self.solr.disjunction_max('blue', 'description comments')
        
        self.assertSameIDs(results, ['sn6', 'sn4', 'sn2', 'sn1'])

    def test_disjunction_max_with_nested_q(self):
        nested_q = self.solr.create_nested_q('edismax', 'blue', **{
                'qf': 'description comments'
        })
        results = self.solr.disjunction_max('unicorn AND {}'.format(nested_q), 'cat name')
        
        self.assertSameIDs(results, ['sn6', 'sn4', 'sn2'])

    def test_spatial_search(self):
        results = self.solr.spatial_search('pony', 'store', '54.33131,10.12135', '100')
        
        self.assertSameIDs(results, ['sn6', 'sn3', 'sn2'])

    def test_more_like_this(self):
        results = self.solr.more_like_this('id:doc_1', 'text')
        self.assertEqual(len(results), 0)

    def test_suggest_terms(self):
        results = self.solr.suggest_terms('title', '')
        self.assertEqual(len(results), 1)
        self.assertEqual(results, {'title': [('doc', 3), ('another', 2), ('example', 2), ('1', 1), ('2', 1), ('boring', 1), ('rock', 1), ('thing', 1)]})

    def test__build_doc(self):
        doc = {
            'id': 'doc_1',
            'title': 'Example doc ☃ 1',
            'price': 12.59,
            'popularity': 10,
        }
        doc_xml = force_unicode(ET.tostring(self.solr._build_doc(doc), encoding='utf-8'))
        self.assertTrue('<field name="title">Example doc ☃ 1</field>' in doc_xml)
        self.assertTrue('<field name="id">doc_1</field>' in doc_xml)
        self.assertEqual(len(doc_xml), 152)

    def test_add(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.assertEqual(len(self.solr.search('example')), 2)

        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            },
            {
                'id': 'doc_7',
                'title': 'Another example doc',
            },
        ])

        self.assertEqual(len(self.solr.search('doc')), 5)
        self.assertEqual(len(self.solr.search('example')), 3)

    def test_add_with_boost(self):
        self.assertEqual(len(self.solr.search('doc')), 3)

        self.solr.add([{'id': 'doc_6', 'title': 'Important doc'}],
                      boost={'title': 10.0})

        self.solr.add([{'id': 'doc_7', 'title': 'Spam doc doc'}],
                      boost={'title': 0})

        res = self.solr.search('doc')
        self.assertEqual(len(res), 5)
        self.assertEqual('doc_6', res.docs[0]['id'])

    def test_delete(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.delete(id='doc_1')
        self.assertEqual(len(self.solr.search('doc')), 2)
        self.solr.delete(q='price:[0 TO 15]')
        self.assertEqual(len(self.solr.search('doc')), 1)

        self.assertEqual(len(self.solr.search('*:*')), 7)
        self.solr.delete(q='*:*')
        self.assertEqual(len(self.solr.search('*:*')), 0)

        # Need at least one.
        self.assertRaises(ValueError, self.solr.delete)
        # Can't have both.
        self.assertRaises(ValueError, self.solr.delete, id='foo', q='bar')

    def test_commit(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            }
        ], commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.commit()
        self.assertEqual(len(self.solr.search('doc')), 4)

    def test_optimize(self):
        # Make sure it doesn't blow up. Side effects are hard to measure. :/
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            }
        ], commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.optimize()
        self.assertEqual(len(self.solr.search('doc')), 4)

    def test_extract(self):
        fake_f = StringIO("""
            <html>
                <head>
                    <meta charset="utf-8">
                    <meta name="haystack-test" content="test 1234">
                    <title>Test Title ☃&#x2603;</title>
                </head>
                    <body>foobar</body>
            </html>
        """)
        fake_f.name = "test.html"
        extracted = self.solr.extract(fake_f)

        # Verify documented response structure:
        self.assertIn('contents', extracted)
        self.assertIn('metadata', extracted)

        self.assertIn('foobar', extracted['contents'])

        m = extracted['metadata']

        self.assertEqual([fake_f.name], m['stream_name'])

        self.assertIn('haystack-test', m, "HTML metadata should have been extracted!")
        self.assertEqual(['test 1234'], m['haystack-test'])

        # Note the underhanded use of a double snowman to verify both that Tika
        # correctly decoded entities and that our UTF-8 characters survived the
        # round-trip:
        self.assertEqual(['Test Title ☃☃'], m['title'])

    def test_full_url(self):
        self.solr.url = 'http://localhost:8983/solr/'
        full_url = self.solr._create_full_url(path='/update')

        # Make sure trailing and leading slashes do not collide:
        self.assertEqual(full_url, 'http://localhost:8983/solr/update')
Пример #29
0
class DocManager(DocManagerBase):
    """The DocManager class creates a connection to the backend engine and
    adds/removes documents, and in the case of rollback, searches for them.

    The reason for storing id/doc pairs as opposed to doc's is so that multiple
    updates to the same doc reflect the most up to date version as opposed to
    multiple, slightly different versions of a doc.
    """

    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs):
        """Verify Solr URL and establish a connection.
        """
        self.url = url
        self.solr = Solr(url, **kwargs.get('clientOptions', {}))
        self.unique_key = unique_key
        # pysolr does things in milliseconds
        if auto_commit_interval is not None:
            self.auto_commit_interval = auto_commit_interval * 1000
        else:
            self.auto_commit_interval = None
        self.chunk_size = chunk_size
        self.field_list = []
        self._build_fields()
        self._formatter = DocumentFlattener()

    def _parse_fields(self, result, field_name):
        """ If Schema access, parse fields and build respective lists
        """
        field_list = []
        for key, value in result.get('schema', {}).get(field_name, {}).items():
            if key not in field_list:
                field_list.append(key)
        return field_list

    @wrap_exceptions
    def _build_fields(self):
        """ Builds a list of valid fields
        """
        declared_fields = self.solr._send_request('get', ADMIN_URL)
        result = decoder.decode(declared_fields)
        self.field_list = self._parse_fields(result, 'fields')

        # Build regular expressions to match dynamic fields.
        # dynamic field names may have exactly one wildcard, either at
        # the beginning or the end of the name
        self._dynamic_field_regexes = []
        for wc_pattern in self._parse_fields(result, 'dynamicFields'):
            if wc_pattern[0] == "*":
                self._dynamic_field_regexes.append(
                    re.compile(".*%s\Z" % wc_pattern[1:]))
            elif wc_pattern[-1] == "*":
                self._dynamic_field_regexes.append(
                    re.compile("\A%s.*" % wc_pattern[:-1]))

    def _clean_doc(self, doc, namespace, timestamp):
        """Reformats the given document before insertion into Solr.

        This method reformats the document in the following ways:
          - removes extraneous fields that aren't defined in schema.xml
          - unwinds arrays in order to find and later flatten sub-documents
          - flattens the document so that there are no sub-documents, and every
            value is associated with its dot-separated path of keys
          - inserts namespace and timestamp metadata into the document in order
            to handle rollbacks

        An example:
          {"a": 2,
           "b": {
             "c": {
               "d": 5
             }
           },
           "e": [6, 7, 8]
          }

        becomes:
          {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8}

        """

        # Translate the _id field to whatever unique key we're using.
        # _id may not exist in the doc, if we retrieved it from Solr
        # as part of update.
        if '_id' in doc:
            doc[self.unique_key] = u(doc.pop("_id"))

        # Update namespace and timestamp metadata
        if 'ns' in doc or '_ts' in doc:
            raise errors.OperationFailed(
                'Need to set "ns" and "_ts" fields, but these fields already '
                'exist in the document %r!' % doc)
        doc['ns'] = namespace
        doc['_ts'] = timestamp

        # SOLR cannot index fields within sub-documents, so flatten documents
        # with the dot-separated path to each value as the respective key
        flat_doc = self._formatter.format_document(doc)

        # Only include fields that are explicitly provided in the
        # schema or match one of the dynamic field patterns, if
        # we were able to retrieve the schema
        if len(self.field_list) + len(self._dynamic_field_regexes) > 0:
            def include_field(field):
                return field in self.field_list or any(
                    regex.match(field) for regex in self._dynamic_field_regexes
                )
            return dict((k, v) for k, v in flat_doc.items() if include_field(k))
        return flat_doc

    def stop(self):
        """ Stops the instance
        """
        pass

    @wrap_exceptions
    def handle_command(self, doc, namespace, timestamp):
        db, _ = namespace.split('.', 1)
        if doc.get('dropDatabase'):
            for new_db in self.command_helper.map_db(db):
                self.solr.delete(q="ns:%s.*" % new_db,
                                 commit=(self.auto_commit_interval == 0))

        if doc.get('renameCollection'):
            raise errors.OperationFailed(
                "solr_doc_manager does not support replication of "
                " renameCollection")

        if doc.get('create'):
            # nothing to do
            pass

        if doc.get('drop'):
            new_db, coll = self.command_helper.map_collection(db, doc['drop'])
            if new_db:
                self.solr.delete(q="ns:%s.%s" % (new_db, coll),
                                 commit=(self.auto_commit_interval == 0))

    def apply_update(self, doc, update_spec):
        """Override DocManagerBase.apply_update to have flat documents."""
        # Replace a whole document
        if not '$set' in update_spec and not '$unset' in update_spec:
            # update spec contains the new document
			# Update the key in Solr based on the unique_key mentioned as parameter
            update_spec['_id'] = doc[self.unique_key]
            return update_spec
        for to_set in update_spec.get("$set", []):
            value = update_spec['$set'][to_set]
            # Find dotted-path to the value, remove that key from doc, then
            # put value at key:
            keys_to_pop = []
            for key in doc:
                if key.startswith(to_set):
                    if key == to_set or key[len(to_set)] == '.':
                        keys_to_pop.append(key)
            for key in keys_to_pop:
                doc.pop(key)
            doc[to_set] = value
        for to_unset in update_spec.get("$unset", []):
            # MongoDB < 2.5.2 reports $unset for fields that don't exist within
            # the document being updated.
            keys_to_pop = []
            for key in doc:
                if key.startswith(to_unset):
                    if key == to_unset or key[len(to_unset)] == '.':
                        keys_to_pop.append(key)
            for key in keys_to_pop:
                doc.pop(key)
        return doc

    @wrap_exceptions
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.

        """
        # Commit outstanding changes so that the document to be updated is the
        # same version to which the changes apply.
        self.commit()
        # Need to escape special characters in the document_id.
        document_id = ''.join(map(
            lambda c: '\\' + c if c in ESCAPE_CHARACTERS else c,
            u(document_id)
        ))

        query = "%s:%s" % (self.unique_key, document_id)
        results = self.solr.search(query)
        if not len(results):
            # Document may not be retrievable yet
            self.commit()
            results = self.solr.search(query)
        # Results is an iterable containing only 1 result
        for doc in results:
            # Remove metadata previously stored by Mongo Connector.
            doc.pop('ns')
            doc.pop('_ts')
            updated = self.apply_update(doc, update_spec)
            # A _version_ of 0 will always apply the update
            updated['_version_'] = 0
            self.upsert(updated, namespace, timestamp)
            return updated

    @wrap_exceptions
    def upsert(self, doc, namespace, timestamp):
        """Update or insert a document into Solr

        This method should call whatever add/insert/update method exists for
        the backend engine and add the document in there. The input will
        always be one mongo document, represented as a Python dictionary.
        """
        if self.auto_commit_interval is not None:
            self.solr.add([self._clean_doc(doc, namespace, timestamp)],
                          commit=(self.auto_commit_interval == 0),
                          commitWithin=u(self.auto_commit_interval))
        else:
            self.solr.add([self._clean_doc(doc, namespace, timestamp)],
                          commit=False)

    @wrap_exceptions
    def bulk_upsert(self, docs, namespace, timestamp):
        """Update or insert multiple documents into Solr

        docs may be any iterable
        """
        if self.auto_commit_interval is not None:
            add_kwargs = {
                "commit": (self.auto_commit_interval == 0),
                "commitWithin": str(self.auto_commit_interval)
            }
        else:
            add_kwargs = {"commit": False}

        cleaned = (self._clean_doc(d, namespace, timestamp) for d in docs)
        if self.chunk_size > 0:
            batch = list(next(cleaned) for i in range(self.chunk_size))
            while batch:
                self.solr.add(batch, **add_kwargs)
                batch = list(next(cleaned)
                             for i in range(self.chunk_size))
        else:
            self.solr.add(cleaned, **add_kwargs)

    @wrap_exceptions
    def insert_file(self, f, namespace, timestamp):
        params = self._formatter.format_document(f.get_metadata())
        params[self.unique_key] = params.pop('_id')
        params['ns'] = namespace
        params['_ts'] = timestamp
        params = dict(('literal.' + k, v) for k, v in params.items())

        if self.auto_commit_interval == 0:
            params['commit'] = 'true'

        request = Request(os.path.join(
            self.url, "update/extract?%s" % urlencode(params)))

        request.add_header("Content-type", "application/octet-stream")
        request.data = f
        response = urlopen(request)
        logging.debug(response.read())

    @wrap_exceptions
    def remove(self, document_id, namespace, timestamp):
        """Removes documents from Solr

        The input is a python dictionary that represents a mongo document.
        """
        self.solr.delete(id=u(document_id),
                         commit=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def _stream_search(self, query):
        """Helper method for iterating over Solr search results."""
        for doc in self.solr.search(query, rows=100000000):
            if self.unique_key != "_id":
                doc["_id"] = doc.pop(self.unique_key)
            yield doc

    @wrap_exceptions
    def search(self, start_ts, end_ts):
        """Called to query Solr for documents in a time range."""
        query = '_ts: [%s TO %s]' % (start_ts, end_ts)
        return self._stream_search(query)

    def commit(self):
        """This function is used to force a commit.
        """
        retry_until_ok(self.solr.commit)

    @wrap_exceptions
    def get_last_doc(self):
        """Returns the last document stored in the Solr engine.
        """
        #search everything, sort by descending timestamp, return 1 row
        try:
            result = self.solr.search('*:*', sort='_ts desc', rows=1)
        except ValueError:
            return None

        for r in result:
            r['_id'] = r.pop(self.unique_key)
            return r
Пример #30
0
class SolrTestCase(unittest.TestCase):
    def setUp(self):
        super(SolrTestCase, self).setUp()
        self.default_solr = Solr('http://*****:*****@unittest.skipUnless(HAS_LXML, "Cannot test Tomcat error extraction without lxml")
    def test__scrape_response_tomcat(self):
        """Tests for Tomcat error responses, which currently require lxml.html to parse"""

        # Tomcat.
        resp_1 = self.solr._scrape_response({'server': 'coyote'}, '<html><body><p><span>Error message</span><span>messed up.</span></p></body></html>')
        self.assertEqual(resp_1, ('messed up.', ''))

        # Broken Tomcat.
        resp_2 = self.solr._scrape_response({'server': 'coyote'}, '<html><body><p>Really broken. Scraping Java-generated HTML sucks.</pre></body></html>')
        self.assertEqual(resp_2, (None, u'<div><body><p>Really broken. Scraping Java-generated HTML sucks.</p></body></div>'))

    def test__from_python(self):
        self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)), '2013-01-18T00:00:00Z')
        self.assertEqual(self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)), '2013-01-18T00:30:28Z')
        self.assertEqual(self.solr._from_python(True), 'true')
        self.assertEqual(self.solr._from_python(False), 'false')
        self.assertEqual(self.solr._from_python(1), '1')
        self.assertEqual(self.solr._from_python(1.2), '1.2')
        self.assertEqual(self.solr._from_python(b'hello'), 'hello')
        self.assertEqual(self.solr._from_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._from_python('\x01test\x02'), 'test')

    def test__to_python(self):
        self.assertEqual(self.solr._to_python('2013-01-18T00:00:00Z'), datetime.datetime(2013, 1, 18))
        self.assertEqual(self.solr._to_python('2013-01-18T00:30:28Z'), datetime.datetime(2013, 1, 18, 0, 30, 28))
        self.assertEqual(self.solr._to_python('true'), True)
        self.assertEqual(self.solr._to_python('false'), False)
        self.assertEqual(self.solr._to_python(1), 1)
        self.assertEqual(self.solr._to_python(1.2), 1.2)
        self.assertEqual(self.solr._to_python(b'hello'), 'hello')
        self.assertEqual(self.solr._to_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._to_python(['foo', 'bar']), 'foo')
        self.assertEqual(self.solr._to_python(('foo', 'bar')), 'foo')
        self.assertEqual(self.solr._to_python('tuple("foo", "bar")'), 'tuple("foo", "bar")')

    def test__is_null_value(self):
        self.assertTrue(self.solr._is_null_value(None))
        self.assertTrue(self.solr._is_null_value(''))

        self.assertFalse(self.solr._is_null_value('Hello'))
        self.assertFalse(self.solr._is_null_value(1))

    def test_search(self):
        results = self.solr.search('doc')
        self.assertEqual(len(results), 3)

        results = self.solr.search('example')
        self.assertEqual(len(results), 2)

        results = self.solr.search('nothing')
        self.assertEqual(len(results), 0)

        # Advanced options.
        results = self.solr.search('doc', **{
            'debug': 'true',
            'hl': 'true',
            'hl.fragsize': 8,
            'facet': 'on',
            'facet.field': 'popularity',
            'spellcheck': 'true',
            'spellcheck.collate': 'true',
            'spellcheck.count': 1,
            # TODO: Can't get these working in my test setup.
            # 'group': 'true',
            # 'group.field': 'id',
        })
        self.assertEqual(len(results), 3)
        self.assertTrue('explain' in results.debug)
        self.assertEqual(results.highlighting, {u'doc_4': {}, u'doc_2': {}, u'doc_1': {}})
        self.assertEqual(results.spellcheck, {})
        self.assertEqual(results.facets['facet_fields']['popularity'], ['10', 2, '7', 1, '2', 0, '8', 0])
        self.assertTrue(results.qtime is not None)
        # TODO: Can't get these working in my test setup.
        # self.assertEqual(results.grouped, '')

    def test_more_like_this(self):
        results = self.solr.more_like_this('id:doc_1', 'text')
        self.assertEqual(len(results), 0)

    def test_suggest_terms(self):
        results = self.solr.suggest_terms('title', '')
        self.assertEqual(len(results), 1)
        self.assertEqual(results, {'title': [('doc', 3), ('another', 2), ('example', 2), ('1', 1), ('2', 1), ('boring', 1), ('rock', 1), ('thing', 1)]})

    def test__build_doc(self):
        doc = {
            'id': 'doc_1',
            'title': 'Example doc ☃ 1',
            'price': 12.59,
            'popularity': 10,
        }
        doc_xml = force_unicode(ET.tostring(self.solr._build_doc(doc), encoding='utf-8'))
        self.assertTrue('<field name="title">Example doc ☃ 1</field>' in doc_xml)
        self.assertTrue('<field name="id">doc_1</field>' in doc_xml)
        self.assertEqual(len(doc_xml), 152)

    def test_add(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.assertEqual(len(self.solr.search('example')), 2)

        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            },
            {
                'id': 'doc_7',
                'title': 'Another example doc',
            },
        ])

        self.assertEqual(len(self.solr.search('doc')), 5)
        self.assertEqual(len(self.solr.search('example')), 3)

    def test_add_with_boost(self):
        self.assertEqual(len(self.solr.search('doc')), 3)

        self.solr.add([{'id': 'doc_6', 'title': 'Important doc'}],
                      boost={'title': 10.0})

        self.solr.add([{'id': 'doc_7', 'title': 'Spam doc doc'}],
                      boost={'title': 0})

        res = self.solr.search('doc')
        self.assertEqual(len(res), 5)
        self.assertEqual('doc_6', res.docs[0]['id'])

    def test_field_update(self):
        originalDocs = self.solr.search('doc')
        self.assertEqual(len(originalDocs), 3)
        updateList = []
        for i, doc in enumerate(originalDocs):
            updateList.append( {'id': doc['id'], 'popularity': 5} )
        self.solr.add(updateList, fieldUpdates={'popularity': 'inc'})

        updatedDocs = self.solr.search('doc')
        self.assertEqual(len(updatedDocs), 3)
        for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)):
            self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys()))
            self.assertEqual(updatedDoc['popularity'], originalDoc['popularity'] + 5)
            self.assertEqual(True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ['_version_', 'popularity']))

        self.solr.add([
            {
                'id': 'multivalued_1',
                'title': 'Multivalued doc 1',
                'word_ss': ['alpha', 'beta'],
            },
            {
                'id': 'multivalued_2',
                'title': 'Multivalued doc 2',
                'word_ss': ['charlie', 'delta'],
            },
        ])

        originalDocs = self.solr.search('multivalued')
        self.assertEqual(len(originalDocs), 2)
        updateList = []
        for i, doc in enumerate(originalDocs):
            updateList.append( {'id': doc['id'], 'word_ss': ['epsilon', 'gamma']} )
        self.solr.add(updateList, fieldUpdates={'word_ss': 'add'})

        updatedDocs = self.solr.search('multivalued')
        self.assertEqual(len(updatedDocs), 2)
        for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)):
            self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys()))
            self.assertEqual(updatedDoc['word_ss'], originalDoc['word_ss'] + ['epsilon', 'gamma'])
            self.assertEqual(True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ['_version_', 'word_ss']))

    def test_delete(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.delete(id='doc_1')
        self.assertEqual(len(self.solr.search('doc')), 2)
        self.solr.delete(q='price:[0 TO 15]')
        self.assertEqual(len(self.solr.search('doc')), 1)

        self.assertEqual(len(self.solr.search('*:*')), 1)
        self.solr.delete(q='*:*')
        self.assertEqual(len(self.solr.search('*:*')), 0)

        # Need at least one.
        self.assertRaises(ValueError, self.solr.delete)
        # Can't have both.
        self.assertRaises(ValueError, self.solr.delete, id='foo', q='bar')

    def test_commit(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            }
        ], commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.commit()
        self.assertEqual(len(self.solr.search('doc')), 4)

    def test_optimize(self):
        # Make sure it doesn't blow up. Side effects are hard to measure. :/
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            }
        ], commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.optimize()
        self.assertEqual(len(self.solr.search('doc')), 4)

    def test_extract(self):
        fake_f = StringIO("""
            <html>
                <head>
                    <meta charset="utf-8">
                    <meta name="haystack-test" content="test 1234">
                    <title>Test Title ☃&#x2603;</title>
                </head>
                    <body>foobar</body>
            </html>
        """)
        fake_f.name = "test.html"
        extracted = self.solr.extract(fake_f)

        # Verify documented response structure:
        self.assertIn('contents', extracted)
        self.assertIn('metadata', extracted)

        self.assertIn('foobar', extracted['contents'])

        m = extracted['metadata']

        self.assertEqual([fake_f.name], m['stream_name'])

        self.assertIn('haystack-test', m, "HTML metadata should have been extracted!")
        self.assertEqual(['test 1234'], m['haystack-test'])

        # Note the underhanded use of a double snowman to verify both that Tika
        # correctly decoded entities and that our UTF-8 characters survived the
        # round-trip:
        self.assertEqual(['Test Title ☃☃'], m['title'])

    def test_full_url(self):
        self.solr.url = 'http://localhost:8983/solr/core0'
        full_url = self.solr._create_full_url(path='/update')

        # Make sure trailing and leading slashes do not collide:
        self.assertEqual(full_url, 'http://localhost:8983/solr/core0/update')
Пример #31
0
def _solr_add(docs, collection):
    solr = Solr("/".join([settings.SOLR.rstrip("/"), "solr", collection]))
    return solr.add(docs, commit=True)
Пример #32
0
def update_solr(buid, download=True, force=True, set_title=False):
    """
    Update the Solr master index with the data contained in a feed file
    for a given buid/jsid.

    This is meant to be a standalone function such that the state of the
    Solr index is not tied to the state of the database.

    Inputs:
    :buid: An integer; the ID for a particular business unit.
    :download: Boolean. If False, this process will not download a new
    feedfile, but instead use the one on disk. Should only be false for
    the purposes of our test suite.
    :force: Boolean. If True, every job seen in the feed file will be
    updated in the index. Otherwise, only the jobs seen in the feed file
    but not seen in the index will be updated. This latter option will
    soon be deprecated.

    Returns:
    A 2-tuple consisting of the number of jobs added and the number deleted.

    Writes/Modifies:
    Job data found in the feed file is used to modify the Solr index. This
    includes adds & deletes. (Solr does not have a discrete equivalent to
    SQL's UPDATE; by adding a document with the same UID as a document in
    the index, the equivalent of an update operation is performed.)

    """
    if download:
        filepath = download_feed_file(buid)
    else:
        filepath = os.path.join(DATA_DIR, FEED_FILE_PREFIX + str(buid) + ".xml")
    jobfeed = xmlparse.DEv2JobFeed(filepath)

    # If the feed file did not pass validation, return. The return value is
    # '(0, 0)' to match what's returned on a successful parse.
    if jobfeed.errors:
        error = jobfeed.error_messages
        logging.error(
            "BUID:%s - Feed file has failed validation on line %s. "
            "Exception: %s" % (error["buid"], error["line"], error["exception"])
        )
        return (0, 0)

    bu = BusinessUnit.objects.get(id=buid)

    # 'set_title' will be True if this feed file is for a BusinessUnit that's
    # been newly created by `helpers.create_businessunit` (called from the
    # `send_sns_confirm` view).
    if set_title or not bu.title:
        bu.title = jobfeed.company
        bu.save()

    # A list of jobListing instances based off the job data in the feed file
    # for the business unit.
    jobs = jobfeed.jobparse()
    # Build a set of all the UIDs for all those instances.
    job_uids = set([long(i.get("uid")) for i in jobs if i.get("uid")])
    conn = Solr(settings.HAYSTACK_CONNECTIONS["default"]["URL"])
    step1 = 1024

    # Get the count of all the results in the Solr index for this BUID.
    hits = conn.search("*:*", fq="buid:%s" % buid, facet="false", mlt="false").hits
    # Create (start-index, stop-index) tuples to facilitate handling results
    # in ``step1``-sized chunks. So if ``hits`` returns 2048 results,
    # ``job_slices`` will look like ``[(0,1024), (1024, 2048)]``. Those
    # values are then used to slice up the total results.
    #
    # This was put in place because part of the logic to figuring out what
    # jobs to delete from and add jobs to the Solr index is using set
    # algebra. We convert the total list of UIDs in the index and the UIDs
    # in the XML feed to sets, then compare them via ``.difference()``
    # (seen below). However for very large feed files, say 10,000+ jobs,
    # this process was taking so long that the connection would time out. To
    # address this problem we break up the comparisons as described above.
    # This results in more requests but it alleviates the connection timeout
    # issue.
    job_slices = slices(range(hits), step=step1)
    results = [_solr_results_chunk(tup, buid, step1) for tup in job_slices]
    solr_uids = reduce(lambda x, y: x | y, results) if results else set()
    # Return the job UIDs that are in the Solr index but not in the feed
    # file.
    solr_del_uids = solr_uids.difference(job_uids)

    if not force:
        # Return the job UIDs that are in the feed file but not in the Solr
        # index.
        solr_add_uids = job_uids.difference(solr_uids)
        # ``jobfeed.solr_jobs()`` yields a list of dictionaries. We want to
        # filter out any dictionaries whose "uid" key is not in
        # ``solr_add_uids``. This is because by default we only want to add
        # new documents (which each ``solr_jobs()`` dictionary represents),
        # not update.
        add_docs = filter(lambda x: int(x.get("uid", 0)) in solr_add_uids, jobfeed.solr_jobs())
    else:
        # This might seem redundant to refer to the same value
        # twice with two different variable names. However, this decision
        # was made during the implementation of the "force Solr update"
        # feature to this function.
        #
        # Instead of adding only the documents with UIDs that are in the feed
        # file but not in the Solr index, we're going to add ALL the documents
        # in the feed file. This will add the new documents of course, but it
        # will also update existing documents with any new data. Uniqueness of
        # the documents is ensured by the ``id`` field defined in the Solr
        # schema (the template for which can be seen in
        # templates/search_configuration/solr.xml). At the very bottom you'll
        # see <uniqueKey>id</uniqueKey>. This serves as the equivalent of the pk
        # (i.e. globally unique) in a database.
        solr_add_uids = job_uids
        add_docs = jobfeed.solr_jobs()

    # Slice up ``add_docs`` in chunks of 4096. This is because the
    # maxBooleanClauses setting in solrconfig.xml is set to 4096. This means
    # if we used any more than that Solr would throw an error and our
    # updates wouldn't get processed.
    add_steps = slices(range(len(solr_add_uids)), step=4096)
    # Same concept as ``add_docs``.
    del_steps = slices(range(len(solr_del_uids)), step=4096)
    # Create a generator that yields 2-tuples with each invocation. The
    # 2-tuples consist of one tuple each from del_steps & add_steps. Any
    # mismatched values (e.g. there are more del_steps than add_steps)
    # will be compensated for with the ``fillvalue``.
    zipped_steps = izip_longest(del_steps, add_steps, fillvalue=(0, 0))

    for tup in zipped_steps:
        update_chunk = add_docs[tup[1][0] : tup[1][1] + 1]

        if update_chunk:
            logging.info("BUID:%s - SOLR - Update chunk: %s" % (buid, [i["uid"] for i in update_chunk]))
            # Pass 'commitWithin' so that Solr doesn't try to commit the new
            # docs right away. This will help relieve some of the resource
            # stress during the daily update. The value is expressed in
            # milliseconds.
            conn.add(update_chunk, commitWithin="30000")

        delete_chunk = _build_solr_delete_query(list(solr_del_uids)[tup[0][0] : tup[0][1] + 1])

        if delete_chunk:
            logging.info("BUID:%s - SOLR - Delete chunk: %s" % (buid, list(solr_del_uids)))
            conn.delete(q=delete_chunk)

    os.remove(filepath)
    logging.info("BUID:%s - Deleted feed file." % buid)
    return len(solr_add_uids), len(solr_del_uids)
Пример #33
0
class DocManager(DocManagerBase):
    """The DocManager class creates a connection to the backend engine and
    adds/removes documents, and in the case of rollback, searches for them.

    The reason for storing id/doc pairs as opposed to doc's is so that multiple
    updates to the same doc reflect the most up to date version as opposed to
    multiple, slightly different versions of a doc.
    """
    def __init__(self,
                 url,
                 auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id',
                 chunk_size=DEFAULT_MAX_BULK,
                 **kwargs):
        """Verify Solr URL and establish a connection.
        """
        self.url = url
        self.solr = Solr(url, **kwargs.get('clientOptions', {}))
        self.unique_key = unique_key
        # pysolr does things in milliseconds
        if auto_commit_interval is not None:
            self.auto_commit_interval = auto_commit_interval * 1000
        else:
            self.auto_commit_interval = None
        self.chunk_size = chunk_size
        self.field_list = []
        self._build_fields()
        self._formatter = DocumentFlattener()

        self._content_type = kwargs.get("content_type", None)
        logging.info("begin to init content_type args ,value is %s" %
                     str(self._content_type))

        if self._content_type is None:
            logging.info("content_type args is none, will receive all type")
            self._receive_all_type = True
        else:
            logging.debug("begin to check content_type args")
            self._receive_all_type = False
            if isinstance(self._content_type, dict):
                self._content_type_list = dict(self._content_type).keys()
                logging.debug("the support type list is %s" %
                              str(self._content_type_list))

            else:
                raise errors.InvalidConfiguration(
                    "args content type is not is dict")

    def _parse_fields(self, result, field_name):
        """ If Schema access, parse fields and build respective lists
        """
        field_list = []
        for key, value in result.get('schema', {}).get(field_name, {}).items():
            if key not in field_list:
                field_list.append(key)
        return field_list

    @wrap_exceptions
    def _build_fields(self):
        """ Builds a list of valid fields
        """
        declared_fields = self.solr._send_request('get', ADMIN_URL)
        result = decoder.decode(declared_fields)
        self.field_list = self._parse_fields(result, 'fields')

        # Build regular expressions to match dynamic fields.
        # dynamic field names may have exactly one wildcard, either at
        # the beginning or the end of the name
        self._dynamic_field_regexes = []
        for wc_pattern in self._parse_fields(result, 'dynamicFields'):
            if wc_pattern[0] == "*":
                self._dynamic_field_regexes.append(
                    re.compile(".*%s\Z" % wc_pattern[1:]))
            elif wc_pattern[-1] == "*":
                self._dynamic_field_regexes.append(
                    re.compile("\A%s.*" % wc_pattern[:-1]))

    def _clean_doc(self, doc, namespace, timestamp):
        """Reformats the given document before insertion into Solr.

        This method reformats the document in the following ways:
          - removes extraneous fields that aren't defined in schema.xml
          - unwinds arrays in order to find and later flatten sub-documents
          - flattens the document so that there are no sub-documents, and every
            value is associated with its dot-separated path of keys
          - inserts namespace and timestamp metadata into the document in order
            to handle rollbacks

        An example:
          {"a": 2,
           "b": {
             "c": {
               "d": 5
             }
           },
           "e": [6, 7, 8]
          }

        becomes:
          {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8}

        """

        # Translate the _id field to whatever unique key we're using.
        # _id may not exist in the doc, if we retrieved it from Solr
        # as part of update.
        if '_id' in doc:
            doc[self.unique_key] = u(doc.pop("_id"))

        # Update namespace and timestamp metadata
        if 'ns' in doc or '_ts' in doc:
            raise errors.OperationFailed(
                'Need to set "ns" and "_ts" fields, but these fields already '
                'exist in the document %r!' % doc)
        doc['ns'] = namespace
        doc['_ts'] = timestamp

        #doc 提前进行扁平化
        doc = self._formatter.format_document(doc)
        #对doc中tag*变量长度进行限制
        for k, v in doc.items():
            if (k[0:3] == "tag" and v and isinstance(v, basestring)):
                doc[k] = v[0:9000]

        # 获取mongo表名称
        collecion_name = self._get_collection_name(namespace)
        # 处理用户行为表数据
        if ("b_dynamic" == collecion_name):

            logging.info("to process doc from b_dynamic ,the doc is %s" %
                         str(doc[self.unique_key]))
            return self._parse_user_dynamic_collection(doc)

        #处理用户表
        if ("T_USER" == collecion_name):
            logging.info("to process doc from T_USER ,the doc is %s" %
                         str(doc[self.unique_key]))
            return self._parse_t_user_collection(doc)

        #to process the content data
        logging.info("begin to process b_content ,the doc is %s" %
                     str(doc[self.unique_key]))
        doctemp = self._parse_content_doc(doc)

        if doctemp is None:
            logging.info("don't send doc to solr ,the doc is %s" % str(doc))
            return None

        if (isinstance(doctemp, list) and len(doctemp) == 0):
            logging.info("don't send doc to solr ,the doc is %s" % str(doc))
            return None

        if (isinstance(doctemp, list) and len(doctemp) > 1):
            logging.info(
                "to process doc from b_content after it is a list,the doc is %s"
                % str(doc[self.unique_key]))
            flat_doc = []
            for docvalue in doctemp:
                flat_doc.append(self._parse_doc_to_solr_doc(docvalue))

            return flat_doc

        if (isinstance(doctemp, list)):
            logging.info(
                "to process doc from b_content after it is a one-value list,the doc is %s"
                % str(doc[self.unique_key]))
            return self._parse_doc_to_solr_doc(doctemp[0])
        logging.info(
            "to process doc from b_content after it is a object,the doc is %s"
            % str(doc[self.unique_key]))
        return self._parse_doc_to_solr_doc(doctemp)

    def _get_collection_name(self, namespace):
        '''获取mongodb的collection 的名称
        '''
        coll = namespace.split('.', 1)[1]
        return coll

    def _parse_user_dynamic_collection(self, doc):
        '''解析用户行为表,转换为搜索引擎识别的数据结构
        '''
        if doc.get("content"):
            doc["detail"] = doc.pop("content")
        #赋予作者字段
        if doc.get("createUser.userId"):
            doc["author.id"] = doc.get("createUser.userId")
        if doc.get("createUser.userName"):
            doc["author.name"] = doc.get("createUser.userName")

        if doc.get("target"):
            doc["fkTag.0"] = doc.pop("target")

        #内容不可查询
        doc["op"] = "LDEL"
        return self._parse_doc_to_solr_doc(doc)

    def _parse_t_user_collection(self, doc):
        '''解析用户表,转换为搜索引擎识别的数据结构
        '''
        #用户昵称转化
        nickName = doc.pop("nickName", None)
        if nickName:
            doc["title.0.name"] = nickName
            doc["tag.0.name"] = nickName
        #用户描述转化
        description = doc.pop("description", None)
        if description:
            doc["title.1.name"] = description
            doc["tag.1.name"] = description

        figureurl40 = doc.pop("figureurl40", None)
        if figureurl40:
            doc["imgurl"] = figureurl40

        website = doc.pop("website", None)
        if website:
            doc["resurl"] = u"/u/" + str(website)
            doc["title.2.name"] = website
            doc["tag.2.name"] = website
        #如果用户被锁定说明用户不能被搜索
        isLocked = doc.pop("isLocked", None)
        if isLocked == "N":
            doc["status"] = u"released"
        elif isLocked == "Y":
            doc["status"] = u"draft"

        #清除多余信息
        doc.pop("password", None)
        doc.pop("salt", None)
        doc.pop("phoneNum", None)
        doc.pop("userName", None)

        #补充必要信息
        doc["type"] = u"user"

        return self._parse_doc_to_solr_doc(doc)

    def _parse_doc_to_solr_doc(self, doc):
        # SOLR cannot index fields within sub-documents, so flatten documents
        # with the dot-separated path to each value as the respective key
        flat_doc = self._formatter.format_document(doc)

        # Only include fields that are explicitly provided in the
        # schema or match one of the dynamic field patterns, if
        # we were able to retrieve the schema
        if len(self.field_list) + len(self._dynamic_field_regexes) > 0:

            def include_field(field):
                return field in self.field_list or any(
                    regex.match(field)
                    for regex in self._dynamic_field_regexes)

            return dict(
                (k, v) for k, v in flat_doc.items() if include_field(k))
        return flat_doc

    def _parse_content_doc(self, doc):
        type = doc.get("type")
        if doc.get("releaseTime"):
            doc["createTime"] = doc.get("releaseTime")
        if (type == "product"):
            return self._parse_product(doc)
        #不再需要对图片视频文档等做特殊处理
        # if (type == "explain"):
        #     return self._parse_explain(doc)
        # elif(type == "video"):
        #     return self._parse_video(doc)
        # elif(type == "picture"):
        #     return self._paser_picture(doc)
        # else:
        return [doc]

    def _parse_product(self, doc):
        """
        处理项目数据,主要是对项目详情进行处理
        """
        spiltflag = False
        resultlist = []

        flat_doc = self._formatter.format_document(doc)

        #获取地址各个字段的数据
        adlist = []
        country = flat_doc.get("address.country.name")
        if country:
            self._add_list_with_not_empty_string(adlist, country)
        province = flat_doc.get("address.province.name")
        if province:
            self._add_list_with_not_empty_string(adlist, province)
        city = flat_doc.get("address.city.name")
        if city:
            self._add_list_with_not_empty_string(adlist, city)
        area = flat_doc.get("address.area.name")
        if area:
            self._add_list_with_not_empty_string(adlist, area)
        detail = flat_doc.get("address.detail.name")
        if detail:
            self._add_list_with_not_empty_string(adlist, detail)
        #合并为真的地址
        address_str = "".join(adlist)

        if address_str:
            resultlist.append("项目地址:" + address_str)
        #开发建设方处理

        dev_str = self._get_flat_array(flat_doc, "devBuilder.", ".name")
        if dev_str:
            resultlist.append("开发建设方:" + dev_str)
        #主要设计师处理
        design_str = self._get_flat_array(flat_doc, "buildingMainDesigner.",
                                          ".name")
        if design_str:
            resultlist.append("建筑主创设计师:" + design_str)

        #建筑面积处理
        buildingArea = doc.get("buildingArea")
        if buildingArea:
            resultlist.append("建筑面积:" + str(buildingArea) + "㎡")

        doc["detail"] = " / ".join(resultlist)
        return [doc]

    def _add_list_with_not_empty_string(self, v_list, value):
        if value:
            v_list.append(str(value))

    def _get_flat_array(self, doc, prefix, suffix):
        """
        获取扁平化的数组并且连接为一体并返回
        """
        r = []
        i = 0
        while (True):
            value = doc.get(prefix + str(i) + suffix)
            if (value):
                r.append(str(value))
                i = i + 1
            else:
                break
        return ",".join(r)

    def _parse_explain(self, doc):
        """parse the content explain to replace the resurl value to be composited of fkTag
        """
        return [doc]
        ''' 不需要对explain即点评做特殊处理了
        fkTag=doc.get("fkTag")
        if(isinstance(fkTag,list) and len(fkTag) > 0):
            resurl="/detail/"+str(fkTag[0])
            
            logging.info("resurl is replace from %s to %s" % (doc.get("resurl"),resurl))
            doc["resurl"]=u(resurl)
        else:
            logging.error("fail to change resurl(%s) ,because the fkTag(%s) is not valid" % (str(doc.get("resurl")),str(doc.get("fkTag")) ))
        return [doc] 
        '''

    def _parse_video(self, doc):

        return self._parse_content_list_to_serval(doc, "video", "video")

    def _paser_picture(self, doc):
        """parse the picture content to subdoc 
        
        doclist=[doc]
        logging.debug("parse picture ,the raw doc is %s:" % str(doc))
        picture=doc.get("picture")
        
        if(isinstance(picture, list) and len(picture)>0):
            
            for index,value in enumerate(picture):
                doctemp=doc.copy()
                doctemp["s_picture_id"]=u(value.get("id"))
                doctemp["s_pitcure_name"]=u(value.get("name"))
                doctemp["_id"]=u(doctemp.get("_id")+"_"+str(index))
                doctemp["s_parent_id"]=u(doctemp.get("_id"))
                doctemp["type"]="s_picture"
                doclist.append(doctemp)
            #only picture is existed , to replace s_picture attr
            doc["s_picture"]=picture
        
        # !!!!!there is bug when update picture status  
        return doclist
        """
        return self._parse_content_list_to_serval(doc, "picture", "picture")

    def _parse_content_list_to_serval(self, doc, fieldName, type):
        """parse the picture content to subdoc 
        """
        doclist = [doc]
        logging.debug("parse %s ,the raw doc is %s:" % (fieldName, str(doc)))
        picture = doc.get(fieldName)

        if (isinstance(picture, list) and len(picture) > 0):
            s_field_id = "s_" + fieldName + "_id"
            s_field_name = "s_" + fieldName + "_name"
            new_type = "s_" + type
            for index, value in enumerate(picture):
                doctemp = doc.copy()
                doctemp[s_field_id] = u(value.get("id"))
                doctemp[s_field_name] = u(value.get("name"))
                doctemp["_id"] = u(doctemp.get("_id") + "_" + str(index))
                doctemp["s_parent_id"] = u(doctemp.get("_id"))
                doctemp["type"] = new_type
                doclist.append(doctemp)
            #only picture is existed , to replace s_picture attr
            doc["s_" + fieldName] = picture

        # !!!!!there is bug when update picture status
        return doclist

    def stop(self):
        """ Stops the instance
        """
        pass

    @wrap_exceptions
    def handle_command(self, doc, namespace, timestamp):
        db, _ = namespace.split('.', 1)
        if doc.get('dropDatabase'):
            for new_db in self.command_helper.map_db(db):
                self.solr.delete(q="ns:%s.*" % new_db,
                                 commit=(self.auto_commit_interval == 0))

        if doc.get('renameCollection'):
            raise errors.OperationFailed(
                "solr_doc_manager does not support replication of "
                " renameCollection")

        if doc.get('create'):
            # nothing to do
            pass

        if doc.get('drop'):
            new_db, coll = self.command_helper.map_collection(db, doc['drop'])
            if new_db:
                self.solr.delete(q="ns:%s.%s" % (new_db, coll),
                                 commit=(self.auto_commit_interval == 0))

    def apply_update(self, doc, update_spec):
        """Override DocManagerBase.apply_update to have flat documents."""
        # Replace a whole document
        if not '$set' in update_spec and not '$unset' in update_spec:
            # update_spec contains the new document.
            # Update the key in Solr based on the unique_key mentioned as
            # parameter.
            update_spec['_id'] = doc[self.unique_key]
            return update_spec
        for to_set in update_spec.get("$set", []):
            value = update_spec['$set'][to_set]
            # Find dotted-path to the value, remove that key from doc, then
            # put value at key:
            keys_to_pop = []
            for key in doc:
                if key.startswith(to_set):
                    if key == to_set or key[len(to_set)] == '.':
                        keys_to_pop.append(key)

            for key in keys_to_pop:
                doc.pop(key)
            doc[to_set] = value
        for to_unset in update_spec.get("$unset", []):
            # MongoDB < 2.5.2 reports $unset for fields that don't exist within
            # the document being updated.
            keys_to_pop = []
            for key in doc:
                if key.startswith(to_unset):
                    if key == to_unset or key[len(to_unset)] == '.':
                        keys_to_pop.append(key)
                tmp_to_unset = "s_" + to_unset
                if key.startswith(tmp_to_unset):
                    if key == tmp_to_unset or key[len(tmp_to_unset)] == '.':
                        keys_to_pop.append(key)
            for key in keys_to_pop:
                doc.pop(key)
        return doc

    @wrap_exceptions
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.

        """
        # Commit outstanding changes so that the document to be updated is the
        # same version to which the changes apply.
        self.commit()
        # Need to escape special characters in the document_id.
        document_id = ''.join(
            map(lambda c: '\\' + c if c in ESCAPE_CHARACTERS else c,
                u(document_id)))

        query = "%s:%s" % (self.unique_key, document_id)
        results = self.solr.search(query)
        if not len(results):
            # Document may not be retrievable yet
            self.commit()
            results = self.solr.search(query)
        # Results is an iterable containing only 1 result
        for doc in results:
            # Remove metadata previously stored by Mongo Connector.
            doc.pop('ns')
            doc.pop('_ts')
            updated = self.apply_update(doc, update_spec)
            # A _version_ of 0 will always apply the update
            updated['_version_'] = 0
            self.upsert(updated, namespace, timestamp)
            return updated

    @wrap_exceptions
    def upsert(self, doc, namespace, timestamp):
        """Update or insert a document into Solr

        This method should call whatever add/insert/update method exists for
        the backend engine and add the document in there. The input will
        always be one mongo document, represented as a Python dictionary.
        """
        logging.debug("before insert the raw doc is :(%s)" % str(doc))
        docs = self._clean_doc(doc, namespace, timestamp)
        logging.debug("before insert the processed doc is :(%s)" % str(doc))
        if docs is None:
            return None
        if not isinstance(docs, list):
            docs = [docs]
        docid = doc.get("_id")
        #self.remove(docid, namespace, timestamp)
        #delete the child node about this file, TODO
        # if docid :
        #     logging.info("remove solr document which id is %s _* ,timestamp is %s" % (str(docid), str(timestamp)))
        #     self.solr.delete(q=u("_id:"+docid+"_*"),
        #                      commit=(self.auto_commit_interval == 0))
        # else:
        #     raise errors.OperationFailed("delete solr document error for the id(%s) is not valid" % str(docid));
        try:
            if self.auto_commit_interval is not None:
                self.solr.add(docs,
                              commit=(self.auto_commit_interval == 0),
                              commitWithin=u(self.auto_commit_interval))
            else:
                self.solr.add(docs, commit=False)
            logging.debug("insert into solr docs:(%s)" % str(docs))
        except UnicodeDecodeError:
            logging.exception(
                "Unable to process processed document for UnicodeDecodeError, %r "
                % str(docs))

    @wrap_exceptions
    def bulk_upsert(self, docs, namespace, timestamp):
        """Update or insert multiple documents into Solr

        docs may be any iterable
        """
        if self.auto_commit_interval is not None:
            add_kwargs = {
                "commit": (self.auto_commit_interval == 0),
                "commitWithin": str(self.auto_commit_interval)
            }
        else:
            add_kwargs = {"commit": False}

        cleaned = (self._clean_doc(d, namespace, timestamp) for d in docs)
        if self.chunk_size > 0:
            batch = list(next(cleaned) for i in range(self.chunk_size))
            while batch:
                self.solr.add(batch, **add_kwargs)
                batch = list(next(cleaned) for i in range(self.chunk_size))
        else:
            self.solr.add(cleaned, **add_kwargs)

    @wrap_exceptions
    def insert_file(self, f, namespace, timestamp):
        params = self._formatter.format_document(f.get_metadata())
        params[self.unique_key] = params.pop('_id')
        params['ns'] = namespace
        params['_ts'] = timestamp
        params = dict(('literal.' + k, v) for k, v in params.items())

        if self.auto_commit_interval == 0:
            params['commit'] = 'true'

        request = Request(
            os.path.join(self.url, "update/extract?%s" % urlencode(params)))

        request.add_header("Content-type", "application/octet-stream")
        request.data = f
        response = urlopen(request)
        logging.debug(response.read())

    @wrap_exceptions
    def remove(self, document_id, namespace, timestamp):
        """Removes documents from Solr

        The input is a python dictionary that represents a mongo document.
        """
        if document_id:
            self.solr.delete(id=u(document_id),
                             commit=(self.auto_commit_interval == 0))
            self.solr.delete(q=u("_id:" + document_id + "_*"),
                             commit=(self.auto_commit_interval == 0))
        else:
            raise errors.OperationFailed(
                "delete solr document error for the id(%s) is not valid" %
                str(document_id))

    @wrap_exceptions
    def _stream_search(self, query):
        """Helper method for iterating over Solr search results."""
        for doc in self.solr.search(query, rows=100000000):
            if self.unique_key != "_id":
                doc["_id"] = doc.pop(self.unique_key)
            yield doc

    @wrap_exceptions
    def search(self, start_ts, end_ts):
        """Called to query Solr for documents in a time range."""
        query = '_ts: [%s TO %s]' % (start_ts, end_ts)
        return self._stream_search(query)

    def commit(self):
        """This function is used to force a commit.
        """
        retry_until_ok(self.solr.commit)

    @wrap_exceptions
    def get_last_doc(self):
        """Returns the last document stored in the Solr engine.
        """
        #search everything, sort by descending timestamp, return 1 row
        try:
            result = self.solr.search('*:*', sort='_ts desc', rows=1)
        except ValueError:
            return None

        for r in result:
            r['_id'] = r.pop(self.unique_key)
            return r
Пример #34
0
class DocManager():
    """The DocManager class creates a connection to the backend engine and
    adds/removes documents, and in the case of rollback, searches for them.

    The reason for storing id/doc pairs as opposed to doc's is so that multiple
    updates to the same doc reflect the most up to date version as opposed to
    multiple, slightly different versions of a doc.
    """
    def __init__(self, url, auto_commit=True, unique_key='_id'):
        """Verify Solr URL and establish a connection.
        """
        if verify_url(url) is False:
            raise SystemError

        self.solr = Solr(url)
        self.unique_key = unique_key
        self.auto_commit = auto_commit

        if auto_commit:
            self.run_auto_commit()

    def stop(self):
        self.auto_commit = False

    def upsert(self, doc):
        """Update or insert a document into Solr

        This method should call whatever add/insert/update method exists for
        the backend engine and add the document in there. The input will
        always be one mongo document, represented as a Python dictionary.
        """
        self.solr.add([doc], commit=False)

    def remove(self, doc):
        """Removes documents from Solr

        The input is a python dictionary that represents a mongo document.
        """
        self.solr.delete(id=str(doc[self.unique_key]), commit=False)

    def search(self, start_ts, end_ts):
        """Called to query Solr for documents in a time range.
        """
        query = '_ts: [%s TO %s]' % (start_ts, end_ts)
        return self.solr.search(query, rows=100000000)

    def _search(self, query):
        """For test purposes only. Performs search on Solr with given query
            Does not have to be implemented.
        """
        return self.solr.search(query, rows=200)

    def commit(self):
        """This function is used to force a commit.
        """
        retry_until_ok(self.solr.commit)

    def run_auto_commit(self):
        """Periodically commits to the Solr server.
        """
        self.solr.commit()
        if self.auto_commit:
            Timer(1, self.run_auto_commit).start()

    def get_last_doc(self):
        """Returns the last document stored in the Solr engine.
        """
        #search everything, sort by descending timestamp, return 1 row
        result = self.solr.search('*:*', sort='_ts desc', rows=1)

        if len(result) == 0:
            return None

        return result.docs[0]
Пример #35
0
class Indexer(object):
    """Indexer for PRL."""
    def __init__(self, args: Dict[str, Any], config: Dict[str, Any]):

        self.solr = None
        self.s3 = None
        self.record_identifiers = None
        self.harvester_settings = None

        self.args = args
        self.config = config
        self.oai_pmh_cache = {}

    def connect(self):
        """Initializes the interfaces for all third-party services."""

        self._connect_internal_services()
        if not self.args['dry_run']:
            self._connect_external_services()

    def _connect_internal_services(self):
        """Initializes the interfaces for all third-party services instantiated by this module."""

        try:
            self.record_identifiers = plyvel.DB(os.path.expanduser(
                self.config['leveldb']['record_identifiers']['path']),
                                                create_if_missing=True)
            self.harvester_settings = plyvel.DB(os.path.expanduser(
                self.config['leveldb']['harvester_settings']['path']),
                                                create_if_missing=True)
            self.set_harvester_settings()
        except plyvel.IOError as e:
            raise IndexerError(
                'Failed to instantiate LevelDB instance: {}'.format(repr(e)))

    def _connect_external_services(self):
        """Initializes the interfaces for all third-party services NOT instantiated by this module."""

        try:
            solr_base_url = self.config['solr']['base_url']

            # Make sure we can connect to Solr.
            def solr_ping(base_url):
                """Raises an error if we can't connect to Solr."""
                o = urllib.parse.urlsplit(solr_base_url)
                ping_url = urllib.parse.urlunsplit(
                    o[:2] + (os.path.join(o.path, 'admin/ping'), ) + o[3:])
                requests.get(ping_url).raise_for_status()

            solr_ping(solr_base_url)

            self.solr = Solr(solr_base_url, always_commit=True)
            self.s3 = boto3.Session(profile_name=self.config['s3']['configure']
                                    ['profile_name']).client('s3')
        except requests.exceptions.RequestException as e:
            raise IndexerError('Connection failed: {}'.format(e))
        except ProfileNotFound as e:
            raise IndexerError('Failed to initialize S3 session: {}'.format(
                repr(e)))

    def disconnect(self):
        """Closes connections with all third-party services."""

        self._disconnect_internal_services()
        if not self.args['dry_run']:
            self._disconnect_external_services()

    def _disconnect_internal_services(self):
        """Closes connections with all third-party services instantiated by this module."""

        try:
            self.record_identifiers.close()
            self.harvester_settings.close()
        except plyvel.Error as e:
            raise IndexerError(
                'Failed to close the connection to LevelDB: {}'.format(e))

    def _disconnect_external_services(self):
        """Closes connections with all third-party services NOT instantiated by this module."""

        self.solr = None
        self.s3 = None

    def get_harvester_settings_path(self) -> str:
        """Gets the full path of the file containing jOAI harvester settings."""

        return os.path.join(
            os.path.expanduser(self.config['leveldb']['harvester_settings']
                               ['source']['base_path']), self.config['leveldb']
            ['harvester_settings']['source']['files']['scheduled_harvests'])

    def get_harvester_settings_key(self, path: str) -> str:
        """
        Returns a relative path with either one or two components.
        
        Intended to be called ONLY on paths representing institution/repository or collection/set directories.
        """
        for harvest_dir_prefix in self.config['filesystem'][
                'harvest_dir_prefixes']:
            if os.path.isabs(path) and os.path.isabs(
                    harvest_dir_prefix) or not os.path.isabs(
                        path) and not os.path.isabs(harvest_dir_prefix):
                common_path = os.path.commonpath([path, harvest_dir_prefix])
                if os.path.normpath(common_path) == os.path.normpath(
                        harvest_dir_prefix):
                    return os.path.relpath(path, common_path)

    def read_harvester_settings_file(self) -> Dict[str, Dict[str, str]]:
        """Returns a dictionary representing the harvester settings.

        First, tries reading the settings as if the source file is UTF-8 encoded JSON of the following form (used for testing):

        {
            "harvester_settings_key_1": {
                "repository_name": "repository_name_1",
                "base_url": "http://example.edu/oai2",
                "set_spec": "set_spec_1",
                "split_by_set": False
            },
            ...
        }

        If that fails, tries reading the settings as if the source file is a serialized java.util.Hashtable instance from jOAI (used for production).
        """

        harvester_settings_path = self.get_harvester_settings_path()

        try:
            # See if it's in JSON already.
            with open(harvester_settings_path, 'r') as harvester_settings_file:
                # Make sure we transform the key before storing.
                return {
                    self.get_harvester_settings_key(key): metadata
                    for key, metadata in json.load(
                        harvester_settings_file).items()
                }
        except JSONDecodeError as e:
            # Invalid JSON.
            raise IndexerError(
                'Cannot load scheduled harvests settings: {}'.format(e))
        except UnicodeDecodeError as e:
            logging.debug('Config file is not JSON: {}'.format(e))

            # Open the file in binary mode and try to parse it with javaobj.
            with open(harvester_settings_path,
                      'rb') as harvester_settings_file:
                pobj = JavaObjectUnmarshaller(
                    harvester_settings_file).readObject()

            scheduled_harvest_class = self.config['leveldb'][
                'harvester_settings']['source']['classes']['scheduled_harvest']
            is_scheduled_harvest = lambda h: scheduled_harvest_class in str(h)

            return {
                self.get_harvester_settings_key(pobj_harvest.harvestDir.path):
                {
                    'repository_name': pobj_harvest.repositoryName,
                    'base_url': pobj_harvest.baseURL,
                    'set_spec': pobj_harvest.setSpec,
                    'split_by_set': pobj_harvest.splitBySet
                }
                for pobj_harvest in list(
                    filter(is_scheduled_harvest, pobj.annotations))
            }
        except Exception as e:
            # Something else went wrong.
            raise IndexerError(
                'Cannot load scheduled harvests settings: {}'.format(e))

    def set_harvester_settings(self):
        """Updates the harvester_settings LevelDB instance with the data stored in the source file.
        
        Responds to filesystem event on that file.
        """

        new_harvester_settings = self.read_harvester_settings_file()
        deleted_keys = []
        updated_keys = []

        # Remove all keys from LevelDB that aren't in the harvester settings file.
        harvester_settings_iterator = self.harvester_settings.iterator()
        for key, value in harvester_settings_iterator:
            if key.decode() not in new_harvester_settings:
                self.harvester_settings.delete(key)
                deleted_keys.append(key)

        if deleted_keys:
            logging.info('Deleted harvester settings for %s', deleted_keys)

        # Add all keys in the harvester settings file to LevelDB, since some of their values may have changed.
        for harvest_key, harvest_metadata in new_harvester_settings.items():
            key = harvest_key
            value = json.dumps(harvest_metadata)
            self.harvester_settings.put(key.encode(), value.encode())
            updated_keys.append(key)

        if updated_keys:
            logging.info('Updated harvester settings for %s', updated_keys)

    def update_record(self, path: str):
        """Updates a metadata record in PRL.
        
        Responds to IndexerEventHandler.on_modified filesystem event.
        """

        try:
            # Generate a Solr document from the metadata record.
            with open(path, 'r') as record_file:
                prl_solr_document = self.get_solr_document(record_file)
            pysolr_doc = prl_solr_document.get_pysolr_doc()
            record_identifier = prl_solr_document.get_record_identifier()

            if not self.args['dry_run']:
                if prl_solr_document.original_thumbnail_metadata():
                    thumbnail_saved = self.save_thumbnail(prl_solr_document)
                    if not thumbnail_saved:
                        prl_solr_document.discard_incorrect_thumbnail_url()
                try:
                    self.solr.add([pysolr_doc])
                    logging.debug('%s updated in Solr', record_identifier)
                    self.record_identifiers.put(path.encode(),
                                                record_identifier.encode())
                except plyvel.Error as e:
                    self.solr.delete(id=record_identifier)
                    raise IndexerError(
                        'Failed to PUT on LevelDB: {}'.format(e))
                except Exception as e:
                    raise IndexerError(
                        'Failed to update Solr document: {}'.format(e))

                logging.info('%s updated in PRL', record_identifier)
            else:
                logging.info('DRY-RUN: %s updated in PRL', record_identifier)
        except IndexerError as e:
            if self.args['dry_run']:
                logging.error('DRY-RUN: %s would not be updated in PRL: %s',
                              record_identifier, e)
            else:
                raise e

    def remove_record(self, path: str):
        """Removes a metadata record from PRL.
        
        Responds to IndexerEventHandler.on_deleted filesystem event.
        """

        if not self.args['dry_run']:
            try:
                record_identifier = self.record_identifiers.get(
                    path.encode()).decode()
                docs = self.solr.search('id:"{0}"'.format(record_identifier))
                if len(docs) == 0:
                    raise IndexerError('Document not found in Solr: {}'.format(
                        record_identifier))
                elif len(docs) > 1:
                    # This should never happen. If it does, probably an issue with the schema.
                    raise IndexerError(
                        'Solr doesn\'t have unique IDs: {} records found with identifier {}'
                        .format(len(docs), record_identifier))
            except plyvel.Error as e:
                raise IndexerError('Failed to GET on LevelDB: {}'.format(e))
            except IndexerError as e:
                raise e
            except Exception as e:
                raise IndexerError(
                    'Failed to search for Solr document {}: {}'.format(
                        record_identifier, e))

            try:
                self.solr.delete(id=record_identifier)
                logging.debug('%s removed from Solr', record_identifier)
                self.record_identifiers.delete(path.encode())
                for doc in docs:
                    if 'thumbnail_url' in doc:
                        self.unsave_thumbnail(doc['thumbnail_url'],
                                              record_identifier,
                                              doc['institutionKey'],
                                              doc['collectionKey'])
                logging.info('%s removed from PRL', record_identifier)
            except plyvel.Error as e:
                raise IndexerError('Failed to DELETE on LevelDB: {}'.format(e))
            except IndexerError as e:
                raise e
            except Exception as e:
                raise IndexerError(
                    'Failed to remove Solr document: {}'.format(e))
        else:
            logging.info('DRY-RUN: Removed %s', path)

    def get_oai_pmh_metadata(self, base_url: str) -> Dict[str, str]:
        """Returns a dictionary containing top-level metadata and set metadata of an OAI-PMH repository."""

        logging.debug(
            'Retrieving repository and set metadata from OAI-PMH repository %s',
            base_url)
        try:
            metadata = {}

            # All repositories should have this metadata.
            repository_metadata = Sickle(base_url, timeout=60).Identify()
            if hasattr(repository_metadata, 'repositoryIdentifier'):
                metadata[
                    'repository_identifier'] = repository_metadata.repositoryIdentifier
            if hasattr(repository_metadata, 'repositoryName'):
                metadata[
                    'repository_name'] = repository_metadata.repositoryName

            # Not all repositories will support sets.
            try:
                set_metadata = Sickle(base_url, timeout=60).ListSets()
                metadata.update({
                    'sets': {s.setSpec: s.setName
                             for s in list(set_metadata)}
                })
            except sickle.oaiexceptions.NoSetHierarchy as e:
                logging.debug(
                    'Failed to list sets from OAI-PMH repository %s: %s',
                    base_url, e)

            return metadata

        except requests.RequestException as e:
            raise IndexerError(
                'Failed to get repository metadata from OAI-PMH repository {}: {}'
                .format(base_url, e))

    def get_solr_document(self, file_object: TextIOWrapper) -> PRLSolrDocument:
        """Builds a Solr document for PRL."""
        identifier, institution_key, institution_name, collection_key, collection_name = self.get_key_record_metadata(
            file_object.name)

        if self.args['dry_run']:
            s3_domain_name = 'example.com'
        else:
            s3_domain_name = self.config['s3']['sync']['destination'][
                'domain_name']

        return PRLSolrDocument(
            file_object, identifier, institution_key, institution_name,
            collection_key, collection_name,
            self.config['metadata']['dublin_core']['solr_mapping'],
            self.config['metadata']['dublin_core']
            ['external_link_field_patterns'],
            self.config['metadata']['dublin_core']['thumbnail_field_patterns'],
            s3_domain_name)

    def get_key_record_metadata(self, file_path: str):
        """Determines collection and institution metadata from the filepath of the record.

        Returns a 5-tuple containing the following elements:
            - an identifier for the record
            - an identifier for the institution
            - a human-readable string for the institution
            - an identifier for the collection
            - a human-readable string for the collection

        Side effects:
            - updates local LevelDB cache with OAI-PMH repository metadata
        """

        # ---------------------------------------- #
        # --- Gather all the data we can find. --- #
        # ---------------------------------------- #

        # Get the record identifier from the filename.
        identifier = urllib.parse.unquote(
            os.path.splitext(os.path.basename(file_path))[0])

        try:
            # The harvester settings will tell us how to get the other metadata.
            harvester_settings_key = None

            potential_harvester_settings_keys = map(
                self.get_harvester_settings_key, [
                    os.path.dirname(file_path),
                    os.path.dirname(os.path.dirname(file_path))
                ])
            # Keep track of keys that we tried, but failed.
            tried_keys = []

            for potential_harvester_settings_key in potential_harvester_settings_keys:
                potential_harvester_settings_serialized_encoded = self.harvester_settings.get(
                    potential_harvester_settings_key.encode())

                if potential_harvester_settings_serialized_encoded:
                    # Found it!
                    harvester_settings_key = potential_harvester_settings_key
                    break
                else:
                    tried_keys.append(potential_harvester_settings_key)

            if harvester_settings_key is not None:
                harvester_settings_serialized_encoded = potential_harvester_settings_serialized_encoded
                harvester_settings_serialized = harvester_settings_serialized_encoded.decode(
                )
                harvester_settings = json.loads(harvester_settings_serialized)
            else:
                # This should never happen. Harvester settings should represent all harvested files.
                raise IndexerError(
                    'Cannot find harvester settings in LevelDB for {}'.format(
                        tried_keys))

        except plyvel.Error as e:
            # We can't go on without LevelDB.
            raise IndexerError('Failed to GET on LevelDB: {}'.format(e))
        except AttributeError as e:
            # This should never happen. Harvester settings should represent all harvested files.
            raise IndexerError(
                'Cannot find harvester settings in LevelDB for {}'.format(
                    harvester_settings_key))
        except JSONDecodeError as e:
            # This should never happen.
            raise IndexerError(
                'Harvester settings are not valid JSON: {}'.format(e))

        base_url = harvester_settings['base_url']
        institution_name = harvester_settings['repository_name']
        set_spec = harvester_settings['set_spec']
        split_by_set = harvester_settings['split_by_set']

        # Fetch repository metadata, and write to the in-memory cache if necessary.
        if base_url in self.oai_pmh_cache:
            oai_pmh_metadata = self.oai_pmh_cache[base_url]
        else:
            oai_pmh_metadata = self.get_oai_pmh_metadata(base_url)
            self.oai_pmh_cache[base_url] = oai_pmh_metadata

        # ----------------------------------------- #
        # --- Determine which values to return. --- #
        # ----------------------------------------- #

        # This is the most common case: an institution specifies a specific set for us to harvest.
        individual_set_harvest = set_spec != '' and not split_by_set

        # This is the case when an institution wants us to harvest all sets from their repository.
        full_repository_harvest = set_spec == '' and split_by_set

        # This is the case when an institution wants us to treat their entire repository as a PRL "collection".
        single_collection_repository = set_spec == '' and not split_by_set

        # Set the return values.
        if individual_set_harvest:
            institution_key = os.path.dirname(harvester_settings_key)
            collection_key = set_spec
            collection_name = oai_pmh_metadata['sets'][set_spec]

        elif full_repository_harvest:
            institution_key = harvester_settings_key
            collection_key = os.path.basename(os.path.dirname(file_path))
            collection_name = oai_pmh_metadata['sets'][set_spec]

        elif single_collection_repository:
            institution_key = os.path.dirname(harvester_settings_key)
            collection_key = os.path.basename(harvester_settings_key)
            collection_name = oai_pmh_metadata['repository_name']
        else:
            raise IndexerError(
                'Unable to handle harvest configuration: {}'.format(
                    harvester_settings_key))

        return (identifier, institution_key, institution_name, collection_key,
                collection_name)

    def save_thumbnail(self, prl_solr_document: PRLSolrDocument):
        """Puts thumbnail on the local filesystem and on S3.

        Returns the Boolean value of whether or not a thumbnail was saved."""

        thumbnail_path = self.download_thumbnail(prl_solr_document)
        if thumbnail_path:
            self.upload_thumbnail(prl_solr_document, thumbnail_path)
            logging.debug('%s thumbnail saved',
                          prl_solr_document.get_record_identifier())
            return True
        else:
            return False

    def download_thumbnail(self, prl_solr_document: PRLSolrDocument):
        """Puts the thumbnail file in its place on the file system.

        Returns its path, or None if no thumbnail could be fetched."""

        # TODO: need better exception handling here
        thumbnail_s3_key = prl_solr_document.get_thumbnail_s3_key()
        try:
            filepath = os.path.join(
                os.path.abspath(
                    os.path.expanduser(self.config['s3']['sync']['source'])),
                thumbnail_s3_key)
            os.makedirs(os.path.dirname(filepath), exist_ok=True)

            original_thumbnail_url = prl_solr_document.original_thumbnail_metadata(
            )['url']
            n_tries = 3
            for try_i in range(1, n_tries + 1):
                try:
                    response = requests.get(original_thumbnail_url,
                                            timeout=30,
                                            stream=True)
                    # Fail on 4xx or 5xx
                    response.raise_for_status()
                    # Make sure the Content-Type is what we expect. Some servers discriminate against robots.
                    if re.match(re.compile('image/.+'),
                                response.headers.get('Content-Type')):
                        with open(filepath, 'wb') as image_file:
                            for chunk in response.iter_content(
                                    chunk_size=1024):
                                image_file.write(chunk)
                        logging.debug(
                            '%s thumbnail put on local filesystem at %s',
                            thumbnail_s3_key, filepath)
                        return filepath
                    else:
                        logging.debug('Robots cannot access %s',
                                      original_thumbnail_url)
                        return None
                except requests.Timeout as e:
                    if try_i < n_tries:
                        msg = 'Thumbnail download timed out, retrying...'
                        logging.info(msg)
                        # Continue loop
                    else:
                        # No more tries left, so fail
                        msg = 'Failed to download thumbnail after {} tries: {}'.format(
                            n_tries, str(e))
                        logging.debug(msg)
                        return None
                except (requests.RequestException, IOError) as e:
                    msg = 'Failed to download thumbnail: {}'.format(e)
                    logging.debug(msg)
                    return None
        except Exception as e:
            raise IndexerError(
                'Failed to put thumbnail on local filesystem: {}'.format(e))

    def upload_thumbnail(self, prl_solr_document: PRLSolrDocument,
                         filepath: str):
        """Puts the thumbnail on S3."""

        try:
            self.s3.put_object(
                Bucket=self.config['s3']['sync']['destination']['s3_uri'],
                Key=prl_solr_document.get_thumbnail_s3_key(),
                Body=open(filepath, 'rb'),
                ContentType=prl_solr_document.original_thumbnail_metadata()
                ['content-type'])
            logging.debug('%s thumbnail put on S3',
                          prl_solr_document.get_record_identifier())
        except BotoCoreError as e:
            raise IndexerError('Failed to put thumbnail on S3: {}'.format(
                e.msg))

    def unsave_thumbnail(self, thumbnail_url: str, record_identifier: str,
                         institution_key: str, collection_keys: List[str]):
        """Removes thumbnail from the local filesystem and from S3."""

        try:
            thumbnail_s3_key = os.path.relpath(
                urllib.parse.urlparse(
                    urllib.parse.unquote(thumbnail_url)).path, '/')
            filepath = os.path.join(
                os.path.abspath(
                    os.path.expanduser(self.config['s3']['sync']['source'])),
                thumbnail_s3_key)
            os.remove(filepath)
            logging.debug('%s thumbnail removed from local filesystem at %s',
                          record_identifier, filepath)

            # TODO: clean up empty parent directories
            self.s3.delete_object(
                Bucket=self.config['s3']['sync']['destination']['s3_uri'],
                Key=thumbnail_s3_key)
            logging.debug('%s thumbnail removed from S3', record_identifier)
        except BotoCoreError as e:
            raise IndexerError('Failed to remove thumbnail from S3: {}'.format(
                e.msg))
        except Exception as e:
            raise IndexerError(
                'Failed to remove thumbnail from local filesystem: {}'.format(
                    e))
Пример #36
0
#!/usr/bin/env python

import sys

from pysolr import Solr

from docs import docs

print 'number of docs:', len(docs)

solr_url = 'http://localhost:8990/solr'
print 'connecting to Solr at', solr_url

conn = Solr(solr_url)

if '-d' in sys.argv:
    print 'deleting docs in Solr'
    conn.delete(q='*:*')
    conn.commit()
else:
    print "use flag '-d' to delete all Solr docs before adding docs"

print 'adding docs to Solr'
conn.add(docs)
conn.commit()
Пример #37
0
class DocManager(DocManagerBase):
    """The DocManager class creates a connection to the backend engine and
    adds/removes documents, and in the case of rollback, searches for them.

    The reason for storing id/doc pairs as opposed to doc's is so that multiple
    updates to the same doc reflect the most up to date version as opposed to
    multiple, slightly different versions of a doc.
    """

    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs):
        """Verify Solr URL and establish a connection.
        """
        self.solr = Solr(url)
        self.unique_key = unique_key
        # pysolr does things in milliseconds
        if auto_commit_interval is not None:
            self.auto_commit_interval = auto_commit_interval * 1000
        else:
            self.auto_commit_interval = None
        self.chunk_size = chunk_size
        self.field_list = []
        self._build_fields()
        self._formatter = DocumentFlattener()

    def _parse_fields(self, result, field_name):
        """ If Schema access, parse fields and build respective lists
        """
        field_list = []
        for key, value in result.get('schema', {}).get(field_name, {}).items():
            if key not in field_list:
                field_list.append(key)
        return field_list

    @wrap_exceptions
    def _build_fields(self):
        """ Builds a list of valid fields
        """
        declared_fields = self.solr._send_request('get', ADMIN_URL)
        result = decoder.decode(declared_fields)
        self.field_list = self._parse_fields(result, 'fields')

        # Build regular expressions to match dynamic fields.
        # dynamic field names may have exactly one wildcard, either at
        # the beginning or the end of the name
        self._dynamic_field_regexes = []
        for wc_pattern in self._parse_fields(result, 'dynamicFields'):
            if wc_pattern[0] == "*":
                self._dynamic_field_regexes.append(
                    re.compile(".*%s\Z" % wc_pattern[1:]))
            elif wc_pattern[-1] == "*":
                self._dynamic_field_regexes.append(
                    re.compile("\A%s.*" % wc_pattern[:-1]))

    def _clean_doc(self, doc):
        """Reformats the given document before insertion into Solr.

        This method reformats the document in the following ways:
          - removes extraneous fields that aren't defined in schema.xml
          - unwinds arrays in order to find and later flatten sub-documents
          - flattens the document so that there are no sub-documents, and every
            value is associated with its dot-separated path of keys

        An example:
          {"a": 2,
           "b": {
             "c": {
               "d": 5
             }
           },
           "e": [6, 7, 8]
          }

        becomes:
          {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8}

        """

        # Translate the _id field to whatever unique key we're using.
        # _id may not exist in the doc, if we retrieved it from Solr
        # as part of update.
        if '_id' in doc:
            doc[self.unique_key] = doc.pop("_id")

        # SOLR cannot index fields within sub-documents, so flatten documents
        # with the dot-separated path to each value as the respective key
        flat_doc = self._formatter.format_document(doc)

        # Only include fields that are explicitly provided in the
        # schema or match one of the dynamic field patterns, if
        # we were able to retrieve the schema
        if len(self.field_list) + len(self._dynamic_field_regexes) > 0:
            def include_field(field):
                return field in self.field_list or any(
                    regex.match(field) for regex in self._dynamic_field_regexes
                )
            return dict((k, v) for k, v in flat_doc.items() if include_field(k))
        return flat_doc

    def stop(self):
        """ Stops the instance
        """
        pass

    def apply_update(self, doc, update_spec):
        """Override DocManagerBase.apply_update to have flat documents."""
        # Replace a whole document
        if not '$set' in update_spec and not '$unset' in update_spec:
            # update spec contains the new document
            update_spec['_ts'] = doc['_ts']
            update_spec['ns'] = doc['ns']
            return update_spec
        for to_set in update_spec.get("$set", []):
            value = update_spec['$set'][to_set]
            # Find dotted-path to the value, remove that key from doc, then
            # put value at key:
            keys_to_pop = []
            for key in doc:
                if key.startswith(to_set):
                    if key == to_set or key[len(to_set)] == '.':
                        keys_to_pop.append(key)
            for key in keys_to_pop:
                doc.pop(key)
            doc[to_set] = value
        for to_unset in update_spec.get("$unset", []):
            doc.pop(to_unset)
        return doc

    @wrap_exceptions
    def update(self, doc, update_spec):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.

        """
        query = "%s:%s" % (self.unique_key, str(doc['_id']))
        results = self.solr.search(query)
        if not len(results):
            # Document may not be retrievable yet
            self.commit()
            results = self.solr.search(query)
        # Results is an iterable containing only 1 result
        for doc in results:
            updated = self.apply_update(doc, update_spec)
            # A _version_ of 0 will always apply the update
            updated['_version_'] = 0
            self.upsert(updated)
            return updated

    @wrap_exceptions
    def upsert(self, doc):
        """Update or insert a document into Solr

        This method should call whatever add/insert/update method exists for
        the backend engine and add the document in there. The input will
        always be one mongo document, represented as a Python dictionary.
        """
        if self.auto_commit_interval is not None:
            self.solr.add([self._clean_doc(doc)],
                          commit=(self.auto_commit_interval == 0),
                          commitWithin=str(self.auto_commit_interval))
        else:
            self.solr.add([self._clean_doc(doc)], commit=False)

    @wrap_exceptions
    def bulk_upsert(self, docs):
        """Update or insert multiple documents into Solr

        docs may be any iterable
        """
        if self.auto_commit_interval is not None:
            add_kwargs = {
                "commit": (self.auto_commit_interval == 0),
                "commitWithin": self.auto_commit_interval
            }
        else:
            add_kwargs = {"commit": False}

        cleaned = (self._clean_doc(d) for d in docs)
        if self.chunk_size > 0:
            batch = list(next(cleaned) for i in range(self.chunk_size))
            while batch:
                self.solr.add(batch, **add_kwargs)
                batch = list(next(cleaned)
                             for i in range(self.chunk_size))
        else:
            self.solr.add(cleaned, **add_kwargs)

    @wrap_exceptions
    def remove(self, doc):
        """Removes documents from Solr

        The input is a python dictionary that represents a mongo document.
        """
        self.solr.delete(id=str(doc["_id"]),
                         commit=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def _remove(self):
        """Removes everything
        """
        self.solr.delete(q='*:*', commit=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def _stream_search(self, query):
        """Helper method for iterating over Solr search results."""
        for doc in self.solr.search(query, rows=100000000):
            if self.unique_key != "_id":
                doc["_id"] = doc.pop(self.unique_key)
            yield doc

    @wrap_exceptions
    def search(self, start_ts, end_ts):
        """Called to query Solr for documents in a time range."""
        query = '_ts: [%s TO %s]' % (start_ts, end_ts)
        return self._stream_search(query)

    @wrap_exceptions
    def _search(self, query):
        """For test purposes only. Performs search on Solr with given query
            Does not have to be implemented.
        """
        return self._stream_search(query)

    def commit(self):
        """This function is used to force a commit.
        """
        retry_until_ok(self.solr.commit)

    @wrap_exceptions
    def get_last_doc(self):
        """Returns the last document stored in the Solr engine.
        """
        #search everything, sort by descending timestamp, return 1 row
        try:
            result = self.solr.search('*:*', sort='_ts desc', rows=1)
        except ValueError:
            return None

        for r in result:
            r['_id'] = r.pop(self.unique_key)
            return r
Пример #38
0
class SearchBackend(BaseSearchBackend):
    def __init__(self):
        if not hasattr(settings, "HAYSTACK_SOLR_URL"):
            raise ImproperlyConfigured("You must specify a HAYSTACK_SOLR_URL in your settings.")

        # DRL_TODO: This should handle the connection more graceful, especially
        #           if the backend is down.
        self.conn = Solr(settings.HAYSTACK_SOLR_URL)

    def update(self, index, iterable, commit=True):
        docs = []

        try:
            for obj in iterable:
                doc = {}
                doc["id"] = self.get_identifier(obj)
                doc["django_ct_s"] = "%s.%s" % (obj._meta.app_label, obj._meta.module_name)
                doc["django_id_s"] = force_unicode(obj.pk)
                doc.update(index.prepare(obj))
                docs.append(doc)
        except UnicodeDecodeError:
            sys.stderr.write("Chunk failed.\n")
            pass

        self.conn.add(docs, commit=commit)

    def remove(self, obj, commit=True):
        solr_id = self.get_identifier(obj)
        self.conn.delete(id=solr_id, commit=commit)

    def clear(self, models=[], commit=True):
        if not models:
            # *:* matches all docs in Solr
            self.conn.delete(q="*:*", commit=commit)
        else:
            models_to_delete = []

            for model in models:
                models_to_delete.append("django_ct_s:%s.%s" % (model._meta.app_label, model._meta.module_name))

            self.conn.delete(q=" OR ".join(models_to_delete), commit=commit)

        # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99
        self.conn.optimize()

    def search(
        self,
        query_string,
        sort_by=None,
        start_offset=0,
        end_offset=None,
        fields="",
        highlight=False,
        facets=None,
        date_facets=None,
        query_facets=None,
        narrow_queries=None,
    ):
        if len(query_string) == 0:
            return []

        kwargs = {"fl": "* score"}

        if fields:
            kwargs["fl"] = fields

        if sort_by is not None:
            kwargs["sort"] = sort_by

        if start_offset is not None:
            kwargs["start"] = start_offset

        if end_offset is not None:
            kwargs["rows"] = end_offset

        if highlight is True:
            kwargs["hl"] = "true"
            kwargs["hl.fragsize"] = "200"

        if facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.field"] = facets

        if date_facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.date"] = date_facets.keys()

            for key, value in date_facets.items():
                # Date-based facets in Solr kinda suck.
                kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(value.get("start_date"))
                kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(value.get("end_date"))
                kwargs["f.%s.facet.date.gap" % key] = value.get("gap")

        if query_facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.query"] = ["%s:%s" % (field, value) for field, value in query_facets.items()]

        if narrow_queries is not None:
            kwargs["fq"] = list(narrow_queries)

        raw_results = self.conn.search(query_string, **kwargs)
        return self._process_results(raw_results, highlight=highlight)

    def more_like_this(self, model_instance):
        from haystack.sites import site, NotRegistered

        index = site.get_index(model_instance.__class__)
        field_name = index.get_content_field()
        raw_results = self.conn.more_like_this("id:%s" % self.get_identifier(model_instance), field_name, fl="*,score")
        return self._process_results(raw_results)

    def _process_results(self, raw_results, highlight=False):
        results = []
        facets = {}

        if hasattr(raw_results, "facets"):
            facets = {
                "fields": raw_results.facets.get("facet_fields", {}),
                "dates": raw_results.facets.get("facet_dates", {}),
                "queries": raw_results.facets.get("facet_queries", {}),
            }

            for key in ["fields"]:
                for facet_field in facets[key]:
                    # Convert to a two-tuple, as Solr's json format returns a list of
                    # pairs.
                    facets[key][facet_field] = zip(facets[key][facet_field][::2], facets[key][facet_field][1::2])

        for raw_result in raw_results.docs:
            app_label, module_name = raw_result["django_ct_s"].split(".")
            additional_fields = {}

            for key, value in raw_result.items():
                additional_fields[str(key)] = self.conn._to_python(value)

            del (additional_fields["django_ct_s"])
            del (additional_fields["django_id_s"])
            del (additional_fields["score"])

            if raw_result["id"] in getattr(raw_results, "highlighting", {}):
                additional_fields["highlighted"] = raw_results.highlighting[raw_result["id"]]

            result = SearchResult(
                app_label, module_name, raw_result["django_id_s"], raw_result["score"], **additional_fields
            )
            results.append(result)

        return {"results": results, "hits": raw_results.hits, "facets": facets}
Пример #39
0
class SolrTestCase(unittest.TestCase):
    def setUp(self):
        super(SolrTestCase, self).setUp()
        self.default_solr = Solr('http://localhost:8983/solr/core0')
        # Short timeouts.
        self.solr = Solr('http://localhost:8983/solr/core0', timeout=2)
        self.docs = [
            {
                'id': 'doc_1',
                'title': 'Example doc 1',
                'price': 12.59,
                'popularity': 10,
            },
            {
                'id': 'doc_2',
                'title': 'Another example ☃ doc 2',
                'price': 13.69,
                'popularity': 7,
            },
            {
                'id': 'doc_3',
                'title': 'Another thing',
                'price': 2.35,
                'popularity': 8,
            },
            {
                'id': 'doc_4',
                'title': 'doc rock',
                'price': 99.99,
                'popularity': 10,
            },
            {
                'id': 'doc_5',
                'title': 'Boring',
                'price': 1.12,
                'popularity': 2,
            },
        ]

        # Clear it.
        self.solr.delete(q='*:*')

        # Index our docs. Yes, this leans on functionality we're going to test
        # later & if it's broken, everything will catastrophically fail.
        # Such is life.
        self.solr.add(self.docs)

    def tearDown(self):
        self.solr.delete(q='*:*')
        super(SolrTestCase, self).tearDown()

    def test_init(self):
        self.assertEqual(self.default_solr.url,
                         'http://localhost:8983/solr/core0')
        self.assertTrue(isinstance(self.default_solr.decoder,
                                   json.JSONDecoder))
        self.assertEqual(self.default_solr.timeout, 60)

        self.assertEqual(self.solr.url, 'http://localhost:8983/solr/core0')
        self.assertTrue(isinstance(self.solr.decoder, json.JSONDecoder))
        self.assertEqual(self.solr.timeout, 2)

    def test__create_full_url(self):
        # Nada.
        self.assertEqual(self.solr._create_full_url(path=''),
                         'http://localhost:8983/solr/core0')
        # Basic path.
        self.assertEqual(self.solr._create_full_url(path='pysolr_tests'),
                         'http://localhost:8983/solr/core0/pysolr_tests')
        # Leading slash (& making sure we don't touch the trailing slash).
        self.assertEqual(
            self.solr._create_full_url(
                path='/pysolr_tests/select/?whatever=/'),
            'http://localhost:8983/solr/core0/pysolr_tests/select/?whatever=/')

    def test__send_request(self):
        # Test a valid request.
        resp_body = self.solr._send_request('GET', 'select/?q=doc&wt=json')
        self.assertTrue('"numFound":3' in resp_body)

        # Test a lowercase method & a body.
        xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee! ☃</field></doc></add>'
        resp_body = self.solr._send_request('POST',
                                            'update/?commit=true',
                                            body=xml_body,
                                            headers={
                                                'Content-type':
                                                'text/xml; charset=utf-8',
                                            })
        self.assertTrue('<int name="status">0</int>' in resp_body)

        # Test a non-existent URL.
        old_url = self.solr.url
        self.solr.url = 'http://127.0.0.1:567898/wahtever'
        self.assertRaises(SolrError, self.solr._send_request, 'get',
                          'select/?q=doc&wt=json')
        self.solr.url = old_url

    def test__select(self):
        # Short params.
        resp_body = self.solr._select({'q': 'doc'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 3)

        # Long params.
        resp_body = self.solr._select({'q': 'doc' * 1024})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)
        self.assertEqual(len(resp_data['responseHeader']['params']['q']),
                         3 * 1024)

    def test__mlt(self):
        resp_body = self.solr._mlt({'q': 'id:doc_1', 'mlt.fl': 'title'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)

    def test__suggest_terms(self):
        resp_body = self.solr._select({'terms.fl': 'title'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)

    def test__update(self):
        xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
        resp_body = self.solr._update(xml_body)
        self.assertTrue('<int name="status">0</int>' in resp_body)

    def test__extract_error(self):
        class RubbishResponse(object):
            def __init__(self, content, headers=None):
                if isinstance(content, bytes):
                    content = content.decode('utf-8')
                self.content = content
                self.headers = headers

                if self.headers is None:
                    self.headers = {}

            def json(self):
                return json.loads(self.content)

        # Just the reason.
        resp_1 = RubbishResponse("We don't care.",
                                 {'reason': 'Something went wrong.'})
        self.assertEqual(self.solr._extract_error(resp_1),
                         "[Reason: Something went wrong.]")

        # Empty reason.
        resp_2 = RubbishResponse("We don't care.", {'reason': None})
        self.assertEqual(self.solr._extract_error(resp_2),
                         "[Reason: None]\nWe don't care.")

        # No reason. Time to scrape.
        resp_3 = RubbishResponse(
            '<html><body><pre>Something is broke.</pre></body></html>',
            {'server': 'jetty'})
        self.assertEqual(self.solr._extract_error(resp_3),
                         "[Reason: Something is broke.]")

        # No reason. JSON response.
        resp_4 = RubbishResponse(b'\n {"error": {"msg": "It happens"}}',
                                 {'server': 'tomcat'})
        self.assertEqual(self.solr._extract_error(resp_4),
                         "[Reason: It happens]")

        # No reason. Weird JSON response.
        resp_5 = RubbishResponse(b'{"kinda": "weird"}', {'server': 'jetty'})
        self.assertEqual(self.solr._extract_error(resp_5),
                         '[Reason: None]\n{"kinda": "weird"}')

    def test__scrape_response(self):
        # Jetty.
        resp_1 = self.solr._scrape_response(
            {'server': 'jetty'},
            '<html><body><pre>Something is broke.</pre></body></html>')
        self.assertEqual(resp_1, ('Something is broke.', u''))

        # Other.
        resp_2 = self.solr._scrape_response({
            'server': 'crapzilla'
        }, '<html><head><title>Wow. Seriously weird.</title></head><body><pre>Something is broke.</pre></body></html>'
                                            )
        self.assertEqual(resp_2, ('Wow. Seriously weird.', u''))

    @unittest.skipUnless(HAS_LXML,
                         "Cannot test Tomcat error extraction without lxml")
    def test__scrape_response_tomcat(self):
        """Tests for Tomcat error responses, which currently require lxml.html to parse"""

        # Tomcat.
        resp_1 = self.solr._scrape_response({
            'server': 'coyote'
        }, '<html><body><p><span>Error message</span><span>messed up.</span></p></body></html>'
                                            )
        self.assertEqual(resp_1, ('messed up.', ''))

        # Broken Tomcat.
        resp_2 = self.solr._scrape_response({
            'server': 'coyote'
        }, '<html><body><p>Really broken. Scraping Java-generated HTML sucks.</pre></body></html>'
                                            )
        self.assertEqual(resp_2, (
            None,
            u'<div><body><p>Really broken. Scraping Java-generated HTML sucks.</p></body></div>'
        ))

    def test__from_python(self):
        self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)),
                         '2013-01-18T00:00:00Z')
        self.assertEqual(
            self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)),
            '2013-01-18T00:30:28Z')
        self.assertEqual(self.solr._from_python(True), 'true')
        self.assertEqual(self.solr._from_python(False), 'false')
        self.assertEqual(self.solr._from_python(1), '1')
        self.assertEqual(self.solr._from_python(1.2), '1.2')
        self.assertEqual(self.solr._from_python(b'hello'), 'hello')
        self.assertEqual(self.solr._from_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._from_python('\x01test\x02'), 'test')

    def test__to_python(self):
        self.assertEqual(self.solr._to_python('2013-01-18T00:00:00Z'),
                         datetime.datetime(2013, 1, 18))
        self.assertEqual(self.solr._to_python('2013-01-18T00:30:28Z'),
                         datetime.datetime(2013, 1, 18, 0, 30, 28))
        self.assertEqual(self.solr._to_python('true'), True)
        self.assertEqual(self.solr._to_python('false'), False)
        self.assertEqual(self.solr._to_python(1), 1)
        self.assertEqual(self.solr._to_python(1.2), 1.2)
        self.assertEqual(self.solr._to_python(b'hello'), 'hello')
        self.assertEqual(self.solr._to_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._to_python(['foo', 'bar']), 'foo')
        self.assertEqual(self.solr._to_python(('foo', 'bar')), 'foo')
        self.assertEqual(self.solr._to_python('tuple("foo", "bar")'),
                         'tuple("foo", "bar")')

    def test__is_null_value(self):
        self.assertTrue(self.solr._is_null_value(None))
        self.assertTrue(self.solr._is_null_value(''))

        self.assertFalse(self.solr._is_null_value('Hello'))
        self.assertFalse(self.solr._is_null_value(1))

    def test_search(self):
        results = self.solr.search('doc')
        self.assertEqual(len(results), 3)

        results = self.solr.search('example')
        self.assertEqual(len(results), 2)

        results = self.solr.search('nothing')
        self.assertEqual(len(results), 0)

        # Advanced options.
        results = self.solr.search(
            'doc',
            **{
                'debug': 'true',
                'hl': 'true',
                'hl.fragsize': 8,
                'facet': 'on',
                'facet.field': 'popularity',
                'spellcheck': 'true',
                'spellcheck.collate': 'true',
                'spellcheck.count': 1,
                # TODO: Can't get these working in my test setup.
                # 'group': 'true',
                # 'group.field': 'id',
            })
        self.assertEqual(len(results), 3)
        self.assertTrue('explain' in results.debug)
        self.assertEqual(results.highlighting, {
            u'doc_4': {},
            u'doc_2': {},
            u'doc_1': {}
        })
        self.assertEqual(results.spellcheck, {})
        self.assertEqual(results.facets['facet_fields']['popularity'],
                         ['10', 2, '7', 1, '2', 0, '8', 0])
        self.assertTrue(results.qtime is not None)
        # TODO: Can't get these working in my test setup.
        # self.assertEqual(results.grouped, '')

    def test_more_like_this(self):
        results = self.solr.more_like_this('id:doc_1', 'text')
        self.assertEqual(len(results), 0)

    def test_suggest_terms(self):
        results = self.solr.suggest_terms('title', '')
        self.assertEqual(len(results), 1)
        self.assertEqual(
            results, {
                'title': [('doc', 3), ('another', 2), ('example', 2), ('1', 1),
                          ('2', 1), ('boring', 1), ('rock', 1), ('thing', 1)]
            })

    def test__build_doc(self):
        doc = {
            'id': 'doc_1',
            'title': 'Example doc ☃ 1',
            'price': 12.59,
            'popularity': 10,
        }
        doc_xml = force_unicode(
            ET.tostring(self.solr._build_doc(doc), encoding='utf-8'))
        self.assertTrue(
            '<field name="title">Example doc ☃ 1</field>' in doc_xml)
        self.assertTrue('<field name="id">doc_1</field>' in doc_xml)
        self.assertEqual(len(doc_xml), 152)

    def test_add(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.assertEqual(len(self.solr.search('example')), 2)

        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            },
            {
                'id': 'doc_7',
                'title': 'Another example doc',
            },
        ])

        self.assertEqual(len(self.solr.search('doc')), 5)
        self.assertEqual(len(self.solr.search('example')), 3)

    def test_add_with_boost(self):
        self.assertEqual(len(self.solr.search('doc')), 3)

        self.solr.add([{
            'id': 'doc_6',
            'title': 'Important doc'
        }],
                      boost={'title': 10.0})

        self.solr.add([{
            'id': 'doc_7',
            'title': 'Spam doc doc'
        }],
                      boost={'title': 0})

        res = self.solr.search('doc')
        self.assertEqual(len(res), 5)
        self.assertEqual('doc_6', res.docs[0]['id'])

    def test_delete(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.delete(id='doc_1')
        self.assertEqual(len(self.solr.search('doc')), 2)
        self.solr.delete(q='price:[0 TO 15]')
        self.assertEqual(len(self.solr.search('doc')), 1)

        self.assertEqual(len(self.solr.search('*:*')), 1)
        self.solr.delete(q='*:*')
        self.assertEqual(len(self.solr.search('*:*')), 0)

        # Need at least one.
        self.assertRaises(ValueError, self.solr.delete)
        # Can't have both.
        self.assertRaises(ValueError, self.solr.delete, id='foo', q='bar')

    def test_commit(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([{
            'id': 'doc_6',
            'title': 'Newly added doc',
        }],
                      commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.commit()
        self.assertEqual(len(self.solr.search('doc')), 4)

    def test_optimize(self):
        # Make sure it doesn't blow up. Side effects are hard to measure. :/
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([{
            'id': 'doc_6',
            'title': 'Newly added doc',
        }],
                      commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.optimize()
        self.assertEqual(len(self.solr.search('doc')), 4)

    def test_extract(self):
        fake_f = StringIO("""
            <html>
                <head>
                    <meta charset="utf-8">
                    <meta name="haystack-test" content="test 1234">
                    <title>Test Title ☃&#x2603;</title>
                </head>
                    <body>foobar</body>
            </html>
        """)
        fake_f.name = "test.html"
        extracted = self.solr.extract(fake_f)

        # Verify documented response structure:
        self.assertIn('contents', extracted)
        self.assertIn('metadata', extracted)

        self.assertIn('foobar', extracted['contents'])

        m = extracted['metadata']

        self.assertEqual([fake_f.name], m['stream_name'])

        self.assertIn('haystack-test', m,
                      "HTML metadata should have been extracted!")
        self.assertEqual(['test 1234'], m['haystack-test'])

        # Note the underhanded use of a double snowman to verify both that Tika
        # correctly decoded entities and that our UTF-8 characters survived the
        # round-trip:
        self.assertEqual(['Test Title ☃☃'], m['title'])

    def test_full_url(self):
        self.solr.url = 'http://localhost:8983/solr/core0'
        full_url = self.solr._create_full_url(path='/update')

        # Make sure trailing and leading slashes do not collide:
        self.assertEqual(full_url, 'http://localhost:8983/solr/core0/update')
Пример #40
0
class SolrTestCase(unittest.TestCase):
    def setUp(self):
        super(SolrTestCase, self).setUp()
        self.default_solr = Solr('http://localhost:8983/solr/core0')
        # Short timeouts.
        self.solr = Solr('http://localhost:8983/solr/core0', timeout=2)
        self.docs = [
            {
                'id': 'doc_1',
                'title': 'Example doc 1',
                'price': 12.59,
                'popularity': 10,
            },
            {
                'id': 'doc_2',
                'title': 'Another example ☃ doc 2',
                'price': 13.69,
                'popularity': 7,
            },
            {
                'id': 'doc_3',
                'title': 'Another thing',
                'price': 2.35,
                'popularity': 8,
            },
            {
                'id': 'doc_4',
                'title': 'doc rock',
                'price': 99.99,
                'popularity': 10,
            },
            {
                'id': 'doc_5',
                'title': 'Boring',
                'price': 1.12,
                'popularity': 2,
            },
        ]

        # Clear it.
        self.solr.delete(q='*:*')

        # Index our docs. Yes, this leans on functionality we're going to test
        # later & if it's broken, everything will catastrophically fail.
        # Such is life.
        self.solr.add(self.docs)

    def tearDown(self):
        self.solr.delete(q='*:*')
        super(SolrTestCase, self).tearDown()

    def test_init(self):
        self.assertEqual(self.default_solr.url,
                         'http://localhost:8983/solr/core0')
        self.assertTrue(isinstance(self.default_solr.decoder,
                                   json.JSONDecoder))
        self.assertEqual(self.default_solr.timeout, 60)

        self.assertEqual(self.solr.url, 'http://localhost:8983/solr/core0')
        self.assertTrue(isinstance(self.solr.decoder, json.JSONDecoder))
        self.assertEqual(self.solr.timeout, 2)

    def test__create_full_url(self):
        # Nada.
        self.assertEqual(self.solr._create_full_url(path=''),
                         'http://localhost:8983/solr/core0')
        # Basic path.
        self.assertEqual(self.solr._create_full_url(path='pysolr_tests'),
                         'http://localhost:8983/solr/core0/pysolr_tests')
        # Leading slash (& making sure we don't touch the trailing slash).
        self.assertEqual(
            self.solr._create_full_url(
                path='/pysolr_tests/select/?whatever=/'),
            'http://localhost:8983/solr/core0/pysolr_tests/select/?whatever=/')

    def test__send_request(self):
        # Test a valid request.
        resp_body = self.solr._send_request('GET', 'select/?q=doc&wt=json')
        self.assertTrue('"numFound":3' in resp_body)

        # Test a lowercase method & a body.
        xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee! ☃</field></doc></add>'
        resp_body = self.solr._send_request('POST',
                                            'update/?commit=true',
                                            body=xml_body,
                                            headers={
                                                'Content-type':
                                                'text/xml; charset=utf-8',
                                            })
        self.assertTrue('<int name="status">0</int>' in resp_body)

        # Test a non-existent URL.
        old_url = self.solr.url
        self.solr.url = 'http://127.0.0.1:567898/wahtever'
        self.assertRaises(SolrError, self.solr._send_request, 'get',
                          'select/?q=doc&wt=json')
        self.solr.url = old_url

    def test__select(self):
        # Short params.
        resp_body = self.solr._select({'q': 'doc'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 3)

        # Long params.
        resp_body = self.solr._select({'q': 'doc' * 1024})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)
        self.assertEqual(len(resp_data['responseHeader']['params']['q']),
                         3 * 1024)

    def test__mlt(self):
        resp_body = self.solr._mlt({'q': 'id:doc_1', 'mlt.fl': 'title'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)

    def test__suggest_terms(self):
        resp_body = self.solr._select({'terms.fl': 'title'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)

    def test__update(self):
        xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
        resp_body = self.solr._update(xml_body)
        self.assertTrue('<int name="status">0</int>' in resp_body)

    def test__soft_commit(self):
        xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
        resp_body = self.solr._update(xml_body, softCommit=True)
        self.assertTrue('<int name="status">0</int>' in resp_body)

    def test__extract_error(self):
        class RubbishResponse(object):
            def __init__(self, content, headers=None):
                if isinstance(content, bytes):
                    content = content.decode('utf-8')
                self.content = content
                self.headers = headers

                if self.headers is None:
                    self.headers = {}

            def json(self):
                return json.loads(self.content)

        # Just the reason.
        resp_1 = RubbishResponse("We don't care.",
                                 {'reason': 'Something went wrong.'})
        self.assertEqual(self.solr._extract_error(resp_1),
                         "[Reason: Something went wrong.]")

        # Empty reason.
        resp_2 = RubbishResponse("We don't care.", {'reason': None})
        self.assertEqual(self.solr._extract_error(resp_2),
                         "[Reason: None]\nWe don't care.")

        # No reason. Time to scrape.
        resp_3 = RubbishResponse(
            '<html><body><pre>Something is broke.</pre></body></html>',
            {'server': 'jetty'})
        self.assertEqual(self.solr._extract_error(resp_3),
                         "[Reason: Something is broke.]")

        # No reason. JSON response.
        resp_4 = RubbishResponse(b'\n {"error": {"msg": "It happens"}}',
                                 {'server': 'tomcat'})
        self.assertEqual(self.solr._extract_error(resp_4),
                         "[Reason: It happens]")

        # No reason. Weird JSON response.
        resp_5 = RubbishResponse(b'{"kinda": "weird"}', {'server': 'jetty'})
        self.assertEqual(self.solr._extract_error(resp_5),
                         '[Reason: None]\n{"kinda": "weird"}')

    def test__scrape_response(self):
        # Jetty.
        resp_1 = self.solr._scrape_response(
            {'server': 'jetty'},
            '<html><body><pre>Something is broke.</pre></body></html>')
        self.assertEqual(resp_1, ('Something is broke.', u''))

        # Other.
        resp_2 = self.solr._scrape_response({
            'server': 'crapzilla'
        }, '<html><head><title>Wow. Seriously weird.</title></head><body><pre>Something is broke.</pre></body></html>'
                                            )
        self.assertEqual(resp_2, ('Wow. Seriously weird.', u''))

    @unittest.skipIf(
        sys.version_info < (2, 7),
        reason=
        u'Python 2.6 lacks the ElementTree 1.3 interface required for Solr XML error message parsing'
    )
    def test__scrape_response_coyote_xml(self):
        resp_3 = self.solr._scrape_response({
            'server': 'coyote'
        }, '<?xml version="1.0"?>\n<response>\n<lst name="responseHeader"><int name="status">400</int><int name="QTime">0</int></lst><lst name="error"><str name="msg">Invalid Date String:\'2015-03-23 10:43:33\'</str><int name="code">400</int></lst>\n</response>\n'
                                            )
        self.assertEqual(resp_3, ("Invalid Date String:'2015-03-23 10:43:33'",
                                  "Invalid Date String:'2015-03-23 10:43:33'"))

        # Valid XML with a traceback
        resp_4 = self.solr._scrape_response({'server': 'coyote'},
                                            """<?xml version="1.0"?>
<response>
<lst name="responseHeader"><int name="status">500</int><int name="QTime">138</int></lst><lst name="error"><str name="msg">Internal Server Error</str><str name="trace">org.apache.solr.common.SolrException: Internal Server Error at java.lang.Thread.run(Thread.java:745)</str><int name="code">500</int></lst>
</response>""")
        self.assertEqual(resp_4, (
            u"Internal Server Error",
            u"org.apache.solr.common.SolrException: Internal Server Error at java.lang.Thread.run(Thread.java:745)"
        ))

    def test__scrape_response_tomcat(self):
        """Tests for Tomcat error responses"""

        resp_0 = self.solr._scrape_response({
            'server': 'coyote'
        }, '<html><body><h1>Something broke!</h1><pre>gigantic stack trace</pre></body></html>'
                                            )
        self.assertEqual(resp_0, ('Something broke!', ''))

        # Invalid XML
        bogus_xml = '<?xml version="1.0"?>\n<response>\n<lst name="responseHeader"><int name="status">400</int><int name="QTime">0</int></lst><lst name="error"><str name="msg">Invalid Date String:\'2015-03-23 10:43:33\'</str><int name="code">400</int></lst>'
        reason, full_html = self.solr._scrape_response({'server': 'coyote'},
                                                       bogus_xml)
        self.assertEqual(reason, None)
        self.assertEqual(full_html, bogus_xml.replace("\n", ""))

    def test__from_python(self):
        self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)),
                         '2013-01-18T00:00:00Z')
        self.assertEqual(
            self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)),
            '2013-01-18T00:30:28Z')
        self.assertEqual(self.solr._from_python(True), 'true')
        self.assertEqual(self.solr._from_python(False), 'false')
        self.assertEqual(self.solr._from_python(1), '1')
        self.assertEqual(self.solr._from_python(1.2), '1.2')
        self.assertEqual(self.solr._from_python(b'hello'), 'hello')
        self.assertEqual(self.solr._from_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._from_python('\x01test\x02'), 'test')

    def test__to_python(self):
        self.assertEqual(self.solr._to_python('2013-01-18T00:00:00Z'),
                         datetime.datetime(2013, 1, 18))
        self.assertEqual(self.solr._to_python('2013-01-18T00:30:28Z'),
                         datetime.datetime(2013, 1, 18, 0, 30, 28))
        self.assertEqual(self.solr._to_python('true'), True)
        self.assertEqual(self.solr._to_python('false'), False)
        self.assertEqual(self.solr._to_python(1), 1)
        self.assertEqual(self.solr._to_python(1.2), 1.2)
        self.assertEqual(self.solr._to_python(b'hello'), 'hello')
        self.assertEqual(self.solr._to_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._to_python(['foo', 'bar']), 'foo')
        self.assertEqual(self.solr._to_python(('foo', 'bar')), 'foo')
        self.assertEqual(self.solr._to_python('tuple("foo", "bar")'),
                         'tuple("foo", "bar")')

    def test__is_null_value(self):
        self.assertTrue(self.solr._is_null_value(None))
        self.assertTrue(self.solr._is_null_value(''))

        self.assertFalse(self.solr._is_null_value('Hello'))
        self.assertFalse(self.solr._is_null_value(1))

    def test_search(self):
        results = self.solr.search('doc')
        self.assertEqual(len(results), 3)

        results = self.solr.search('example')
        self.assertEqual(len(results), 2)

        results = self.solr.search('nothing')
        self.assertEqual(len(results), 0)

        # Advanced options.
        results = self.solr.search(
            'doc',
            **{
                'debug': 'true',
                'hl': 'true',
                'hl.fragsize': 8,
                'facet': 'on',
                'facet.field': 'popularity',
                'spellcheck': 'true',
                'spellcheck.collate': 'true',
                'spellcheck.count': 1,
                # TODO: Can't get these working in my test setup.
                # 'group': 'true',
                # 'group.field': 'id',
            })
        self.assertEqual(len(results), 3)
        self.assertTrue('explain' in results.debug)
        self.assertEqual(results.highlighting, {
            u'doc_4': {},
            u'doc_2': {},
            u'doc_1': {}
        })
        self.assertEqual(results.spellcheck, {})
        self.assertEqual(results.facets['facet_fields']['popularity'],
                         ['10', 2, '7', 1, '2', 0, '8', 0])
        self.assertTrue(results.qtime is not None)
        # TODO: Can't get these working in my test setup.
        # self.assertEqual(results.grouped, '')

    def test_more_like_this(self):
        results = self.solr.more_like_this('id:doc_1', 'text')
        self.assertEqual(len(results), 0)

    def test_suggest_terms(self):
        results = self.solr.suggest_terms('title', '')
        self.assertEqual(len(results), 1)
        self.assertEqual(
            results, {
                'title': [('doc', 3), ('another', 2), ('example', 2), ('1', 1),
                          ('2', 1), ('boring', 1), ('rock', 1), ('thing', 1)]
            })

    def test__build_doc(self):
        doc = {
            'id': 'doc_1',
            'title': 'Example doc ☃ 1',
            'price': 12.59,
            'popularity': 10,
        }
        doc_xml = force_unicode(
            ET.tostring(self.solr._build_doc(doc), encoding='utf-8'))
        self.assertTrue(
            '<field name="title">Example doc ☃ 1</field>' in doc_xml)
        self.assertTrue('<field name="id">doc_1</field>' in doc_xml)
        self.assertEqual(len(doc_xml), 152)

    def test_add(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.assertEqual(len(self.solr.search('example')), 2)

        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            },
            {
                'id': 'doc_7',
                'title': 'Another example doc',
            },
        ])

        self.assertEqual(len(self.solr.search('doc')), 5)
        self.assertEqual(len(self.solr.search('example')), 3)

    def test_add_with_boost(self):
        self.assertEqual(len(self.solr.search('doc')), 3)

        self.solr.add([{
            'id': 'doc_6',
            'title': 'Important doc'
        }],
                      boost={'title': 10.0})

        self.solr.add([{
            'id': 'doc_7',
            'title': 'Spam doc doc'
        }],
                      boost={'title': 0})

        res = self.solr.search('doc')
        self.assertEqual(len(res), 5)
        self.assertEqual('doc_6', res.docs[0]['id'])

    def test_field_update(self):
        originalDocs = self.solr.search('doc')
        self.assertEqual(len(originalDocs), 3)
        updateList = []
        for i, doc in enumerate(originalDocs):
            updateList.append({'id': doc['id'], 'popularity': 5})
        self.solr.add(updateList, fieldUpdates={'popularity': 'inc'})

        updatedDocs = self.solr.search('doc')
        self.assertEqual(len(updatedDocs), 3)
        for i, (originalDoc,
                updatedDoc) in enumerate(zip(originalDocs, updatedDocs)):
            self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys()))
            self.assertEqual(updatedDoc['popularity'],
                             originalDoc['popularity'] + 5)
            self.assertEqual(
                True,
                all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys()
                    if not k in ['_version_', 'popularity']))

        self.solr.add([
            {
                'id': 'multivalued_1',
                'title': 'Multivalued doc 1',
                'word_ss': ['alpha', 'beta'],
            },
            {
                'id': 'multivalued_2',
                'title': 'Multivalued doc 2',
                'word_ss': ['charlie', 'delta'],
            },
        ])

        originalDocs = self.solr.search('multivalued')
        self.assertEqual(len(originalDocs), 2)
        updateList = []
        for i, doc in enumerate(originalDocs):
            updateList.append({
                'id': doc['id'],
                'word_ss': ['epsilon', 'gamma']
            })
        self.solr.add(updateList, fieldUpdates={'word_ss': 'add'})

        updatedDocs = self.solr.search('multivalued')
        self.assertEqual(len(updatedDocs), 2)
        for i, (originalDoc,
                updatedDoc) in enumerate(zip(originalDocs, updatedDocs)):
            self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys()))
            self.assertEqual(updatedDoc['word_ss'],
                             originalDoc['word_ss'] + ['epsilon', 'gamma'])
            self.assertEqual(
                True,
                all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys()
                    if not k in ['_version_', 'word_ss']))

    def test_delete(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.delete(id='doc_1')
        self.assertEqual(len(self.solr.search('doc')), 2)
        self.solr.delete(q='price:[0 TO 15]')
        self.assertEqual(len(self.solr.search('doc')), 1)

        self.assertEqual(len(self.solr.search('*:*')), 1)
        self.solr.delete(q='*:*')
        self.assertEqual(len(self.solr.search('*:*')), 0)

        # Need at least one.
        self.assertRaises(ValueError, self.solr.delete)
        # Can't have both.
        self.assertRaises(ValueError, self.solr.delete, id='foo', q='bar')

    def test_commit(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([{
            'id': 'doc_6',
            'title': 'Newly added doc',
        }],
                      commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.commit()
        self.assertEqual(len(self.solr.search('doc')), 4)

    def test_optimize(self):
        # Make sure it doesn't blow up. Side effects are hard to measure. :/
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([{
            'id': 'doc_6',
            'title': 'Newly added doc',
        }],
                      commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.optimize()
        self.assertEqual(len(self.solr.search('doc')), 4)

    def test_extract(self):
        fake_f = StringIO("""
            <html>
                <head>
                    <meta charset="utf-8">
                    <meta name="haystack-test" content="test 1234">
                    <title>Test Title ☃&#x2603;</title>
                </head>
                    <body>foobar</body>
            </html>
        """)
        fake_f.name = "test.html"
        extracted = self.solr.extract(fake_f)

        # Verify documented response structure:
        self.assertIn('contents', extracted)
        self.assertIn('metadata', extracted)

        self.assertIn('foobar', extracted['contents'])

        m = extracted['metadata']

        self.assertEqual([fake_f.name], m['stream_name'])

        self.assertIn('haystack-test', m,
                      "HTML metadata should have been extracted!")
        self.assertEqual(['test 1234'], m['haystack-test'])

        # Note the underhanded use of a double snowman to verify both that Tika
        # correctly decoded entities and that our UTF-8 characters survived the
        # round-trip:
        self.assertEqual(['Test Title ☃☃'], m['title'])

    def test_full_url(self):
        self.solr.url = 'http://localhost:8983/solr/core0'
        full_url = self.solr._create_full_url(path='/update')

        # Make sure trailing and leading slashes do not collide:
        self.assertEqual(full_url, 'http://localhost:8983/solr/core0/update')
Пример #41
0
class DocManager():
    """The DocManager class creates a connection to the backend engine and
    adds/removes documents, and in the case of rollback, searches for them.

    The reason for storing id/doc pairs as opposed to doc's is so that multiple
    updates to the same doc reflect the most up to date version as opposed to
    multiple, slightly different versions of a doc.
    """
    def __init__(self,
                 url,
                 auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id',
                 **kwargs):
        """Verify Solr URL and establish a connection.
        """
        self.solr = Solr(url)
        self.unique_key = unique_key
        # pysolr does things in milliseconds
        if auto_commit_interval is not None:
            self.auto_commit_interval = auto_commit_interval * 1000
        else:
            self.auto_commit_interval = None
        self.field_list = []
        self._build_fields()

    def _parse_fields(self, result, field_name):
        """ If Schema access, parse fields and build respective lists
        """
        field_list = []
        for key, value in result.get('schema', {}).get(field_name, {}).items():
            if key not in field_list:
                field_list.append(key)
        return field_list

    def _build_fields(self):
        """ Builds a list of valid fields
        """
        declared_fields = self.solr._send_request('get', ADMIN_URL)
        result = decoder.decode(declared_fields)
        self.field_list = self._parse_fields(result, 'fields')

        # Build regular expressions to match dynamic fields.
        # dynamic field names may have exactly one wildcard, either at
        # the beginning or the end of the name
        self._dynamic_field_regexes = []
        for wc_pattern in self._parse_fields(result, 'dynamicFields'):
            if wc_pattern[0] == "*":
                self._dynamic_field_regexes.append(
                    re.compile(".*%s\Z" % wc_pattern[1:]))
            elif wc_pattern[-1] == "*":
                self._dynamic_field_regexes.append(
                    re.compile("\A%s.*" % wc_pattern[:-1]))

    def _clean_doc(self, doc):
        """Reformats the given document before insertion into Solr.

        This method reformats the document in the following ways:
          - removes extraneous fields that aren't defined in schema.xml
          - unwinds arrays in order to find and later flatten sub-documents
          - flattens the document so that there are no sub-documents, and every
            value is associated with its dot-separated path of keys

        An example:
          {"a": 2,
           "b": {
             "c": {
               "d": 5
             }
           },
           "e": [6, 7, 8]
          }

        becomes:
          {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8}

        """

        # SOLR cannot index fields within sub-documents, so flatten documents
        # with the dot-separated path to each value as the respective key
        def flattened(doc):
            def flattened_kernel(doc, path):
                for k, v in doc.items():
                    path.append(k)
                    if isinstance(v, dict):
                        for inner_k, inner_v in flattened_kernel(v, path):
                            yield inner_k, inner_v
                    elif isinstance(v, list):
                        for li, lv in enumerate(v):
                            path.append(str(li))
                            if isinstance(lv, dict):
                                for dk, dv in flattened_kernel(lv, path):
                                    yield dk, dv
                            else:
                                yield ".".join(path), lv
                            path.pop()
                    else:
                        yield ".".join(path), v
                    path.pop()

            return dict(flattened_kernel(doc, []))

        # Translate the _id field to whatever unique key we're using
        doc[self.unique_key] = doc["_id"]
        flat_doc = flattened(doc)

        # Only include fields that are explicitly provided in the
        # schema or match one of the dynamic field patterns, if
        # we were able to retrieve the schema
        if len(self.field_list) + len(self._dynamic_field_regexes) > 0:

            def include_field(field):
                return field in self.field_list or any(
                    regex.match(field)
                    for regex in self._dynamic_field_regexes)

            return dict(
                (k, v) for k, v in flat_doc.items() if include_field(k))
        return flat_doc

    def stop(self):
        """ Stops the instance
        """
        pass

    def upsert(self, doc):
        """Update or insert a document into Solr

        This method should call whatever add/insert/update method exists for
        the backend engine and add the document in there. The input will
        always be one mongo document, represented as a Python dictionary.
        """
        try:
            if self.auto_commit_interval is not None:
                self.solr.add([self._clean_doc(doc)],
                              commit=(self.auto_commit_interval == 0),
                              commitWithin=str(self.auto_commit_interval))
            else:
                self.solr.add([self._clean_doc(doc)], commit=False)
        except SolrError:
            raise errors.OperationFailed("Could not insert %r into Solr" %
                                         bsjson.dumps(doc))

    def bulk_upsert(self, docs):
        """Update or insert multiple documents into Solr

        docs may be any iterable
        """
        try:
            cleaned = (self._clean_doc(d) for d in docs)
            if self.auto_commit_interval is not None:
                self.solr.add(cleaned,
                              commit=(self.auto_commit_interval == 0),
                              commitWithin=str(self.auto_commit_interval))
            else:
                self.solr.add(cleaned, commit=False)
        except SolrError:
            raise errors.OperationFailed(
                "Could not bulk-insert documents into Solr")

    def remove(self, doc):
        """Removes documents from Solr

        The input is a python dictionary that represents a mongo document.
        """
        self.solr.delete(id=str(doc[self.unique_key]),
                         commit=(self.auto_commit_interval == 0))

    def _remove(self):
        """Removes everything
        """
        self.solr.delete(q='*:*', commit=(self.auto_commit_interval == 0))

    def search(self, start_ts, end_ts):
        """Called to query Solr for documents in a time range.
        """
        query = '_ts: [%s TO %s]' % (start_ts, end_ts)
        return self.solr.search(query, rows=100000000)

    def _search(self, query):
        """For test purposes only. Performs search on Solr with given query
            Does not have to be implemented.
        """
        return self.solr.search(query, rows=200)

    def commit(self):
        """This function is used to force a commit.
        """
        retry_until_ok(self.solr.commit)

    def get_last_doc(self):
        """Returns the last document stored in the Solr engine.
        """
        #search everything, sort by descending timestamp, return 1 row
        try:
            result = self.solr.search('*:*', sort='_ts desc', rows=1)
        except ValueError:
            return None

        if len(result) == 0:
            return None

        return result.docs[0]
Пример #42
0
class SearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )
    
    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':',
    )
    
    def __init__(self, site=None):
        super(SearchBackend, self).__init__(site)
        
        if not hasattr(settings, 'HAYSTACK_SOLR_URL'):
            raise ImproperlyConfigured('You must specify a HAYSTACK_SOLR_URL in your settings.')
        
        timeout = getattr(settings, 'HAYSTACK_SOLR_TIMEOUT', 10)
        self.conn = Solr(settings.HAYSTACK_SOLR_URL, timeout=timeout)
    
    def update(self, index, iterable, commit=True):
        docs = []
        
        try:
            for obj in iterable:
                doc = {}
                doc['id'] = self.get_identifier(obj)
                doc['django_ct'] = "%s.%s" % (obj._meta.app_label, obj._meta.module_name)
                doc['django_id'] = force_unicode(obj.pk)
                doc.update(index.prepare(obj))
                docs.append(doc)
        except UnicodeDecodeError:
            sys.stderr.write("Chunk failed.\n")
        
        self.conn.add(docs, commit=commit)

    def remove(self, obj_or_string, commit=True):
        solr_id = self.get_identifier(obj_or_string)
        self.conn.delete(id=solr_id, commit=commit)

    def clear(self, models=[], commit=True):
        if not models:
            # *:* matches all docs in Solr
            self.conn.delete(q='*:*', commit=commit)
        else:
            models_to_delete = []
            
            for model in models:
                models_to_delete.append("django_ct:%s.%s" % (model._meta.app_label, model._meta.module_name))
            
            self.conn.delete(q=" OR ".join(models_to_delete), commit=commit)
        
        # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99
        self.conn.optimize()

    def search(self, query_string, sort_by=None, start_offset=0, end_offset=None,
               fields='', highlight=False, facets=None, date_facets=None, query_facets=None,
               narrow_queries=None, collapse_field=None, collapse_max=None, collapse_type=None, **kwargs):
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }
        
        kwargs = {
            'fl': '* score',
        }
        
        if fields:
            kwargs['fl'] = fields
        
        if sort_by is not None:
            kwargs['sort'] = sort_by
        
        if start_offset is not None:
            kwargs['start'] = start_offset
        
        if end_offset is not None:
            kwargs['rows'] = end_offset
        
        if highlight is True:
            kwargs['hl'] = 'true'
            kwargs['hl.fragsize'] = '70'
        
        if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
            kwargs['spellcheck'] = 'true'
            kwargs['spellcheck.collate'] = 'true'
            kwargs['spellcheck.count'] = 1
        
        if facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.field'] = facets
        
        if date_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.date'] = date_facets.keys()
            
            for key, value in date_facets.items():
                # Date-based facets in Solr kinda suck.
                kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(value.get('start_date'))
                kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(value.get('end_date'))
                gap_string = value.get('gap_by').upper()
                
                if value.get('gap_amount') != 1:
                    gap_string = "%d%sS" % (value.get('gap_amount'), gap_string)
                
                kwargs["f.%s.facet.date.gap" % key] = "/%s" % gap_string
        
        if query_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.query'] = ["%s:%s" % (field, value) for field, value in query_facets.items()]
        
        if narrow_queries is not None:
            kwargs['fq'] = list(narrow_queries)
        
        if collapse_field is not None:
            #http://wiki.apache.org/solr/FieldCollapsing
            #&collapse.field=url&collapse.max=1&collapse.type=normal
            kwargs['collapse'] = 'true'
            kwargs['collapse.type'] = collapse_type
            kwargs['collapse.max'] = collapse_max# assumes defaults to an int
            kwargs['collapse.field'] = collapse_field
        
        raw_results = self.conn.search(query_string, **kwargs)
        return self._process_results(raw_results, highlight=highlight)
    
    def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, **kwargs):
        index = self.site.get_index(model_instance.__class__)
        field_name = index.get_content_field()
        params = {
            'fl': '*,score',
        }
        
        if start_offset is not None:
            params['start'] = start_offset
        
        if end_offset is not None:
            params['rows'] = end_offset
        
        if additional_query_string:
            params['fq'] = additional_query_string
        
        raw_results = self.conn.more_like_this("id:%s" % self.get_identifier(model_instance), field_name, **params)
        return self._process_results(raw_results)
    
    def _process_results(self, raw_results, highlight=False):
        from haystack import site
        results = []
        hits = raw_results.hits
        facets = {}
        spelling_suggestion = None
        
        if hasattr(raw_results, 'facets'):
            facets = {
                'fields': raw_results.facets.get('facet_fields', {}),
                'dates': raw_results.facets.get('facet_dates', {}),
                'queries': raw_results.facets.get('facet_queries', {}),
            }
            
            for key in ['fields']:
                for facet_field in facets[key]:
                    # Convert to a two-tuple, as Solr's json format returns a list of
                    # pairs.
                    facets[key][facet_field] = zip(facets[key][facet_field][::2], facets[key][facet_field][1::2])
        
        if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
            if hasattr(raw_results, 'spellcheck'):
                if len(raw_results.spellcheck.get('suggestions', [])):
                    # For some reason, it's an array of pairs. Pull off the
                    # collated result from the end.
                    spelling_suggestion = raw_results.spellcheck.get('suggestions')[-1]
        
        indexed_models = site.get_indexed_models()
        
        for raw_result in raw_results.docs:
            app_label, model_name = raw_result['django_ct'].split('.')
            additional_fields = {}
            model = get_model(app_label, model_name)
            
            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = site.get_index(model)
                    string_key = str(key)
                    
                    if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                        additional_fields[string_key] = index.fields[string_key].convert(value)
                    else:
                        additional_fields[string_key] = self.conn._to_python(value)
                
                del(additional_fields['django_ct'])
                del(additional_fields['django_id'])
                del(additional_fields['score'])
                
                if raw_result['id'] in getattr(raw_results, 'highlighting', {}):
                    additional_fields['highlighted'] = raw_results.highlighting[raw_result['id']]
                
                result = SearchResult(app_label, model_name, raw_result['django_id'], raw_result['score'], **additional_fields)
                results.append(result)
            else:
                hits -= 1
        
        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }
    
    def build_schema(self, fields):
        content_field_name = ''
        schema_fields = []
        
        for field_name, field_class in fields.items():
            field_data = {
                'field_name': field_name,
                'type': 'text',
                'indexed': 'true',
                'multi_valued': 'false',
            }
            
            if field_class.document is True:
                content_field_name = field_name
            
            if field_class.indexed is False:
                field_data['indexed'] = 'false'
            
            # DRL_FIXME: Perhaps move to something where, if none of these
            #            checks succeed, call a custom method on the form that
            #            returns, per-backend, the right type of storage?
            # DRL_FIXME: Also think about removing `isinstance` and replacing
            #            it with a method call/string returned (like 'text' or
            #            'date').
            if isinstance(field_class, (DateField, DateTimeField)):
                field_data['type'] = 'date'
            elif isinstance(field_class, IntegerField):
                field_data['type'] = 'slong'
            elif isinstance(field_class, FloatField):
                field_data['type'] = 'sfloat'
            elif isinstance(field_class, BooleanField):
                field_data['type'] = 'boolean'
            elif isinstance(field_class, MultiValueField):
                field_data['multi_valued'] = 'true'
            
            schema_fields.append(field_data)
        
        return (content_field_name, schema_fields)
Пример #43
0
                output[header] = newTime.isoformat() + 'Z'
            except:
                # print col
                print lineno
                validRow = False
            # print newTime.isoformat()
        elif header == 'fb_assoc':
            output[header] = col.strip().split(' ')
        elif header == 'geoloc':
            try:
                cleanCol = col.replace('geolocation{latitude=','').replace('longitude=','').replace('}','').replace(', ',',')
                # print cleanCol
                if cleanCol != 'null':
                    output[header] = cleanCol
            except:
                print lineno
                validRow = False
        else:
            output[header] = col
    if validRow:
        data.append(output)

    # update the index every 10000 documents (reduces overhead)
    if i > (10000*index):
        conn.add(data)
        data = []
        index = index + 1
    i = i + 1

if data:
    conn.add(data)
Пример #44
0
class SolrSearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = ("AND", "NOT", "OR", "TO")

    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        "\\",
        "+",
        "-",
        "&&",
        "||",
        "!",
        "(",
        ")",
        "{",
        "}",
        "[",
        "]",
        "^",
        '"',
        "~",
        "*",
        "?",
        ":",
        "/",
    )

    def __init__(self, connection_alias, **connection_options):
        super(SolrSearchBackend, self).__init__(connection_alias, **connection_options)

        if "URL" not in connection_options:
            raise ImproperlyConfigured(
                "You must specify a 'URL' in your settings for connection '%s'."
                % connection_alias
            )

        self.collate = connection_options.get("COLLATE_SPELLING", True)

        self.conn = Solr(
            connection_options["URL"],
            timeout=self.timeout,
            **connection_options.get("KWARGS", {})
        )
        self.log = logging.getLogger("haystack")

    def update(self, index, iterable, commit=True):
        docs = []

        for obj in iterable:
            try:
                docs.append(index.full_prepare(obj))
            except SkipDocument:
                self.log.debug("Indexing for object `%s` skipped", obj)
            except UnicodeDecodeError:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(
                    "UnicodeDecodeError while preparing object for update",
                    exc_info=True,
                    extra={"data": {"index": index, "object": get_identifier(obj)}},
                )

        if len(docs) > 0:
            try:
                self.conn.add(docs, commit=commit, boost=index.get_field_weights())
            except (IOError, SolrError) as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Solr: %s", e, exc_info=True)

    def remove(self, obj_or_string, commit=True):
        solr_id = get_identifier(obj_or_string)

        try:
            kwargs = {"commit": commit, "id": solr_id}
            self.conn.delete(**kwargs)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to remove document '%s' from Solr: %s",
                solr_id,
                e,
                exc_info=True,
            )

    def clear(self, models=None, commit=True):
        if models is not None:
            assert isinstance(models, (list, tuple))

        try:
            if models is None:
                # *:* matches all docs in Solr
                self.conn.delete(q="*:*", commit=commit)
            else:
                models_to_delete = []

                for model in models:
                    models_to_delete.append("%s:%s" % (DJANGO_CT, get_model_ct(model)))

                self.conn.delete(q=" OR ".join(models_to_delete), commit=commit)

            if commit:
                # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99
                self.conn.optimize()
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            if models is not None:
                self.log.error(
                    "Failed to clear Solr index of models '%s': %s",
                    ",".join(models_to_delete),
                    e,
                    exc_info=True,
                )
            else:
                self.log.error("Failed to clear Solr index: %s", e, exc_info=True)

    @log_query
    def search(self, query_string, **kwargs):
        if len(query_string) == 0:
            return {"results": [], "hits": 0}

        search_kwargs = self.build_search_kwargs(query_string, **kwargs)

        try:
            raw_results = self.conn.search(query_string, **search_kwargs)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to query Solr using '%s': %s", query_string, e, exc_info=True
            )
            raw_results = EmptyResults()

        return self._process_results(
            raw_results,
            highlight=kwargs.get("highlight"),
            result_class=kwargs.get("result_class", SearchResult),
            distance_point=kwargs.get("distance_point"),
        )

    def build_search_kwargs(
        self,
        query_string,
        sort_by=None,
        start_offset=0,
        end_offset=None,
        fields="",
        highlight=False,
        facets=None,
        date_facets=None,
        query_facets=None,
        narrow_queries=None,
        spelling_query=None,
        within=None,
        dwithin=None,
        distance_point=None,
        models=None,
        limit_to_registered_models=None,
        result_class=None,
        stats=None,
        collate=None,
        **extra_kwargs
    ):

        index = haystack.connections[self.connection_alias].get_unified_index()

        kwargs = {"fl": "* score", "df": index.document_field}

        if fields:
            if isinstance(fields, (list, set)):
                fields = " ".join(fields)

            kwargs["fl"] = fields

        if sort_by is not None:
            if sort_by in ["distance asc", "distance desc"] and distance_point:
                # Do the geo-enabled sort.
                lng, lat = distance_point["point"].coords
                kwargs["sfield"] = distance_point["field"]
                kwargs["pt"] = "%s,%s" % (lat, lng)

                if sort_by == "distance asc":
                    kwargs["sort"] = "geodist() asc"
                else:
                    kwargs["sort"] = "geodist() desc"
            else:
                if sort_by.startswith("distance "):
                    warnings.warn(
                        "In order to sort by distance, you must call the '.distance(...)' method."
                    )

                # Regular sorting.
                kwargs["sort"] = sort_by

        if start_offset is not None:
            kwargs["start"] = start_offset

        if end_offset is not None:
            kwargs["rows"] = end_offset - start_offset

        if highlight:
            # `highlight` can either be True or a dictionary containing custom parameters
            # which will be passed to the backend and may override our default settings:

            kwargs["hl"] = "true"
            kwargs["hl.fragsize"] = "200"

            if isinstance(highlight, dict):
                # autoprefix highlighter options with 'hl.', all of them start with it anyway
                # this makes option dicts shorter: {'maxAnalyzedChars': 42}
                # and lets some of options be used as keyword arguments: `.highlight(preserveMulti=False)`
                kwargs.update(
                    {
                        key if key.startswith("hl.") else ("hl." + key): highlight[key]
                        for key in highlight.keys()
                    }
                )

        if collate is None:
            collate = self.collate
        if self.include_spelling is True:
            kwargs["spellcheck"] = "true"
            kwargs["spellcheck.collate"] = str(collate).lower()
            kwargs["spellcheck.count"] = 1

            if spelling_query:
                kwargs["spellcheck.q"] = spelling_query

        if facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.field"] = facets.keys()

            for facet_field, options in facets.items():
                for key, value in options.items():
                    kwargs[
                        "f.%s.facet.%s" % (facet_field, key)
                    ] = self.conn._from_python(value)

        if date_facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.date"] = date_facets.keys()
            kwargs["facet.date.other"] = "none"

            for key, value in date_facets.items():
                kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(
                    value.get("start_date")
                )
                kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(
                    value.get("end_date")
                )
                gap_by_string = value.get("gap_by").upper()
                gap_string = "%d%s" % (value.get("gap_amount"), gap_by_string)

                if value.get("gap_amount") != 1:
                    gap_string += "S"

                kwargs["f.%s.facet.date.gap" % key] = "+%s/%s" % (
                    gap_string,
                    gap_by_string,
                )

        if query_facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.query"] = [
                "%s:%s" % (field, value) for field, value in query_facets
            ]

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True
            )

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add("%s:(%s)" % (DJANGO_CT, " OR ".join(model_choices)))

        if narrow_queries is not None:
            kwargs["fq"] = list(narrow_queries)

        if stats:
            kwargs["stats"] = "true"

            for k in stats.keys():
                kwargs["stats.field"] = k

                for facet in stats[k]:
                    kwargs["f.%s.stats.facet" % k] = facet

        if within is not None:
            from haystack.utils.geo import generate_bounding_box

            kwargs.setdefault("fq", [])
            ((min_lat, min_lng), (max_lat, max_lng)) = generate_bounding_box(
                within["point_1"], within["point_2"]
            )
            # Bounding boxes are min, min TO max, max. Solr's wiki was *NOT*
            # very clear on this.
            bbox = "%s:[%s,%s TO %s,%s]" % (
                within["field"],
                min_lat,
                min_lng,
                max_lat,
                max_lng,
            )
            kwargs["fq"].append(bbox)

        if dwithin is not None:
            kwargs.setdefault("fq", [])
            lng, lat = dwithin["point"].coords
            geofilt = "{!geofilt pt=%s,%s sfield=%s d=%s}" % (
                lat,
                lng,
                dwithin["field"],
                dwithin["distance"].km,
            )
            kwargs["fq"].append(geofilt)

        # Check to see if the backend should try to include distances
        # (Solr 4.X+) in the results.
        if self.distance_available and distance_point:
            # In early testing, you can't just hand Solr 4.X a proper bounding box
            # & request distances. To enable native distance would take calculating
            # a center point & a radius off the user-provided box, which kinda
            # sucks. We'll avoid it for now, since Solr 4.x's release will be some
            # time yet.
            # kwargs['fl'] += ' _dist_:geodist()'
            pass

        if extra_kwargs:
            kwargs.update(extra_kwargs)

        return kwargs

    def more_like_this(
        self,
        model_instance,
        additional_query_string=None,
        start_offset=0,
        end_offset=None,
        models=None,
        limit_to_registered_models=None,
        result_class=None,
        **kwargs
    ):
        from haystack import connections

        # Deferred models will have a different class ("RealClass_Deferred_fieldname")
        # which won't be in our registry:
        model_klass = model_instance._meta.concrete_model

        index = (
            connections[self.connection_alias]
            .get_unified_index()
            .get_index(model_klass)
        )
        field_name = index.get_content_field()
        params = {"fl": "*,score"}

        if start_offset is not None:
            params["start"] = start_offset

        if end_offset is not None:
            params["rows"] = end_offset

        narrow_queries = set()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True
            )

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add("%s:(%s)" % (DJANGO_CT, " OR ".join(model_choices)))

        if additional_query_string:
            narrow_queries.add(additional_query_string)

        if narrow_queries:
            params["fq"] = list(narrow_queries)

        query = "%s:%s" % (ID, get_identifier(model_instance))

        try:
            raw_results = self.conn.more_like_this(query, field_name, **params)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to fetch More Like This from Solr for document '%s': %s",
                query,
                e,
                exc_info=True,
            )
            raw_results = EmptyResults()

        return self._process_results(raw_results, result_class=result_class)

    def _process_results(
        self, raw_results, highlight=False, result_class=None, distance_point=None
    ):
        from haystack import connections

        results = []
        hits = raw_results.hits
        facets = {}
        stats = {}
        spelling_suggestion = spelling_suggestions = None

        if result_class is None:
            result_class = SearchResult

        if hasattr(raw_results, "stats"):
            stats = raw_results.stats.get("stats_fields", {})

        if hasattr(raw_results, "facets"):
            facets = {
                "fields": raw_results.facets.get("facet_fields", {}),
                "dates": raw_results.facets.get("facet_dates", {}),
                "queries": raw_results.facets.get("facet_queries", {}),
            }

            for key in ["fields"]:
                for facet_field in facets[key]:
                    # Convert to a two-tuple, as Solr's json format returns a list of
                    # pairs.
                    facets[key][facet_field] = list(
                        zip(
                            facets[key][facet_field][::2],
                            facets[key][facet_field][1::2],
                        )
                    )

        if self.include_spelling and hasattr(raw_results, "spellcheck"):
            try:
                spelling_suggestions = self.extract_spelling_suggestions(raw_results)
            except Exception as exc:
                self.log.error(
                    "Error extracting spelling suggestions: %s",
                    exc,
                    exc_info=True,
                    extra={"data": {"spellcheck": raw_results.spellcheck}},
                )

                if not self.silently_fail:
                    raise

                spelling_suggestions = None

            if spelling_suggestions:
                # Maintain compatibility with older versions of Haystack which returned a single suggestion:
                spelling_suggestion = spelling_suggestions[-1]
                assert isinstance(spelling_suggestion, six.string_types)
            else:
                spelling_suggestion = None

        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for raw_result in raw_results.docs:
            app_label, model_name = raw_result[DJANGO_CT].split(".")
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                index = unified_index.get_index(model)
                index_field_map = index.field_map
                for key, value in raw_result.items():
                    string_key = str(key)
                    # re-map key if alternate name used
                    if string_key in index_field_map:
                        string_key = index_field_map[key]

                    if string_key in index.fields and hasattr(
                        index.fields[string_key], "convert"
                    ):
                        additional_fields[string_key] = index.fields[
                            string_key
                        ].convert(value)
                    else:
                        additional_fields[string_key] = self.conn._to_python(value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])
                del (additional_fields["score"])

                if raw_result[ID] in getattr(raw_results, "highlighting", {}):
                    additional_fields["highlighted"] = raw_results.highlighting[
                        raw_result[ID]
                    ]

                if distance_point:
                    additional_fields["_point_of_origin"] = distance_point

                    if raw_result.get("__dist__"):
                        from django.contrib.gis.measure import Distance

                        additional_fields["_distance"] = Distance(
                            km=float(raw_result["__dist__"])
                        )
                    else:
                        additional_fields["_distance"] = None

                result = result_class(
                    app_label,
                    model_name,
                    raw_result[DJANGO_ID],
                    raw_result["score"],
                    **additional_fields
                )
                results.append(result)
            else:
                hits -= 1

        return {
            "results": results,
            "hits": hits,
            "stats": stats,
            "facets": facets,
            "spelling_suggestion": spelling_suggestion,
            "spelling_suggestions": spelling_suggestions,
        }

    def extract_spelling_suggestions(self, raw_results):
        # There are many different formats for Legacy, 6.4, and 6.5 e.g.
        # https://issues.apache.org/jira/browse/SOLR-3029 and depending on the
        # version and configuration the response format may be a dict of dicts,
        # a list of dicts, or a list of strings.

        collations = raw_results.spellcheck.get("collations", None)
        suggestions = raw_results.spellcheck.get("suggestions", None)

        # We'll collect multiple suggestions here. For backwards
        # compatibility with older versions of Haystack we'll still return
        # only a single suggestion but in the future we can expose all of
        # them.

        spelling_suggestions = []

        if collations:
            if isinstance(collations, dict):
                # Solr 6.5
                collation_values = collations["collation"]
                if isinstance(collation_values, six.string_types):
                    collation_values = [collation_values]
                elif isinstance(collation_values, dict):
                    # spellcheck.collateExtendedResults changes the format to a dictionary:
                    collation_values = [collation_values["collationQuery"]]
            elif isinstance(collations[1], dict):
                # Solr 6.4
                collation_values = collations
            else:
                # Older versions of Solr
                collation_values = collations[-1:]

            for i in collation_values:
                # Depending on the options the values are either simple strings or dictionaries:
                spelling_suggestions.append(
                    i["collationQuery"] if isinstance(i, dict) else i
                )
        elif suggestions:
            if isinstance(suggestions, dict):
                for i in suggestions.values():
                    for j in i["suggestion"]:
                        if isinstance(j, dict):
                            spelling_suggestions.append(j["word"])
                        else:
                            spelling_suggestions.append(j)
            elif isinstance(suggestions[0], six.string_types) and isinstance(
                suggestions[1], dict
            ):
                # Solr 6.4 uses a list of paired (word, dictionary) pairs:
                for suggestion in suggestions:
                    if isinstance(suggestion, dict):
                        for i in suggestion["suggestion"]:
                            if isinstance(i, dict):
                                spelling_suggestions.append(i["word"])
                            else:
                                spelling_suggestions.append(i)
            else:
                # Legacy Solr
                spelling_suggestions.append(suggestions[-1])

        return spelling_suggestions

    def build_schema(self, fields):
        content_field_name = ""
        schema_fields = []

        for field_name, field_class in fields.items():
            field_data = {
                "field_name": field_class.index_fieldname,
                "type": "text_en",
                "indexed": "true",
                "stored": "true",
                "multi_valued": "false",
            }

            if field_class.document is True:
                content_field_name = field_class.index_fieldname

            # DRL_FIXME: Perhaps move to something where, if none of these
            #            checks succeed, call a custom method on the form that
            #            returns, per-backend, the right type of storage?
            if field_class.field_type in ["date", "datetime"]:
                field_data["type"] = "date"
            elif field_class.field_type == "integer":
                field_data["type"] = "long"
            elif field_class.field_type == "float":
                field_data["type"] = "float"
            elif field_class.field_type == "boolean":
                field_data["type"] = "boolean"
            elif field_class.field_type == "ngram":
                field_data["type"] = "ngram"
            elif field_class.field_type == "edge_ngram":
                field_data["type"] = "edge_ngram"
            elif field_class.field_type == "location":
                field_data["type"] = "location"

            if field_class.is_multivalued:
                field_data["multi_valued"] = "true"

            if field_class.stored is False:
                field_data["stored"] = "false"

            # Do this last to override `text` fields.
            if field_class.indexed is False:
                field_data["indexed"] = "false"

                # If it's text and not being indexed, we probably don't want
                # to do the normal lowercase/tokenize/stemming/etc. dance.
                if field_data["type"] == "text_en":
                    field_data["type"] = "string"

            # If it's a ``FacetField``, make sure we don't postprocess it.
            if hasattr(field_class, "facet_for"):
                # If it's text, it ought to be a string.
                if field_data["type"] == "text_en":
                    field_data["type"] = "string"

            schema_fields.append(field_data)

        return (content_field_name, schema_fields)

    def extract_file_contents(self, file_obj, **kwargs):
        """Extract text and metadata from a structured file (PDF, MS Word, etc.)

        Uses the Solr ExtractingRequestHandler, which is based on Apache Tika.
        See the Solr wiki for details:

            http://wiki.apache.org/solr/ExtractingRequestHandler

        Due to the way the ExtractingRequestHandler is implemented it completely
        replaces the normal Haystack indexing process with several unfortunate
        restrictions: only one file per request, the extracted data is added to
        the index with no ability to modify it, etc. To simplify the process and
        allow for more advanced use we'll run using the extract-only mode to
        return the extracted data without adding it to the index so we can then
        use it within Haystack's normal templating process.

        Returns None if metadata cannot be extracted; otherwise returns a
        dictionary containing at least two keys:

            :contents:
                        Extracted full-text content, if applicable
            :metadata:
                        key:value pairs of text strings
        """

        try:
            return self.conn.extract(file_obj, **kwargs)
        except Exception as e:
            self.log.warning(
                "Unable to extract file contents: %s",
                e,
                exc_info=True,
                extra={"data": {"file": file_obj}},
            )
            return None
Пример #45
0
class DocManager():
    """The DocManager class creates a connection to the backend engine and
    adds/removes documents, and in the case of rollback, searches for them.

    The reason for storing id/doc pairs as opposed to doc's is so that multiple
    updates to the same doc reflect the most up to date version as opposed to
    multiple, slightly different versions of a doc.
    """

    def __init__(self, url, auto_commit=False, unique_key='_id', **kwargs):
        """Verify Solr URL and establish a connection.
        """
        self.solr = Solr(url)
        self.unique_key = unique_key
        self.auto_commit = auto_commit
        self.field_list = []
        self.dynamic_field_list = []
        self.build_fields()

        if auto_commit:
            self.run_auto_commit()

    def _parse_fields(self, result, field_name):
        """ If Schema access, parse fields and build respective lists
        """
        field_list = []
        for key, value in result.get('schema', {}).get(field_name, {}).items():
            if key not in field_list:
                field_list.append(key)
        return field_list

    def build_fields(self):
        """ Builds a list of valid fields
        """
        declared_fields = self.solr._send_request('get', ADMIN_URL)
        result = decoder.decode(declared_fields)
        self.field_list = self._parse_fields(result, 'fields'),
        self.dynamic_field_list = self._parse_fields(result, 'dynamicFields')

    def clean_doc(self, doc):
        """ Cleans a document passed in to be compliant with the Solr as
        used by Solr. This WILL remove fields that aren't in the schema, so
        the document may actually get altered.
        """
        if not self.field_list:
            return doc

        fixed_doc = {}
        doc[self.unique_key] = doc["_id"]
        for key, value in doc.items():
            if key in self.field_list[0]:
                fixed_doc[key] = value

            # Dynamic strings. * can occur only at beginning and at end
            else:
                for field in self.dynamic_field_list:
                    if field[0] == '*':
                        regex = re.compile(r'\w%s\b' % (field))
                    else:
                        regex = re.compile(r'\b%s\w' % (field))
                    if regex.match(key):
                        fixed_doc[key] = value

        return fixed_doc

    def stop(self):
        """ Stops the instance
        """
        self.auto_commit = False

    def upsert(self, doc):
        """Update or insert a document into Solr

        This method should call whatever add/insert/update method exists for
        the backend engine and add the document in there. The input will
        always be one mongo document, represented as a Python dictionary.
        """
        try:
            self.solr.add([self.clean_doc(doc)], commit=True)
        except SolrError:
            raise errors.OperationFailed(
                "Could not insert %r into Solr" % bsjson.dumps(doc))

    def bulk_upsert(self, docs):
        """Update or insert multiple documents into Solr

        docs may be any iterable
        """
        try:
            cleaned = (self.clean_doc(d) for d in docs)
            self.solr.add(cleaned, commit=True)
        except SolrError:
            raise errors.OperationFailed(
                "Could not bulk-insert documents into Solr")

    def remove(self, doc):
        """Removes documents from Solr

        The input is a python dictionary that represents a mongo document.
        """
        self.solr.delete(id=str(doc[self.unique_key]), commit=True)

    def _remove(self):
        """Removes everything
        """
        self.solr.delete(q='*:*')

    def search(self, start_ts, end_ts):
        """Called to query Solr for documents in a time range.
        """
        query = '_ts: [%s TO %s]' % (start_ts, end_ts)
        return self.solr.search(query, rows=100000000)

    def _search(self, query):
        """For test purposes only. Performs search on Solr with given query
            Does not have to be implemented.
        """
        return self.solr.search(query, rows=200)

    def commit(self):
        """This function is used to force a commit.
        """
        retry_until_ok(self.solr.commit)

    def run_auto_commit(self):
        """Periodically commits to the Solr server.
        """
        self.solr.commit()
        if self.auto_commit:
            Timer(1, self.run_auto_commit).start()

    def get_last_doc(self):
        """Returns the last document stored in the Solr engine.
        """
        #search everything, sort by descending timestamp, return 1 row
        try:
            result = self.solr.search('*:*', sort='_ts desc', rows=1)
        except ValueError:
            return None

        if len(result) == 0:
            return None

        return result.docs[0]
Пример #46
0
				Entry.date_added >= lastIndexDateTime,
				Entry.date_modified >= lastIndexDateTime
			)
		).order_by(Entry.id).values(Entry.id)
#	entry_ids = [156805]
	
	docs = []
	num = 0
	for entry_id in entry_ids:
		entry = store.find(Entry, Entry.id==entry_id)[0]
		
		log("Adding Entry with id %d" % entry.id)
		docs.append(createdoc(entry))
		num += 1
		if num % entriesPerPost == 0:
			solr.add(docs, False)
			docs = []
		if num % transactionSize == 0:
			log("Committing...")
			try:
				solr.commit()
			except pysolr.SolrError, e:
				log('%s, skipping last batch...' % str(e).strip())
			
	
	if len(docs) > 0:
		solr.add(docs)
	
	solr.commit()

	end_time = util.now()
Пример #47
0
class SolrSearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':', '/',
    )

    def __init__(self, connection_alias, **connection_options):
        super(SolrSearchBackend, self).__init__(connection_alias, **connection_options)

        if not 'URL' in connection_options:
            raise ImproperlyConfigured("You must specify a 'URL' in your settings for connection '%s'." % connection_alias)

        self.conn = Solr(connection_options['URL'], timeout=self.timeout, **connection_options.get('KWARGS', {}))
        self.log = logging.getLogger('haystack')

    def get_schema_admin(self):
        '''
        SolrSchemaAdmin singleton
        '''
        if not hasattr(self, '_schema_admin'):
            self._schema_admin = SolrSchemaAdmin(self.conn.url, self.conn.session)
        return self._schema_admin

    def update(self, index, iterable, commit=True):
        docs = []

        for obj in iterable:
            try:
                docs.append(index.full_prepare(obj))
            except SkipDocument:
                self.log.debug(u"Indexing for object `%s` skipped", obj)
            except UnicodeDecodeError:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"UnicodeDecodeError while preparing object for update", exc_info=True, extra={
                    "data": {
                        "index": index,
                        "object": get_identifier(obj)
                    }
                })

        if len(docs) > 0:
            try:
                self.conn.add(docs, commit=commit, boost=index.get_field_weights())
            except (IOError, SolrError) as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Solr: %s", e)

    def remove(self, obj_or_string, commit=True):
        solr_id = get_identifier(obj_or_string)

        try:
            kwargs = {
                'commit': commit,
                'id': solr_id
            }
            self.conn.delete(**kwargs)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to remove document '%s' from Solr: %s", solr_id, e)

    def clear(self, models=[], commit=True):
        try:
            if not models:
                # *:* matches all docs in Solr
                self.conn.delete(q='*:*', commit=commit)
            else:
                models_to_delete = []

                for model in models:
                    models_to_delete.append("%s:%s" % (DJANGO_CT, get_model_ct(model)))

                self.conn.delete(q=" OR ".join(models_to_delete), commit=commit)

            if commit:
                # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99
                self.conn.optimize()
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            if len(models):
                self.log.error("Failed to clear Solr index of models '%s': %s", ','.join(models_to_delete), e)
            else:
                self.log.error("Failed to clear Solr index: %s", e)

    @log_query
    def search(self, query_string, **kwargs):
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }

        search_kwargs = self.build_search_kwargs(query_string, **kwargs)

        try:
            raw_results = self.conn.search(query_string, **search_kwargs)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to query Solr using '%s': %s", query_string, e)
            raw_results = EmptyResults()

        return self._process_results(raw_results, highlight=kwargs.get('highlight'), result_class=kwargs.get('result_class', SearchResult), distance_point=kwargs.get('distance_point'))

    def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None,
                            fields='', highlight=False, facets=None,
                            date_facets=None, query_facets=None,
                            narrow_queries=None, spelling_query=None,
                            within=None, dwithin=None, distance_point=None,
                            models=None, limit_to_registered_models=None,
                            result_class=None, stats=None):
        kwargs = {'fl': '* score'}

        if fields:
            if isinstance(fields, (list, set)):
                fields = " ".join(fields)

            kwargs['fl'] = fields

        if sort_by is not None:
            if sort_by in ['distance asc', 'distance desc'] and distance_point:
                # Do the geo-enabled sort.
                lng, lat = distance_point['point'].get_coords()
                kwargs['sfield'] = distance_point['field']
                kwargs['pt'] = '%s,%s' % (lat, lng)

                if sort_by == 'distance asc':
                    kwargs['sort'] = 'geodist() asc'
                else:
                    kwargs['sort'] = 'geodist() desc'
            else:
                if sort_by.startswith('distance '):
                    warnings.warn("In order to sort by distance, you must call the '.distance(...)' method.")

                # Regular sorting.
                kwargs['sort'] = sort_by

        if start_offset is not None:
            kwargs['start'] = start_offset

        if end_offset is not None:
            kwargs['rows'] = end_offset - start_offset

        if highlight is True:
            kwargs['hl'] = 'true'
            kwargs['hl.fragsize'] = '200'

        if self.include_spelling is True:
            kwargs['spellcheck'] = 'true'
            kwargs['spellcheck.collate'] = 'true'
            kwargs['spellcheck.count'] = 1

            if spelling_query:
                kwargs['spellcheck.q'] = spelling_query

        if facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.field'] = facets.keys()

            for facet_field, options in facets.items():
                for key, value in options.items():
                    kwargs['f.%s.facet.%s' % (facet_field, key)] = self.conn._from_python(value)

        if date_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.date'] = date_facets.keys()
            kwargs['facet.date.other'] = 'none'

            for key, value in date_facets.items():
                kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(value.get('start_date'))
                kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(value.get('end_date'))
                gap_by_string = value.get('gap_by').upper()
                gap_string = "%d%s" % (value.get('gap_amount'), gap_by_string)

                if value.get('gap_amount') != 1:
                    gap_string += "S"

                kwargs["f.%s.facet.date.gap" % key] = '+%s/%s' % (gap_string, gap_by_string)

        if query_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.query'] = ["%s:%s" % (field, value) for field, value in query_facets]

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices)))

        if narrow_queries is not None:
            kwargs['fq'] = list(narrow_queries)

        if stats:
            kwargs['stats'] = "true"

            for k in stats.keys():
                kwargs['stats.field'] = k

                for facet in stats[k]:
                    kwargs['f.%s.stats.facet' % k] = facet

        if within is not None:
            from haystack.utils.geo import generate_bounding_box

            kwargs.setdefault('fq', [])
            ((min_lat, min_lng), (max_lat, max_lng)) = generate_bounding_box(within['point_1'], within['point_2'])
            # Bounding boxes are min, min TO max, max. Solr's wiki was *NOT*
            # very clear on this.
            bbox = '%s:[%s,%s TO %s,%s]' % (within['field'], min_lat, min_lng, max_lat, max_lng)
            kwargs['fq'].append(bbox)

        if dwithin is not None:
            kwargs.setdefault('fq', [])
            lng, lat = dwithin['point'].get_coords()
            geofilt = '{!geofilt pt=%s,%s sfield=%s d=%s}' % (lat, lng, dwithin['field'], dwithin['distance'].km)
            kwargs['fq'].append(geofilt)

        # Check to see if the backend should try to include distances
        # (Solr 4.X+) in the results.
        if self.distance_available and distance_point:
            # In early testing, you can't just hand Solr 4.X a proper bounding box
            # & request distances. To enable native distance would take calculating
            # a center point & a radius off the user-provided box, which kinda
            # sucks. We'll avoid it for now, since Solr 4.x's release will be some
            # time yet.
            # kwargs['fl'] += ' _dist_:geodist()'
            pass

        return kwargs

    def more_like_this(self, model_instance, additional_query_string=None,
                       start_offset=0, end_offset=None, models=None,
                       limit_to_registered_models=None, result_class=None, **kwargs):
        from haystack import connections

        # Deferred models will have a different class ("RealClass_Deferred_fieldname")
        # which won't be in our registry:
        model_klass = model_instance._meta.concrete_model

        index = connections[self.connection_alias].get_unified_index().get_index(model_klass)
        field_name = index.get_content_field()
        params = {
            'fl': '*,score',
        }

        if start_offset is not None:
            params['start'] = start_offset

        if end_offset is not None:
            params['rows'] = end_offset

        narrow_queries = set()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices)))

        if additional_query_string:
            narrow_queries.add(additional_query_string)

        if narrow_queries:
            params['fq'] = list(narrow_queries)

        query = "%s:%s" % (ID, get_identifier(model_instance))

        try:
            raw_results = self.conn.more_like_this(query, field_name, **params)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to fetch More Like This from Solr for document '%s': %s", query, e)
            raw_results = EmptyResults()

        return self._process_results(raw_results, result_class=result_class)

    def _process_results(self, raw_results, highlight=False, result_class=None, distance_point=None):
        from haystack import connections
        results = []
        hits = raw_results.hits
        facets = {}
        stats = {}
        spelling_suggestion = None

        if result_class is None:
            result_class = SearchResult

        if hasattr(raw_results,'stats'):
            stats = raw_results.stats.get('stats_fields',{})

        if hasattr(raw_results, 'facets'):
            facets = {
                'fields': raw_results.facets.get('facet_fields', {}),
                'dates': raw_results.facets.get('facet_dates', {}),
                'queries': raw_results.facets.get('facet_queries', {}),
            }

            for key in ['fields']:
                for facet_field in facets[key]:
                    # Convert to a two-tuple, as Solr's json format returns a list of
                    # pairs.
                    facets[key][facet_field] = list(zip(facets[key][facet_field][::2], facets[key][facet_field][1::2]))

        if self.include_spelling is True:
            if hasattr(raw_results, 'spellcheck'):
                if len(raw_results.spellcheck.get('suggestions', [])):
                    # For some reason, it's an array of pairs. Pull off the
                    # collated result from the end.
                    spelling_suggestion = raw_results.spellcheck.get('suggestions')[-1]

        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for raw_result in raw_results.docs:
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                index = unified_index.get_index(model)
                index_field_map = index.field_map
                for key, value in raw_result.items():
                    string_key = str(key)
                    # re-map key if alternate name used
                    if string_key in index_field_map:
                        string_key = index_field_map[key]

                    if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                        additional_fields[string_key] = index.fields[string_key].convert(value)
                    else:
                        additional_fields[string_key] = self.conn._to_python(value)

                del(additional_fields[DJANGO_CT])
                del(additional_fields[DJANGO_ID])
                del(additional_fields['score'])

                if raw_result[ID] in getattr(raw_results, 'highlighting', {}):
                    additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]]

                if distance_point:
                    additional_fields['_point_of_origin'] = distance_point

                    if raw_result.get('__dist__'):
                        from haystack.utils.geo import Distance
                        additional_fields['_distance'] = Distance(km=float(raw_result['__dist__']))
                    else:
                        additional_fields['_distance'] = None

                result = result_class(app_label, model_name, raw_result[DJANGO_ID], raw_result['score'], **additional_fields)
                results.append(result)
            else:
                hits -= 1

        return {
            'results': results,
            'hits': hits,
            'stats': stats,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }

    def build_schema(self, fields):
        content_field_name = ''
        schema_fields = []

        for field_name, field_class in fields.items():
            field_data = {
                'name': field_class.index_fieldname,
                'type': 'text_en',
                'indexed': 'true',
                'stored': 'true',
                'multiValued': 'false',
            }

            if field_class.document is True:
                content_field_name = field_class.index_fieldname

            # DRL_FIXME: Perhaps move to something where, if none of these
            #            checks succeed, call a custom method on the form that
            #            returns, per-backend, the right type of storage?
            if field_class.field_type in ['date', 'datetime']:
                field_data['type'] = 'date'
            elif field_class.field_type == 'integer':
                field_data['type'] = 'long'
            elif field_class.field_type == 'float':
                field_data['type'] = 'float'
            elif field_class.field_type == 'boolean':
                field_data['type'] = 'boolean'
            elif field_class.field_type == 'ngram':
                field_data['type'] = 'ngram'
            elif field_class.field_type == 'edge_ngram':
                field_data['type'] = 'edge_ngram'
            elif field_class.field_type == 'location':
                field_data['type'] = 'location'

            if field_class.is_multivalued:
                field_data['multiValued'] = 'true'

            if field_class.stored is False:
                field_data['stored'] = 'false'

            # Do this last to override `text` fields.
            if field_class.indexed is False:
                field_data['indexed'] = 'false'

                # If it's text and not being indexed, we probably don't want
                # to do the normal lowercase/tokenize/stemming/etc. dance.
                if field_data['type'] == 'text_en':
                    field_data['type'] = 'string'

            # If it's a ``FacetField``, make sure we don't postprocess it.
            if hasattr(field_class, 'facet_for'):
                # If it's text, it ought to be a string.
                if field_data['type'] == 'text_en':
                    field_data['type'] = 'string'

            schema_fields.append(field_data)

        return (content_field_name, schema_fields)

    def extract_file_contents(self, file_obj):
        """Extract text and metadata from a structured file (PDF, MS Word, etc.)

        Uses the Solr ExtractingRequestHandler, which is based on Apache Tika.
        See the Solr wiki for details:

            http://wiki.apache.org/solr/ExtractingRequestHandler

        Due to the way the ExtractingRequestHandler is implemented it completely
        replaces the normal Haystack indexing process with several unfortunate
        restrictions: only one file per request, the extracted data is added to
        the index with no ability to modify it, etc. To simplify the process and
        allow for more advanced use we'll run using the extract-only mode to
        return the extracted data without adding it to the index so we can then
        use it within Haystack's normal templating process.

        Returns None if metadata cannot be extracted; otherwise returns a
        dictionary containing at least two keys:

            :contents:
                        Extracted full-text content, if applicable
            :metadata:
                        key:value pairs of text strings
        """

        try:
            return self.conn.extract(file_obj)
        except Exception as e:
            self.log.warning(u"Unable to extract file contents: %s", e,
                             exc_info=True, extra={"data": {"file": file_obj}})
            return None
Пример #48
0
class SolrSearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\',
        '+',
        '-',
        '&&',
        '||',
        '!',
        '(',
        ')',
        '{',
        '}',
        '[',
        ']',
        '^',
        '"',
        '~',
        '*',
        '?',
        ':',
        '/',
    )

    def __init__(self, connection_alias, **connection_options):
        super(SolrSearchBackend, self).__init__(connection_alias,
                                                **connection_options)

        if not 'URL' in connection_options:
            raise ImproperlyConfigured(
                "You must specify a 'URL' in your settings for connection '%s'."
                % connection_alias)

        self.conn = Solr(connection_options['URL'],
                         timeout=self.timeout,
                         **connection_options.get('KWARGS', {}))
        self.log = logging.getLogger('haystack')

    def update(self, index, iterable, commit=True):
        docs = []

        for obj in iterable:
            try:
                docs.append(index.full_prepare(obj))
            except SkipDocument:
                self.log.debug(u"Indexing for object `%s` skipped", obj)
            except UnicodeDecodeError:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(
                    u"UnicodeDecodeError while preparing object for update",
                    exc_info=True,
                    extra={
                        "data": {
                            "index": index,
                            "object": get_identifier(obj)
                        }
                    })

        if len(docs) > 0:
            try:
                self.conn.add(docs,
                              commit=commit,
                              boost=index.get_field_weights())
            except (IOError, SolrError) as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Solr: %s",
                               e,
                               exc_info=True)

    def remove(self, obj_or_string, commit=True):
        solr_id = get_identifier(obj_or_string)

        try:
            kwargs = {'commit': commit, 'id': solr_id}
            self.conn.delete(**kwargs)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to remove document '%s' from Solr: %s",
                           solr_id,
                           e,
                           exc_info=True)

    def clear(self, models=None, commit=True):
        if models is not None:
            assert isinstance(models, (list, tuple))

        try:
            if models is None:
                # *:* matches all docs in Solr
                self.conn.delete(q='*:*', commit=commit)
            else:
                models_to_delete = []

                for model in models:
                    models_to_delete.append("%s:%s" %
                                            (DJANGO_CT, get_model_ct(model)))

                self.conn.delete(q=" OR ".join(models_to_delete),
                                 commit=commit)

            if commit:
                # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99
                self.conn.optimize()
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            if models is not None:
                self.log.error("Failed to clear Solr index of models '%s': %s",
                               ','.join(models_to_delete),
                               e,
                               exc_info=True)
            else:
                self.log.error("Failed to clear Solr index: %s",
                               e,
                               exc_info=True)

    @log_query
    def search(self, query_string, **kwargs):
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }

        search_kwargs = self.build_search_kwargs(query_string, **kwargs)

        try:
            raw_results = self.conn.search(query_string, **search_kwargs)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to query Solr using '%s': %s",
                           query_string,
                           e,
                           exc_info=True)
            raw_results = EmptyResults()

        return self._process_results(
            raw_results,
            highlight=kwargs.get('highlight'),
            result_class=kwargs.get('result_class', SearchResult),
            distance_point=kwargs.get('distance_point'))

    def build_search_kwargs(self,
                            query_string,
                            sort_by=None,
                            start_offset=0,
                            end_offset=None,
                            fields='',
                            highlight=False,
                            facets=None,
                            date_facets=None,
                            query_facets=None,
                            narrow_queries=None,
                            spelling_query=None,
                            within=None,
                            dwithin=None,
                            distance_point=None,
                            models=None,
                            limit_to_registered_models=None,
                            result_class=None,
                            stats=None,
                            **extra_kwargs):
        kwargs = {'fl': '* score'}

        if fields:
            if isinstance(fields, (list, set)):
                fields = " ".join(fields)

            kwargs['fl'] = fields

        if sort_by is not None:
            if sort_by in ['distance asc', 'distance desc'] and distance_point:
                # Do the geo-enabled sort.
                lng, lat = distance_point['point'].get_coords()
                kwargs['sfield'] = distance_point['field']
                kwargs['pt'] = '%s,%s' % (lat, lng)

                if sort_by == 'distance asc':
                    kwargs['sort'] = 'geodist() asc'
                else:
                    kwargs['sort'] = 'geodist() desc'
            else:
                if sort_by.startswith('distance '):
                    warnings.warn(
                        "In order to sort by distance, you must call the '.distance(...)' method."
                    )

                # Regular sorting.
                kwargs['sort'] = sort_by

        if start_offset is not None:
            kwargs['start'] = start_offset

        if end_offset is not None:
            kwargs['rows'] = end_offset - start_offset

        if highlight:
            # `highlight` can either be True or a dictionary containing custom parameters
            # which will be passed to the backend and may override our default settings:

            kwargs['hl'] = 'true'
            kwargs['hl.fragsize'] = '200'

            if isinstance(highlight, dict):
                kwargs.update(highlight)

        if self.include_spelling is True:
            kwargs['spellcheck'] = 'true'
            kwargs['spellcheck.collate'] = 'true'
            kwargs['spellcheck.count'] = 1

            if spelling_query:
                kwargs['spellcheck.q'] = spelling_query

        if facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.field'] = facets.keys()

            for facet_field, options in facets.items():
                for key, value in options.items():
                    kwargs['f.%s.facet.%s' %
                           (facet_field, key)] = self.conn._from_python(value)

        if date_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.date'] = date_facets.keys()
            kwargs['facet.date.other'] = 'none'

            for key, value in date_facets.items():
                kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(
                    value.get('start_date'))
                kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(
                    value.get('end_date'))
                gap_by_string = value.get('gap_by').upper()
                gap_string = "%d%s" % (value.get('gap_amount'), gap_by_string)

                if value.get('gap_amount') != 1:
                    gap_string += "S"

                kwargs["f.%s.facet.date.gap" %
                       key] = '+%s/%s' % (gap_string, gap_by_string)

        if query_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.query'] = [
                "%s:%s" % (field, value) for field, value in query_facets
            ]

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add('%s:(%s)' %
                               (DJANGO_CT, ' OR '.join(model_choices)))

        if narrow_queries is not None:
            kwargs['fq'] = list(narrow_queries)

        if stats:
            kwargs['stats'] = "true"

            for k in stats.keys():
                kwargs['stats.field'] = k

                for facet in stats[k]:
                    kwargs['f.%s.stats.facet' % k] = facet

        if within is not None:
            from haystack.utils.geo import generate_bounding_box

            kwargs.setdefault('fq', [])
            ((min_lat, min_lng), (max_lat, max_lng)) = generate_bounding_box(
                within['point_1'], within['point_2'])
            # Bounding boxes are min, min TO max, max. Solr's wiki was *NOT*
            # very clear on this.
            bbox = '%s:[%s,%s TO %s,%s]' % (within['field'], min_lat, min_lng,
                                            max_lat, max_lng)
            kwargs['fq'].append(bbox)

        if dwithin is not None:
            kwargs.setdefault('fq', [])
            lng, lat = dwithin['point'].get_coords()
            geofilt = '{!geofilt pt=%s,%s sfield=%s d=%s}' % (
                lat, lng, dwithin['field'], dwithin['distance'].km)
            kwargs['fq'].append(geofilt)

        # Check to see if the backend should try to include distances
        # (Solr 4.X+) in the results.
        if self.distance_available and distance_point:
            # In early testing, you can't just hand Solr 4.X a proper bounding box
            # & request distances. To enable native distance would take calculating
            # a center point & a radius off the user-provided box, which kinda
            # sucks. We'll avoid it for now, since Solr 4.x's release will be some
            # time yet.
            # kwargs['fl'] += ' _dist_:geodist()'
            pass

        if extra_kwargs:
            kwargs.update(extra_kwargs)

        return kwargs

    def more_like_this(self,
                       model_instance,
                       additional_query_string=None,
                       start_offset=0,
                       end_offset=None,
                       models=None,
                       limit_to_registered_models=None,
                       result_class=None,
                       **kwargs):
        from haystack import connections

        # Deferred models will have a different class ("RealClass_Deferred_fieldname")
        # which won't be in our registry:
        model_klass = model_instance._meta.concrete_model

        index = connections[
            self.connection_alias].get_unified_index().get_index(model_klass)
        field_name = index.get_content_field()
        params = {
            'fl': '*,score',
        }

        if start_offset is not None:
            params['start'] = start_offset

        if end_offset is not None:
            params['rows'] = end_offset

        narrow_queries = set()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add('%s:(%s)' %
                               (DJANGO_CT, ' OR '.join(model_choices)))

        if additional_query_string:
            narrow_queries.add(additional_query_string)

        if narrow_queries:
            params['fq'] = list(narrow_queries)

        query = "%s:%s" % (ID, get_identifier(model_instance))

        try:
            raw_results = self.conn.more_like_this(query, field_name, **params)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to fetch More Like This from Solr for document '%s': %s",
                query,
                e,
                exc_info=True)
            raw_results = EmptyResults()

        return self._process_results(raw_results, result_class=result_class)

    def _process_results(self,
                         raw_results,
                         highlight=False,
                         result_class=None,
                         distance_point=None):
        from haystack import connections
        results = []
        hits = raw_results.hits
        facets = {}
        stats = {}
        spelling_suggestion = None

        if result_class is None:
            result_class = SearchResult

        if hasattr(raw_results, 'stats'):
            stats = raw_results.stats.get('stats_fields', {})

        if hasattr(raw_results, 'facets'):
            facets = {
                'fields': raw_results.facets.get('facet_fields', {}),
                'dates': raw_results.facets.get('facet_dates', {}),
                'queries': raw_results.facets.get('facet_queries', {}),
            }

            for key in ['fields']:
                for facet_field in facets[key]:
                    # Convert to a two-tuple, as Solr's json format returns a list of
                    # pairs.
                    facets[key][facet_field] = list(
                        zip(facets[key][facet_field][::2],
                            facets[key][facet_field][1::2]))

        if self.include_spelling and hasattr(raw_results, 'spellcheck'):
            # Solr 5+ changed the JSON response format so the suggestions will be key-value mapped rather
            # than simply paired elements in a list, which is a nice improvement but incompatible with
            # Solr 4: https://issues.apache.org/jira/browse/SOLR-3029
            if len(raw_results.spellcheck.get('collations', [])):
                spelling_suggestion = raw_results.spellcheck['collations'][-1]
            elif len(raw_results.spellcheck.get('suggestions', [])):
                spelling_suggestion = raw_results.spellcheck['suggestions'][-1]

            assert spelling_suggestion is None or isinstance(
                spelling_suggestion, six.string_types)

        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for raw_result in raw_results.docs:
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                index = unified_index.get_index(model)
                index_field_map = index.field_map
                for key, value in raw_result.items():
                    string_key = str(key)
                    # re-map key if alternate name used
                    if string_key in index_field_map:
                        string_key = index_field_map[key]

                    if string_key in index.fields and hasattr(
                            index.fields[string_key], 'convert'):
                        additional_fields[string_key] = index.fields[
                            string_key].convert(value)
                    else:
                        additional_fields[string_key] = self.conn._to_python(
                            value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])
                del (additional_fields['score'])

                if raw_result[ID] in getattr(raw_results, 'highlighting', {}):
                    additional_fields[
                        'highlighted'] = raw_results.highlighting[
                            raw_result[ID]]

                if distance_point:
                    additional_fields['_point_of_origin'] = distance_point

                    if raw_result.get('__dist__'):
                        from haystack.utils.geo import Distance
                        additional_fields['_distance'] = Distance(
                            km=float(raw_result['__dist__']))
                    else:
                        additional_fields['_distance'] = None

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID],
                                      raw_result['score'], **additional_fields)
                results.append(result)
            else:
                hits -= 1

        return {
            'results': results,
            'hits': hits,
            'stats': stats,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }

    def build_schema(self, fields):
        content_field_name = ''
        schema_fields = []

        for field_name, field_class in fields.items():
            field_data = {
                'field_name': field_class.index_fieldname,
                'type': 'text_en',
                'indexed': 'true',
                'stored': 'true',
                'multi_valued': 'false',
            }

            if field_class.document is True:
                content_field_name = field_class.index_fieldname

            # DRL_FIXME: Perhaps move to something where, if none of these
            #            checks succeed, call a custom method on the form that
            #            returns, per-backend, the right type of storage?
            if field_class.field_type in ['date', 'datetime']:
                field_data['type'] = 'date'
            elif field_class.field_type == 'integer':
                field_data['type'] = 'long'
            elif field_class.field_type == 'float':
                field_data['type'] = 'float'
            elif field_class.field_type == 'boolean':
                field_data['type'] = 'boolean'
            elif field_class.field_type == 'ngram':
                field_data['type'] = 'ngram'
            elif field_class.field_type == 'edge_ngram':
                field_data['type'] = 'edge_ngram'
            elif field_class.field_type == 'location':
                field_data['type'] = 'location'

            if field_class.is_multivalued:
                field_data['multi_valued'] = 'true'

            if field_class.stored is False:
                field_data['stored'] = 'false'

            # Do this last to override `text` fields.
            if field_class.indexed is False:
                field_data['indexed'] = 'false'

                # If it's text and not being indexed, we probably don't want
                # to do the normal lowercase/tokenize/stemming/etc. dance.
                if field_data['type'] == 'text_en':
                    field_data['type'] = 'string'

            # If it's a ``FacetField``, make sure we don't postprocess it.
            if hasattr(field_class, 'facet_for'):
                # If it's text, it ought to be a string.
                if field_data['type'] == 'text_en':
                    field_data['type'] = 'string'

            schema_fields.append(field_data)

        return (content_field_name, schema_fields)

    def extract_file_contents(self, file_obj):
        """Extract text and metadata from a structured file (PDF, MS Word, etc.)

        Uses the Solr ExtractingRequestHandler, which is based on Apache Tika.
        See the Solr wiki for details:

            http://wiki.apache.org/solr/ExtractingRequestHandler

        Due to the way the ExtractingRequestHandler is implemented it completely
        replaces the normal Haystack indexing process with several unfortunate
        restrictions: only one file per request, the extracted data is added to
        the index with no ability to modify it, etc. To simplify the process and
        allow for more advanced use we'll run using the extract-only mode to
        return the extracted data without adding it to the index so we can then
        use it within Haystack's normal templating process.

        Returns None if metadata cannot be extracted; otherwise returns a
        dictionary containing at least two keys:

            :contents:
                        Extracted full-text content, if applicable
            :metadata:
                        key:value pairs of text strings
        """

        try:
            return self.conn.extract(file_obj)
        except Exception as e:
            self.log.warning(u"Unable to extract file contents: %s",
                             e,
                             exc_info=True,
                             extra={"data": {
                                 "file": file_obj
                             }})
            return None
Пример #49
0
class SolrSearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = ("AND", "NOT", "OR", "TO")

    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        "\\",
        "+",
        "-",
        "&&",
        "||",
        "!",
        "(",
        ")",
        "{",
        "}",
        "[",
        "]",
        "^",
        '"',
        "~",
        "*",
        "?",
        ":",
        "/",
    )

    def __init__(self, connection_alias, **connection_options):
        super(SolrSearchBackend, self).__init__(connection_alias,
                                                **connection_options)

        if "URL" not in connection_options:
            raise ImproperlyConfigured(
                "You must specify a 'URL' in your settings for connection '%s'."
                % connection_alias)

        self.collate = connection_options.get("COLLATE_SPELLING", True)

        self.conn = Solr(connection_options["URL"],
                         timeout=self.timeout,
                         **connection_options.get("KWARGS", {}))
        self.log = logging.getLogger("haystack")

    def update(self, index, iterable, commit=True):
        docs = []

        for obj in iterable:
            try:
                docs.append(index.full_prepare(obj))
            except SkipDocument:
                self.log.debug("Indexing for object `%s` skipped", obj)
            except UnicodeDecodeError:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(
                    "UnicodeDecodeError while preparing object for update",
                    exc_info=True,
                    extra={
                        "data": {
                            "index": index,
                            "object": get_identifier(obj)
                        }
                    },
                )

        if len(docs) > 0:
            try:
                self.conn.add(docs,
                              commit=commit,
                              boost=index.get_field_weights())
            except (IOError, SolrError) as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Solr: %s",
                               e,
                               exc_info=True)

    def remove(self, obj_or_string, commit=True):
        solr_id = get_identifier(obj_or_string)

        try:
            kwargs = {"commit": commit, "id": solr_id}
            self.conn.delete(**kwargs)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to remove document '%s' from Solr: %s",
                solr_id,
                e,
                exc_info=True,
            )

    def clear(self, models=None, commit=True):
        if models is not None:
            assert isinstance(models, (list, tuple))

        try:
            if models is None:
                # *:* matches all docs in Solr
                self.conn.delete(q="*:*", commit=commit)
            else:
                models_to_delete = []

                for model in models:
                    models_to_delete.append("%s:%s" %
                                            (DJANGO_CT, get_model_ct(model)))

                self.conn.delete(q=" OR ".join(models_to_delete),
                                 commit=commit)

            if commit:
                # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99
                self.conn.optimize()
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            if models is not None:
                self.log.error(
                    "Failed to clear Solr index of models '%s': %s",
                    ",".join(models_to_delete),
                    e,
                    exc_info=True,
                )
            else:
                self.log.error("Failed to clear Solr index: %s",
                               e,
                               exc_info=True)

    @log_query
    def search(self, query_string, **kwargs):
        if len(query_string) == 0:
            return {"results": [], "hits": 0}

        search_kwargs = self.build_search_kwargs(query_string, **kwargs)

        try:
            raw_results = self.conn.search(query_string, **search_kwargs)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to query Solr using '%s': %s",
                           query_string,
                           e,
                           exc_info=True)
            raw_results = EmptyResults()

        return self._process_results(
            raw_results,
            highlight=kwargs.get("highlight"),
            result_class=kwargs.get("result_class", SearchResult),
            distance_point=kwargs.get("distance_point"),
        )

    def build_search_kwargs(self,
                            query_string,
                            sort_by=None,
                            start_offset=0,
                            end_offset=None,
                            fields="",
                            highlight=False,
                            facets=None,
                            date_facets=None,
                            query_facets=None,
                            narrow_queries=None,
                            spelling_query=None,
                            within=None,
                            dwithin=None,
                            distance_point=None,
                            models=None,
                            limit_to_registered_models=None,
                            result_class=None,
                            stats=None,
                            collate=None,
                            **extra_kwargs):

        index = haystack.connections[self.connection_alias].get_unified_index()

        kwargs = {"fl": "* score", "df": index.document_field}

        if fields:
            if isinstance(fields, (list, set)):
                fields = " ".join(fields)

            kwargs["fl"] = fields

        if sort_by is not None:
            if sort_by in ["distance asc", "distance desc"] and distance_point:
                # Do the geo-enabled sort.
                lng, lat = distance_point["point"].coords
                kwargs["sfield"] = distance_point["field"]
                kwargs["pt"] = "%s,%s" % (lat, lng)

                if sort_by == "distance asc":
                    kwargs["sort"] = "geodist() asc"
                else:
                    kwargs["sort"] = "geodist() desc"
            else:
                if sort_by.startswith("distance "):
                    warnings.warn(
                        "In order to sort by distance, you must call the '.distance(...)' method."
                    )

                # Regular sorting.
                kwargs["sort"] = sort_by

        if start_offset is not None:
            kwargs["start"] = start_offset

        if end_offset is not None:
            kwargs["rows"] = end_offset - start_offset

        if highlight:
            # `highlight` can either be True or a dictionary containing custom parameters
            # which will be passed to the backend and may override our default settings:

            kwargs["hl"] = "true"
            kwargs["hl.fragsize"] = "200"

            if isinstance(highlight, dict):
                # autoprefix highlighter options with 'hl.', all of them start with it anyway
                # this makes option dicts shorter: {'maxAnalyzedChars': 42}
                # and lets some of options be used as keyword arguments: `.highlight(preserveMulti=False)`
                kwargs.update({
                    key if key.startswith("hl.") else ("hl." + key):
                    highlight[key]
                    for key in highlight.keys()
                })

        if collate is None:
            collate = self.collate
        if self.include_spelling is True:
            kwargs["spellcheck"] = "true"
            kwargs["spellcheck.collate"] = str(collate).lower()
            kwargs["spellcheck.count"] = 1

            if spelling_query:
                kwargs["spellcheck.q"] = spelling_query

        if facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.field"] = facets.keys()

            for facet_field, options in facets.items():
                for key, value in options.items():
                    kwargs["f.%s.facet.%s" %
                           (facet_field, key)] = self.conn._from_python(value)

        if date_facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.date"] = date_facets.keys()
            kwargs["facet.date.other"] = "none"

            for key, value in date_facets.items():
                kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(
                    value.get("start_date"))
                kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(
                    value.get("end_date"))
                gap_by_string = value.get("gap_by").upper()
                gap_string = "%d%s" % (value.get("gap_amount"), gap_by_string)

                if value.get("gap_amount") != 1:
                    gap_string += "S"

                kwargs["f.%s.facet.date.gap" % key] = "+%s/%s" % (
                    gap_string,
                    gap_by_string,
                )

        if query_facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.query"] = [
                "%s:%s" % (field, value) for field, value in query_facets
            ]

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add("%s:(%s)" %
                               (DJANGO_CT, " OR ".join(model_choices)))

        if narrow_queries is not None:
            kwargs["fq"] = list(narrow_queries)

        if stats:
            kwargs["stats"] = "true"

            for k in stats.keys():
                kwargs["stats.field"] = k

                for facet in stats[k]:
                    kwargs["f.%s.stats.facet" % k] = facet

        if within is not None:
            from haystack.utils.geo import generate_bounding_box

            kwargs.setdefault("fq", [])
            ((min_lat, min_lng), (max_lat, max_lng)) = generate_bounding_box(
                within["point_1"], within["point_2"])
            # Bounding boxes are min, min TO max, max. Solr's wiki was *NOT*
            # very clear on this.
            bbox = "%s:[%s,%s TO %s,%s]" % (
                within["field"],
                min_lat,
                min_lng,
                max_lat,
                max_lng,
            )
            kwargs["fq"].append(bbox)

        if dwithin is not None:
            kwargs.setdefault("fq", [])
            lng, lat = dwithin["point"].coords
            geofilt = "{!geofilt pt=%s,%s sfield=%s d=%s}" % (
                lat,
                lng,
                dwithin["field"],
                dwithin["distance"].km,
            )
            kwargs["fq"].append(geofilt)

        # Check to see if the backend should try to include distances
        # (Solr 4.X+) in the results.
        if self.distance_available and distance_point:
            # In early testing, you can't just hand Solr 4.X a proper bounding box
            # & request distances. To enable native distance would take calculating
            # a center point & a radius off the user-provided box, which kinda
            # sucks. We'll avoid it for now, since Solr 4.x's release will be some
            # time yet.
            # kwargs['fl'] += ' _dist_:geodist()'
            pass

        if extra_kwargs:
            kwargs.update(extra_kwargs)

        return kwargs

    def more_like_this(self,
                       model_instance,
                       additional_query_string=None,
                       start_offset=0,
                       end_offset=None,
                       models=None,
                       limit_to_registered_models=None,
                       result_class=None,
                       **kwargs):
        from haystack import connections

        # Deferred models will have a different class ("RealClass_Deferred_fieldname")
        # which won't be in our registry:
        model_klass = model_instance._meta.concrete_model

        index = (connections[
            self.connection_alias].get_unified_index().get_index(model_klass))
        field_name = index.get_content_field()
        params = {"fl": "*,score"}

        if start_offset is not None:
            params["start"] = start_offset

        if end_offset is not None:
            params["rows"] = end_offset

        narrow_queries = set()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add("%s:(%s)" %
                               (DJANGO_CT, " OR ".join(model_choices)))

        if additional_query_string:
            narrow_queries.add(additional_query_string)

        if narrow_queries:
            params["fq"] = list(narrow_queries)

        query = "%s:%s" % (ID, get_identifier(model_instance))

        try:
            raw_results = self.conn.more_like_this(query, field_name, **params)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to fetch More Like This from Solr for document '%s': %s",
                query,
                e,
                exc_info=True,
            )
            raw_results = EmptyResults()

        return self._process_results(raw_results, result_class=result_class)

    def _process_results(self,
                         raw_results,
                         highlight=False,
                         result_class=None,
                         distance_point=None):
        from haystack import connections

        results = []
        hits = raw_results.hits
        facets = {}
        stats = {}
        spelling_suggestion = spelling_suggestions = None

        if result_class is None:
            result_class = SearchResult

        if hasattr(raw_results, "stats"):
            stats = raw_results.stats.get("stats_fields", {})

        if hasattr(raw_results, "facets"):
            facets = {
                "fields": raw_results.facets.get("facet_fields", {}),
                "dates": raw_results.facets.get("facet_dates", {}),
                "queries": raw_results.facets.get("facet_queries", {}),
            }

            for key in ["fields"]:
                for facet_field in facets[key]:
                    # Convert to a two-tuple, as Solr's json format returns a list of
                    # pairs.
                    facets[key][facet_field] = list(
                        zip(
                            facets[key][facet_field][::2],
                            facets[key][facet_field][1::2],
                        ))

        if self.include_spelling and hasattr(raw_results, "spellcheck"):
            try:
                spelling_suggestions = self.extract_spelling_suggestions(
                    raw_results)
            except Exception as exc:
                self.log.error(
                    "Error extracting spelling suggestions: %s",
                    exc,
                    exc_info=True,
                    extra={"data": {
                        "spellcheck": raw_results.spellcheck
                    }},
                )

                if not self.silently_fail:
                    raise

                spelling_suggestions = None

            if spelling_suggestions:
                # Maintain compatibility with older versions of Haystack which returned a single suggestion:
                spelling_suggestion = spelling_suggestions[-1]
                assert isinstance(spelling_suggestion, six.string_types)
            else:
                spelling_suggestion = None

        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for raw_result in raw_results.docs:
            app_label, model_name = raw_result[DJANGO_CT].split(".")
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                index = unified_index.get_index(model)
                index_field_map = index.field_map
                for key, value in raw_result.items():
                    string_key = str(key)
                    # re-map key if alternate name used
                    if string_key in index_field_map:
                        string_key = index_field_map[key]

                    if string_key in index.fields and hasattr(
                            index.fields[string_key], "convert"):
                        additional_fields[string_key] = index.fields[
                            string_key].convert(value)
                    else:
                        additional_fields[string_key] = self.conn._to_python(
                            value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])
                del (additional_fields["score"])

                if raw_result[ID] in getattr(raw_results, "highlighting", {}):
                    additional_fields[
                        "highlighted"] = raw_results.highlighting[
                            raw_result[ID]]

                if distance_point:
                    additional_fields["_point_of_origin"] = distance_point

                    if raw_result.get("__dist__"):
                        from haystack.utils.geo import Distance

                        additional_fields["_distance"] = Distance(
                            km=float(raw_result["__dist__"]))
                    else:
                        additional_fields["_distance"] = None

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID],
                                      raw_result["score"], **additional_fields)
                results.append(result)
            else:
                hits -= 1

        return {
            "results": results,
            "hits": hits,
            "stats": stats,
            "facets": facets,
            "spelling_suggestion": spelling_suggestion,
            "spelling_suggestions": spelling_suggestions,
        }

    def extract_spelling_suggestions(self, raw_results):
        # There are many different formats for Legacy, 6.4, and 6.5 e.g.
        # https://issues.apache.org/jira/browse/SOLR-3029 and depending on the
        # version and configuration the response format may be a dict of dicts,
        # a list of dicts, or a list of strings.

        collations = raw_results.spellcheck.get("collations", None)
        suggestions = raw_results.spellcheck.get("suggestions", None)

        # We'll collect multiple suggestions here. For backwards
        # compatibility with older versions of Haystack we'll still return
        # only a single suggestion but in the future we can expose all of
        # them.

        spelling_suggestions = []

        if collations:
            if isinstance(collations, dict):
                # Solr 6.5
                collation_values = collations["collation"]
                if isinstance(collation_values, six.string_types):
                    collation_values = [collation_values]
                elif isinstance(collation_values, dict):
                    # spellcheck.collateExtendedResults changes the format to a dictionary:
                    collation_values = [collation_values["collationQuery"]]
            elif isinstance(collations[1], dict):
                # Solr 6.4
                collation_values = collations
            else:
                # Older versions of Solr
                collation_values = collations[-1:]

            for i in collation_values:
                # Depending on the options the values are either simple strings or dictionaries:
                spelling_suggestions.append(
                    i["collationQuery"] if isinstance(i, dict) else i)
        elif suggestions:
            if isinstance(suggestions, dict):
                for i in suggestions.values():
                    for j in i["suggestion"]:
                        if isinstance(j, dict):
                            spelling_suggestions.append(j["word"])
                        else:
                            spelling_suggestions.append(j)
            elif isinstance(suggestions[0], six.string_types) and isinstance(
                    suggestions[1], dict):
                # Solr 6.4 uses a list of paired (word, dictionary) pairs:
                for suggestion in suggestions:
                    if isinstance(suggestion, dict):
                        for i in suggestion["suggestion"]:
                            if isinstance(i, dict):
                                spelling_suggestions.append(i["word"])
                            else:
                                spelling_suggestions.append(i)
            else:
                # Legacy Solr
                spelling_suggestions.append(suggestions[-1])

        return spelling_suggestions

    def build_schema(self, fields):
        content_field_name = ""
        schema_fields = []

        for field_name, field_class in fields.items():
            field_data = {
                "field_name": field_class.index_fieldname,
                "type": "text_en",
                "indexed": "true",
                "stored": "true",
                "multi_valued": "false",
            }

            if field_class.document is True:
                content_field_name = field_class.index_fieldname

            # DRL_FIXME: Perhaps move to something where, if none of these
            #            checks succeed, call a custom method on the form that
            #            returns, per-backend, the right type of storage?
            if field_class.field_type in ["date", "datetime"]:
                field_data["type"] = "date"
            elif field_class.field_type == "integer":
                field_data["type"] = "long"
            elif field_class.field_type == "float":
                field_data["type"] = "float"
            elif field_class.field_type == "boolean":
                field_data["type"] = "boolean"
            elif field_class.field_type == "ngram":
                field_data["type"] = "ngram"
            elif field_class.field_type == "edge_ngram":
                field_data["type"] = "edge_ngram"
            elif field_class.field_type == "location":
                field_data["type"] = "location"

            if field_class.is_multivalued:
                field_data["multi_valued"] = "true"

            if field_class.stored is False:
                field_data["stored"] = "false"

            # Do this last to override `text` fields.
            if field_class.indexed is False:
                field_data["indexed"] = "false"

                # If it's text and not being indexed, we probably don't want
                # to do the normal lowercase/tokenize/stemming/etc. dance.
                if field_data["type"] == "text_en":
                    field_data["type"] = "string"

            # If it's a ``FacetField``, make sure we don't postprocess it.
            if hasattr(field_class, "facet_for"):
                # If it's text, it ought to be a string.
                if field_data["type"] == "text_en":
                    field_data["type"] = "string"

            schema_fields.append(field_data)

        return (content_field_name, schema_fields)

    def extract_file_contents(self, file_obj, **kwargs):
        """Extract text and metadata from a structured file (PDF, MS Word, etc.)

        Uses the Solr ExtractingRequestHandler, which is based on Apache Tika.
        See the Solr wiki for details:

            http://wiki.apache.org/solr/ExtractingRequestHandler

        Due to the way the ExtractingRequestHandler is implemented it completely
        replaces the normal Haystack indexing process with several unfortunate
        restrictions: only one file per request, the extracted data is added to
        the index with no ability to modify it, etc. To simplify the process and
        allow for more advanced use we'll run using the extract-only mode to
        return the extracted data without adding it to the index so we can then
        use it within Haystack's normal templating process.

        Returns None if metadata cannot be extracted; otherwise returns a
        dictionary containing at least two keys:

            :contents:
                        Extracted full-text content, if applicable
            :metadata:
                        key:value pairs of text strings
        """

        try:
            return self.conn.extract(file_obj, **kwargs)
        except Exception as e:
            self.log.warning(
                "Unable to extract file contents: %s",
                e,
                exc_info=True,
                extra={"data": {
                    "file": file_obj
                }},
            )
            return None
Пример #50
0
class Indexer(object):
    """Indexer for PRL."""
    def __init__(self, args: Dict[str, Any]):

        self.solr = None
        self.s3 = None
        self.harvester_settings = None
        self.record_sets = None

        self.args = args
        self.oai_pmh_cache = {}

    def connect(self):
        """Initializes the interfaces for all third-party services."""

        self._connect_internal_services()
        if not self.args['dry_run']:
            self._connect_external_services()

    def _connect_internal_services(self):
        """Initializes the interfaces for all third-party services instantiated by this module."""

        try:
            self.harvester_settings = plyvel.DB(os.path.expanduser(
                os.environ.get('LEVELDB_HARVESTER_SETTINGS_DIRECTORY')),
                                                create_if_missing=True)
            self.record_sets = plyvel.DB(os.path.expanduser(
                os.environ.get('LEVELDB_RECORD_SETS_DIRECTORY')),
                                         create_if_missing=True)
            self.set_harvester_settings()
        except plyvel.IOError as e:
            raise IndexerError(
                'Failed to instantiate LevelDB instance: {}'.format(repr(e)))

    def _connect_external_services(self):
        """Initializes the interfaces for all third-party services NOT instantiated by this module."""

        try:
            solr_base_url = 'http://{}:{}/solr/{}'.format(
                os.environ.get('SOLR_HOST'), os.environ.get('SOLR_PORT'),
                os.environ.get('SOLR_CORE_NAME'))

            # Make sure we can connect to Solr.
            def solr_ping(base_url):
                """Raises an error if we can't connect to Solr."""
                o = urllib.parse.urlsplit(solr_base_url)
                ping_url = urllib.parse.urlunsplit(
                    o[:2] + (os.path.join(o.path, 'admin/ping'), ) + o[3:])
                requests.get(ping_url).raise_for_status()

            solr_ping(solr_base_url)

            self.solr = Solr(solr_base_url, always_commit=True)
            self.s3 = boto3.Session(
                aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
                aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY'),
                region_name=os.environ.get('AWS_DEFAULT_REGION')).client('s3')
        except requests.exceptions.RequestException as e:
            raise IndexerError('Connection failed: {}'.format(e))
        except BotoCoreError as e:
            raise IndexerError('Failed to initialize S3 session: {}'.format(
                repr(e)))

    def disconnect(self):
        """Closes connections with all third-party services."""

        self._disconnect_internal_services()
        if not self.args['dry_run']:
            self._disconnect_external_services()

    def _disconnect_internal_services(self):
        """Closes connections with all third-party services instantiated by this module."""

        try:
            self.harvester_settings.close()
            self.record_sets.close()
        except plyvel.Error as e:
            raise IndexerError(
                'Failed to close the connection to LevelDB: {}'.format(e))

    def _disconnect_external_services(self):
        """Closes connections with all third-party services NOT instantiated by this module."""

        self.solr = None
        self.s3 = None

    def get_harvester_settings_path(self) -> str:
        """Gets the full path of the file containing jOAI harvester settings."""

        return os.path.join(
            os.path.expanduser(
                os.environ.get('JOAI_HARVESTER_SETTINGS_DIRECTORY')),
            JOAI_SCHEDULED_HARVESTS_FILENAME)

    def get_harvester_settings_key(self, path: str) -> str:
        """
        Returns a relative path with either one or two components.
        
        Intended to be called ONLY on paths representing institution/repository or collection/set directories.
        """
        harvest_dir_prefix = os.environ.get('JOAI_DATA_DIRECTORY')

        return os.path.relpath(path, harvest_dir_prefix)

    def read_harvester_settings_file(self,
                                     path: str) -> Dict[str, Dict[str, str]]:
        """Returns a dictionary representing the harvester settings.

        First, tries reading the settings as if the source file is UTF-8 encoded JSON of the following form (used for testing):

        {
            "harvester_settings_key_1": {
                "repository_name": "repository_name_1",
                "base_url": "http://example.edu/oai2",
                "set_spec": "set_spec_1",
                "split_by_set": False
            },
            ...
        }

        If that fails, tries reading the settings as if the source file is a serialized java.util.Hashtable instance from jOAI (used for production).
        """

        try:
            # See if it's in JSON already.
            with open(path, 'r') as harvester_settings_file:
                # Make sure we transform the key before storing.
                return {
                    self.get_harvester_settings_key(key): metadata
                    for key, metadata in json.load(
                        harvester_settings_file).items()
                }
        except JSONDecodeError as e:
            # Invalid JSON.
            raise IndexerError(
                'Cannot load scheduled harvests settings: {}'.format(e))
        except FileNotFoundError as e:
            # This file won't exist when no harvests have been scheduled, so it's probably fine.
            logging.debug(
                'Scheduled harvests settings file does not exist: {}'.format(
                    path))
            return {}
        except UnicodeDecodeError as e:
            logging.debug('Config file is not JSON: {}'.format(e))

            # Open the file in binary mode and try to parse it with javaobj.
            with open(path, 'rb') as harvester_settings_file:
                pobj = javaobj.loads(harvester_settings_file.read())

            is_scheduled_harvest = lambda h: JOAI_SCHEDULED_HARVEST_CLASSNAME in str(
                h)

            return {
                self.get_harvester_settings_key(pobj_harvest.harvestDir.path):
                {
                    'repository_name': pobj_harvest.repositoryName,
                    'base_url': pobj_harvest.baseURL,
                    'set_spec': pobj_harvest.setSpec,
                    'split_by_set': pobj_harvest.splitBySet
                }
                for pobj_harvest in list(
                    filter(is_scheduled_harvest, pobj.annotations))
            }
        except Exception as e:
            # Something else went wrong.
            raise IndexerError(
                'Cannot load scheduled harvests settings: {}'.format(e))

    def set_harvester_settings(self):
        """Updates the harvester_settings LevelDB instance with the data stored in the source file.
        
        Responds to filesystem event on that file.
        """

        harvester_settings_path = self.get_harvester_settings_path()
        new_harvester_settings = self.read_harvester_settings_file(
            harvester_settings_path)
        deleted_keys = []
        updated_keys = []

        # Remove all keys from LevelDB that aren't in the harvester settings file.
        harvester_settings_iterator = self.harvester_settings.iterator()
        for key, value in harvester_settings_iterator:
            if key.decode() not in new_harvester_settings:
                self.harvester_settings.delete(key)
                deleted_keys.append(key)

        if deleted_keys:
            logging.info('Deleted harvester settings for %s', deleted_keys)

        # Add all keys in the harvester settings file to LevelDB, since some of their values may have changed.
        for harvest_key, harvest_metadata in new_harvester_settings.items():
            key = harvest_key
            value = json.dumps(harvest_metadata)
            self.harvester_settings.put(key.encode(), value.encode())
            updated_keys.append(key)

        if updated_keys:
            logging.info('Updated harvester settings for %s', updated_keys)

    def update_record(self, path: str):
        """Updates a metadata record in PRL.
        
        Responds to IndexerEventHandler.on_modified filesystem event.
        """
        if not self.args['dry_run']:

            record_metadata = self.get_key_record_metadata(path)
            record_identifier = record_metadata[0]
            record_sets_serialized_encoded = self.record_sets.get(
                record_identifier.encode())

            # Generate a Solr document from the metadata record.
            with open(path, 'r', encoding='utf-8') as record_file:
                prl_solr_document = self.get_solr_document(record_file)

            # If there is a thumbnail, save it to the system.
            if prl_solr_document.original_thumbnail_metadata():
                self.save_thumbnail(prl_solr_document)

            record_identifier = prl_solr_document.id

            # Determine whether or not this is a create or an update.
            if record_sets_serialized_encoded is None:
                action = 'create'
            else:
                action = 'update'
                # If we've processed this record in the past, make sure we don't completely overwrite the collectionKey or collectionName fields.
                # We save these locally in LevelDB.
                record_sets = json.loads(
                    record_sets_serialized_encoded.decode())
                prl_solr_document.complete_collection_list(
                    record_sets['collectionKey'],
                    record_sets['collectionName'])

            pysolr_doc = prl_solr_document.get_pysolr_doc()
            collection_key = pysolr_doc['collectionKey']
            collection_name = pysolr_doc['collectionName']

            try:
                self.solr.add([pysolr_doc], overwrite=True)
                logging.debug('%s %sd in Solr', record_identifier, action)

                self.record_sets.put(
                    record_identifier.encode(),
                    json.dumps({
                        'collectionKey': collection_key,
                        'collectionName': collection_name
                    }).encode())
                logging.info('%s %sd in PRL', record_identifier, action)
            except plyvel.Error as e:
                self.solr.delete(id=record_identifier)
                raise IndexerError('Failed to PUT on LevelDB: {}'.format(e))
            except Exception as e:
                raise IndexerError(
                    'Failed to update Solr document: {}'.format(e))
        else:
            logging.info('DRY-RUN: %s updated in PRL', record_identifier)

    def remove_record(self, path: str):
        """Removes a metadata record from PRL.
        
        Responds to IndexerEventHandler.on_deleted filesystem event.
        """
        if not self.args['dry_run']:
            try:
                record_metadata = self.get_key_record_metadata(path)
                record_identifier = record_metadata[0]
                # We're certain that our serialized JSON is valid.
                record_sets = json.loads(
                    self.record_sets.get(record_identifier.encode()).decode())
            except plyvel.Error as e:
                raise IndexerError('Failed to GET on LevelDB: {}'.format(e))

            # Either remove the record from the system, or update it.
            if len(record_sets['collectionKey']) == 1:
                # Remove the thumbnail if there is one.
                try:
                    pysolr_doc = self.solr.search(
                        'id:"{0}"'.format(record_identifier)).docs[0]
                except Exception as e:
                    raise IndexerError('Failed to GET {} from Solr: {}'.format(
                        record_identifier, e))
                if 'thumbnail_url' in pysolr_doc:
                    self.unsave_thumbnail(pysolr_doc['thumbnail_url'],
                                          record_identifier)

                # Remove the document from Solr.
                try:
                    self.solr.delete(id=record_identifier)
                except Exception as e:
                    raise IndexerError(
                        'Failed to DELETE {} from Solr: {}'.format(
                            record_identifier, e))
                logging.debug('%s removed from Solr', record_identifier)

                try:
                    self.record_sets.delete(record_identifier.encode())
                except plyvel.Error as e:
                    raise IndexerError(
                        'Failed to DELETE on LevelDB: {}'.format(e))

                logging.info('%s removed from PRL', record_identifier)
            else:
                # Update the list of collections that the record belongs to.
                # This is the case when a record belongs to more than one OAI-PMH set.
                collection_key = list(
                    filter(lambda x: x != record_metadata[3],
                           record_sets['collectionKey']))
                collection_name = list(
                    filter(lambda x: x != record_metadata[4],
                           record_sets['collectionName']))

                pysolr_doc = {
                    'id': record_identifier,
                    'collectionKey': collection_key,
                    'collectionName': collection_name
                }

                try:
                    self.solr.add([pysolr_doc],
                                  fieldUpdates={
                                      'collectionKey': 'set',
                                      'collectionName': 'set'
                                  },
                                  overwrite=True)
                except Exception as e:
                    raise IndexerError('Failed to POST {} on Solr: {}'.format(
                        record_identifier, e))
                logging.debug(
                    '%s updated in Solr (removed from collection %s)',
                    record_identifier, record_metadata[3])

                try:
                    self.record_sets.put(
                        record_identifier.encode(),
                        json.dumps({
                            'collectionKey': collection_key,
                            'collectionName': collection_name
                        }).encode())
                except plyvel.Error as e:
                    raise IndexerError(
                        'Failed to PUT on LevelDB: {}'.format(e))

                logging.info('%s updated in PRL (removed from collection %s)',
                             record_identifier, record_metadata[3])
        else:
            logging.info('DRY-RUN: Removed %s', path)

    def get_oai_pmh_metadata(self, base_url: str) -> Dict[str, str]:
        """Returns a dictionary containing top-level metadata and set metadata of an OAI-PMH repository."""

        logging.debug(
            'Retrieving repository and set metadata from OAI-PMH repository %s',
            base_url)
        try:
            metadata = {}

            # All repositories should have this metadata.
            repository_metadata = Sickle(base_url, timeout=60).Identify()
            if hasattr(repository_metadata, 'repositoryIdentifier'):
                metadata[
                    'repository_identifier'] = repository_metadata.repositoryIdentifier
            if hasattr(repository_metadata, 'repositoryName'):
                metadata[
                    'repository_name'] = repository_metadata.repositoryName

            # Not all repositories will support sets.
            try:
                set_metadata = Sickle(base_url, timeout=60).ListSets()
                metadata.update({
                    'sets': {s.setSpec: s.setName
                             for s in list(set_metadata)}
                })
            except sickle.oaiexceptions.NoSetHierarchy as e:
                logging.debug(
                    'Failed to list sets from OAI-PMH repository %s: %s',
                    base_url, e)

            return metadata

        except requests.RequestException as e:
            raise IndexerError(
                'Failed to get repository metadata from OAI-PMH repository {}: {}'
                .format(base_url, e))

    def get_solr_document(self, file_object: TextIOWrapper) -> PRLSolrDocument:
        """Builds a Solr document for PRL."""
        identifier, institution_key, institution_name, collection_key, collection_name = self.get_key_record_metadata(
            file_object.name)

        if self.args['dry_run']:
            s3_domain_name = 'example.com'
        else:
            s3_domain_name = os.environ.get('AWS_S3_BUCKET_DOMAIN_NAME')

        return PRLSolrDocument(file_object, identifier, institution_key,
                               institution_name, collection_key,
                               collection_name, s3_domain_name)

    def get_key_record_metadata(self, file_path: str):
        """Determines collection and institution metadata from the filepath of the record.

        Returns a 5-tuple containing the following elements:
            - an identifier for the record
            - an identifier for the institution
            - a human-readable string for the institution
            - an identifier for the collection
            - a human-readable string for the collection

        Side effects:
            - updates local LevelDB cache with OAI-PMH repository metadata
        """

        # ---------------------------------------- #
        # --- Gather all the data we can find. --- #
        # ---------------------------------------- #

        # Get the record identifier from the filename.
        identifier = urllib.parse.unquote(
            os.path.splitext(os.path.basename(file_path))[0])

        try:
            # The harvester settings will tell us how to get the other metadata.
            harvester_settings_key = None

            potential_harvester_settings_keys = map(
                self.get_harvester_settings_key, [
                    os.path.dirname(file_path),
                    os.path.dirname(os.path.dirname(file_path))
                ])
            # Keep track of keys that we tried, but failed.
            tried_keys = []

            for potential_harvester_settings_key in potential_harvester_settings_keys:
                potential_harvester_settings_serialized_encoded = self.harvester_settings.get(
                    potential_harvester_settings_key.encode())

                if potential_harvester_settings_serialized_encoded:
                    # Found it!
                    harvester_settings_key = potential_harvester_settings_key
                    break
                else:
                    tried_keys.append(potential_harvester_settings_key)

            if harvester_settings_key is not None:
                harvester_settings_serialized_encoded = potential_harvester_settings_serialized_encoded
                harvester_settings_serialized = harvester_settings_serialized_encoded.decode(
                )
                harvester_settings = json.loads(harvester_settings_serialized)
            else:
                # This should never happen. Harvester settings should represent all harvested files.
                raise IndexerError(
                    'Cannot find harvester settings in LevelDB for {}'.format(
                        tried_keys))

        except plyvel.Error as e:
            # We can't go on without LevelDB.
            raise IndexerError('Failed to GET on LevelDB: {}'.format(e))
        except AttributeError as e:
            # This should never happen. Harvester settings should represent all harvested files.
            raise IndexerError(
                'Cannot find harvester settings in LevelDB for {}'.format(
                    harvester_settings_key))
        except JSONDecodeError as e:
            # This should never happen.
            raise IndexerError(
                'Harvester settings are not valid JSON: {}'.format(e))

        base_url = harvester_settings['base_url']
        institution_name = harvester_settings['repository_name']
        set_spec = harvester_settings['set_spec']
        split_by_set = harvester_settings['split_by_set']

        # Fetch repository metadata, and write to the in-memory cache if necessary.
        if base_url in self.oai_pmh_cache:
            oai_pmh_metadata = self.oai_pmh_cache[base_url]
        else:
            oai_pmh_metadata = self.get_oai_pmh_metadata(base_url)
            self.oai_pmh_cache[base_url] = oai_pmh_metadata

        # ----------------------------------------- #
        # --- Determine which values to return. --- #
        # ----------------------------------------- #

        # This is the most common case: an institution specifies a specific set for us to harvest.
        individual_set_harvest = set_spec != '' and not split_by_set

        # This is the case when an institution wants us to harvest all sets from their repository.
        full_repository_harvest = set_spec == '' and split_by_set

        # This is the case when an institution wants us to treat their entire repository as a PRL "collection".
        single_collection_repository = set_spec == '' and not split_by_set

        # Set the return values.
        if individual_set_harvest:
            institution_key = os.path.dirname(harvester_settings_key)
            collection_key = set_spec
            collection_name = oai_pmh_metadata['sets'][set_spec]

        elif full_repository_harvest:
            institution_key = harvester_settings_key
            collection_key = os.path.basename(os.path.dirname(file_path))
            collection_name = oai_pmh_metadata['sets'][collection_key]

        elif single_collection_repository:
            institution_key = os.path.dirname(harvester_settings_key)
            collection_key = os.path.basename(harvester_settings_key)
            collection_name = oai_pmh_metadata['repository_name']
        else:
            raise IndexerError(
                'Unable to handle harvest configuration: {}'.format(
                    harvester_settings_key))

        return (identifier, institution_key, institution_name, collection_key,
                collection_name)

    def save_thumbnail(self, prl_solr_document: PRLSolrDocument):
        """Puts thumbnail on the local filesystem and on S3.

        Returns the Boolean value of whether or not a thumbnail was saved."""

        thumbnail_path = self.download_thumbnail(prl_solr_document)
        if thumbnail_path:
            self.upload_thumbnail(prl_solr_document, thumbnail_path)
            logging.debug('%s thumbnail saved',
                          prl_solr_document.get_record_identifier())
            return True
        else:
            return False

    def download_thumbnail(self, prl_solr_document: PRLSolrDocument):
        """Puts the thumbnail file in its place on the file system.

        Returns its path, or None if no thumbnail could be fetched."""

        # TODO: need better exception handling here
        thumbnail_s3_key = prl_solr_document.get_thumbnail_s3_key()
        try:
            filepath = os.path.join(
                os.path.abspath(os.environ.get('THUMBNAILS_DIRECTORY')),
                thumbnail_s3_key)
            os.makedirs(os.path.dirname(filepath), exist_ok=True)

            original_thumbnail_url = prl_solr_document.original_thumbnail_metadata(
            )['url']
            n_tries = 3
            for try_i in range(1, n_tries + 1):
                try:
                    response = requests.get(original_thumbnail_url,
                                            timeout=30,
                                            stream=True)
                    # Fail on 4xx or 5xx
                    response.raise_for_status()
                    # Make sure the Content-Type is what we expect and that the server doesn't disallow robots
                    response_content_type = response.headers.get(
                        'Content-Type')
                    if re.match(re.compile('image/.+'), response_content_type):
                        with open(filepath, 'wb') as image_file:
                            for chunk in response.iter_content(
                                    chunk_size=1024):
                                image_file.write(chunk)
                        logging.debug(
                            '%s thumbnail put on local filesystem at %s',
                            thumbnail_s3_key, filepath)

                        if not prl_solr_document.has_thumbnail_format():
                            # Determine the format and rename the image file to use the newly-determined filetype ext
                            prl_solr_document.set_thumbnail_format(
                                response_content_type)
                            new_filepath = os.path.join(
                                os.path.abspath(
                                    os.path.expanduser(
                                        os.environ.get(
                                            'THUMBNAILS_DIRECTORY'))),
                                prl_solr_document.get_thumbnail_s3_key())
                            logging.debug('renaming %s -> %s', filepath,
                                          new_filepath)
                            os.rename(filepath, new_filepath)
                            return new_filepath
                        else:
                            return filepath
                    else:
                        logging.debug('Robots cannot access %s',
                                      original_thumbnail_url)
                        return None
                except requests.Timeout as e:
                    if try_i < n_tries:
                        msg = 'Thumbnail download timed out, retrying...'
                        logging.info(msg)
                        # Continue loop
                    else:
                        # No more tries left, so fail
                        msg = 'Failed to download thumbnail after {} tries: {}'.format(
                            n_tries, str(e))
                        logging.debug(msg)
                        return None
                except (requests.RequestException, IOError) as e:
                    msg = 'Failed to download thumbnail: {}'.format(e)
                    logging.debug(msg)
                    return None
        except Exception as e:
            raise IndexerError(
                'Failed to put thumbnail on local filesystem: {}'.format(e))

    def upload_thumbnail(self, prl_solr_document: PRLSolrDocument,
                         filepath: str):
        """Puts the thumbnail on S3."""

        # Determine a URL for the thumbnail now that we've downloaded it and know the image format
        prl_solr_document.add_thumbnail_url()

        try:
            self.s3.put_object(
                Bucket=os.environ.get('AWS_S3_BUCKET_NAME'),
                Key=prl_solr_document.get_thumbnail_s3_key(),
                Body=open(filepath, 'rb'),
                ContentType=prl_solr_document.original_thumbnail_metadata()
                ['content-type'])
            logging.debug('%s thumbnail put on S3',
                          prl_solr_document.get_record_identifier())
        except BotoCoreError as e:
            raise IndexerError('Failed to put thumbnail on S3: {}'.format(
                e.msg))

    def unsave_thumbnail(self, thumbnail_url: str, record_identifier: str):
        """Removes thumbnail from the local filesystem and from S3."""

        try:
            thumbnail_s3_key = os.path.relpath(
                urllib.parse.urlparse(
                    urllib.parse.unquote(thumbnail_url)).path, '/')
            filepath = os.path.join(
                os.path.abspath(os.environ.get('THUMBNAILS_DIRECTORY')),
                thumbnail_s3_key)
            os.remove(filepath)
            logging.debug('%s thumbnail removed from local filesystem at %s',
                          record_identifier, filepath)

            # TODO: clean up empty parent directories
            self.s3.delete_object(Bucket=os.environ.get('AWS_S3_BUCKET_NAME'),
                                  Key=thumbnail_s3_key)
            logging.debug('%s thumbnail removed from S3', record_identifier)
        except BotoCoreError as e:
            raise IndexerError('Failed to remove thumbnail from S3: {}'.format(
                e.msg))
        except Exception as e:
            raise IndexerError(
                'Failed to remove thumbnail from local filesystem: {}'.format(
                    e))
Пример #51
0
class DocManager(DocManagerBase):
    """The DocManager class creates a connection to the backend engine and
    adds/removes documents, and in the case of rollback, searches for them.

    The reason for storing id/doc pairs as opposed to doc's is so that multiple
    updates to the same doc reflect the most up to date version as opposed to
    multiple, slightly different versions of a doc.
    """
    def __init__(self,
                 url,
                 auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id',
                 chunk_size=DEFAULT_MAX_BULK,
                 **kwargs):
        """Verify Solr URL and establish a connection.
        """
        self.url = url
        self.solr = Solr(url)
        self.unique_key = unique_key
        # pysolr does things in milliseconds
        if auto_commit_interval is not None:
            self.auto_commit_interval = auto_commit_interval * 1000
        else:
            self.auto_commit_interval = None
        self.chunk_size = chunk_size
        self.field_list = []
        self._build_fields()
        self._formatter = DocumentFlattener()

    def _parse_fields(self, result, field_name):
        """ If Schema access, parse fields and build respective lists
        """
        field_list = []
        for key, value in result.get('schema', {}).get(field_name, {}).items():
            if key not in field_list:
                field_list.append(key)
        return field_list

    @wrap_exceptions
    def _build_fields(self):
        """ Builds a list of valid fields
        """
        declared_fields = self.solr._send_request('get', ADMIN_URL)
        result = decoder.decode(declared_fields)
        self.field_list = self._parse_fields(result, 'fields')

        # Build regular expressions to match dynamic fields.
        # dynamic field names may have exactly one wildcard, either at
        # the beginning or the end of the name
        self._dynamic_field_regexes = []
        for wc_pattern in self._parse_fields(result, 'dynamicFields'):
            if wc_pattern[0] == "*":
                self._dynamic_field_regexes.append(
                    re.compile(".*%s\Z" % wc_pattern[1:]))
            elif wc_pattern[-1] == "*":
                self._dynamic_field_regexes.append(
                    re.compile("\A%s.*" % wc_pattern[:-1]))

    def _clean_doc(self, doc, namespace, timestamp):
        """Reformats the given document before insertion into Solr.

        This method reformats the document in the following ways:
          - removes extraneous fields that aren't defined in schema.xml
          - unwinds arrays in order to find and later flatten sub-documents
          - flattens the document so that there are no sub-documents, and every
            value is associated with its dot-separated path of keys
          - inserts namespace and timestamp metadata into the document in order
            to handle rollbacks

        An example:
          {"a": 2,
           "b": {
             "c": {
               "d": 5
             }
           },
           "e": [6, 7, 8]
          }

        becomes:
          {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8}

        """

        # Translate the _id field to whatever unique key we're using.
        # _id may not exist in the doc, if we retrieved it from Solr
        # as part of update.
        if '_id' in doc:
            doc[self.unique_key] = u(doc.pop("_id"))

        # Update namespace and timestamp metadata
        if 'ns' in doc or '_ts' in doc:
            raise errors.OperationFailed(
                'Need to set "ns" and "_ts" fields, but these fields already '
                'exist in the document %r!' % doc)
        doc['ns'] = namespace
        doc['_ts'] = timestamp

        # SOLR cannot index fields within sub-documents, so flatten documents
        # with the dot-separated path to each value as the respective key
        flat_doc = self._formatter.format_document(doc)

        # Only include fields that are explicitly provided in the
        # schema or match one of the dynamic field patterns, if
        # we were able to retrieve the schema
        if len(self.field_list) + len(self._dynamic_field_regexes) > 0:

            def include_field(field):
                return field in self.field_list or any(
                    regex.match(field)
                    for regex in self._dynamic_field_regexes)

            return dict(
                (k, v) for k, v in flat_doc.items() if include_field(k))
        return flat_doc

    def stop(self):
        """ Stops the instance
        """
        pass

    @wrap_exceptions
    def handle_command(self, doc, namespace, timestamp):
        db, _ = namespace.split('.', 1)
        if doc.get('dropDatabase'):
            for new_db in self.command_helper.map_db(db):
                self.solr.delete(q="ns:%s.*" % new_db,
                                 commit=(self.auto_commit_interval == 0))

        if doc.get('renameCollection'):
            raise errors.OperationFailed(
                "solr_doc_manager does not support replication of "
                " renameCollection")

        if doc.get('create'):
            # nothing to do
            pass

        if doc.get('drop'):
            new_db, coll = self.command_helper.map_collection(db, doc['drop'])
            if new_db:
                self.solr.delete(q="ns:%s.%s" % (new_db, coll),
                                 commit=(self.auto_commit_interval == 0))

    def apply_update(self, doc, update_spec):
        """Override DocManagerBase.apply_update to have flat documents."""
        # Replace a whole document
        if not '$set' in update_spec and not '$unset' in update_spec:
            # update spec contains the new document
            update_spec['_id'] = doc['_id']
            return update_spec
        for to_set in update_spec.get("$set", []):
            value = update_spec['$set'][to_set]
            # Find dotted-path to the value, remove that key from doc, then
            # put value at key:
            keys_to_pop = []
            for key in doc:
                if key.startswith(to_set):
                    if key == to_set or key[len(to_set)] == '.':
                        keys_to_pop.append(key)
            for key in keys_to_pop:
                doc.pop(key)
            doc[to_set] = value
        for to_unset in update_spec.get("$unset", []):
            # MongoDB < 2.5.2 reports $unset for fields that don't exist within
            # the document being updated.
            keys_to_pop = []
            for key in doc:
                if key.startswith(to_unset):
                    if key == to_unset or key[len(to_unset)] == '.':
                        keys_to_pop.append(key)
            for key in keys_to_pop:
                doc.pop(key)
        return doc

    @wrap_exceptions
    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.

        """
        # Commit outstanding changes so that the document to be updated is the
        # same version to which the changes apply.
        self.commit()
        query = "%s:%s" % (self.unique_key, u(document_id))
        results = self.solr.search(query)
        if not len(results):
            # Document may not be retrievable yet
            self.commit()
            results = self.solr.search(query)
        # Results is an iterable containing only 1 result
        for doc in results:
            # Remove metadata previously stored by Mongo Connector.
            doc.pop('ns')
            doc.pop('_ts')
            updated = self.apply_update(doc, update_spec)
            # A _version_ of 0 will always apply the update
            updated['_version_'] = 0
            self.upsert(updated, namespace, timestamp)
            return updated

    @wrap_exceptions
    def upsert(self, doc, namespace, timestamp):
        """Update or insert a document into Solr

        This method should call whatever add/insert/update method exists for
        the backend engine and add the document in there. The input will
        always be one mongo document, represented as a Python dictionary.
        """
        if self.auto_commit_interval is not None:
            self.solr.add([self._clean_doc(doc, namespace, timestamp)],
                          commit=(self.auto_commit_interval == 0),
                          commitWithin=u(self.auto_commit_interval))
        else:
            self.solr.add([self._clean_doc(doc, namespace, timestamp)],
                          commit=False)

    @wrap_exceptions
    def bulk_upsert(self, docs, namespace, timestamp):
        """Update or insert multiple documents into Solr

        docs may be any iterable
        """
        if self.auto_commit_interval is not None:
            add_kwargs = {
                "commit": (self.auto_commit_interval == 0),
                "commitWithin": str(self.auto_commit_interval)
            }
        else:
            add_kwargs = {"commit": False}

        cleaned = (self._clean_doc(d, namespace, timestamp) for d in docs)
        if self.chunk_size > 0:
            batch = list(next(cleaned) for i in range(self.chunk_size))
            while batch:
                self.solr.add(batch, **add_kwargs)
                batch = list(next(cleaned) for i in range(self.chunk_size))
        else:
            self.solr.add(cleaned, **add_kwargs)

    @wrap_exceptions
    def insert_file(self, f, namespace, timestamp):
        params = self._formatter.format_document(f.get_metadata())
        params[self.unique_key] = params.pop('_id')
        params['ns'] = namespace
        params['_ts'] = timestamp
        params = dict(('literal.' + k, v) for k, v in params.items())

        if self.auto_commit_interval == 0:
            params['commit'] = 'true'

        request = Request(
            os.path.join(self.url, "update/extract?%s" % urlencode(params)))

        request.add_header("Content-type", "application/octet-stream")
        request.data = f
        response = urlopen(request)
        logging.debug(response.read())

    @wrap_exceptions
    def remove(self, document_id, namespace, timestamp):
        """Removes documents from Solr

        The input is a python dictionary that represents a mongo document.
        """
        self.solr.delete(id=u(document_id),
                         commit=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def _stream_search(self, query):
        """Helper method for iterating over Solr search results."""
        for doc in self.solr.search(query, rows=100000000):
            if self.unique_key != "_id":
                doc["_id"] = doc.pop(self.unique_key)
            yield doc

    @wrap_exceptions
    def search(self, start_ts, end_ts):
        """Called to query Solr for documents in a time range."""
        query = '_ts: [%s TO %s]' % (start_ts, end_ts)
        return self._stream_search(query)

    def commit(self):
        """This function is used to force a commit.
        """
        retry_until_ok(self.solr.commit)

    @wrap_exceptions
    def get_last_doc(self):
        """Returns the last document stored in the Solr engine.
        """
        #search everything, sort by descending timestamp, return 1 row
        try:
            result = self.solr.search('*:*', sort='_ts desc', rows=1)
        except ValueError:
            return None

        for r in result:
            r['_id'] = r.pop(self.unique_key)
            return r
Пример #52
0
class JobFeedTestCase(TestCase):

    def setUp(self):
        super(JobFeedTestCase, self).setUp()
        self.businessunit = BusinessUnitFactory.build()
        self.businessunit.save()
        self.buid_id = self.businessunit.id
        self.numjobs = 4
        self.testdir = os.path.abspath(os.path.dirname(__file__))
        self.conn = Solr("http://127.0.0.1:8983/solr/")
        self.emptyfeed = os.path.join(self.testdir, "dseo_feed_0.no_jobs.xml")

        #Ensures DATA_DIR used by import_jobs.download_feed_file exists
        data_path = settings.DATA_DIR
        if not os.path.exists(data_path):
            os.mkdir(data_path)
        
    def test_dev2_feed(self):
        filepath = import_jobs.download_feed_file(self.buid_id)
        results = xmlparse.DEv2JobFeed(filepath)
        jobs = results.jobparse()
        self.assertEqual(results.jsid, self.buid_id)
        self.assertEqual(results.company, self.businessunit.title)
        self.assertEqual(len(jobs), self.numjobs)
        # Test for the presence of every non-calculated field on the jobListing
        # model. (That is, all slugfields and 'location' are left out.)
        self.assertEqual(set(jobs[0].keys()), set(['buid_id', 'city', 'country',
                                                   'country_short', 'date_new',
                                                   'date_updated', 'description',
                                                   'hitkey', 'link', 'onet_id',
                                                   'reqid', 'state',
                                                   'state_short', 'title',
                                                   'uid', 'zipcode']))

    def test_mocids(self):
        """
        Tests that mocid fields exist when jobs are imported from a feed and
        added to a solr connnection
        
        """
        filepath = import_jobs.download_feed_file(self.buid_id)
        results = xmlparse.DEv2JobFeed(filepath)
        jobs = results.solr_jobs()
        # Since we're going to be adding/updating data in the Solr index, we're
        # hardcoding in the local Solr instance so that we don't accidentally
        # alter production data.
        self.conn.add(jobs)
        num_hits = self.conn.search(q="*:*",
                                    fq="buid:%s -mocid:[* TO *]" % self.buid_id)
        self.assertEqual(num_hits.hits, self.numjobs)
        for job in jobs:
            self.assertTrue('mocid' in job)

    def test_empty_feed(self):
        """
        Test that the schema for the v2 DirectEmployers feed file schema
        allows for empty feed files.
        
        """
        results = xmlparse.DEv2JobFeed(self.emptyfeed)
        # If the schema is such that empty feed files are considered invalid,
        # trying to run jobparse() will throw an exception.
        self.assertEqual(len(results.jobparse()), 0)

    def test_empty_solr(self):
        """
        Tests for the proper behavior when encountering a job-less, but
        otherwise valid, feed file. The proper behavior is to delete any
        jobs associated with that BusinessUnit from the Solr index.

        """
        # Normal download-and-parse operation on a feed file with jobs.
        import_jobs.update_solr(self.buid_id)
        results = self.conn.search(q="*:*", fq="buid:%s" % self.buid_id)
        self.assertEqual(results.hits, self.numjobs)

        # Download-and-parse operation on a feed file with no jobs. Expected
        # behavior is to delete all jobs.
        self._get_feedfile()
        import_jobs.update_solr(self.buid_id, download=False)
        results = self.conn.search(q="*:*", fq="buid:%s" % self.buid_id)
        self.assertEqual(results.hits, 0)

    def test_empty_db(self):
        """
        Tests for the proper behavior when encountering a job-less, but
        otherwise valid, feed file. The proper behavior is to delete any
        jobs associated with that BusinessUnit from the database.

        """
        # Normal download-and-parse operation on a feed file with jobs.
        import_jobs.refresh_bunit_jobs(self.buid_id)
        dbjobs = jobListing.objects.filter(buid=self.buid_id).count()
        self.assertEqual(dbjobs, self.numjobs)
        
        # Download-and-parse operation on a feed file with no jobs. Expected
        # behavior is to delete all jobs.
        self._get_feedfile()
        import_jobs.refresh_bunit_jobs(self.buid_id, download=False)
        dbjobs = jobListing.objects.filter(buid=self.buid_id).count()
        self.assertEqual(dbjobs, 0)

    def test_zipcode(self):
        """
        Tests to ensure proper behavior of zipcode field in being entered both
        in the database and Solr.

        """
        filepath = import_jobs.download_feed_file(self.buid_id)
        dbresults = xmlparse.DEv2JobFeed(filepath)
        solrresults = dbresults.solr_jobs()

        zips_from_feedfile = ["28243", "10095", "90212", "30309"]
        solrzips = [i['zipcode'] for i in solrresults]
        dbzips = [i['zipcode'] for i in dbresults.jobparse()]
        
        for coll in [solrzips, dbzips]:
            self.assertItemsEqual(zips_from_feedfile, coll)

    def test_salt_date(self):
        """
        Test to ensure that job postings show up in a quasi-random
        fashion by sorting by the `salted_date` attribute in the index
        vice strictly by `date_new`.
        
        """
        filepath = import_jobs.download_feed_file(self.buid_id)
        jobs = xmlparse.DEv2JobFeed(filepath)
        solrjobs = jobs.solr_jobs()
        self.conn.add(solrjobs)
        results = self.conn.search(q="*:*", sort="salted_date asc")
        self.assertEqual(self.numjobs, results.hits)
        # We can't really test for inequality between the two result sets,
        # since sometimes results.docs will equal results2.docs.
        results2 = self.conn.search(q="*:*", sort="date_new asc")
        self.assertItemsEqual(results2.docs, results.docs)

    def test_date_updated(self):
        """
        Test to ensure proper behavior of date updated field when added to
        Solr.

        """
        filepath = import_jobs.download_feed_file(self.buid_id)
        jobs = xmlparse.DEv2JobFeed(filepath)
        solrjobs = jobs.solr_jobs()
        self.conn.add(solrjobs)
        date_updated = datetime.datetime.strptime("5/17/2012 12:01:05 PM",
                                                  "%m/%d/%Y %I:%M:%S %p")
        solr_dates = [i['date_updated'] for i in solrjobs]

        for solr_date in solr_dates:
            self.assertEqual(solr_date, date_updated)
        
    def _get_feedfile(self):
        # Download the 'real' feed file then copy the empty feed file in its
        # place.
        realfeed = import_jobs.download_feed_file(self.buid_id)
        shutil.copyfile(realfeed, "%s.bak" % realfeed)
        shutil.copyfile(self.emptyfeed, realfeed)
Пример #53
0
class SearchEngine(BaseSearchEngine):
    def __init__(self):
        args = [settings.SOLR_URL]
        self.conn = Solr(*args)

    def _models_query(self, models):
        def qt(model):
            return 'django_ct_s:"%s.%s"' % (model._meta.app_label,
                                            model._meta.module_name)

        return ' OR '.join([qt(model) for model in models])

    def update(self, indexer, iterable, commit=True):
        docs = []
        try:
            for obj in iterable:
                doc = {}
                doc['id'] = self.get_identifier(obj)
                doc['django_ct_s'] = "%s.%s" % (obj._meta.app_label,
                                                obj._meta.module_name)
                doc['django_id_s'] = force_unicode(obj.pk)
                doc['text'] = indexer.flatten(obj)
                for name, value in indexer.get_indexed_fields(obj):
                    doc[name] = value
                docs.append(doc)
        except UnicodeDecodeError:
            print "Chunk failed."
            pass
        self.conn.add(docs, commit=commit)

    def remove(self, obj, commit=True):
        solr_id = self.get_identifier(obj)
        self.conn.delete(id=solr_id, commit=commit)

    def clear(self, models, commit=True):
        # *:* matches all docs in Solr
        self.conn.delete(q='*:*', commit=commit)

    def _result_callback(self, result):
        app_label, model_name = result['django_ct_s'].split('.')
        return (app_label, model_name, result['django_id_s'], None)

    def search(self,
               q,
               models=None,
               order_by=RELEVANCE,
               limit=None,
               offset=None):
        if len(q) == 0:
            return SearchResults(q, [], 0, lambda x: x)
        original_query = q
        q = convert_query(original_query, SolrQueryConverter)

        if models is not None:
            models_clause = self._models_query(models)
            final_q = '(%s) AND (%s)' % (q, models_clause)
        else:
            final_q = q

        kwargs = {}
        if order_by != RELEVANCE:
            if order_by[0] == '-':
                kwargs['sort'] = '%s desc' % order_by[1:]
            else:
                kwargs['sort'] = '%s asc' % order_by

        if limit is not None:
            kwargs['rows'] = limit
        if offset is not None:
            kwargs['start'] = offset

        results = self.conn.search(final_q, **kwargs)
        return SearchResults(final_q, iter(results.docs), results.hits,
                             self._result_callback)
Пример #54
0
class SearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )
    
    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':',
    )
    
    def __init__(self, site=None):
        super(SearchBackend, self).__init__(site)
        
        if not hasattr(settings, 'HAYSTACK_SOLR_URL'):
            raise ImproperlyConfigured('You must specify a HAYSTACK_SOLR_URL in your settings.')
        
        timeout = getattr(settings, 'HAYSTACK_SOLR_TIMEOUT', 10)
        self.conn = Solr(settings.HAYSTACK_SOLR_URL, timeout=timeout)
    
    def update(self, index, iterable, commit=True):
        docs = []
        
        try:
            for obj in iterable:
                doc = {}
                doc['id'] = self.get_identifier(obj)
                doc['django_ct'] = "%s.%s" % (obj._meta.app_label, obj._meta.module_name)
                doc['django_id'] = force_unicode(obj.pk)
                doc.update(index.prepare(obj))
                docs.append(doc)
        except UnicodeDecodeError:
            sys.stderr.write("Chunk failed.\n")
        
        self.conn.add(docs, commit=commit)

    def remove(self, obj_or_string, commit=True):
        solr_id = self.get_identifier(obj_or_string)
        self.conn.delete(id=solr_id, commit=commit)

    def clear(self, models=[], commit=True):
        if not models:
            # *:* matches all docs in Solr
            self.conn.delete(q='*:*', commit=commit)
        else:
            models_to_delete = []
            
            for model in models:
                models_to_delete.append("django_ct:%s.%s" % (model._meta.app_label, model._meta.module_name))
            
            self.conn.delete(q=" OR ".join(models_to_delete), commit=commit)
        
        # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99
        self.conn.optimize()

    def search(self, query_string, sort_by=None, start_offset=0, end_offset=None,
               fields='', highlight=False, facets=None, date_facets=None, query_facets=None,
               narrow_queries=None, **kwargs):
        if len(query_string) == 0:
            return []
        
        kwargs = {
            'fl': '* score',
        }
        
        if fields:
            kwargs['fl'] = fields
        
        if sort_by is not None:
            kwargs['sort'] = sort_by
        
        if start_offset is not None:
            kwargs['start'] = start_offset
        
        if end_offset is not None:
            kwargs['rows'] = end_offset
        
        if highlight is True:
            kwargs['hl'] = 'true'
            kwargs['hl.fragsize'] = '200'
        
        if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
            kwargs['spellcheck'] = 'true'
            kwargs['spellcheck.collate'] = 'true'
            kwargs['spellcheck.count'] = 1
        
        if facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.field'] = facets
        
        if date_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.date'] = date_facets.keys()
            
            for key, value in date_facets.items():
                # Date-based facets in Solr kinda suck.
                kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(value.get('start_date'))
                kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(value.get('end_date'))
                kwargs["f.%s.facet.date.gap" % key] = value.get('gap')
        
        if query_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.query'] = ["%s:%s" % (field, value) for field, value in query_facets.items()]
        
        if narrow_queries is not None:
            kwargs['fq'] = list(narrow_queries)
        
        raw_results = self.conn.search(query_string, **kwargs)
        return self._process_results(raw_results, highlight=highlight)
    
    def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, **kwargs):
        index = self.site.get_index(model_instance.__class__)
        field_name = index.get_content_field()
        params = {
            'fl': '*,score',
        }
        
        if start_offset is not None:
            params['start'] = start_offset
        
        if end_offset is not None:
            params['rows'] = end_offset
        
        if additional_query_string:
            params['fq'] = additional_query_string
        
        raw_results = self.conn.more_like_this("id:%s" % self.get_identifier(model_instance), field_name, **params)
        return self._process_results(raw_results)
    
    def _process_results(self, raw_results, highlight=False):
        from haystack import site
        results = []
        hits = raw_results.hits
        facets = {}
        spelling_suggestion = None
        
        if hasattr(raw_results, 'facets'):
            facets = {
                'fields': raw_results.facets.get('facet_fields', {}),
                'dates': raw_results.facets.get('facet_dates', {}),
                'queries': raw_results.facets.get('facet_queries', {}),
            }
            
            for key in ['fields']:
                for facet_field in facets[key]:
                    # Convert to a two-tuple, as Solr's json format returns a list of
                    # pairs.
                    facets[key][facet_field] = zip(facets[key][facet_field][::2], facets[key][facet_field][1::2])
        
        if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
            if hasattr(raw_results, 'spellcheck'):
                if len(raw_results.spellcheck.get('suggestions', [])):
                    # For some reason, it's an array of pairs. Pull off the
                    # collated result from the end.
                    spelling_suggestion = raw_results.spellcheck.get('suggestions')[-1]
        
        indexed_models = site.get_indexed_models()
        
        for raw_result in raw_results.docs:
            app_label, model_name = raw_result['django_ct'].split('.')
            additional_fields = {}
            
            for key, value in raw_result.items():
                additional_fields[str(key)] = self.conn._to_python(value)
            
            del(additional_fields['django_ct'])
            del(additional_fields['django_id'])
            del(additional_fields['score'])
            
            if raw_result['id'] in getattr(raw_results, 'highlighting', {}):
                additional_fields['highlighted'] = raw_results.highlighting[raw_result['id']]
            
            model = get_model(app_label, model_name)
            
            if model:
                if model in indexed_models:
                    result = SearchResult(app_label, model_name, raw_result['django_id'], raw_result['score'], **additional_fields)
                    results.append(result)
                else:
                    hits -= 1
            else:
                hits -= 1
        
        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }
Пример #55
0
class DocManager():
    """The DocManager class creates a connection to the backend engine and
    adds/removes documents, and in the case of rollback, searches for them.

    The reason for storing id/doc pairs as opposed to doc's is so that multiple
    updates to the same doc reflect the most up to date version as opposed to
    multiple, slightly different versions of a doc.
    """
    def __init__(self, url, auto_commit=False, unique_key='_id'):
        """Verify Solr URL and establish a connection.
        """
        if verify_url(url) is False:
            raise SystemError

        self.solr = Solr(url)
        self.unique_key = unique_key
        self.auto_commit = auto_commit
        self.field_list = []
        self.dynamic_field_list = []
        self.build_fields()

        if auto_commit:
            self.run_auto_commit()

    def _parse_fields(self, result, field_name):
        """ If Schema access, parse fields and build respective lists
        """
        field_list = []
        for key, value in result.get('schema', {}).get(field_name, {}).items():
            if key not in field_list:
                field_list.append(key)
        return field_list

    def build_fields(self):
        """ Builds a list of valid fields
        """
        try:
            declared_fields = self.solr._send_request('get', ADMIN_URL)
        except SolrError:
            pass
        result = decoder.decode(declared_fields)
        self.field_list = self._parse_fields(result, 'fields'),
        self.dynamic_field_list = self._parse_fields(result, 'dynamicFields')

    def clean_doc(self, doc):
        """ Cleans a document passed in to be compliant with the Solr as
        used by Solr. This WILL remove fields that aren't in the schema, so
        the document may actually get altered.
        """
        if not self.field_list:
            return doc

        fixed_doc = {}
        for key, value in doc.items():
            if key in self.field_list[0]:
                fixed_doc[key] = value

            # Dynamic strings. * can occur only at beginning and at end
            else:
                for field in self.dynamic_field_list:
                    if field[0] == '*':
                        regex = re.compile(r'\w%s\b' % (field))
                    else:
                        regex = re.compile(r'\b%s\w' % (field))
                    if regex.match(key):
                        fixed_doc[key] = value

        return fixed_doc

    def stop(self):
        """ Stops the instance
        """
        self.auto_commit = False

    def upsert(self, doc):
        """Update or insert a document into Solr

        This method should call whatever add/insert/update method exists for
        the backend engine and add the document in there. The input will
        always be one mongo document, represented as a Python dictionary.
        """
        try:
            self.solr.add([self.clean_doc(doc)], commit=True)
        except SolrError:
            logging.error("Could not insert %r into Solr" % (doc, ))

    def remove(self, doc):
        """Removes documents from Solr

        The input is a python dictionary that represents a mongo document.
        """
        self.solr.delete(id=str(doc[self.unique_key]), commit=True)

    def _remove(self):
        """Removes everything
        """
        self.solr.delete(q='*:*')

    def search(self, start_ts, end_ts):
        """Called to query Solr for documents in a time range.
        """
        query = '_ts: [%s TO %s]' % (start_ts, end_ts)
        return self.solr.search(query, rows=100000000)

    def _search(self, query):
        """For test purposes only. Performs search on Solr with given query
            Does not have to be implemented.
        """
        return self.solr.search(query, rows=200)

    def commit(self):
        """This function is used to force a commit.
        """
        retry_until_ok(self.solr.commit)

    def run_auto_commit(self):
        """Periodically commits to the Solr server.
        """
        self.solr.commit()
        if self.auto_commit:
            Timer(1, self.run_auto_commit).start()

    def get_last_doc(self):
        """Returns the last document stored in the Solr engine.
        """
        #search everything, sort by descending timestamp, return 1 row
        try:
            result = self.solr.search('*:*', sort='_ts desc', rows=1)
        except ValueError:
            return None

        if len(result) == 0:
            return None

        return result.docs[0]
Пример #56
0
class DocManager:
    """The DocManager class creates a connection to the backend engine and
    adds/removes documents, and in the case of rollback, searches for them.

    The reason for storing id/doc pairs as opposed to doc's is so that multiple
    updates to the same doc reflect the most up to date version as opposed to
    multiple, slightly different versions of a doc.
    """

    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key="_id", **kwargs):
        """Verify Solr URL and establish a connection.
        """
        self.solr = Solr(url)
        self.unique_key = unique_key
        # pysolr does things in milliseconds
        if auto_commit_interval is not None:
            self.auto_commit_interval = auto_commit_interval * 1000
        else:
            self.auto_commit_interval = None
        self.field_list = []
        self._build_fields()

    def _parse_fields(self, result, field_name):
        """ If Schema access, parse fields and build respective lists
        """
        field_list = []
        for key, value in result.get("schema", {}).get(field_name, {}).items():
            if key not in field_list:
                field_list.append(key)
        return field_list

    def _build_fields(self):
        """ Builds a list of valid fields
        """
        declared_fields = self.solr._send_request("get", ADMIN_URL)
        result = decoder.decode(declared_fields)
        self.field_list = self._parse_fields(result, "fields")

        # Build regular expressions to match dynamic fields.
        # dynamic field names may have exactly one wildcard, either at
        # the beginning or the end of the name
        self._dynamic_field_regexes = []
        for wc_pattern in self._parse_fields(result, "dynamicFields"):
            if wc_pattern[0] == "*":
                self._dynamic_field_regexes.append(re.compile("\w%s\Z" % wc_pattern))
            elif wc_pattern[-1] == "*":
                self._dynamic_field_regexes.append(re.compile("\A%s\w*" % wc_pattern[:-1]))

    def _clean_doc(self, doc):
        """Reformats the given document before insertion into Solr.

        This method reformats the document in the following ways:
          - removes extraneous fields that aren't defined in schema.xml
          - unwinds arrays in order to find and later flatten sub-documents
          - flattens the document so that there are no sub-documents, and every
            value is associated with its dot-separated path of keys

        An example:
          {"a": 2,
           "b": {
             "c": {
               "d": 5
             }
           },
           "e": [6, 7, 8]
          }

        becomes:
          {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8}

        """
        # SOLR cannot index fields within sub-documents, so flatten documents
        # with the dot-separated path to each value as the respective key
        def flattened(doc):
            def flattened_kernel(doc, path):
                for k, v in doc.items():
                    path.append(k)
                    if isinstance(v, dict):
                        for inner_k, inner_v in flattened_kernel(v, path):
                            yield inner_k, inner_v
                    elif isinstance(v, list):
                        for li, lv in enumerate(v):
                            path.append(str(li))
                            if isinstance(lv, dict):
                                for dk, dv in flattened_kernel(lv, path):
                                    yield dk, dv
                            else:
                                yield ".".join(path), lv
                            path.pop()
                    else:
                        yield ".".join(path), v
                    path.pop()

            return dict(flattened_kernel(doc, []))

        # Translate the _id field to whatever unique key we're using
        doc[self.unique_key] = doc["_id"]
        flat_doc = flattened(doc)

        # Only include fields that are explicitly provided in the
        # schema or match one of the dynamic field patterns, if
        # we were able to retrieve the schema
        if len(self.field_list) + len(self._dynamic_field_regexes) > 0:

            def include_field(field):
                return field in self.field_list or any(regex.match(field) for regex in self._dynamic_field_regexes)

            return dict((k, v) for k, v in flat_doc.items() if include_field(k))
        return flat_doc

    def stop(self):
        """ Stops the instance
        """
        pass

    def upsert(self, doc):
        """Update or insert a document into Solr

        This method should call whatever add/insert/update method exists for
        the backend engine and add the document in there. The input will
        always be one mongo document, represented as a Python dictionary.
        """
        try:
            if self.auto_commit_interval is not None:
                self.solr.add(
                    [self._clean_doc(doc)],
                    commit=(self.auto_commit_interval == 0),
                    commitWithin=str(self.auto_commit_interval),
                )
            else:
                self.solr.add([self._clean_doc(doc)], commit=False)
        except SolrError:
            raise errors.OperationFailed("Could not insert %r into Solr" % bsjson.dumps(doc))

    def bulk_upsert(self, docs):
        """Update or insert multiple documents into Solr

        docs may be any iterable
        """
        try:
            cleaned = (self._clean_doc(d) for d in docs)
            if self.auto_commit_interval is not None:
                self.solr.add(
                    cleaned, commit=(self.auto_commit_interval == 0), commitWithin=str(self.auto_commit_interval)
                )
            else:
                self.solr.add(cleaned, commit=False)
        except SolrError:
            raise errors.OperationFailed("Could not bulk-insert documents into Solr")

    def remove(self, doc):
        """Removes documents from Solr

        The input is a python dictionary that represents a mongo document.
        """
        self.solr.delete(id=str(doc[self.unique_key]), commit=(self.auto_commit_interval == 0))

    def _remove(self):
        """Removes everything
        """
        self.solr.delete(q="*:*", commit=(self.auto_commit_interval == 0))

    def search(self, start_ts, end_ts):
        """Called to query Solr for documents in a time range.
        """
        query = "_ts: [%s TO %s]" % (start_ts, end_ts)
        return self.solr.search(query, rows=100000000)

    def _search(self, query):
        """For test purposes only. Performs search on Solr with given query
            Does not have to be implemented.
        """
        return self.solr.search(query, rows=200)

    def commit(self):
        """This function is used to force a commit.
        """
        retry_until_ok(self.solr.commit)

    def get_last_doc(self):
        """Returns the last document stored in the Solr engine.
        """
        # search everything, sort by descending timestamp, return 1 row
        try:
            result = self.solr.search("*:*", sort="_ts desc", rows=1)
        except ValueError:
            return None

        if len(result) == 0:
            return None

        return result.docs[0]
Пример #57
0
class DocManager(DocManagerBase):
    """The DocManager class creates a connection to the backend engine and
    adds/removes documents, and in the case of rollback, searches for them.

    The reason for storing id/doc pairs as opposed to doc's is so that multiple
    updates to the same doc reflect the most up to date version as opposed to
    multiple, slightly different versions of a doc.
    """

    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs):
        """Verify Solr URL and establish a connection.
        """
        self.solr = Solr(url)
        self.unique_key = unique_key
        # pysolr does things in milliseconds
        if auto_commit_interval is not None:
            self.auto_commit_interval = auto_commit_interval * 1000
        else:
            self.auto_commit_interval = None
        self.chunk_size = chunk_size
        self.field_list = []
        self._build_fields()
        self._formatter = DocumentFlattener()

    def _parse_fields(self, result, field_name):
        """ If Schema access, parse fields and build respective lists
        """
        field_list = []
        for key, value in result.get('schema', {}).get(field_name, {}).items():
            if key not in field_list:
                field_list.append(key)
        return field_list

    @wrap_exceptions
    def _build_fields(self):
        """ Builds a list of valid fields
        """
        declared_fields = self.solr._send_request('get', ADMIN_URL)
        result = decoder.decode(declared_fields)
        self.field_list = self._parse_fields(result, 'fields')

        # Build regular expressions to match dynamic fields.
        # dynamic field names may have exactly one wildcard, either at
        # the beginning or the end of the name
        self._dynamic_field_regexes = []
        for wc_pattern in self._parse_fields(result, 'dynamicFields'):
            if wc_pattern[0] == "*":
                self._dynamic_field_regexes.append(
                    re.compile(".*%s\Z" % wc_pattern[1:]))
            elif wc_pattern[-1] == "*":
                self._dynamic_field_regexes.append(
                    re.compile("\A%s.*" % wc_pattern[:-1]))

    def _clean_doc(self, doc):
        """Reformats the given document before insertion into Solr.

        This method reformats the document in the following ways:
          - removes extraneous fields that aren't defined in schema.xml
          - unwinds arrays in order to find and later flatten sub-documents
          - flattens the document so that there are no sub-documents, and every
            value is associated with its dot-separated path of keys

        An example:
          {"a": 2,
           "b": {
             "c": {
               "d": 5
             }
           },
           "e": [6, 7, 8]
          }

        becomes:
          {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8}

        """

        # Translate the _id field to whatever unique key we're using.
        # _id may not exist in the doc, if we retrieved it from Solr
        # as part of update.
        if '_id' in doc:
            doc[self.unique_key] = doc.pop("_id")

        # SOLR cannot index fields within sub-documents, so flatten documents
        # with the dot-separated path to each value as the respective key
        flat_doc = self._formatter.format_document(doc)

        # Only include fields that are explicitly provided in the
        # schema or match one of the dynamic field patterns, if
        # we were able to retrieve the schema
        if len(self.field_list) + len(self._dynamic_field_regexes) > 0:
            def include_field(field):
                return field in self.field_list or any(
                    regex.match(field) for regex in self._dynamic_field_regexes
                )
            return dict((k, v) for k, v in flat_doc.items() if include_field(k))
        return flat_doc

    def stop(self):
        """ Stops the instance
        """
        pass

    def apply_update(self, doc, update_spec):
        """Override DocManagerBase.apply_update to have flat documents."""
        # Replace a whole document
        if not '$set' in update_spec and not '$unset' in update_spec:
            # update spec contains the new document
            update_spec['_ts'] = doc['_ts']
            update_spec['ns'] = doc['ns']
            update_spec['_id'] = doc['_id']
            return update_spec
        for to_set in update_spec.get("$set", []):
            value = update_spec['$set'][to_set]
            # Find dotted-path to the value, remove that key from doc, then
            # put value at key:
            keys_to_pop = []
            for key in doc:
                if key.startswith(to_set):
                    if key == to_set or key[len(to_set)] == '.':
                        keys_to_pop.append(key)
            for key in keys_to_pop:
                doc.pop(key)
            doc[to_set] = value
        for to_unset in update_spec.get("$unset", []):
            # MongoDB < 2.5.2 reports $unset for fields that don't exist within
            # the document being updated.
            keys_to_pop = []
            for key in doc:
                if key.startswith(to_unset):
                    if key == to_unset or key[len(to_unset)] == '.':
                        keys_to_pop.append(key)
            for key in keys_to_pop:
                doc.pop(key)
        return doc

    @wrap_exceptions
    def update(self, doc, update_spec):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.

        """
        # Commit outstanding changes so that the document to be updated is the
        # same version to which the changes apply.
        self.commit()
        query = "%s:%s" % (self.unique_key, str(doc['_id']))
        results = self.solr.search(query)
        if not len(results):
            # Document may not be retrievable yet
            self.commit()
            results = self.solr.search(query)
        # Results is an iterable containing only 1 result
        for doc in results:
            updated = self.apply_update(doc, update_spec)
            # A _version_ of 0 will always apply the update
            updated['_version_'] = 0
            self.upsert(updated)
            return updated

    @wrap_exceptions
    def upsert(self, doc):
        """Update or insert a document into Solr

        This method should call whatever add/insert/update method exists for
        the backend engine and add the document in there. The input will
        always be one mongo document, represented as a Python dictionary.
        """
        if self.auto_commit_interval is not None:
            self.solr.add([self._clean_doc(doc)],
                          commit=(self.auto_commit_interval == 0),
                          commitWithin=str(self.auto_commit_interval))
        else:
            self.solr.add([self._clean_doc(doc)], commit=False)

    @wrap_exceptions
    def bulk_upsert(self, docs):
        """Update or insert multiple documents into Solr

        docs may be any iterable
        """
        if self.auto_commit_interval is not None:
            add_kwargs = {
                "commit": (self.auto_commit_interval == 0),
                "commitWithin": str(self.auto_commit_interval)
            }
        else:
            add_kwargs = {"commit": False}

        cleaned = (self._clean_doc(d) for d in docs)
        if self.chunk_size > 0:
            batch = list(next(cleaned) for i in range(self.chunk_size))
            while batch:
                self.solr.add(batch, **add_kwargs)
                batch = list(next(cleaned)
                             for i in range(self.chunk_size))
        else:
            self.solr.add(cleaned, **add_kwargs)

    @wrap_exceptions
    def remove(self, doc):
        """Removes documents from Solr

        The input is a python dictionary that represents a mongo document.
        """
        self.solr.delete(id=str(doc["_id"]),
                         commit=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def _remove(self):
        """Removes everything
        """
        self.solr.delete(q='*:*', commit=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def _stream_search(self, query):
        """Helper method for iterating over Solr search results."""
        for doc in self.solr.search(query, rows=100000000):
            if self.unique_key != "_id":
                doc["_id"] = doc.pop(self.unique_key)
            yield doc

    @wrap_exceptions
    def search(self, start_ts, end_ts):
        """Called to query Solr for documents in a time range."""
        query = '_ts: [%s TO %s]' % (start_ts, end_ts)
        return self._stream_search(query)

    @wrap_exceptions
    def _search(self, query):
        """For test purposes only. Performs search on Solr with given query
            Does not have to be implemented.
        """
        return self._stream_search(query)

    def commit(self):
        """This function is used to force a commit.
        """
        retry_until_ok(self.solr.commit)

    @wrap_exceptions
    def get_last_doc(self):
        """Returns the last document stored in the Solr engine.
        """
        #search everything, sort by descending timestamp, return 1 row
        try:
            result = self.solr.search('*:*', sort='_ts desc', rows=1)
        except ValueError:
            return None

        for r in result:
            r['_id'] = r.pop(self.unique_key)
            return r
Пример #58
0
            except:
                # print col
                print lineno
                validRow = False
            # print newTime.isoformat()
        elif header == 'fb_assoc':
            output[header] = col.strip().split(' ')
        elif header == 'geoloc':
            try:
                cleanCol = col.replace('geolocation{latitude=', '').replace(
                    'longitude=', '').replace('}', '').replace(', ', ',')
                # print cleanCol
                if cleanCol != 'null':
                    output[header] = cleanCol
            except:
                print lineno
                validRow = False
        else:
            output[header] = col
    if validRow:
        data.append(output)

    # update the index every 10000 documents (reduces overhead)
    if i > (10000 * index):
        conn.add(data)
        data = []
        index = index + 1
    i = i + 1

if data:
    conn.add(data)
Пример #59
-1
class SolrTestCase(unittest.TestCase):
    def setUp(self):
        super(SolrTestCase, self).setUp()
        self.default_solr = Solr('http://*****:*****@unittest.skipIf(sys.version_info < (2, 7), reason=u'Python 2.6 lacks the ElementTree 1.3 interface required for Solr XML error message parsing')
    def test__scrape_response_coyote_xml(self):
        resp_3 = self.solr._scrape_response({'server': 'coyote'}, '<?xml version="1.0"?>\n<response>\n<lst name="responseHeader"><int name="status">400</int><int name="QTime">0</int></lst><lst name="error"><str name="msg">Invalid Date String:\'2015-03-23 10:43:33\'</str><int name="code">400</int></lst>\n</response>\n')
        self.assertEqual(resp_3, ("Invalid Date String:'2015-03-23 10:43:33'", "Invalid Date String:'2015-03-23 10:43:33'"))

        # Valid XML with a traceback
        resp_4 = self.solr._scrape_response({'server': 'coyote'}, """<?xml version="1.0"?>
<response>
<lst name="responseHeader"><int name="status">500</int><int name="QTime">138</int></lst><lst name="error"><str name="msg">Internal Server Error</str><str name="trace">org.apache.solr.common.SolrException: Internal Server Error at java.lang.Thread.run(Thread.java:745)</str><int name="code">500</int></lst>
</response>""")
        self.assertEqual(resp_4, (u"Internal Server Error", u"org.apache.solr.common.SolrException: Internal Server Error at java.lang.Thread.run(Thread.java:745)"))

    def test__scrape_response_tomcat(self):
        """Tests for Tomcat error responses"""

        resp_0 = self.solr._scrape_response({'server': 'coyote'}, '<html><body><h1>Something broke!</h1><pre>gigantic stack trace</pre></body></html>')
        self.assertEqual(resp_0, ('Something broke!', ''))

        # Invalid XML
        bogus_xml = '<?xml version="1.0"?>\n<response>\n<lst name="responseHeader"><int name="status">400</int><int name="QTime">0</int></lst><lst name="error"><str name="msg">Invalid Date String:\'2015-03-23 10:43:33\'</str><int name="code">400</int></lst>'
        reason, full_html = self.solr._scrape_response({'server': 'coyote'}, bogus_xml)
        self.assertEqual(reason, None)
        self.assertEqual(full_html, bogus_xml.replace("\n", ""))


    def test__from_python(self):
        self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)), '2013-01-18T00:00:00Z')
        self.assertEqual(self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)), '2013-01-18T00:30:28Z')
        self.assertEqual(self.solr._from_python(True), 'true')
        self.assertEqual(self.solr._from_python(False), 'false')
        self.assertEqual(self.solr._from_python(1), '1')
        self.assertEqual(self.solr._from_python(1.2), '1.2')
        self.assertEqual(self.solr._from_python(b'hello'), 'hello')
        self.assertEqual(self.solr._from_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._from_python('\x01test\x02'), 'test')

    def test__to_python(self):
        self.assertEqual(self.solr._to_python('2013-01-18T00:00:00Z'), datetime.datetime(2013, 1, 18))
        self.assertEqual(self.solr._to_python('2013-01-18T00:30:28Z'), datetime.datetime(2013, 1, 18, 0, 30, 28))
        self.assertEqual(self.solr._to_python('true'), True)
        self.assertEqual(self.solr._to_python('false'), False)
        self.assertEqual(self.solr._to_python(1), 1)
        self.assertEqual(self.solr._to_python(1.2), 1.2)
        self.assertEqual(self.solr._to_python(b'hello'), 'hello')
        self.assertEqual(self.solr._to_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._to_python(['foo', 'bar']), 'foo')
        self.assertEqual(self.solr._to_python(('foo', 'bar')), 'foo')
        self.assertEqual(self.solr._to_python('tuple("foo", "bar")'), 'tuple("foo", "bar")')

    def test__is_null_value(self):
        self.assertTrue(self.solr._is_null_value(None))
        self.assertTrue(self.solr._is_null_value(''))

        self.assertFalse(self.solr._is_null_value('Hello'))
        self.assertFalse(self.solr._is_null_value(1))

    def test_search(self):
        results = self.solr.search('doc')
        self.assertEqual(len(results), 3)
        # search should default to 'select' handler
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('select/?'))

        results = self.solr.search('example')
        self.assertEqual(len(results), 2)

        results = self.solr.search('nothing')
        self.assertEqual(len(results), 0)

        # Advanced options.
        results = self.solr.search('doc', **{
            'debug': 'true',
            'hl': 'true',
            'hl.fragsize': 8,
            'facet': 'on',
            'facet.field': 'popularity',
            'spellcheck': 'true',
            'spellcheck.collate': 'true',
            'spellcheck.count': 1,
            # TODO: Can't get these working in my test setup.
            # 'group': 'true',
            # 'group.field': 'id',
        })
        self.assertEqual(len(results), 3)
        self.assertTrue('explain' in results.debug)
        self.assertEqual(results.highlighting, {u'doc_4': {}, u'doc_2': {}, u'doc_1': {}})
        self.assertEqual(results.spellcheck, {})
        self.assertEqual(results.facets['facet_fields']['popularity'], ['10', 2, '7', 1, '2', 0, '8', 0])
        self.assertTrue(results.qtime is not None)
        # TODO: Can't get these working in my test setup.
        # self.assertEqual(results.grouped, '')

        # search should support custom handlers
        with self.assertRaises(SolrError):
            self.solr.search('doc', handler='fakehandler')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('fakehandler'))

    def test_more_like_this(self):
        results = self.solr.more_like_this('id:doc_1', 'text')
        self.assertEqual(len(results), 0)
        # more_like_this should default to 'mlt' handler
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('mlt/?'))

        # more_like_this should support custom handlers
        with self.assertRaises(SolrError):
            self.solr.more_like_this('id:doc_1', 'text', handler='fakehandler')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('fakehandler'))

    def test_suggest_terms(self):
        results = self.solr.suggest_terms('title', '')
        self.assertEqual(len(results), 1)
        self.assertEqual(results, {'title': [('doc', 3), ('another', 2), ('example', 2), ('1', 1), ('2', 1), ('boring', 1), ('rock', 1), ('thing', 1)]})
        # suggest_terms should default to 'mlt' handler
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('terms/?'))

        # suggest_terms should support custom handlers
        with self.assertRaises(SolrError):
            self.solr.suggest_terms('title', '', handler='fakehandler')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('fakehandler'))

    def test__build_doc(self):
        doc = {
            'id': 'doc_1',
            'title': 'Example doc ☃ 1',
            'price': 12.59,
            'popularity': 10,
        }
        doc_xml = force_unicode(ET.tostring(self.solr._build_doc(doc), encoding='utf-8'))
        self.assertTrue('<field name="title">Example doc ☃ 1</field>' in doc_xml)
        self.assertTrue('<field name="id">doc_1</field>' in doc_xml)
        self.assertEqual(len(doc_xml), 152)

    def test_add(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.assertEqual(len(self.solr.search('example')), 2)

        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            },
            {
                'id': 'doc_7',
                'title': 'Another example doc',
            },
        ])
        # add should default to 'update' handler
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('update/?'))

        self.assertEqual(len(self.solr.search('doc')), 5)
        self.assertEqual(len(self.solr.search('example')), 3)

        # add should support custom handlers
        with self.assertRaises(SolrError):
            self.solr.add([], handler='fakehandler')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('fakehandler'))

    def test_add_with_boost(self):
        self.assertEqual(len(self.solr.search('doc')), 3)

        self.solr.add([{'id': 'doc_6', 'title': 'Important doc'}],
                      boost={'title': 10.0})

        self.solr.add([{'id': 'doc_7', 'title': 'Spam doc doc'}],
                      boost={'title': 0})

        res = self.solr.search('doc')
        self.assertEqual(len(res), 5)
        self.assertEqual('doc_6', res.docs[0]['id'])

    def test_field_update(self):
        originalDocs = self.solr.search('doc')
        self.assertEqual(len(originalDocs), 3)
        updateList = []
        for i, doc in enumerate(originalDocs):
            updateList.append( {'id': doc['id'], 'popularity': 5} )
        self.solr.add(updateList, fieldUpdates={'popularity': 'inc'})

        updatedDocs = self.solr.search('doc')
        self.assertEqual(len(updatedDocs), 3)
        for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)):
            self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys()))
            self.assertEqual(updatedDoc['popularity'], originalDoc['popularity'] + 5)
            self.assertEqual(True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ['_version_', 'popularity']))

        self.solr.add([
            {
                'id': 'multivalued_1',
                'title': 'Multivalued doc 1',
                'word_ss': ['alpha', 'beta'],
            },
            {
                'id': 'multivalued_2',
                'title': 'Multivalued doc 2',
                'word_ss': ['charlie', 'delta'],
            },
        ])

        originalDocs = self.solr.search('multivalued')
        self.assertEqual(len(originalDocs), 2)
        updateList = []
        for i, doc in enumerate(originalDocs):
            updateList.append( {'id': doc['id'], 'word_ss': ['epsilon', 'gamma']} )
        self.solr.add(updateList, fieldUpdates={'word_ss': 'add'})

        updatedDocs = self.solr.search('multivalued')
        self.assertEqual(len(updatedDocs), 2)
        for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)):
            self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys()))
            self.assertEqual(updatedDoc['word_ss'], originalDoc['word_ss'] + ['epsilon', 'gamma'])
            self.assertEqual(True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ['_version_', 'word_ss']))

    def test_delete(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.delete(id='doc_1')
        # delete should default to 'update' handler
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('update/?'))

        self.assertEqual(len(self.solr.search('doc')), 2)
        self.solr.delete(q='price:[0 TO 15]')
        self.assertEqual(len(self.solr.search('doc')), 1)

        self.assertEqual(len(self.solr.search('*:*')), 1)
        self.solr.delete(q='*:*')
        self.assertEqual(len(self.solr.search('*:*')), 0)

        # Need at least one.
        self.assertRaises(ValueError, self.solr.delete)
        # Can't have both.
        self.assertRaises(ValueError, self.solr.delete, id='foo', q='bar')

        # delete should support custom handlers
        with self.assertRaises(SolrError):
            self.solr.delete(id='doc_1', handler='fakehandler')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('fakehandler'))

    def test_commit(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            }
        ], commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.commit()
        # commit should default to 'update' handler
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('update/?'))
        self.assertEqual(len(self.solr.search('doc')), 4)

        # commit should support custom handlers
        with self.assertRaises(SolrError):
            self.solr.commit(handler='fakehandler')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('fakehandler'))

    def test_optimize(self):
        # Make sure it doesn't blow up. Side effects are hard to measure. :/
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            }
        ], commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.optimize()
        # optimize should default to 'update' handler
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('update/?'))
        self.assertEqual(len(self.solr.search('doc')), 4)

        # optimize should support custom handlers
        with self.assertRaises(SolrError):
            self.solr.optimize(handler='fakehandler')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('fakehandler'))

    def test_extract(self):
        fake_f = StringIO("""
            <html>
                <head>
                    <meta charset="utf-8">
                    <meta name="haystack-test" content="test 1234">
                    <title>Test Title ☃&#x2603;</title>
                </head>
                    <body>foobar</body>
            </html>
        """)
        fake_f.name = "test.html"
        extracted = self.solr.extract(fake_f)
        # extract should default to 'update/extract' handler
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('update/extract'))

        # extract should support custom handlers
        with self.assertRaises(SolrError):
            self.solr.extract(fake_f, handler='fakehandler')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('fakehandler'))

        # Verify documented response structure:
        self.assertIn('contents', extracted)
        self.assertIn('metadata', extracted)

        self.assertIn('foobar', extracted['contents'])

        m = extracted['metadata']

        self.assertEqual([fake_f.name], m['stream_name'])

        self.assertIn('haystack-test', m, "HTML metadata should have been extracted!")
        self.assertEqual(['test 1234'], m['haystack-test'])

        # Note the underhanded use of a double snowman to verify both that Tika
        # correctly decoded entities and that our UTF-8 characters survived the
        # round-trip:
        self.assertEqual(['Test Title ☃☃'], m['title'])

    def test_full_url(self):
        self.solr.url = 'http://localhost:8983/solr/core0'
        full_url = self.solr._create_full_url(path='/update')

        # Make sure trailing and leading slashes do not collide:
        self.assertEqual(full_url, 'http://localhost:8983/solr/core0/update')

    def test_request_handler(self):
        before_test_use_qt_param = self.solr.use_qt_param
        before_test_search_handler = self.solr.search_handler

        self.solr.use_qt_param = True

        response = self.solr.search('my query')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('select'))

        response = self.solr.search('my', handler='/autocomplete')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('select'))
        self.assertTrue(args[1].find("qt=%2Fautocomplete") > -1)

        self.solr.search_handler = '/autocomplete'

        response = self.solr.search('my')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('select'))
        self.assertTrue(args[1].find("qt=%2Fautocomplete") > -1)

        self.solr.use_qt_param = False
        # will change the path, so expect a 404
        with self.assertRaises(SolrError):
            response = self.solr.search('my')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('/autocomplete'))
        self.assertTrue(args[1].find("qt=%2Fautocomplete") < 0)

        # reset the values to what they were before the test
        self.solr.use_qt_param = before_test_use_qt_param
        self.solr.search_handler = before_test_search_handler