Пример #1
0
 def commit(self):
     try:
         conn = make_connection()
         conn.commit(wait_searcher=False)
     except Exception, e:
         log.exception(e)
         raise SearchIndexError(e)
Пример #2
0
def main():
    single_action = (len(sys.argv) == 2) and (sys.argv[1] == '--single-action')

    conn = make_connection()
    try:
        with conn.cursor() as cur:
            parser = PolyParser(single_action, conn, cur)
            try:
                while not parser.is_done():
                    act_inc(cur)
                    parser.parse_all()
                    global_live = act_dec(cur)
                    if single_action:
                        break
                    else:
                        future_live = parser.cond_notify()
                        if global_live or future_live:
                            parser.wait()
                        else:
                            parser.do_notify()
                            print("all done", file=sys.stderr)
                            break
            finally:
                parser.close()
    finally:
        conn.close()
Пример #3
0
def contacts():
    """Shows us the contact person in specified school"""

    conn = common.make_connection()
    rows = query.contacts(conn)
    conn.close()
    return render_template('contacts.html', rows=rows)
Пример #4
0
def all_school():
    """Lists all schools with their mentors - even if there's no mentor in specified school"""

    conn = common.make_connection()
    rows = query.all_school(conn)
    conn.close()
    return render_template('all_school.html', rows=rows)
Пример #5
0
def applicants_and_mentors():
    """Shows us applicants name, code and mentor's name"""

    conn = common.make_connection()
    rows = query.applicants_and_mentors(conn)
    conn.close()
    return render_template("applicants_and_mentors.html", rows=rows)
Пример #6
0
def applicants():
    """Lists the applicants name, code and the application's date if it was greater than 2016-01-01"""

    conn = common.make_connection()
    rows = query.applicants(conn)
    conn.close()
    return render_template('applicants.html', rows=rows)
Пример #7
0
 def commit(self):
     try:
         conn = make_connection()
         conn.commit(waitSearcher=False)
     except Exception, e:
         log.exception(e)
         raise SearchIndexError(e)
Пример #8
0
def main():
    dump_header = False
    if (len(sys.argv) > 1) and (sys.argv[1] == '-H'):
        dump_header = True
        del sys.argv[1]

    conn = make_connection()
    try:
        with conn.cursor() as cur:
            dumper = Dumper(cur, dump_header)
            try:
                for url in sys.argv[1:]:
                    cur.execute(
                        """select id, checkd
from field
where url=%s""", (url, ))
                    row = cur.fetchone()
                    if not row:
                        print(url + " not found", file=sys.stderr)
                    else:
                        if row[1] is None:
                            print(url + " not downloaded", file=sys.stderr)
                        else:
                            dumper.dump(url, row[0])
            finally:
                dumper.close()
    finally:
        conn.close()
Пример #9
0
def mentors_by_country():
    """ Shows us the number of mentors by country"""

    conn = common.make_connection()
    rows = query.mentors_by_country(conn)
    conn.close()
    return render_template('mentors_by_country.html', rows=rows)
Пример #10
0
def mentors_and_schools():
    " Lists all mentors with thair school's name and country ordered by mentor id"

    conn = common.make_connection()
    rows = query.mentors(conn)
    conn.close()
    return render_template('mentors.html', rows=rows)
Пример #11
0
def main():
    conn = make_connection()
    try:
        with conn.cursor() as cur:
            inner_select = """select url_id
from download_error"""

            cur.execute("""update field
set checkd=null
where id in (%s)""" % inner_select)

            cur.execute("""delete from locality
where url_id in (%s)""" % inner_select)
            cur.execute("""delete from content
where url_id in (%s)""" % inner_select)

            cur.execute("delete from download_error")

            kicker = Kicker(cur)
            kicker.run()

            seeder = Seeder(cur)
            seeder.seed_queue()
    finally:
        conn.close()
Пример #12
0
def main():
    conn = make_connection()
    try:
        with conn.cursor() as cur:
            retriever = Retriever(cur)
            retriever.retrieve_all()
    finally:
        conn.close()
Пример #13
0
def main():
    conn = make_connection()
    try:
        with conn.cursor() as cur:
            decompressor = Decompressor(cur)
            decompressor.decompress()
    finally:
        conn.close()
Пример #14
0
    def index_package(self, pkg_dict):
        if pkg_dict is None:  
            return 

        if (not pkg_dict.get('state')) or ('active' not in pkg_dict.get('state')):
            return self.delete_package(pkg_dict)

        conn = make_connection()
        index_fields = RESERVED_FIELDS + pkg_dict.keys()
            
        # include the extras in the main namespace
        extras = pkg_dict.get('extras', {})
        for (key, value) in extras.items():
            if isinstance(value, (tuple, list)):
                value = " ".join(map(unicode, value))
            key = ''.join([c for c in key if c in KEY_CHARS])
            pkg_dict['extras_' + key] = value
            if key not in index_fields:
                pkg_dict[key] = value
        if 'extras' in pkg_dict:
            del pkg_dict['extras']

        # flatten the structure for indexing: 
        for resource in pkg_dict.get('resources', []):
            for (okey, nkey) in [('description', 'res_description'),
                                 ('format', 'res_format'),
                                 ('url', 'res_url')]:
                pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u'')]
        if 'resources' in pkg_dict:
            del pkg_dict['resources']
        
        # index relationships as <type>:<object>
        rel_dict = {}
        rel_types = list(itertools.chain(RELATIONSHIP_TYPES))
        for rel in pkg_dict.get('relationships', []):
            _type = rel.get('type', 'rel')
            if (_type in pkg_dict.keys()) or (_type not in rel_types): 
                continue
            rel_dict[_type] = rel_dict.get(_type, []) + [rel.get('object')]
        
        pkg_dict.update(rel_dict)
        
        if 'relationships' in pkg_dict:
            del pkg_dict['relationships']

        pkg_dict[TYPE_FIELD] = PACKAGE_TYPE
        pkg_dict = dict([(k.encode('ascii', 'ignore'), v) for (k, v) in pkg_dict.items()])
        
        # mark this CKAN instance as data source:
        pkg_dict['site_id'] = config.get('ckan.site_id')
        
        # send to solr:  
        try:
            conn.add_many([pkg_dict])
            conn.commit(wait_flush=False, wait_searcher=False)
        except Exception, e:
            log.exception(e)
            raise SearchIndexError(e)
Пример #15
0
def main():
    conn = make_connection()
    try:
        with conn.cursor() as cur:
            builder = Builder(cur)
            builder.prepare()
            builder.process()
    finally:
        conn.close()
Пример #16
0
def clear_index():
    conn = make_connection()
    query = "+site_id:\"%s\"" % (config.get('ckan.site_id'))
    try:
        conn.delete_query(query)
        conn.commit()
    except socket.error, e:
        log.error('Could not connect to SOLR: %r' % e)
        raise
Пример #17
0
def clear_index():
    conn = make_connection()
    query = "+site_id:\"%s\"" % (config.get('ckan.site_id'))
    try:
        conn.delete_query(query)
        conn.commit()
    except socket.error, e:
        log.error('Could not connect to SOLR: %r' % e)
        raise
Пример #18
0
def clear_index():
    conn = make_connection()
    query = "+site_id:\"%s\"" % (config.get('ckan.site_id'))
    try:
        conn.delete(q=query)
    except socket.error, e:
        err = 'Could not connect to SOLR %r: %r' % (conn.url, e)
        log.error(err)
        raise SearchIndexError(err)
Пример #19
0
 def delete_package(self, pkg_dict):
     conn = make_connection()
     query = "+%s:%s AND +(id:\"%s\" OR name:\"%s\") AND +site_id:\"%s\"" % \
             (TYPE_FIELD, PACKAGE_TYPE, pkg_dict.get('id'), pkg_dict.get('id'), config.get('ckan.site_id'))
     try:
         commit = asbool(config.get('ckan.search.solr_commit', 'true'))
         conn.delete(q=query, commit=commit)
     except Exception as e:
         log.exception(e)
         raise SearchIndexError(e)
Пример #20
0
def main():
    conn = make_connection()
    try:
        with conn.cursor() as cur:
            compressor = Compressor(cur)
            try:
                compressor.compress_all()
            finally:
                compressor.close()
    finally:
        conn.close()
Пример #21
0
def main():
    conn = make_connection()
    try:
        with conn.cursor() as cur:
            # maybe check here whether download and/or parse is running? it shouldn't...
            act_reset(cur)

            kicker = Kicker(cur)
            kicker.run()
    finally:
        conn.close()
Пример #22
0
 def delete_package(self, pkg_dict):
     conn = make_connection()
     query = "+%s:%s (+id:\"%s\" OR +name:\"%s\") +site_id:\"%s\"" % (
         TYPE_FIELD, PACKAGE_TYPE, pkg_dict.get('id'), pkg_dict.get('id'),
         config.get('ckan.site_id'))
     try:
         conn.delete_query(query)
         conn.commit()
     except Exception, e:
         log.exception(e)
         raise SearchIndexError(e)
Пример #23
0
 def delete_package(self, pkg_dict):
     conn = make_connection()
     query = "+%s:%s (+id:\"%s\" OR +name:\"%s\") +site_id:\"%s\"" % (TYPE_FIELD, PACKAGE_TYPE,
                                                    pkg_dict.get('id'), pkg_dict.get('id'),
                                                    config.get('ckan.site_id'))
     try:
         commit = asbool(config.get('ckan.search.solr_commit', 'true'))
         conn.delete(q=query, commit=commit)
     except Exception, e:
         log.exception(e)
         raise SearchIndexError(e)
Пример #24
0
 def delete_package(self, pkg_dict):
     conn = make_connection()
     query = "+%s:%s +id:\"%s\" +site_id:\"%s\"" % (TYPE_FIELD, PACKAGE_TYPE,
                                                    pkg_dict.get('id'),
                                                    config.get('ckan.site_id'))
     try:
         conn.delete_query(query)
         conn.commit()
     except Exception, e:
         log.exception(e)
         raise SearchIndexError(e)
Пример #25
0
def clear_index():
    import solr.core

    conn = make_connection()
    query = '+site_id:"%s"' % (config.get("ckan.site_id"))
    try:
        conn.delete_query(query)
        conn.commit()
    except socket.error, e:
        err = "Could not connect to SOLR %r: %r" % (conn.url, e)
        log.error(err)
        raise SearchIndexError(err)
Пример #26
0
    def get_index(self,reference):
        query = {
            'rows': 1,
            'q': 'name:%s OR id:%s' % (reference,reference),
            'wt': 'json',
            'fq': 'site_id:"%s"' % config.get('ckan.site_id')}

        conn = make_connection()
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.raw_query(**query)
        except SolrException, e:
            raise SearchError('SOLR returned an error running query: %r Error: %r' %
                              (query, e.reason))
Пример #27
0
def main():
    conn = make_connection()
    try:
        with conn.cursor() as cur:
            builder = Builder(cur)
            cur.execute(
                """select from_id, to_id from redirect order by from_id, to_id"""
            )
            rows = cur.fetchall()
            for row in rows:
                builder.add(*row)

            builder.dump()
    finally:
        conn.close()
Пример #28
0
    def get_all_entity_ids(self, max_results=1000):
        """
        Return a list of the IDs of all indexed packages.
        """
        query = "*:*"
        fq = "+site_id:\"%s\" " % config.get('ckan.site_id')
        fq += "+state:active "

        conn = make_connection()
        try:
            data = conn.query(query, fq=fq, rows=max_results, fields='id')
        finally:
            conn.close()

        return [r.get('id') for r in data.results]
Пример #29
0
 def delete_package(self, pkg_dict):
     conn = make_connection()
     query = '+%s:%s (+id:"%s" OR +name:"%s") +site_id:"%s"' % (
         TYPE_FIELD,
         PACKAGE_TYPE,
         pkg_dict.get("id"),
         pkg_dict.get("id"),
         config.get("ckan.site_id"),
     )
     try:
         conn.delete_query(query)
         conn.commit()
     except Exception, e:
         log.exception(e)
         raise SearchIndexError(e)
Пример #30
0
    def get_all_entity_ids(self, max_results=1000):
        """
        Return a list of the IDs of all indexed packages.
        """
        query = "*:*"
        fq = "+site_id:\"%s\" " % config.get('ckan.site_id')
        fq += "+state:active "

        conn = make_connection()
        try:
            data = conn.query(query, fq=fq, rows=max_results, fields='id')
        finally:
            conn.close()

        return [r.get('id') for r in data.results]
Пример #31
0
    def get_index(self, reference):
        query = {
            'rows': 1,
            'q': 'name:%s OR id:%s' % (reference, reference),
            'wt': 'json',
            'fq': 'site_id:"%s"' % config.get('ckan.site_id')
        }

        conn = make_connection()
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.raw_query(**query)
        except SolrException, e:
            raise SearchError(
                'SOLR returned an error running query: %r Error: %r' %
                (query, e.reason))
Пример #32
0
def main():
    conn = make_connection()
    try:
        with conn.cursor() as cur:
            tracer = Tracer(cur)
            try:
                cur.execute("""select url, id
from field
where checkd is not null
order by url""")
                rows = cur.fetchall()
                for row in rows:
                    tracer.parse(*row)
            finally:
                tracer.close()
    finally:
        conn.close()
Пример #33
0
def main():
    conn = make_connection()
    try:
        with conn.cursor() as cur:
            squisher = Squisher(cur)
            cur.execute("""select url_id
from nodes
order by url_id""")
            rows = cur.fetchall()
            idx = 0
            for row in rows:
                squisher.add(row[0])
                idx += 1
                if not (idx % 10000):
                    print("id %d..." % (idx, ))
    finally:
        conn.close()
Пример #34
0
def main():
    conn = make_connection()
    try:
        with conn.cursor() as cur:
            builder = Builder(cur)
            try:
                cur.execute("""select url, id
from field
left join nodes on id=url_id
where checkd is not null and (url_id is null or depth=0)
order by url""")
                rows = cur.fetchall()
                for row in rows:
                    builder.add(*row)
            finally:
                builder.close()
    finally:
        conn.close()
Пример #35
0
def main():
    conn = make_connection()
    try:
        with conn.cursor() as cur:
            extender = Extender(cur)
            try:
                cur.execute("""select url, id
from field
left join extra on field.id=extra.url_id
where checkd is not null and has_body is null
order by url""")
                rows = cur.fetchall()
                for row in rows:
                    extender.extend(*row)
            finally:
                extender.close()
    finally:
        conn.close()
Пример #36
0
def main():
    conn = make_connection()
    try:
        with conn.cursor() as cur:
            purger = Purger(cur)
            for url in sys.argv[1:]:
                cur.execute("""select id
from field
where url=%s""", (url,))
                row = cur.fetchone()
                if not row:
                    print(url + " not found", file=sys.stderr)
                else:
                    purger.purge_fast(row[0])

            purger.purge_rest()
    finally:
        conn.close()
Пример #37
0
def main():
    conn = make_connection()
    try:
        with conn.cursor() as cur:
            bfs = BreathFirstSearch(cur)
            # graph.py actually doesn't fill the root (it only fills
            # nodes with parents) - it should be done by seed.py
            bfs.check_pre()
            depth = 0
            count = 1
            while count:
                count = bfs.step(depth)
                depth += 1
                print("found %d nodes at depth %d" % (count, depth))

            bfs.check_post()
    finally:
        conn.close()
Пример #38
0
def main():
    paydirt = get_mandatory_option('paydirt_rx')
    conn = make_connection()
    try:
        with conn.cursor() as cur:
            adder = Adder(cur)
            cur.execute(
                """select url, url_id, depth
from nodes
join field
on url_id=id
where url ~ %s
and depth is not null
order by url""", (paydirt, ))
            rows = cur.fetchall()
            for row in rows:
                adder.add(*row)
    finally:
        conn.close()
Пример #39
0
def main():
    single_action = (len(sys.argv) == 2) and (sys.argv[1] == '--single-action')

    conn = make_connection()
    try:
        with conn.cursor() as cur:
            retriever = Retriever(single_action, conn, cur)
            while True:
                act_inc(cur)
                retriever.retrieve_all()
                global_live = act_dec(cur)
                if single_action:
                    break
                else:
                    future_live = retriever.cond_notify()
                    if global_live or future_live or retriever.has_holds():
                        retriever.wait()
                    else:
                        retriever.last_notify()
                        print("all done", file=sys.stderr)
                        break
    finally:
        conn.close()
Пример #40
0
    def run(self, query):
        '''
        Performs a dataset search using the given query.

        @param query - dictionary with keys like: q, fq, sort, rows, facet
        @return - dictionary with keys results and count

        May raise SearchQueryError or SearchError.
        '''
        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        if not set(query.keys()) <= VALID_SOLR_PARAMETERS:
            invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS]
            raise SearchQueryError("Invalid search parameters: %s" % invalid_params)

        # default query is to return all documents
        q = query.get('q')
        if not q or q == '""' or q == "''":
            query['q'] = "*:*"

        # number of results
        rows_to_return = min(1000, int(query.get('rows', 10)))
        if rows_to_return > 0:
            # #1683 Work around problem of last result being out of order
            #       in SOLR 1.4
            rows_to_query = rows_to_return + 1
        else:
            rows_to_query = rows_to_return
        query['rows'] = rows_to_query

        # order by score if no 'sort' term given
        order_by = query.get('sort')
        if order_by == 'rank' or order_by is None:
            query['sort'] = 'score desc, name asc'

        # show only results from this CKAN instance
        fq = query.get('fq', '')
        if not '+site_id:' in fq:
            fq += ' +site_id:"%s"' % config.get('ckan.site_id')

        # filter for package status
        if not '+state:' in fq:
            fq += " +state:active"
        query['fq'] = fq

        # faceting
        query['facet'] = query.get('facet', 'true')
        query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50'))
        query['facet.mincount'] = query.get('facet.mincount', 1)

        # return the package ID and search scores
        query['fl'] = query.get('fl', 'name')

        # return results as json encoded string
        query['wt'] = query.get('wt', 'json')

        # If the query has a colon in it then consider it a fielded search and do use dismax.
        if ':' not in query['q']:
            query['defType'] = 'dismax'
            query['tie'] = '0.1'
            query['mm'] = '1'
            query['qf'] = query.get('qf', QUERY_FIELDS)

        conn = make_connection()
        log.debug('Package query: %r' % query)
        try:
            solr_response = conn.raw_query(**query)
        except SolrException, e:
            raise SearchError('SOLR returned an error running query: %r Error: %r' %
                              (query, e.reason))
Пример #41
0
    def index_package(self, pkg_dict):
        if pkg_dict is None:
            return

        # add to string field for sorting
        title = pkg_dict.get("title")
        if title:
            pkg_dict["title_string"] = title

        if (not pkg_dict.get("state")) or ("active" not in pkg_dict.get("state")):
            return self.delete_package(pkg_dict)

        index_fields = RESERVED_FIELDS + pkg_dict.keys()

        # include the extras in the main namespace
        extras = pkg_dict.get("extras", [])
        for extra in extras:
            key, value = extra["key"], json.loads(extra["value"])
            if isinstance(value, (tuple, list)):
                value = " ".join(map(unicode, value))
            key = "".join([c for c in key if c in KEY_CHARS])
            pkg_dict["extras_" + key] = value
            if key not in index_fields:
                pkg_dict[key] = value
        pkg_dict.pop("extras", None)

        # Add tags and groups
        tags = pkg_dict.pop("tags", [])
        pkg_dict["tags"] = [tag["name"] for tag in tags]

        groups = pkg_dict.pop("groups", [])

        # Capacity is different to the default only if using organizations
        # where the dataset is only in one group. We will add the capacity
        # from the single group that it is a part of if we have a group
        if len(groups):
            pkg_dict["capacity"] = groups[0].get("capacity", "public")
        else:
            pkg_dict["capacity"] = "public"

        pkg_dict["groups"] = [group["name"] for group in groups]

        # tracking
        tracking_summary = pkg_dict.pop("tracking_summary", None)
        if tracking_summary:
            pkg_dict["views_total"] = tracking_summary["total"]
            pkg_dict["views_recent"] = tracking_summary["recent"]

        # flatten the structure for indexing:
        for resource in pkg_dict.get("resources", []):
            for (okey, nkey) in [("description", "res_description"), ("format", "res_format"), ("url", "res_url")]:
                pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u"")]
        pkg_dict.pop("resources", None)

        rel_dict = collections.defaultdict(list)
        subjects = pkg_dict.pop("relationships_as_subject", [])
        objects = pkg_dict.pop("relationships_as_object", [])
        for rel in objects:
            type = model.PackageRelationship.forward_to_reverse_type(rel["type"])
            rel_dict[type].append(model.Package.get(rel["subject_package_id"]).name)
        for rel in subjects:
            type = rel["type"]
            rel_dict[type].append(model.Package.get(rel["object_package_id"]).name)
        for key, value in rel_dict.iteritems():
            if key not in pkg_dict:
                pkg_dict[key] = value

        pkg_dict[TYPE_FIELD] = PACKAGE_TYPE

        pkg_dict = dict([(k.encode("ascii", "ignore"), v) for (k, v) in pkg_dict.items()])

        for k in ("title", "notes", "title_string"):
            if k in pkg_dict and pkg_dict[k]:
                pkg_dict[k] = escape_xml_illegal_chars(pkg_dict[k])

        # modify dates (SOLR is quite picky with dates, and only accepts ISO dates
        # with UTC time (i.e trailing Z)
        # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html
        pkg_dict["metadata_created"] += "Z"
        pkg_dict["metadata_modified"] += "Z"

        # mark this CKAN instance as data source:
        pkg_dict["site_id"] = config.get("ckan.site_id")

        # Strip a selection of the fields.
        # These fields are possible candidates for sorting search results on,
        # so we strip leading spaces because solr will sort " " before "a" or "A".
        for field_name in ["title"]:
            try:
                value = pkg_dict.get(field_name)
                if value:
                    pkg_dict[field_name] = value.lstrip()
            except KeyError:
                pass

        # add a unique index_id to avoid conflicts
        import hashlib

        pkg_dict["index_id"] = hashlib.md5("%s%s" % (pkg_dict["id"], config.get("ckan.site_id"))).hexdigest()

        for item in PluginImplementations(IPackageController):
            pkg_dict = item.before_index(pkg_dict)

        assert pkg_dict, "Plugin must return non empty package dict on index"

        # send to solr:
        try:
            conn = make_connection()
            conn.add_many([pkg_dict])
            conn.commit(wait_flush=False, wait_searcher=False)
        except Exception, e:
            log.exception(e)
            raise SearchIndexError(e)
Пример #42
0
    def index_package(self, pkg_dict):
        if pkg_dict is None:  
            return 

        if (not pkg_dict.get('state')) or ('active' not in pkg_dict.get('state')):
            return self.delete_package(pkg_dict)

        conn = make_connection()
        index_fields = RESERVED_FIELDS + pkg_dict.keys()
            
        # include the extras in the main namespace
        extras = pkg_dict.get('extras', {})
        for (key, value) in extras.items():
            if isinstance(value, (tuple, list)):
                value = " ".join(map(unicode, value))
            key = ''.join([c for c in key if c in KEY_CHARS])
            pkg_dict['extras_' + key] = value
            if key not in index_fields:
                pkg_dict[key] = value
        if 'extras' in pkg_dict:
            del pkg_dict['extras']

        # flatten the structure for indexing: 
        for resource in pkg_dict.get('resources', []):
            for (okey, nkey) in [('description', 'res_description'),
                                 ('format', 'res_format'),
                                 ('url', 'res_url'),
                                 ('name', 'res_name')]:
                pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u'')]
        if 'resources' in pkg_dict:
            del pkg_dict['resources']
        
        # index relationships as <type>:<object>
        rel_dict = {}
        rel_types = list(itertools.chain(RELATIONSHIP_TYPES))
        for rel in pkg_dict.get('relationships', []):
            _type = rel.get('type', 'rel')
            if (_type in pkg_dict.keys()) or (_type not in rel_types): 
                continue
            rel_dict[_type] = rel_dict.get(_type, []) + [rel.get('object')]
        
        pkg_dict.update(rel_dict)
        
        if 'relationships' in pkg_dict:
            del pkg_dict['relationships']

        pkg_dict[TYPE_FIELD] = PACKAGE_TYPE
        pkg_dict = dict([(k.encode('ascii', 'ignore'), v) for (k, v) in pkg_dict.items()])

        # modify dates (SOLR is quite picky with dates, and only accepts ISO dates
        # with UTC time (i.e trailing Z)
        # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html
        pkg_dict['metadata_created'] += 'Z'
        pkg_dict['metadata_modified'] += 'Z'

        # mark this CKAN instance as data source:
        pkg_dict['site_id'] = config.get('ckan.site_id')
        
        # add a unique index_id to avoid conflicts
        import hashlib
        pkg_dict['index_id'] = hashlib.md5('%s%s' % (pkg_dict['id'],config.get('ckan.site_id'))).hexdigest()

        for item in PluginImplementations(IPackageController):
            pkg_dict = item.before_index(pkg_dict)

        assert pkg_dict, 'Plugin must return non empty package dict on index'

        # send to solr:  
        try:
            conn.add_many([pkg_dict])
            conn.commit(wait_flush=False, wait_searcher=False)
        except Exception, e:
            log.exception(e)
            raise SearchIndexError(e)
Пример #43
0
    def run(self, query):
        '''
        Performs a dataset search using the given query.

        @param query - dictionary with keys like: q, fq, sort, rows, facet
        @return - dictionary with keys results and count
        
        May raise SearchQueryError or SearchError.
        '''
        from solr import SolrException
        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        if not set(query.keys()) <= VALID_SOLR_PARAMETERS:
            invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS]
            raise SearchQueryError("Invalid search parameters: %s" % invalid_params)

        # default query is to return all documents
        q = query.get('q')
        if not q or q == '""' or q == "''":
            query['q'] = "*:*"

        # number of results
        query['rows'] = min(1000, int(query.get('rows', 10)))

        # order by score if no 'sort' term given
        order_by = query.get('sort')
        if order_by == 'rank' or order_by is None: 
            query['sort'] = 'score desc, name asc'

        # show only results from this CKAN instance
        fq = query.get('fq', '')
        if not '+site_id:' in fq:
            fq += ' +site_id:"%s"' % config.get('ckan.site_id')

        # filter for package status       
        if not '+state:' in fq:
            fq += " +state:active"
        query['fq'] = fq

        # faceting
        query['facet'] = query.get('facet', 'true')
        query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50'))
        query['facet.mincount'] = query.get('facet.mincount', 1)

        # return the package ID and search scores
        query['fl'] = query.get('fl', 'name')
        
        # return results as json encoded string
        query['wt'] = query.get('wt', 'json')

        # query field weighting: disabled for now as solr 3.* is required for 
        # the 'edismax' query parser, our current Ubuntu version only has
        # packages for 1.4
        #
        # query['defType'] = 'edismax'
        # query['tie'] = '0.5'
        # query['qf'] = query.get('qf', QUERY_FIELDS)

        conn = make_connection()
        log.debug('Package query: %r' % query)
        
        try:
            solr_response = conn.raw_query(**query)
        except SolrException, e:
            raise SearchError('SOLR returned an error running query: %r Error: %r' %
                              (query, e.reason))
Пример #44
0
    def run(self, query):
        """
        Performs a dataset search using the given query.

        @param query - dictionary with keys like: q, fq, sort, rows, facet
        @return - dictionary with keys results and count
        
        May raise SearchQueryError or SearchError.
        """
        from solr import SolrException

        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        if not set(query.keys()) <= VALID_SOLR_PARAMETERS:
            invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS]
            raise SearchQueryError("Invalid search parameters: %s" % invalid_params)

        # default query is to return all documents
        q = query.get("q")
        if not q or q == '""' or q == "''":
            query["q"] = "*:*"

        # number of results
        rows_to_return = min(1000, int(query.get("rows", 10)))
        if rows_to_return > 0:
            # #1683 Work around problem of last result being out of order
            #       in SOLR 1.4
            rows_to_query = rows_to_return + 1
        else:
            rows_to_query = rows_to_return
        query["rows"] = rows_to_query

        # order by score if no 'sort' term given
        order_by = query.get("sort")
        if order_by == "rank" or order_by is None:
            query["sort"] = "score desc, name asc"

        # show only results from this CKAN instance
        fq = query.get("fq", "")
        if not "+site_id:" in fq:
            fq += ' +site_id:"%s"' % config.get("ckan.site_id")

        # filter for package status
        if not "+state:" in fq:
            fq += " +state:active"
        query["fq"] = fq

        # faceting
        query["facet"] = query.get("facet", "true")
        query["facet.limit"] = query.get("facet.limit", config.get("search.facets.limit", "50"))
        query["facet.mincount"] = query.get("facet.mincount", 1)

        # return the package ID and search scores
        query["fl"] = query.get("fl", "name")

        # return results as json encoded string
        query["wt"] = query.get("wt", "json")

        # If the query has a colon in it then consider it a fielded search and do use dismax.
        if ":" not in query["q"]:
            query["defType"] = "dismax"
            query["tie"] = "0.1"
            query["mm"] = "1"
            query["qf"] = query.get("qf", QUERY_FIELDS)

        conn = make_connection()
        log.debug("Package query: %r" % query)
        try:
            solr_response = conn.raw_query(**query)
        except SolrException, e:
            raise SearchError("SOLR returned an error running query: %r Error: %r" % (query, e.reason))
Пример #45
0
    def index_package(self, pkg_dict):
        if pkg_dict is None:
            return
        pkg_dict['data_dict'] = json.dumps(pkg_dict)

        # add to string field for sorting
        title = pkg_dict.get('title')
        if title:
            pkg_dict['title_string'] = title

        if (not pkg_dict.get('state')) or ('active' not in pkg_dict.get('state')):
            return self.delete_package(pkg_dict)

        index_fields = RESERVED_FIELDS + pkg_dict.keys()

        # include the extras in the main namespace
        extras = pkg_dict.get('extras', [])
        for extra in extras:
            key, value = extra['key'], json.loads(extra['value'])
            if isinstance(value, (tuple, list)):
                value = " ".join(map(unicode, value))
            key = ''.join([c for c in key if c in KEY_CHARS])
            pkg_dict['extras_' + key] = value
            if key not in index_fields:
                pkg_dict[key] = value
        pkg_dict.pop('extras', None)

        #Add tags and groups
        tags = pkg_dict.pop('tags', [])
        pkg_dict['tags'] = [tag['name'] for tag in tags]

        groups = pkg_dict.pop('groups', [])

        # Capacity is different to the default only if using organizations
        # where the dataset is only in one group. We will add the capacity
        # from the single group that it is a part of if we have a group
        if len(groups):
            pkg_dict['capacity'] = groups[0].get('capacity', 'public')
        else:
            pkg_dict['capacity'] = 'public'

        pkg_dict['groups'] = [group['name'] for group in groups]

        # tracking
        tracking_summary = pkg_dict.pop('tracking_summary', None)
        if tracking_summary:
            pkg_dict['views_total'] = tracking_summary['total']
            pkg_dict['views_recent'] = tracking_summary['recent']

        # flatten the structure for indexing:
        for resource in pkg_dict.get('resources', []):
            for (okey, nkey) in [('description', 'res_description'),
                                 ('format', 'res_format'),
                                 ('url', 'res_url')]:
                pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u'')]
        pkg_dict.pop('resources', None)

        rel_dict = collections.defaultdict(list)
        subjects = pkg_dict.pop("relationships_as_subject", [])
        objects = pkg_dict.pop("relationships_as_object", [])
        for rel in objects:
            type = model.PackageRelationship.forward_to_reverse_type(rel['type'])
            rel_dict[type].append(model.Package.get(rel['subject_package_id']).name)
        for rel in subjects:
            type = rel['type']
            rel_dict[type].append(model.Package.get(rel['object_package_id']).name)
        for key, value in rel_dict.iteritems():
            if key not in pkg_dict:
                pkg_dict[key] = value

        pkg_dict[TYPE_FIELD] = PACKAGE_TYPE

        pkg_dict = dict([(k.encode('ascii', 'ignore'), v) for (k, v) in pkg_dict.items()])

        for k in ('title', 'notes', 'title_string'):
            if k in pkg_dict and pkg_dict[k]:
                pkg_dict[k] = escape_xml_illegal_chars(pkg_dict[k])

        # modify dates (SOLR is quite picky with dates, and only accepts ISO dates
        # with UTC time (i.e trailing Z)
        # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html
        pkg_dict['metadata_created'] += 'Z'
        pkg_dict['metadata_modified'] += 'Z'

        # mark this CKAN instance as data source:
        pkg_dict['site_id'] = config.get('ckan.site_id')

        # Strip a selection of the fields.
        # These fields are possible candidates for sorting search results on,
        # so we strip leading spaces because solr will sort " " before "a" or "A".
        for field_name in ['title']:
            try:
                value = pkg_dict.get(field_name)
                if value:
                    pkg_dict[field_name] = value.lstrip()
            except KeyError:
                pass

        # add a unique index_id to avoid conflicts
        import hashlib
        pkg_dict['index_id'] = hashlib.md5('%s%s' % (pkg_dict['id'],config.get('ckan.site_id'))).hexdigest()


        for item in PluginImplementations(IPackageController):
            pkg_dict = item.before_index(pkg_dict)

        assert pkg_dict, 'Plugin must return non empty package dict on index'

        # send to solr:
        try:
            conn = make_connection()
            conn.add_many([pkg_dict])
            conn.commit(wait_flush=False, wait_searcher=False)
        except Exception, e:
            log.exception(e)
            raise SearchIndexError(e)
Пример #46
0
    def index_package(self, pkg_dict, defer_commit=False):
        if pkg_dict is None:
            return

        data_dict_json = json.dumps(pkg_dict)

        if config.get("ckan.cache_validated_datasets", True):
            package_plugin = lib_plugins.lookup_package_plugin(pkg_dict.get("type"))

            schema = package_plugin.show_package_schema()
            validated_pkg_dict, errors = lib_plugins.plugin_validate(
                package_plugin, {"model": model, "session": model.Session}, pkg_dict, schema, "package_show"
            )
            pkg_dict["validated_data_dict"] = json.dumps(
                validated_pkg_dict, cls=ckan.lib.navl.dictization_functions.MissingNullEncoder
            )

        pkg_dict["data_dict"] = data_dict_json

        # add to string field for sorting
        title = pkg_dict.get("title")
        if title:
            pkg_dict["title_string"] = title

        if (not pkg_dict.get("state")) or ("active" not in pkg_dict.get("state")):
            return self.delete_package(pkg_dict)

        index_fields = RESERVED_FIELDS + pkg_dict.keys()

        # include the extras in the main namespace
        extras = pkg_dict.get("extras", [])
        for extra in extras:
            key, value = extra["key"], extra["value"]
            if isinstance(value, (tuple, list)):
                value = " ".join(map(unicode, value))
            key = "".join([c for c in key if c in KEY_CHARS])
            pkg_dict["extras_" + key] = value
            if key not in index_fields:
                pkg_dict[key] = value
        pkg_dict.pop("extras", None)

        # add tags, removing vocab tags from 'tags' list and adding them as
        # vocab_<tag name> so that they can be used in facets
        non_vocab_tag_names = []
        tags = pkg_dict.pop("tags", [])
        context = {"model": model}

        for tag in tags:
            if tag.get("vocabulary_id"):
                data = {"id": tag["vocabulary_id"]}
                vocab = logic.get_action("vocabulary_show")(context, data)
                key = u"vocab_%s" % vocab["name"]
                if key in pkg_dict:
                    pkg_dict[key].append(tag["name"])
                else:
                    pkg_dict[key] = [tag["name"]]
            else:
                non_vocab_tag_names.append(tag["name"])

        pkg_dict["tags"] = non_vocab_tag_names

        # add groups
        groups = pkg_dict.pop("groups", [])

        # we use the capacity to make things private in the search index
        if pkg_dict["private"]:
            pkg_dict["capacity"] = "private"
        else:
            pkg_dict["capacity"] = "public"

        pkg_dict["groups"] = [group["name"] for group in groups]

        # if there is an owner_org we want to add this to groups for index
        # purposes
        if pkg_dict.get("organization"):
            pkg_dict["organization"] = pkg_dict["organization"]["name"]
        else:
            pkg_dict["organization"] = None

        # tracking
        tracking_summary = pkg_dict.pop("tracking_summary", None)
        if tracking_summary:
            pkg_dict["views_total"] = tracking_summary["total"]
            pkg_dict["views_recent"] = tracking_summary["recent"]

        resource_fields = [
            ("name", "res_name"),
            ("description", "res_description"),
            ("format", "res_format"),
            ("url", "res_url"),
            ("resource_type", "res_type"),
        ]
        resource_extras = [(e, "res_extras_" + e) for e in model.Resource.get_extra_columns()]
        # flatten the structure for indexing:
        for resource in pkg_dict.get("resources", []):
            for (okey, nkey) in resource_fields + resource_extras:
                pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u"")]
        pkg_dict.pop("resources", None)

        rel_dict = collections.defaultdict(list)
        subjects = pkg_dict.pop("relationships_as_subject", [])
        objects = pkg_dict.pop("relationships_as_object", [])
        for rel in objects:
            type = model.PackageRelationship.forward_to_reverse_type(rel["type"])
            rel_dict[type].append(model.Package.get(rel["subject_package_id"]).name)
        for rel in subjects:
            type = rel["type"]
            rel_dict[type].append(model.Package.get(rel["object_package_id"]).name)
        for key, value in rel_dict.iteritems():
            if key not in pkg_dict:
                pkg_dict[key] = value

        pkg_dict[TYPE_FIELD] = PACKAGE_TYPE

        # Save dataset type
        pkg_dict["dataset_type"] = pkg_dict["type"]

        # clean the dict fixing keys and dates
        # FIXME where are we getting these dirty keys from?  can we not just
        # fix them in the correct place or is this something that always will
        # be needed?  For my data not changing the keys seems to not cause a
        # problem.
        new_dict = {}
        bogus_date = datetime.datetime(1, 1, 1)
        for key, value in pkg_dict.items():
            key = key.encode("ascii", "ignore")
            if key.endswith("_date"):
                try:
                    date = parse(value, default=bogus_date)
                    if date != bogus_date:
                        value = date.isoformat() + "Z"
                    else:
                        # The date field was empty, so dateutil filled it with
                        # the default bogus date
                        value = None
                except ValueError:
                    continue
            new_dict[key] = value
        pkg_dict = new_dict

        for k in ("title", "notes", "title_string"):
            if k in pkg_dict and pkg_dict[k]:
                pkg_dict[k] = escape_xml_illegal_chars(pkg_dict[k])

        # modify dates (SOLR is quite picky with dates, and only accepts ISO dates
        # with UTC time (i.e trailing Z)
        # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html
        pkg_dict["metadata_created"] += "Z"
        pkg_dict["metadata_modified"] += "Z"

        # mark this CKAN instance as data source:
        pkg_dict["site_id"] = config.get("ckan.site_id")

        # Strip a selection of the fields.
        # These fields are possible candidates for sorting search results on,
        # so we strip leading spaces because solr will sort " " before "a" or "A".
        for field_name in ["title"]:
            try:
                value = pkg_dict.get(field_name)
                if value:
                    pkg_dict[field_name] = value.lstrip()
            except KeyError:
                pass

        # add a unique index_id to avoid conflicts
        import hashlib

        pkg_dict["index_id"] = hashlib.md5("%s%s" % (pkg_dict["id"], config.get("ckan.site_id"))).hexdigest()

        for item in PluginImplementations(IPackageController):
            pkg_dict = item.before_index(pkg_dict)

        assert pkg_dict, "Plugin must return non empty package dict on index"

        # send to solr:
        try:
            conn = make_connection()
            commit = not defer_commit
            if not asbool(config.get("ckan.search.solr_commit", "true")):
                commit = False
            conn.add_many([pkg_dict], _commit=commit)
        except solr.core.SolrException, e:
            msg = "Solr returned an error: {0} {1} - {2}".format(
                e.httpcode, e.reason, e.body[:1000]  # limit huge responses
            )
            raise SearchIndexError(msg)
Пример #47
0
    def index_package(self, pkg_dict, defer_commit=False):
        if pkg_dict is None:
            return

        # tracking summary values will be stale, never store them
        tracking_summary = pkg_dict.pop('tracking_summary', None)
        for r in pkg_dict.get('resources', []):
            r.pop('tracking_summary', None)

        data_dict_json = json.dumps(pkg_dict)

        if config.get('ckan.cache_validated_datasets', True):
            package_plugin = lib_plugins.lookup_package_plugin(
                pkg_dict.get('type'))

            schema = package_plugin.show_package_schema()
            validated_pkg_dict, errors = lib_plugins.plugin_validate(
                package_plugin, {'model': model, 'session': model.Session},
                pkg_dict, schema, 'package_show')
            pkg_dict['validated_data_dict'] = json.dumps(validated_pkg_dict,
                cls=ckan.lib.navl.dictization_functions.MissingNullEncoder)

        pkg_dict['data_dict'] = data_dict_json

        # add to string field for sorting
        title = pkg_dict.get('title')
        if title:
            pkg_dict['title_string'] = title

        # delete the package if there is no state, or the state is `deleted`
        if (not pkg_dict.get('state') or 'deleted' in pkg_dict.get('state')):
            return self.delete_package(pkg_dict)

        index_fields = RESERVED_FIELDS + pkg_dict.keys()

        # include the extras in the main namespace
        extras = pkg_dict.get('extras', [])
        for extra in extras:
            key, value = extra['key'], extra['value']
            if isinstance(value, (tuple, list)):
                value = " ".join(map(unicode, value))
            key = ''.join([c for c in key if c in KEY_CHARS])
            pkg_dict['extras_' + key] = value
            if key not in index_fields:
                pkg_dict[key] = value
        pkg_dict.pop('extras', None)

        # add tags, removing vocab tags from 'tags' list and adding them as
        # vocab_<tag name> so that they can be used in facets
        non_vocab_tag_names = []
        tags = pkg_dict.pop('tags', [])
        context = {'model': model}

        for tag in tags:
            if tag.get('vocabulary_id'):
                data = {'id': tag['vocabulary_id']}
                vocab = logic.get_action('vocabulary_show')(context, data)
                key = u'vocab_%s' % vocab['name']
                if key in pkg_dict:
                    pkg_dict[key].append(tag['name'])
                else:
                    pkg_dict[key] = [tag['name']]
            else:
                non_vocab_tag_names.append(tag['name'])

        pkg_dict['tags'] = non_vocab_tag_names

        # add groups
        groups = pkg_dict.pop('groups', [])

        # we use the capacity to make things private in the search index
        if pkg_dict['private']:
            pkg_dict['capacity'] = 'private'
        else:
            pkg_dict['capacity'] = 'public'

        pkg_dict['groups'] = [group['name'] for group in groups]

        # if there is an owner_org we want to add this to groups for index
        # purposes
        if pkg_dict.get('organization'):
           pkg_dict['organization'] = pkg_dict['organization']['name']
        else:
           pkg_dict['organization'] = None

        # tracking
        if not tracking_summary:
            tracking_summary = model.TrackingSummary.get_for_package(
                pkg_dict['id'])
        pkg_dict['views_total'] = tracking_summary['total']
        pkg_dict['views_recent'] = tracking_summary['recent']

        resource_fields = [('name', 'res_name'),
                           ('description', 'res_description'),
                           ('format', 'res_format'),
                           ('url', 'res_url'),
                           ('resource_type', 'res_type')]
        resource_extras = [(e, 'res_extras_' + e) for e
                            in model.Resource.get_extra_columns()]
        # flatten the structure for indexing:
        for resource in pkg_dict.get('resources', []):
            for (okey, nkey) in resource_fields + resource_extras:
                pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u'')]
        pkg_dict.pop('resources', None)

        rel_dict = collections.defaultdict(list)
        subjects = pkg_dict.pop("relationships_as_subject", [])
        objects = pkg_dict.pop("relationships_as_object", [])
        for rel in objects:
            type = model.PackageRelationship.forward_to_reverse_type(rel['type'])
            rel_dict[type].append(model.Package.get(rel['subject_package_id']).name)
        for rel in subjects:
            type = rel['type']
            rel_dict[type].append(model.Package.get(rel['object_package_id']).name)
        for key, value in rel_dict.iteritems():
            if key not in pkg_dict:
                pkg_dict[key] = value

        pkg_dict[TYPE_FIELD] = PACKAGE_TYPE

        # Save dataset type
        pkg_dict['dataset_type'] = pkg_dict['type']

        # clean the dict fixing keys and dates
        # FIXME where are we getting these dirty keys from?  can we not just
        # fix them in the correct place or is this something that always will
        # be needed?  For my data not changing the keys seems to not cause a
        # problem.
        new_dict = {}
        bogus_date = datetime.datetime(1, 1, 1)
        for key, value in pkg_dict.items():
            key = key.encode('ascii', 'ignore')
            if key.endswith('_date'):
                try:
                    date = parse(value, default=bogus_date)
                    if date != bogus_date:
                        value = date.isoformat() + 'Z'
                    else:
                        # The date field was empty, so dateutil filled it with
                        # the default bogus date
                        value = None
                except ValueError:
                    continue
            new_dict[key] = value
        pkg_dict = new_dict

        for k in ('title', 'notes', 'title_string'):
            if k in pkg_dict and pkg_dict[k]:
                pkg_dict[k] = escape_xml_illegal_chars(pkg_dict[k])

        # modify dates (SOLR is quite picky with dates, and only accepts ISO dates
        # with UTC time (i.e trailing Z)
        # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html
        pkg_dict['metadata_created'] += 'Z'
        pkg_dict['metadata_modified'] += 'Z'

        # mark this CKAN instance as data source:
        pkg_dict['site_id'] = config.get('ckan.site_id')

        # Strip a selection of the fields.
        # These fields are possible candidates for sorting search results on,
        # so we strip leading spaces because solr will sort " " before "a" or "A".
        for field_name in ['title']:
            try:
                value = pkg_dict.get(field_name)
                if value:
                    pkg_dict[field_name] = value.lstrip()
            except KeyError:
                pass

        # add a unique index_id to avoid conflicts
        import hashlib
        pkg_dict['index_id'] = hashlib.md5('%s%s' % (pkg_dict['id'],config.get('ckan.site_id'))).hexdigest()

        for item in PluginImplementations(IPackageController):
            pkg_dict = item.before_index(pkg_dict)

        assert pkg_dict, 'Plugin must return non empty package dict on index'

        # send to solr:
        try:
            conn = make_connection()
            commit = not defer_commit
            if not asbool(config.get('ckan.search.solr_commit', 'true')):
                commit = False
            conn.add(docs=[pkg_dict], commit=commit)
        except pysolr.SolrError, e:
            msg = 'Solr returned an error: {0}'.format(
                e[:1000] # limit huge responses
            )
            raise SearchIndexError(msg)
Пример #48
0
    def index_package(self, pkg_dict, defer_commit=False):
        if pkg_dict is None:
            return
        pkg_dict['data_dict'] = json.dumps(pkg_dict)

        # add to string field for sorting
        title = pkg_dict.get('title')
        if title:
            pkg_dict['title_string'] = title

        if (not pkg_dict.get('state')) or ('active' not in pkg_dict.get('state')):
            return self.delete_package(pkg_dict)

        index_fields = RESERVED_FIELDS + pkg_dict.keys()

        # include the extras in the main namespace
        extras = pkg_dict.get('extras', [])
        for extra in extras:
            key, value = extra['key'], extra['value']
            if isinstance(value, (tuple, list)):
                value = " ".join(map(unicode, value))
            key = ''.join([c for c in key if c in KEY_CHARS])
            pkg_dict['extras_' + key] = value
            if key not in index_fields:
                pkg_dict[key] = value
        pkg_dict.pop('extras', None)

        # add tags, removing vocab tags from 'tags' list and adding them as
        # vocab_<tag name> so that they can be used in facets
        non_vocab_tag_names = []
        tags = pkg_dict.pop('tags', [])
        context = {'model': model}

        for tag in tags:
            if tag.get('vocabulary_id'):
                data = {'id': tag['vocabulary_id']}
                vocab = logic.get_action('vocabulary_show')(context, data)
                key = u'vocab_%s' % vocab['name']
                if key in pkg_dict:
                    pkg_dict[key].append(tag['name'])
                else:
                    pkg_dict[key] = [tag['name']]
            else:
                non_vocab_tag_names.append(tag['name'])

        pkg_dict['tags'] = non_vocab_tag_names

        # add groups
        groups = pkg_dict.pop('groups', [])

        # we use the capacity to make things private in the search index
        if pkg_dict['private']:
            pkg_dict['capacity'] = 'private'
        else:
            pkg_dict['capacity'] = 'public'

        pkg_dict['groups'] = [group['name'] for group in groups]

        # if there is an owner_org we want to add this to groups for index
        # purposes
        if pkg_dict['owner_org']:
            pkg_dict['groups'].append(pkg_dict['organization']['name'])


        # tracking
        tracking_summary = pkg_dict.pop('tracking_summary', None)
        if tracking_summary:
            pkg_dict['views_total'] = tracking_summary['total']
            pkg_dict['views_recent'] = tracking_summary['recent']

        # flatten the structure for indexing:
        for resource in pkg_dict.get('resources', []):
            for (okey, nkey) in [('description', 'res_description'),
                                 ('format', 'res_format'),
                                 ('url', 'res_url')]:
                pkg_dict[nkey] = pkg_dict.get(nkey, []) + [resource.get(okey, u'')]
        pkg_dict.pop('resources', None)

        rel_dict = collections.defaultdict(list)
        subjects = pkg_dict.pop("relationships_as_subject", [])
        objects = pkg_dict.pop("relationships_as_object", [])
        for rel in objects:
            type = model.PackageRelationship.forward_to_reverse_type(rel['type'])
            rel_dict[type].append(model.Package.get(rel['subject_package_id']).name)
        for rel in subjects:
            type = rel['type']
            rel_dict[type].append(model.Package.get(rel['object_package_id']).name)
        for key, value in rel_dict.iteritems():
            if key not in pkg_dict:
                pkg_dict[key] = value

        pkg_dict[TYPE_FIELD] = PACKAGE_TYPE

        # Save dataset type
        pkg_dict['dataset_type'] = pkg_dict['type']

        pkg_dict = dict([(k.encode('ascii', 'ignore'), v) for (k, v) in pkg_dict.items()])

        for k in ('title', 'notes', 'title_string'):
            if k in pkg_dict and pkg_dict[k]:
                pkg_dict[k] = escape_xml_illegal_chars(pkg_dict[k])

        # modify dates (SOLR is quite picky with dates, and only accepts ISO dates
        # with UTC time (i.e trailing Z)
        # See http://lucene.apache.org/solr/api/org/apache/solr/schema/DateField.html
        pkg_dict['metadata_created'] += 'Z'
        pkg_dict['metadata_modified'] += 'Z'

        # mark this CKAN instance as data source:
        pkg_dict['site_id'] = config.get('ckan.site_id')

        # Strip a selection of the fields.
        # These fields are possible candidates for sorting search results on,
        # so we strip leading spaces because solr will sort " " before "a" or "A".
        for field_name in ['title']:
            try:
                value = pkg_dict.get(field_name)
                if value:
                    pkg_dict[field_name] = value.lstrip()
            except KeyError:
                pass

        # add a unique index_id to avoid conflicts
        import hashlib
        pkg_dict['index_id'] = hashlib.md5('%s%s' % (pkg_dict['id'],config.get('ckan.site_id'))).hexdigest()

        for item in PluginImplementations(IPackageController):
            pkg_dict = item.before_index(pkg_dict)

        assert pkg_dict, 'Plugin must return non empty package dict on index'

        # send to solr:
        try:
            conn = make_connection()
            commit = not defer_commit
            if not asbool(config.get('ckan.search.solr_commit', 'true')):
                commit = False
            conn.add_many([pkg_dict], _commit=commit)
        except Exception, e:
            log.exception(e)
            raise SearchIndexError(e)
Пример #49
0
    def run(self, query):
        assert isinstance(query, (dict, MultiDict))
        # check that query keys are valid
        if not set(query.keys()) <= VALID_SOLR_PARAMETERS:
            invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS]
            raise SearchQueryError("Invalid search parameters: %s" % invalid_params)

        # default query is to return all documents
        q = query.get('q')
        if not q or q == '""' or q == "''":
            query['q'] = "*:*"

        # number of results
        query['rows'] = min(1000, int(query.get('rows', 10)))

        # order by score if no 'sort' term given
        order_by = query.get('sort')
        if order_by == 'rank' or order_by is None: 
            query['sort'] = 'score desc, name asc'

        # show only results from this CKAN instance
        fq = query.get('fq', '')
        if not '+site_id:' in fq:
            fq += ' +site_id:"%s"' % config.get('ckan.site_id')

        # filter for package status       
        if not '+state:' in fq:
            fq += " +state:active"
        query['fq'] = fq

        # faceting
        query['facet'] = query.get('facet', 'true')
        query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50'))
        query['facet.mincount'] = query.get('facet.mincount', 1)

        # return the package ID and search scores
        query['fl'] = query.get('fl', 'name')
        
        # return results as json encoded string
        query['wt'] = query.get('wt', 'json')

        # query field weighting: disabled for now as solr 3.* is required for 
        # the 'edismax' query parser, our current Ubuntu version only has
        # packages for 1.4
        #
        # query['defType'] = 'edismax'
        # query['tie'] = '0.5'
        # query['qf'] = query.get('qf', QUERY_FIELDS)

        conn = make_connection()
        try:
            log.debug('Package query: %r' % query)
            data = json.loads(conn.raw_query(**query))
            response = data['response']
            self.count = response.get('numFound', 0)
            self.results = response.get('docs', [])

            # if just fetching the id or name, return a list instead of a dict
            if query.get('fl') in ['id', 'name']:
                self.results = [r.get(query.get('fl')) for r in self.results]

            # get facets and convert facets list to a dict
            self.facets = data.get('facet_counts', {}).get('facet_fields', {})
            for field, values in self.facets.iteritems():
                self.facets[field] = dict(zip(values[0::2], values[1::2]))
        except Exception, e:
            log.exception(e)
            raise SearchError(e)