def search(request, pagename): from ductus.index import get_indexing_mongo_database indexing_db = get_indexing_mongo_database() if indexing_db is None: raise Http404("indexing database is not available") collection = indexing_db.urn_index # figure out target language (if given). # fixme: this probably doesn't belong here target_language_tags = [tag for tag in request.GET.getlist('tag') if tag.startswith('target-language:')] target_language_code = None target_language_description = None if target_language_tags: target_language_code = target_language_tags[0].partition(':')[2] from ductus.utils.bcp47 import language_tag_to_description try: target_language_description = language_tag_to_description(target_language_code) except KeyError: pass # return results to the user return render_to_response('special/search.html', { 'target_language_code': target_language_code, 'target_language_description': target_language_description, }, RequestContext(request))
def otics_front_page(request, pagename=None): from ductus.index import get_indexing_mongo_database indexing_db = get_indexing_mongo_database() languages = {} if indexing_db is not None: collection = indexing_db.urn_index relevant_pages = collection.find( { "tags": { "$regex": "^target-language:" }, "current_wikipages": { "$not": { "$size": 0 } }, }, {"tags": 1}) for page in relevant_pages: for tag in page["tags"]: if tag.startswith("target-language:"): lang_code = tag[len("target-language:"):] languages[lang_code] = languages.get(lang_code, 0) + 1 total_lesson_count = sum(a for a in languages.values()) language_tag_cloud = [] for lang_code, count in sorted(six.iteritems(languages)): if count < 2: # XXX: until the tag cloud is fixed, don't display languages with # only one lesson continue try: descr = language_tag_to_description(lang_code) except KeyError: pass else: # XXX: temporary overrides if lang_code == 'el': descr = u'Greek' elif lang_code == 'km': descr = u'Khmer' language_tag_cloud.append( TagCloudElement( count, label=descr, href=(u"/special/search?tag=target-language:%s" % lang_code), data=lang_code)) prepare_tag_cloud(language_tag_cloud, min_percent=70, max_percent=150) return render_to_response( 'otics/front_page.html', { 'language_tag_cloud': language_tag_cloud, 'total_lesson_count': total_lesson_count, 'total_language_count': len(languages), }, RequestContext(request))
def otics_front_page(request, pagename=None): from ductus.index import get_indexing_mongo_database indexing_db = get_indexing_mongo_database() languages = {} if indexing_db is not None: collection = indexing_db.urn_index relevant_pages = collection.find({ "tags": {"$regex": "^target-language:"}, "current_wikipages": {"$not": {"$size": 0}}, }, {"tags": 1}) for page in relevant_pages: for tag in page["tags"]: if tag.startswith("target-language:"): lang_code = tag[len("target-language:"):] languages[lang_code] = languages.get(lang_code, 0) + 1 total_lesson_count = sum(a for a in languages.values()) language_tag_cloud = [] for lang_code, count in sorted(six.iteritems(languages)): if count < 2: # XXX: until the tag cloud is fixed, don't display languages with # only one lesson continue try: descr = language_tag_to_description(lang_code) except KeyError: pass else: # XXX: temporary overrides if lang_code == 'el': descr = u'Greek' elif lang_code == 'km': descr = u'Khmer' language_tag_cloud.append(TagCloudElement(count, label=descr, href=(u"/special/search?tag=target-language:%s" % lang_code), data=lang_code)) prepare_tag_cloud(language_tag_cloud, min_percent=70, max_percent=150) return render_to_response('otics/front_page.html', { 'language_tag_cloud': language_tag_cloud, 'total_lesson_count': total_lesson_count, 'total_language_count': len(languages), }, RequestContext(request))
def handle_noargs(self, **options): from ductus.index import get_indexing_mongo_database indexing_db = get_indexing_mongo_database() if indexing_db is None: raise Exception collection = indexing_db.urn_index def perform_upsert(urn, obj, ignore=None): # REMEMBER that dictionary order matters in mongodb; we just ignore # it # fixme: first inspect element to see if things might already be # right. also check to make sure there aren't any unexpected # attributes on the toplevel element. and do the same thing for # blobs too. obj = dict(obj) obj["urn"] = urn collection.update({"urn": urn}, obj, upsert=True, safe=True) verified_urns.add(urn) logging.basicConfig(level=logging.INFO) # FIXME # create the mongodb indexes collection.ensure_index("urn", unique=True, drop_dups=True) collection.ensure_index("parents", sparse=True) collection.ensure_index("tags", sparse=True) collection.ensure_index("links") collection.ensure_index("recursive_links") # Begin actual code from lxml import etree from ductus.resource import get_resource_database, UnexpectedHeader, hash_name from ductus.wiki.models import WikiPage resource_database = get_resource_database() verified_urns = set() current_wikipages_map = {} operations = {None: 0} def verify(urn): """Updates a urn's indexing info and returns the set of its recursive links """ operations[None] += 1 logger.info("operation %d: processing %s", operations[None], urn) if urn in verified_urns: q = collection.find_one({"urn": urn}, {"recursive_links": 1}) try: return set(q["recursive_links"]) except KeyError: return set() try: tree = resource_database.get_xml_tree(urn) except UnexpectedHeader: # it must be a blob perform_upsert(urn, {"fqn": None}) return set() links = set() for event, element in etree.iterwalk(tree): if '{http://www.w3.org/1999/xlink}href' in element.attrib and element.getparent().tag != '{http://ductus.us/ns/2009/ductus}parents': link = element.attrib['{http://www.w3.org/1999/xlink}href'] if link.startswith('urn:%s:' % hash_name): links.add(link) recursive_links = set(links) for link in links: additional_links = verify(link) recursive_links.update(additional_links) resource = resource_database.get_resource_object(urn) assert resource.fqn is not None obj = { "fqn": resource.fqn, "links": list(links), "recursive_links": sorted(recursive_links), "current_wikipages": sorted(current_wikipages_map.get(urn, ())), } try: obj["parents"] = sorted([parent.href for parent in resource.common.parents]) obj["tags"] = sorted([tag.value for tag in resource.tags]) except AttributeError: pass perform_upsert(urn, obj) return recursive_links for wikipage in WikiPage.objects.all(): revision = wikipage.get_latest_revision() if revision is not None and revision.urn: urn = 'urn:' + revision.urn current_wikipages_map.setdefault(urn, set()).add(wikipage.name) n_attempted = n_successful = 0 for key in resource_database: n_attempted += 1 try: verify(key) except Exception: logger.warning("Key failed: %s", key) else: n_successful += 1 logger.info("Successfully processed %d of %d keys", n_successful, n_attempted)
def handle_noargs(self, **options): from ductus.index import get_indexing_mongo_database indexing_db = get_indexing_mongo_database() if indexing_db is None: raise Exception collection = indexing_db.urn_index def perform_upsert(urn, obj, ignore=None): # REMEMBER that dictionary order matters in mongodb; we just ignore # it # fixme: first inspect element to see if things might already be # right. also check to make sure there aren't any unexpected # attributes on the toplevel element. and do the same thing for # blobs too. obj = dict(obj) obj["urn"] = urn collection.update({"urn": urn}, obj, upsert=True, safe=True) verified_urns.add(urn) logging.basicConfig(level=logging.INFO) # FIXME # create the mongodb indexes collection.ensure_index("urn", unique=True, drop_dups=True) collection.ensure_index("parents", sparse=True) collection.ensure_index("tags", sparse=True) collection.ensure_index("links") collection.ensure_index("recursive_links") # Begin actual code from lxml import etree from ductus.resource import get_resource_database, UnexpectedHeader, hash_name from ductus.wiki.models import WikiPage resource_database = get_resource_database() verified_urns = set() current_wikipages_map = {} operations = {None: 0} def verify(urn): """Updates a urn's indexing info and returns the set of its recursive links """ operations[None] += 1 logger.info("operation %d: processing %s", operations[None], urn) if urn in verified_urns: q = collection.find_one({"urn": urn}, {"recursive_links": 1}) try: return set(q["recursive_links"]) except KeyError: return set() try: tree = resource_database.get_xml_tree(urn) except UnexpectedHeader: # it must be a blob perform_upsert(urn, {"fqn": None}) return set() links = set() for event, element in etree.iterwalk(tree): if '{http://www.w3.org/1999/xlink}href' in element.attrib and element.getparent( ).tag != '{http://ductus.us/ns/2009/ductus}parents': link = element.attrib['{http://www.w3.org/1999/xlink}href'] if link.startswith('urn:%s:' % hash_name): links.add(link) recursive_links = set(links) for link in links: additional_links = verify(link) recursive_links.update(additional_links) resource = resource_database.get_resource_object(urn) assert resource.fqn is not None obj = { "fqn": resource.fqn, "links": list(links), "recursive_links": sorted(recursive_links), "current_wikipages": sorted(current_wikipages_map.get(urn, ())), } try: obj["parents"] = sorted( [parent.href for parent in resource.common.parents]) obj["tags"] = sorted([tag.value for tag in resource.tags]) except AttributeError: pass perform_upsert(urn, obj) return recursive_links for wikipage in WikiPage.objects.all(): revision = wikipage.get_latest_revision() if revision is not None and revision.urn: urn = 'urn:' + revision.urn current_wikipages_map.setdefault(urn, set()).add(wikipage.name) n_attempted = n_successful = 0 for key in resource_database: n_attempted += 1 try: verify(key) except Exception: logger.warning("Key failed: %s", key) else: n_successful += 1 logger.info("Successfully processed %d of %d keys", n_successful, n_attempted)