Exemplo n.º 1
0
def test_search():
    search_query = {"filter_class": "Document"}
    search_query["filter_date_begin"] = "2010"
    search_query["filter_date_end"] = "2013"
    search_query["filter_languages"] = ["en", "fr"]
    search_query["filter_types"] = ["Book", "BookPart"]
    search_query["rec_class"] = "SearchQuery"
    search_query["rec_metajson"] = 1
    search_query["result_batch_size"] = 100
    search_query["result_bibliographic_styles"] = ["mla"]
    search_query["result_offset"] = 0
    search_query["result_sorts"] = [{"field": "rec_type", "order": "asc"}]
    search_query["search_terms"] = [{
        "index": "title",
        "operator": "and",
        "value": "Cheyenne"
    }, {
        "index": "title",
        "operator": "or",
        "value": "technique"
    }]

    print "search_query:"
    print jsonbson.dumps_json(search_query, True)

    search_result = repository_service.search(None, search_query)

    print "search_result:"
    print jsonbson.dumps_bson(search_result, True)
Exemplo n.º 2
0
def load_dict(meta_dict):
    if "rec_class" not in meta_dict:
        return Common(meta_dict)
    elif meta_dict["rec_class"] == "Document":
        return Document(meta_dict)
    elif meta_dict["rec_class"] == "Person":
        return Person(meta_dict)
    elif meta_dict["rec_class"] == "Orgunit":
        return Orgunit(meta_dict)
    elif meta_dict["rec_class"] == "Project":
        return Project(meta_dict)
    elif meta_dict["rec_class"] == "Event":
        return Event(meta_dict)
    elif meta_dict["rec_class"] == "Family":
        return Family(meta_dict)
    elif meta_dict["rec_class"] == "Field":
        return Field(meta_dict)
    elif meta_dict["rec_class"] == "Resource":
        return Resource(meta_dict)
    elif meta_dict["rec_class"] == "Target":
        return Target(meta_dict)
    elif meta_dict["rec_class"] == "Type":
        return Type(meta_dict)
    elif meta_dict["rec_class"] == "Collection":
        return Collection(meta_dict)
    else:
        logging.debug(jsonbson.dumps_bson(meta_dict))
        logging.warning("Unknown rec_class: {O}".format(meta_dict["rec_class"]))
        return Common(meta_dict)
def test_search():
    search_query = {"filter_class": "Document"}
    search_query["filter_date_begin"] = "2010"
    search_query["filter_date_end"] = "2013"
    search_query["filter_languages"] = ["en", "fr"]
    search_query["filter_types"] = ["Book", "BookPart"]
    search_query["rec_class"] = "SearchQuery"
    search_query["rec_metajson"] = 1
    search_query["result_batch_size"] = 100
    search_query["result_bibliographic_styles"] = ["mla"]
    search_query["result_offset"] = 0
    search_query["result_sorts"] = [{"field": "rec_type", "order": "asc"}]
    search_query["search_terms"] = [{"index": "title", "operator": "and", "value": "Cheyenne"}, {"index": "title", "operator": "or", "value": "technique"}]

    print "search_query:"
    print jsonbson.dumps_json(search_query, True)

    search_result = repository_service.search(None, search_query)

    print "search_result:"
    print jsonbson.dumps_bson(search_result, True)
Exemplo n.º 4
0
def test_search_mongo():
    mongo_query = {
        "$or": [{
            "title": {
                "$options": "i",
                "$regex": "Cheyenne"
            }
        }, {
            "title": {
                "$options": "i",
                "$regex": "technique"
            }
        }]
    }
    mongo_query = {
        "$and": [{
            "$or": [{
                "title": {
                    "$options": "i",
                    "$regex": "Cheyenne"
                }
            }, {
                "title": {
                    "$options": "i",
                    "$regex": "technique"
                }
            }]
        }, {
            "publishers": {
                "$regex": "press",
                "$options": 'i'
            }
        }]
    }
    search_result = repository_service.search_mongo(None, mongo_query)
    print "search_result:"
    print jsonbson.dumps_bson(search_result, True)
Exemplo n.º 5
0
def write_json(item, output_file_path):
    #logging.debug("write_json type(item): {}".format(type(item)))
    with open(output_file_path, "w") as output_file:
        dump = jsonbson.dumps_bson(item, True)
        if dump:
            output_file.write(dump)
Exemplo n.º 6
0
def print_document(document):
    logging.info("document : {}".format(jsonbson.dumps_bson(document)))
Exemplo n.º 7
0
def search(corpus, search_query):
    if not corpus:
        corpus = default_corpus

    search_response = SearchResponse()

    # empty search_query
    if search_query is None:
        raise exceptions.metajsonprc_error(40)

    # filter_class -> collection
    collection = None
    if "filter_class" not in search_query or search_query[
            "filter_class"] not in [
                "Document", "Agent", "Person", "OrgUnit", "Event", "Family"
            ]:
        raise exceptions.metajsonprc_error(40)
    elif search_query["filter_class"] == "Document":
        collection = DOCUMENTS
    #elif search_query["filter_class"] in ["Agent", "Person", "OrgUnit", "Event", "Family"]:
    #    collection = AGENTS

    # other filters
    # todo: filter_peer_review, filter_with_full_text, filter_favorite

    filter_query = []
    if "filter_date_end" in search_query:
        filter_date_end = date_service.parse_date(
            search_query["filter_date_end"])
        filter_query.append({"date_sort": {"$lte": filter_date_end}})
    if "filter_date_begin" in search_query:
        filter_date_begin = date_service.parse_date(
            search_query["filter_date_begin"])
        filter_query.append({"date_sort": {"$gte": filter_date_begin}})
    if "filter_languages" in search_query:
        filter_query.append(
            {"languages": {
                "$in": search_query["filter_languages"]
            }})
    if "filter_types" in search_query:
        filter_query.append(
            {"rec_type": {
                "$in": search_query["filter_types"]
            }})
    if "filter_status" in search_query:
        # "private", "pending", "rejected", "published", "deleted"
        filter_query.append(
            {"rec_status": {
                "$in": search_query["filter_status"]
            }})

    # search_terms
    # a
    # and b
    # or c
    # -> or(and(a,b),c)
    # a
    # or b
    # and c
    # -> and(or(a,b),c)

    search_indexes = []
    if "search_terms" in search_query:
        for idx, search_term in enumerate(search_query["search_terms"]):
            # value
            if "value" not in search_term or search_term["value"] is None:
                # useless
                break

            # split value
            values = search_term["value"].replace(",", " ").split()

            # index
            if "index" not in search_term:
                # useless
                raise exceptions.metajsonprc_error(40)
            elif search_term["index"] == "all":
                all_terms = []
                if values:
                    for value in values:
                        all_terms.append(
                            {"rec_id": {
                                "$regex": value,
                                "$options": 'i'
                            }})
                        all_terms.append({
                            "identifiers.value": {
                                "$regex": value,
                                "$options": 'i'
                            }
                        })
                        all_terms.append(
                            {"title": {
                                "$regex": value,
                                "$options": 'i'
                            }})
                        all_terms.append(
                            {"title_sub": {
                                "$regex": value,
                                "$options": 'i'
                            }})
                        all_terms.append(
                            {"publishers": {
                                "$regex": value,
                                "$options": 'i'
                            }})
                        all_terms.append({
                            "is_part_ofs.title": {
                                "$regex": value,
                                "$options": 'i'
                            }
                        })
                        all_terms.append({
                            "is_part_ofs.is_part_ofs.title": {
                                "$regex": value,
                                "$options": 'i'
                            }
                        })
                        all_terms.append({
                            "creators.agent.name_family": {
                                "$regex": value,
                                "$options": 'i'
                            }
                        })
                        all_terms.append({
                            "creators.agent.name_given": {
                                "$regex": value,
                                "$options": 'i'
                            }
                        })
                        all_terms.append({
                            "creators.agent.name": {
                                "$regex": value,
                                "$options": 'i'
                            }
                        })
                        all_terms.append({
                            "creators.agent.title": {
                                "$regex": value,
                                "$options": 'i'
                            }
                        })
                        all_terms.append(
                            {"rec_type": {
                                "$regex": value,
                                "$options": 'i'
                            }})
                    search_indexes.append({"$or": all_terms})

            elif search_term["index"] == "identifier":
                try:
                    obid = ObjectId(search_term["index"])
                    search_indexes.append({"_id": obid})
                except (InvalidId, TypeError):
                    search_indexes.append({
                        "$or": [{
                            "rec_id": {
                                "$regex": search_term["value"],
                                "$options": 'i'
                            }
                        }, {
                            "identifiers.value": {
                                "$regex": search_term["value"],
                                "$options": 'i'
                            }
                        }]
                    })

            elif search_term["index"] == "title":
                title_terms = []
                for value in values:
                    title_terms.append(
                        {"title": {
                            "$regex": value,
                            "$options": 'i'
                        }})
                search_indexes.append({"$and": title_terms})

            elif search_term["index"] == "is_part_of":
                is_part_of_terms = []
                for value in values:
                    is_part_of_terms.append({
                        "is_part_ofs.title": {
                            "$regex": value,
                            "$options": 'i'
                        }
                    })
                    is_part_of_terms.append({
                        "is_part_ofs.is_part_ofs.title": {
                            "$regex": value,
                            "$options": 'i'
                        }
                    })
                search_indexes.append({"$or": is_part_of_terms})

            elif search_term["index"] == "creator":
                creator_terms = []
                for value in values:
                    creator_terms.append({
                        "creators.agent.name_family": {
                            "$regex": value,
                            "$options": 'i'
                        }
                    })
                    creator_terms.append({
                        "creators.agent.name_given": {
                            "$regex": value,
                            "$options": 'i'
                        }
                    })
                    creator_terms.append({
                        "creators.agent.name": {
                            "$regex": value,
                            "$options": 'i'
                        }
                    })
                    creator_terms.append({
                        "creators.agent.title": {
                            "$regex": value,
                            "$options": 'i'
                        }
                    })
                search_indexes.append({"$or": creator_terms})

            elif search_term["index"] == "creator_id":
                search_indexes.append(
                    {"creators.agent.rec_id": search_term["value"]})

            elif search_term["index"] == "affiliation":
                search_indexes.append({
                    "creators.affiliation.name": {
                        "$regex": search_term["value"],
                        "$options": 'i'
                    }
                })

            elif search_term["index"] == "affiliation_id":
                search_indexes.append(
                    {"creators.affiliation.rec_id": search_term["value"]})

            elif search_term["index"] == "publisher":
                publisher_terms = []
                for value in values:
                    publisher_terms.append(
                        {"publishers": {
                            "$regex": value,
                            "$options": 'i'
                        }})
                    publisher_terms.append({
                        "is_part_ofs.publishers": {
                            "$regex": value,
                            "$options": 'i'
                        }
                    })
                    publisher_terms.append({
                        "is_part_ofs.is_part_ofs.publishers": {
                            "$regex": value,
                            "$options": 'i'
                        }
                    })
                search_indexes.append({"$or": publisher_terms})

            elif search_term["index"] == "keyword":
                search_indexes.append({
                    "keywords.value": {
                        "$regex": search_term["value"],
                        "$options": 'i'
                    }
                })

            elif search_term["index"] == "classification":
                search_indexes.append({
                    "classifications.value": {
                        "$regex": search_term["value"],
                        "$options": 'i'
                    }
                })

            elif search_term["index"] == "research_area":
                search_indexes.append({
                    "research_areas.value": {
                        "$regex": search_term["value"],
                        "$options": 'i'
                    }
                })

            elif search_term["index"] == "subject":
                search_indexes.append({
                    "subjects.value": {
                        "$regex": search_term["value"],
                        "$options": 'i'
                    }
                })

            elif search_term["index"] == "set":
                search_indexes.append({
                    "sets.value": {
                        "$regex": search_term["value"],
                        "$options": 'i'
                    }
                })

            # operator
            if "operator" in search_term:
                if search_term["operator"] == "or":
                    pass
                elif search_term["operator"] == "and":
                    pass
                elif search_term["operator"] == "not":
                    pass

    # result_sorts : how to with this index ? ...
    sort = [("title", pymongo.ASCENDING), ("rec_type", pymongo.ASCENDING)]

    # combine filter_query and search_indexes
    mongo_args = filter_query
    mongo_args.extend(search_indexes)

    # Generate the mongo query
    if mongo_args:
        mongo_query = {"$and": mongo_args}
    else:
        # search all
        mongo_query = {}
    logging.debug("mongo_query:")
    logging.debug(jsonbson.dumps_bson(mongo_query, True))

    mongo_response = mongodb[database_name(corpus)][collection].find(
        mongo_query).sort(sort)
    logging.debug(mongo_response)
    if mongo_response:
        records = metajson_service.load_dict_list(mongo_response)
        records_total_count = len(records)
    else:
        records = []
        records_total_count = 0

    search_response["records"] = records
    search_response["records_total_count"] = records_total_count
    search_response["result_batch_size"] = records_total_count
    search_response["result_offset"] = 0
    search_response["search_query"] = search_query

    return search_response
def test_search_mongo():
    mongo_query = {"$or": [{"title": {"$options": "i", "$regex": "Cheyenne"}}, {"title": {"$options": "i", "$regex": "technique"}}]}
    mongo_query = {"$and": [{"$or": [{"title": {"$options": "i", "$regex": "Cheyenne"}}, {"title": {"$options": "i", "$regex": "technique"}}]}, {"publishers": {"$regex": "press", "$options": 'i'}}]}
    search_result = repository_service.search_mongo(None, mongo_query)
    print "search_result:"
    print jsonbson.dumps_bson(search_result, True)
def search(corpus, search_query):
    if not corpus:
        corpus = default_corpus

    search_response = SearchResponse()

    # empty search_query
    if search_query is None:
        raise exceptions.metajsonprc_error(40)

    # filter_class -> collection
    collection = None
    if "filter_class" not in search_query or search_query["filter_class"] not in ["Document", "Agent", "Person", "OrgUnit", "Event", "Family"]:
        raise exceptions.metajsonprc_error(40)
    elif search_query["filter_class"] == "Document":
        collection = DOCUMENTS
    #elif search_query["filter_class"] in ["Agent", "Person", "OrgUnit", "Event", "Family"]:
    #    collection = AGENTS

    # other filters
    # todo: filter_peer_review, filter_with_full_text, filter_favorite

    filter_query = []
    if "filter_date_end" in search_query:
        filter_date_end = date_service.parse_date(search_query["filter_date_end"])
        filter_query.append({"date_sort": {"$lte": filter_date_end}})
    if "filter_date_begin" in search_query:
        filter_date_begin = date_service.parse_date(search_query["filter_date_begin"])
        filter_query.append({"date_sort": {"$gte": filter_date_begin}})
    if "filter_languages" in search_query:
        filter_query.append({"languages": {"$in": search_query["filter_languages"]}})
    if "filter_types" in search_query:
        filter_query.append({"rec_type": {"$in": search_query["filter_types"]}})
    if "filter_status" in search_query:
        # "private", "pending", "rejected", "published", "deleted"
        filter_query.append({"rec_status": {"$in": search_query["filter_status"]}})

    # search_terms
    # a
    # and b
    # or c
    # -> or(and(a,b),c)
    # a
    # or b
    # and c
    # -> and(or(a,b),c)

    search_indexes = []
    if "search_terms" in search_query:
        for idx, search_term in enumerate(search_query["search_terms"]):
            # value
            if "value" not in search_term or search_term["value"] is None:
                # useless
                break

            # split value
            values = search_term["value"].replace(",", " ").split()
            
            # index
            if "index" not in search_term:
                # useless
                raise exceptions.metajsonprc_error(40)
            elif search_term["index"] == "all":
                all_terms = []
                if values:
                    for value in values:
                        all_terms.append({"rec_id": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"identifiers.value": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"title": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"title_sub": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"publishers": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"is_part_ofs.title": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"is_part_ofs.is_part_ofs.title": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"creators.agent.name_family": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"creators.agent.name_given": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"creators.agent.name": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"creators.agent.title": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"rec_type": {"$regex": value, "$options": 'i'}})
                    search_indexes.append({"$or": all_terms})

            elif search_term["index"] == "identifier":
                try:
                    obid = ObjectId(search_term["index"])
                    search_indexes.append({"_id": obid})
                except (InvalidId, TypeError):
                    search_indexes.append({"$or": [{"rec_id": {"$regex": search_term["value"], "$options": 'i'}}, {"identifiers.value": {"$regex": search_term["value"], "$options": 'i'}}]})

            elif search_term["index"] == "title":
                title_terms = []
                for value in values:
                    title_terms.append({"title": {"$regex": value, "$options": 'i'}})
                search_indexes.append({"$and": title_terms})

            elif search_term["index"] == "is_part_of":
                is_part_of_terms = []
                for value in values:
                    is_part_of_terms.append({"is_part_ofs.title": {"$regex": value, "$options": 'i'}})
                    is_part_of_terms.append({"is_part_ofs.is_part_ofs.title": {"$regex": value, "$options": 'i'}})
                search_indexes.append({"$or": is_part_of_terms})

            elif search_term["index"] == "creator":
                creator_terms = []
                for value in values:
                    creator_terms.append({"creators.agent.name_family": {"$regex": value, "$options": 'i'}})
                    creator_terms.append({"creators.agent.name_given": {"$regex": value, "$options": 'i'}})
                    creator_terms.append({"creators.agent.name": {"$regex": value, "$options": 'i'}})
                    creator_terms.append({"creators.agent.title": {"$regex": value, "$options": 'i'}})
                search_indexes.append({"$or": creator_terms})

            elif search_term["index"] == "creator_id":
                search_indexes.append({"creators.agent.rec_id": search_term["value"]})

            elif search_term["index"] == "affiliation":
                search_indexes.append({"creators.affiliation.name": {"$regex": search_term["value"], "$options": 'i'}})

            elif search_term["index"] == "affiliation_id":
                search_indexes.append({"creators.affiliation.rec_id": search_term["value"]})

            elif search_term["index"] == "publisher":
                publisher_terms = []
                for value in values:
                    publisher_terms.append({"publishers": {"$regex": value, "$options": 'i'}})
                    publisher_terms.append({"is_part_ofs.publishers": {"$regex": value, "$options": 'i'}})
                    publisher_terms.append({"is_part_ofs.is_part_ofs.publishers": {"$regex": value, "$options": 'i'}})
                search_indexes.append({"$or": publisher_terms})

            elif search_term["index"] == "keyword":
                search_indexes.append({"keywords.value": {"$regex": search_term["value"], "$options": 'i'}})

            elif search_term["index"] == "classification":
                search_indexes.append({"classifications.value": {"$regex": search_term["value"], "$options": 'i'}})

            elif search_term["index"] == "research_area":
                search_indexes.append({"research_areas.value": {"$regex": search_term["value"], "$options": 'i'}})

            elif search_term["index"] == "subject":
                search_indexes.append({"subjects.value": {"$regex": search_term["value"], "$options": 'i'}})

            elif search_term["index"] == "set":
                search_indexes.append({"sets.value": {"$regex": search_term["value"], "$options": 'i'}})

            # operator
            if "operator" in search_term:
                if search_term["operator"] == "or":
                    pass
                elif search_term["operator"] == "and":
                    pass
                elif search_term["operator"] == "not":
                    pass

    # result_sorts : how to with this index ? ...
    sort = [("title",pymongo.ASCENDING), ("rec_type",pymongo.ASCENDING)]

    # combine filter_query and search_indexes
    mongo_args = filter_query
    mongo_args.extend(search_indexes)

    # Generate the mongo query
    if mongo_args:
        mongo_query = {"$and": mongo_args}
    else:
        # search all
        mongo_query = {}
    logging.debug("mongo_query:")
    logging.debug(jsonbson.dumps_bson(mongo_query, True))

    mongo_response = mongodb[database_name(corpus)][collection].find(mongo_query).sort(sort)
    logging.debug(mongo_response)
    if mongo_response:
        records = metajson_service.load_dict_list(mongo_response)
        records_total_count = len(records)
    else:
        records = []
        records_total_count = 0

    search_response["records"] = records
    search_response["records_total_count"] = records_total_count
    search_response["result_batch_size"] = records_total_count
    search_response["result_offset"] = 0
    search_response["search_query"] = search_query

    return search_response