Пример #1
0
def enhance_metajson(document):
    if isinstance(document, dict):
        document = load_dict(document)

    # rec_id
    if "rec_id" not in document or document["rec_id"] is None:
        document["rec_id"] = str(uuid.uuid1())

    # language
    # todo use language_service

    # title_non_sort
    manage_title_non_sort(document)

    # rec_status
    if "rec_status" not in document or document["rec_status"] is None:
        document["rec_status"] = constants.REC_STATUS_PRIVATE

    # rec_created_date
    if "rec_created_date" not in document or document["rec_created_date"] is None:
        document["rec_created_date"] = datetime.now().isoformat()

    # rec_modified_date
    if "rec_modified_date" not in document or document["rec_modified_date"] is None:
        document["rec_modified_date"] = datetime.now().isoformat()
    
    # rec_deleted_date
    if document["rec_status"] == constants.REC_STATUS_DELETED and "rec_deleted_date" not in metajson:
        document["rec_deleted_date"] = datetime.now().isoformat()

    # For "rec_class": "Document" only
    if "rec_class" in document and document["rec_class"] == constants.REC_CLASS_DOCUMENT:
        # citations
        citations_manager.add_citations_to_metadata(document, None, None)
        # date_sort
        date_iso = document.get_date()
        date_sort = date_service.parse_date(date_iso)
        document["date_sort"] = date_sort

    return document
Пример #2
0
def search(corpus, search_query):
    if not corpus:
        corpus = default_corpus

    search_response = SearchResponse()

    # empty search_query
    if search_query is None:
        raise exceptions.metajsonprc_error(40)

    # filter_class -> collection
    collection = None
    if "filter_class" not in search_query or search_query[
            "filter_class"] not in [
                "Document", "Agent", "Person", "OrgUnit", "Event", "Family"
            ]:
        raise exceptions.metajsonprc_error(40)
    elif search_query["filter_class"] == "Document":
        collection = DOCUMENTS
    #elif search_query["filter_class"] in ["Agent", "Person", "OrgUnit", "Event", "Family"]:
    #    collection = AGENTS

    # other filters
    # todo: filter_peer_review, filter_with_full_text, filter_favorite

    filter_query = []
    if "filter_date_end" in search_query:
        filter_date_end = date_service.parse_date(
            search_query["filter_date_end"])
        filter_query.append({"date_sort": {"$lte": filter_date_end}})
    if "filter_date_begin" in search_query:
        filter_date_begin = date_service.parse_date(
            search_query["filter_date_begin"])
        filter_query.append({"date_sort": {"$gte": filter_date_begin}})
    if "filter_languages" in search_query:
        filter_query.append(
            {"languages": {
                "$in": search_query["filter_languages"]
            }})
    if "filter_types" in search_query:
        filter_query.append(
            {"rec_type": {
                "$in": search_query["filter_types"]
            }})
    if "filter_status" in search_query:
        # "private", "pending", "rejected", "published", "deleted"
        filter_query.append(
            {"rec_status": {
                "$in": search_query["filter_status"]
            }})

    # search_terms
    # a
    # and b
    # or c
    # -> or(and(a,b),c)
    # a
    # or b
    # and c
    # -> and(or(a,b),c)

    search_indexes = []
    if "search_terms" in search_query:
        for idx, search_term in enumerate(search_query["search_terms"]):
            # value
            if "value" not in search_term or search_term["value"] is None:
                # useless
                break

            # split value
            values = search_term["value"].replace(",", " ").split()

            # index
            if "index" not in search_term:
                # useless
                raise exceptions.metajsonprc_error(40)
            elif search_term["index"] == "all":
                all_terms = []
                if values:
                    for value in values:
                        all_terms.append(
                            {"rec_id": {
                                "$regex": value,
                                "$options": 'i'
                            }})
                        all_terms.append({
                            "identifiers.value": {
                                "$regex": value,
                                "$options": 'i'
                            }
                        })
                        all_terms.append(
                            {"title": {
                                "$regex": value,
                                "$options": 'i'
                            }})
                        all_terms.append(
                            {"title_sub": {
                                "$regex": value,
                                "$options": 'i'
                            }})
                        all_terms.append(
                            {"publishers": {
                                "$regex": value,
                                "$options": 'i'
                            }})
                        all_terms.append({
                            "is_part_ofs.title": {
                                "$regex": value,
                                "$options": 'i'
                            }
                        })
                        all_terms.append({
                            "is_part_ofs.is_part_ofs.title": {
                                "$regex": value,
                                "$options": 'i'
                            }
                        })
                        all_terms.append({
                            "creators.agent.name_family": {
                                "$regex": value,
                                "$options": 'i'
                            }
                        })
                        all_terms.append({
                            "creators.agent.name_given": {
                                "$regex": value,
                                "$options": 'i'
                            }
                        })
                        all_terms.append({
                            "creators.agent.name": {
                                "$regex": value,
                                "$options": 'i'
                            }
                        })
                        all_terms.append({
                            "creators.agent.title": {
                                "$regex": value,
                                "$options": 'i'
                            }
                        })
                        all_terms.append(
                            {"rec_type": {
                                "$regex": value,
                                "$options": 'i'
                            }})
                    search_indexes.append({"$or": all_terms})

            elif search_term["index"] == "identifier":
                try:
                    obid = ObjectId(search_term["index"])
                    search_indexes.append({"_id": obid})
                except (InvalidId, TypeError):
                    search_indexes.append({
                        "$or": [{
                            "rec_id": {
                                "$regex": search_term["value"],
                                "$options": 'i'
                            }
                        }, {
                            "identifiers.value": {
                                "$regex": search_term["value"],
                                "$options": 'i'
                            }
                        }]
                    })

            elif search_term["index"] == "title":
                title_terms = []
                for value in values:
                    title_terms.append(
                        {"title": {
                            "$regex": value,
                            "$options": 'i'
                        }})
                search_indexes.append({"$and": title_terms})

            elif search_term["index"] == "is_part_of":
                is_part_of_terms = []
                for value in values:
                    is_part_of_terms.append({
                        "is_part_ofs.title": {
                            "$regex": value,
                            "$options": 'i'
                        }
                    })
                    is_part_of_terms.append({
                        "is_part_ofs.is_part_ofs.title": {
                            "$regex": value,
                            "$options": 'i'
                        }
                    })
                search_indexes.append({"$or": is_part_of_terms})

            elif search_term["index"] == "creator":
                creator_terms = []
                for value in values:
                    creator_terms.append({
                        "creators.agent.name_family": {
                            "$regex": value,
                            "$options": 'i'
                        }
                    })
                    creator_terms.append({
                        "creators.agent.name_given": {
                            "$regex": value,
                            "$options": 'i'
                        }
                    })
                    creator_terms.append({
                        "creators.agent.name": {
                            "$regex": value,
                            "$options": 'i'
                        }
                    })
                    creator_terms.append({
                        "creators.agent.title": {
                            "$regex": value,
                            "$options": 'i'
                        }
                    })
                search_indexes.append({"$or": creator_terms})

            elif search_term["index"] == "creator_id":
                search_indexes.append(
                    {"creators.agent.rec_id": search_term["value"]})

            elif search_term["index"] == "affiliation":
                search_indexes.append({
                    "creators.affiliation.name": {
                        "$regex": search_term["value"],
                        "$options": 'i'
                    }
                })

            elif search_term["index"] == "affiliation_id":
                search_indexes.append(
                    {"creators.affiliation.rec_id": search_term["value"]})

            elif search_term["index"] == "publisher":
                publisher_terms = []
                for value in values:
                    publisher_terms.append(
                        {"publishers": {
                            "$regex": value,
                            "$options": 'i'
                        }})
                    publisher_terms.append({
                        "is_part_ofs.publishers": {
                            "$regex": value,
                            "$options": 'i'
                        }
                    })
                    publisher_terms.append({
                        "is_part_ofs.is_part_ofs.publishers": {
                            "$regex": value,
                            "$options": 'i'
                        }
                    })
                search_indexes.append({"$or": publisher_terms})

            elif search_term["index"] == "keyword":
                search_indexes.append({
                    "keywords.value": {
                        "$regex": search_term["value"],
                        "$options": 'i'
                    }
                })

            elif search_term["index"] == "classification":
                search_indexes.append({
                    "classifications.value": {
                        "$regex": search_term["value"],
                        "$options": 'i'
                    }
                })

            elif search_term["index"] == "research_area":
                search_indexes.append({
                    "research_areas.value": {
                        "$regex": search_term["value"],
                        "$options": 'i'
                    }
                })

            elif search_term["index"] == "subject":
                search_indexes.append({
                    "subjects.value": {
                        "$regex": search_term["value"],
                        "$options": 'i'
                    }
                })

            elif search_term["index"] == "set":
                search_indexes.append({
                    "sets.value": {
                        "$regex": search_term["value"],
                        "$options": 'i'
                    }
                })

            # operator
            if "operator" in search_term:
                if search_term["operator"] == "or":
                    pass
                elif search_term["operator"] == "and":
                    pass
                elif search_term["operator"] == "not":
                    pass

    # result_sorts : how to with this index ? ...
    sort = [("title", pymongo.ASCENDING), ("rec_type", pymongo.ASCENDING)]

    # combine filter_query and search_indexes
    mongo_args = filter_query
    mongo_args.extend(search_indexes)

    # Generate the mongo query
    if mongo_args:
        mongo_query = {"$and": mongo_args}
    else:
        # search all
        mongo_query = {}
    logging.debug("mongo_query:")
    logging.debug(jsonbson.dumps_bson(mongo_query, True))

    mongo_response = mongodb[database_name(corpus)][collection].find(
        mongo_query).sort(sort)
    logging.debug(mongo_response)
    if mongo_response:
        records = metajson_service.load_dict_list(mongo_response)
        records_total_count = len(records)
    else:
        records = []
        records_total_count = 0

    search_response["records"] = records
    search_response["records_total_count"] = records_total_count
    search_response["result_batch_size"] = records_total_count
    search_response["result_offset"] = 0
    search_response["search_query"] = search_query

    return search_response
def search(corpus, search_query):
    if not corpus:
        corpus = default_corpus

    search_response = SearchResponse()

    # empty search_query
    if search_query is None:
        raise exceptions.metajsonprc_error(40)

    # filter_class -> collection
    collection = None
    if "filter_class" not in search_query or search_query["filter_class"] not in ["Document", "Agent", "Person", "OrgUnit", "Event", "Family"]:
        raise exceptions.metajsonprc_error(40)
    elif search_query["filter_class"] == "Document":
        collection = DOCUMENTS
    #elif search_query["filter_class"] in ["Agent", "Person", "OrgUnit", "Event", "Family"]:
    #    collection = AGENTS

    # other filters
    # todo: filter_peer_review, filter_with_full_text, filter_favorite

    filter_query = []
    if "filter_date_end" in search_query:
        filter_date_end = date_service.parse_date(search_query["filter_date_end"])
        filter_query.append({"date_sort": {"$lte": filter_date_end}})
    if "filter_date_begin" in search_query:
        filter_date_begin = date_service.parse_date(search_query["filter_date_begin"])
        filter_query.append({"date_sort": {"$gte": filter_date_begin}})
    if "filter_languages" in search_query:
        filter_query.append({"languages": {"$in": search_query["filter_languages"]}})
    if "filter_types" in search_query:
        filter_query.append({"rec_type": {"$in": search_query["filter_types"]}})
    if "filter_status" in search_query:
        # "private", "pending", "rejected", "published", "deleted"
        filter_query.append({"rec_status": {"$in": search_query["filter_status"]}})

    # search_terms
    # a
    # and b
    # or c
    # -> or(and(a,b),c)
    # a
    # or b
    # and c
    # -> and(or(a,b),c)

    search_indexes = []
    if "search_terms" in search_query:
        for idx, search_term in enumerate(search_query["search_terms"]):
            # value
            if "value" not in search_term or search_term["value"] is None:
                # useless
                break

            # split value
            values = search_term["value"].replace(",", " ").split()
            
            # index
            if "index" not in search_term:
                # useless
                raise exceptions.metajsonprc_error(40)
            elif search_term["index"] == "all":
                all_terms = []
                if values:
                    for value in values:
                        all_terms.append({"rec_id": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"identifiers.value": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"title": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"title_sub": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"publishers": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"is_part_ofs.title": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"is_part_ofs.is_part_ofs.title": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"creators.agent.name_family": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"creators.agent.name_given": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"creators.agent.name": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"creators.agent.title": {"$regex": value, "$options": 'i'}})
                        all_terms.append({"rec_type": {"$regex": value, "$options": 'i'}})
                    search_indexes.append({"$or": all_terms})

            elif search_term["index"] == "identifier":
                try:
                    obid = ObjectId(search_term["index"])
                    search_indexes.append({"_id": obid})
                except (InvalidId, TypeError):
                    search_indexes.append({"$or": [{"rec_id": {"$regex": search_term["value"], "$options": 'i'}}, {"identifiers.value": {"$regex": search_term["value"], "$options": 'i'}}]})

            elif search_term["index"] == "title":
                title_terms = []
                for value in values:
                    title_terms.append({"title": {"$regex": value, "$options": 'i'}})
                search_indexes.append({"$and": title_terms})

            elif search_term["index"] == "is_part_of":
                is_part_of_terms = []
                for value in values:
                    is_part_of_terms.append({"is_part_ofs.title": {"$regex": value, "$options": 'i'}})
                    is_part_of_terms.append({"is_part_ofs.is_part_ofs.title": {"$regex": value, "$options": 'i'}})
                search_indexes.append({"$or": is_part_of_terms})

            elif search_term["index"] == "creator":
                creator_terms = []
                for value in values:
                    creator_terms.append({"creators.agent.name_family": {"$regex": value, "$options": 'i'}})
                    creator_terms.append({"creators.agent.name_given": {"$regex": value, "$options": 'i'}})
                    creator_terms.append({"creators.agent.name": {"$regex": value, "$options": 'i'}})
                    creator_terms.append({"creators.agent.title": {"$regex": value, "$options": 'i'}})
                search_indexes.append({"$or": creator_terms})

            elif search_term["index"] == "creator_id":
                search_indexes.append({"creators.agent.rec_id": search_term["value"]})

            elif search_term["index"] == "affiliation":
                search_indexes.append({"creators.affiliation.name": {"$regex": search_term["value"], "$options": 'i'}})

            elif search_term["index"] == "affiliation_id":
                search_indexes.append({"creators.affiliation.rec_id": search_term["value"]})

            elif search_term["index"] == "publisher":
                publisher_terms = []
                for value in values:
                    publisher_terms.append({"publishers": {"$regex": value, "$options": 'i'}})
                    publisher_terms.append({"is_part_ofs.publishers": {"$regex": value, "$options": 'i'}})
                    publisher_terms.append({"is_part_ofs.is_part_ofs.publishers": {"$regex": value, "$options": 'i'}})
                search_indexes.append({"$or": publisher_terms})

            elif search_term["index"] == "keyword":
                search_indexes.append({"keywords.value": {"$regex": search_term["value"], "$options": 'i'}})

            elif search_term["index"] == "classification":
                search_indexes.append({"classifications.value": {"$regex": search_term["value"], "$options": 'i'}})

            elif search_term["index"] == "research_area":
                search_indexes.append({"research_areas.value": {"$regex": search_term["value"], "$options": 'i'}})

            elif search_term["index"] == "subject":
                search_indexes.append({"subjects.value": {"$regex": search_term["value"], "$options": 'i'}})

            elif search_term["index"] == "set":
                search_indexes.append({"sets.value": {"$regex": search_term["value"], "$options": 'i'}})

            # operator
            if "operator" in search_term:
                if search_term["operator"] == "or":
                    pass
                elif search_term["operator"] == "and":
                    pass
                elif search_term["operator"] == "not":
                    pass

    # result_sorts : how to with this index ? ...
    sort = [("title",pymongo.ASCENDING), ("rec_type",pymongo.ASCENDING)]

    # combine filter_query and search_indexes
    mongo_args = filter_query
    mongo_args.extend(search_indexes)

    # Generate the mongo query
    if mongo_args:
        mongo_query = {"$and": mongo_args}
    else:
        # search all
        mongo_query = {}
    logging.debug("mongo_query:")
    logging.debug(jsonbson.dumps_bson(mongo_query, True))

    mongo_response = mongodb[database_name(corpus)][collection].find(mongo_query).sort(sort)
    logging.debug(mongo_response)
    if mongo_response:
        records = metajson_service.load_dict_list(mongo_response)
        records_total_count = len(records)
    else:
        records = []
        records_total_count = 0

    search_response["records"] = records
    search_response["records_total_count"] = records_total_count
    search_response["result_batch_size"] = records_total_count
    search_response["result_offset"] = 0
    search_response["search_query"] = search_query

    return search_response