Exemplo n.º 1
0
 def _get_elasticsearch_item_key_query(self, collection_name, item_key):
     if collection_name == "persons":
         tree_num, tree_version, person_id = item_key
         return {
             "bool": {
                 "must": [{
                     "term": {
                         "tree_num": tree_num
                     }
                 }, {
                     "term": {
                         "tree_version": tree_version
                     }
                 }, {
                     "term": {
                         "person_id": person_id
                     }
                 }]
             }
         }
     else:
         return {
             "term": {
                 get_collection_id_field(collection_name): item_key
             }
         }
Exemplo n.º 2
0
def parse_n_update(row, collection_name):
    doc = parse_doc(row, collection_name)
    id_field = get_collection_id_field(collection_name)
    logger.info('{}:Updating {}: {}'.format(
        collection_name, id_field, doc[id_field]))
    update_row.delay(doc, collection_name)
    return doc
Exemplo n.º 3
0
 def _get_elasticsearch_item_key_query(self, collection_name, item_key):
     if collection_name == "persons":
         tree_num, tree_version, person_id = item_key
         return {"bool": {"must": [{"term": {"tree_num": tree_num}},
                                   {"term": {"tree_version": tree_version}},
                                   {"term": {"person_id": person_id}}]}}
     else:
         return {"term": {get_collection_id_field(collection_name): item_key}}
Exemplo n.º 4
0
def parse_n_update(row, collection_name, dryrun=False):
    doc = parse_doc(row, collection_name)
    id_field = get_collection_id_field(collection_name)
    logger.info('{}:Updating {}: {}, updated {}'.format(
        collection_name, id_field, doc[id_field],
        doc.get('UpdateDate', '?')))
    if not dryrun:
        update_row.delay(doc, collection_name)
    return doc
Exemplo n.º 5
0
 def _get_mongo_items(self, collection_name, key):
     if key:
         if collection_name == "persons":
             raise NotImplementedError("persons does not support updating by key yet")
         else:
             items = self.app.data_db[collection_name].find({get_collection_id_field(collection_name): key})
     else:
         items = self.app.data_db[collection_name].find()
     items = self._limit(items)
     return items
Exemplo n.º 6
0
def update_doc(collection, document):
    # update place items with geojson
    if collection.name == 'places':
        document['geometry'] = get_place_geo(document)

    # family trees get special treatment
    if collection.name == 'persons':
        tree_num = document['tree_num']
        id = document['id']
        tree_key = 'tree_vers_'+str(tree_num)
        query = {'tree_num': tree_num, 'id': id}
        tree_vers = current_app.redis.get(tree_key)
        if tree_vers:
            tree_vers = json.loads(tree_vers)
            i = find_version(tree_vers, document['tree_file_id'])
        else:
            tree = current_app.data_db['trees'].find_one({'num':tree_num})
            if tree:
                tree_vers = tree['versions']
                current_app.redis.set(tree_key, json.dumps(tree_vers), 300)
                i = find_version(tree_vers, document['tree_file_id'])
            else:
                current_app.logger.info("didn't find tree number {} using version 0 for {}"
                                         .format(tree_num, id))
                i = 0

        document['tree_version'] = i
        query['tree_version'] = i

        # we have to create it here as at the moment create_slug function requires Header to create slug
        # TODO: move this logic to create_slug function
        document['Slug'] = {'En': 'person_{};{}.{}'.format(
                              tree_num,
                              i,
                              id)}
        created = update_collection(collection, query, document)
        if MIGRATE_ES == '1':
            is_ok, msg = update_es(collection.name, document, created)
            if not is_ok:
                current_app.logger.error(msg)
        current_app.logger.info('Updated person: {}.{}'
                                .format(tree_num, id))
    else:
        doc_id = get_doc_id(collection.name, document)
        if doc_id:
            query = {get_collection_id_field(collection): doc_id}
            created = update_collection(collection, query, document)
            if MIGRATE_ES == '1':
                is_ok, msg = update_es(collection.name, document, created)
                if not is_ok:
                    current_app.logger.error(msg)
            slug = document.get("Slug", {}).get("En")
            current_app.logger.info('Updated {} {}, Slug: {}'.format(collection.name, doc_id, slug))
        else:
            current_app.logger.error('update failed because of id {}'.format(collection.name))
Exemplo n.º 7
0
def update_doc(collection, document):
    # update place items with geojson
    if collection.name == 'places':
        document['geometry'] = get_place_geo(document)

    # family trees get special treatment
    if collection.name == 'persons':
        tree_num = document['tree_num']
        id = document['id']
        tree_key = 'tree_vers_' + str(tree_num)
        query = {'tree_num': tree_num, 'id': id}
        tree_vers = current_app.redis.get(tree_key)
        if tree_vers:
            tree_vers = json.loads(tree_vers)
            i = find_version(tree_vers, document['tree_file_id'])
        else:
            tree = current_app.data_db['trees'].find_one({'num': tree_num})
            if tree:
                tree_vers = tree['versions']
                current_app.redis.set(tree_key, json.dumps(tree_vers), 300)
                i = find_version(tree_vers, document['tree_file_id'])
            else:
                current_app.logger.info(
                    "didn't find tree number {} using version 0 for {}".format(
                        tree_num, id))
                i = 0

        document['tree_version'] = i
        query['tree_version'] = i

        # we have to create it here as at the moment create_slug function requires Header to create slug
        # TODO: move this logic to create_slug function
        document['Slug'] = {'En': 'person_{};{}.{}'.format(tree_num, i, id)}
        created = update_collection(collection, query, document)
        if MIGRATE_ES == '1':
            is_ok, msg = update_es(collection.name, document, created)
            if not is_ok:
                current_app.logger.error(msg)
        current_app.logger.info('Updated person: {}.{}'.format(tree_num, id))
    else:
        doc_id = get_doc_id(collection.name, document)
        if doc_id:
            query = {get_collection_id_field(collection): doc_id}
            created = update_collection(collection, query, document)
            if MIGRATE_ES == '1':
                is_ok, msg = update_es(collection.name, document, created)
                if not is_ok:
                    current_app.logger.error(msg)
            slug = document.get("Slug", {}).get("En")
            current_app.logger.info('Updated {} {}, Slug: {}'.format(
                collection.name, doc_id, slug))
        else:
            current_app.logger.error('update failed because of id {}'.format(
                collection.name))
Exemplo n.º 8
0
 def _get_mongo_items(self, collection_name, key):
     if key:
         if collection_name == "persons":
             raise NotImplementedError(
                 "persons does not support updating by key yet")
         else:
             items = self.app.data_db[collection_name].find(
                 {get_collection_id_field(collection_name): key})
     else:
         items = self.app.data_db[collection_name].find()
     items = self._limit(items)
     return items
Exemplo n.º 9
0
 def _get_elasticsearch_item_key(self, collection_name, es_item):
     if collection_name == "persons":
         person_id = es_item.get("person_id", None)
         tree_num = es_item.get("tree_num", None)
         tree_version = es_item.get("tree_version", None)
         if person_id is not None and tree_num is not None and tree_version is not None:
             item_key = int(tree_num), int(tree_version), str(person_id)
         else:
             item_key = None
     else:
         id_field = get_collection_id_field(collection_name)
         item_key = es_item.get(id_field, None)
     return item_key
Exemplo n.º 10
0
 def _get_elasticsearch_item_key(self, collection_name, es_item):
     if collection_name == "persons":
         person_id = es_item.get("person_id", None)
         tree_num = es_item.get("tree_num", None)
         tree_version = es_item.get("tree_version", None)
         if person_id is not None and tree_num is not None and tree_version is not None:
             item_key = int(tree_num), int(tree_version), str(person_id)
         else:
             item_key = None
     else:
         id_field = get_collection_id_field(collection_name)
         item_key = es_item.get(id_field, None)
     return item_key
Exemplo n.º 11
0
 def _get_mongo_item_key(self, collection_name, mongo_item):
     if collection_name == "persons":
         person_id = mongo_item.get("id", None)
         if self.args.legacy and not person_id:
             person_id = mongo_item.get("ID", None)
         tree_num = mongo_item.get("tree_num", None)
         tree_version = mongo_item.get("tree_version", None)
         if person_id is not None and tree_num is not None and tree_version is not None:
             item_key = int(tree_num), int(tree_version), str(person_id)
         else:
             item_key = None
     else:
         id_field = get_collection_id_field(collection_name)
         item_key = mongo_item.get(id_field, None)
     return item_key
Exemplo n.º 12
0
 def _get_mongo_item_key(self, collection_name, mongo_item):
     if collection_name == "persons":
         person_id = mongo_item.get("id", None)
         if self.args.legacy and not person_id:
             person_id = mongo_item.get("ID", None)
         tree_num = mongo_item.get("tree_num", None)
         tree_version = mongo_item.get("tree_version", None)
         if person_id is not None and tree_num is not None and tree_version is not None:
             item_key = int(tree_num), int(tree_version), str(person_id)
         else:
             item_key = None
     else:
         id_field = get_collection_id_field(collection_name)
         item_key = mongo_item.get(id_field, None)
     return item_key
 def _get_index_body(self):
     body = {
         "mappings": {
             collection: {
                 "properties": {"Header": self.header_mapping,}
             } for collection in SEARCHABLE_COLLECTIONS
         }
     }
     body["mappings"]["familyNames"]["properties"]["dm_soundex"] = {
         "type": "completion",
         "max_input_length": 20,
         "contexts": [{
             "name": "collection",
             "type": "CATEGORY",
             "path": "_type"
         }]
     }
     for collection_name, mapping in body["mappings"].items():
         if collection_name == "persons":
             # persons specific mappings
             # ensure all fields relevant for search are properly indexed
             mapping["properties"].update({"tree_num": {"type": "integer"},
                                           "tree_version": {"type": "integer"},
                                           "person_id": {"type": "keyword"},
                                           "birth_year": {"type": "integer"},
                                           "death_year": {"type": "integer"},
                                           "marriage_years": {"type": "integer"},
                                           # these are updated in bhs_api.item.update_es functions
                                           "first_name_lc": {"type": "text"},
                                           "last_name_lc": {"type": "text"},
                                           "BIRT_PLAC_lc": {"type": "text"},
                                           "MARR_PLAC_lc": {"type": "text"},
                                           "DEAT_PLAC_lc": {"type": "text"},
                                           "gender": {"type": "keyword"}})
         else:
             mapping["properties"][get_collection_id_field(collection_name)] = {"type": "keyword"}
     return body
Exemplo n.º 14
0
 def _get_item_log_identifier(self, item_key, collection_name):
     if collection_name == "persons":
         return "(tree_num,version,id={},{},{})".format(*item_key)
     else:
         return "{}={}".format(get_collection_id_field(collection_name),
                               item_key)
Exemplo n.º 15
0
 def _get_item_log_identifier(self, item_key, collection_name):
     if collection_name == "persons":
         return "(tree_num,version,id={},{},{})".format(*item_key)
     else:
         return "{}={}".format(get_collection_id_field(collection_name), item_key)
Exemplo n.º 16
0

    for c_name in SEARCHABLE_COLLECTIONS:
        if c_name != "persons":
            # TODO: add support for persons, at the moment it's not working due to the persons not having a single unique id field
            print("starting work on " + c_name)
            # in the process we might create duplicate index so remove them for now
            try:
                todb[c_name].drop_index('Slug.He_1')
            except pymongo.errors.OperationFailure:
                pass
            try:
                todb[c_name].drop_index('Slug.En_1')
            except pymongo.errors.OperationFailure:
                pass
            id_field = get_collection_id_field(c_name)
            # loop on all docs with a slug
            for from_doc in fromdb[c_name].find({'Slug': {'$exists': True,
                                                         '$ne': {}}}):
                to_doc = app.data_db[c_name].find_one(
                    {id_field: from_doc[id_field]})
                if not to_doc:
                    print("missing {}".format(get_item_slug(from_doc)))
                    continue
                if from_doc['Slug'] != to_doc['Slug']:
                    try:
                        todb[c_name].update_one({'_id': to_doc['_id']},
                                                    {'$set':
                                                        {'Slug': from_doc['Slug']}
                                                    })
                    except pymongo.errors.DuplicateKeyError as e:
Exemplo n.º 17
0
def reslugify(collection, document):
    ''' append the document id to the slug to ensure uniquness '''
    for lang, val in document['Slug'].items():
        if val:
            doc_id = get_collection_id_field(collection.name)
            document['Slug'][lang] += '-' + str(document[doc_id])
Exemplo n.º 18
0
    app, conf = create_app()
    fromdb = app.client_data_db[args.fromdb]
    todb = app.data_db

    for c_name in SEARCHABLE_COLLECTIONS:
        print("starting work on " + c_name)
        # in the process we might create duplicate index so remove them for now
        try:
            todb[c_name].drop_index('Slug.He_1')
        except pymongo.errors.OperationFailure:
            pass
        try:
            todb[c_name].drop_index('Slug.En_1')
        except pymongo.errors.OperationFailure:
            pass
        id_field = get_collection_id_field(c_name)
        # loop on all docs with a slug
        for from_doc in fromdb[c_name].find(
            {'Slug': {
                '$exists': True,
                '$ne': {}
            }}):
            to_doc = app.data_db[c_name].find_one(
                {id_field: from_doc[id_field]})
            if not to_doc:
                print("missing {}".format(get_item_slug(from_doc)))
                continue
            if from_doc['Slug'] != to_doc['Slug']:
                try:
                    todb[c_name].update_one(
                        {'_id': to_doc['_id']},
Exemplo n.º 19
0
def update_doc(collection, document):
    # update place items with geojson
    if collection.name == 'places':
        document['geometry'] = get_place_geo(document)

    # family trees get special treatment
    if collection.name == 'persons':
        tree_num = document['tree_num']
        id = document['id']
        tree_key = 'tree_vers_'+str(tree_num)
        query = {'tree_num': tree_num, 'id': id}
        tree_vers = current_app.redis.get(tree_key)
        if tree_vers:
            tree_vers = json.loads(tree_vers)
            i = find_version(tree_vers, document['tree_file_id'])
        else:
            tree = current_app.data_db['trees'].find_one({'num':tree_num})
            if tree:
                tree_vers = tree['versions']
                current_app.redis.set(tree_key, json.dumps(tree_vers), 300)
                i = find_version(tree_vers, document['tree_file_id'])
            else:
                current_app.logger.info("didn't find tree number {} using version 0 for {}"
                                         .format(tree_num, id))
                i = 0;

        document['tree_version'] = i
        query['tree_version'] = i

        document['Slug'] = {'En': 'person_{};{}.{}'.format(
                              tree_num,
                              i,
                              id)}
        update_collection(collection, query, document)
        current_app.logger.info('Updated person: {}.{}'
                                .format(tree_num, id))
    else:
        # post parsing: add _id and Slug
        doc_id_field = get_collection_id_field(collection.name)
        try:
            doc_id = document[doc_id_field]
        except KeyError:
            current_app.logger.error('update failed because of id {} {}'
                                     .format(collection.name,
                                             doc_id_field,
                                             ))
        if doc_id:
            document['_id'] = doc_id


        query = {doc_id_field: doc_id}
        result = update_collection(collection, query, document)

        update_es(collection.name, document, doc_id)

        try:
            slug = document['Slug']['En']
        except KeyError:
            slug = 'None'

        current_app.logger.info('Updated {} {}: {}, Slug: {}'.format(
            collection.name,
            doc_id_field,
            doc_id,
            slug))
Exemplo n.º 20
0
def reslugify(collection, document):
    ''' append the document id to the slug to ensure uniquness '''
    for lang, val in document['Slug'].items():
        if val:
            doc_id = get_collection_id_field(collection.name)
            document['Slug'][lang] += '-' + str(document[doc_id])