def checkIfImport(key): """ Check if publication is in database. @type name: string @param name: Key of publication. """ q = FluentSQLQuery() q.select("id").from_table("publication") q.where("researchr_key=", key) q() data = q.fetch_one() return data
def lookup(self, obj, level=None): """ More sophisticated RRSDatabase.contains(). This method doesnt call RRSDatabase.contains() explicitly, it checks other entities and tries to find relationship between them. This method uses list of lookup rules. @returns True if found (the object now carries the ID) False if not found """ if level is None: level = self.lookup_level if level < 0: return if not isinstance(obj, _RRSDatabaseEntity): raise TypeError('lookup() method can be called only on database '\ 'entity objects.') if obj._table_name.endswith("_meta"): raise RRSDatabaseEntityError('lookup() method cannot be called on meta-tables.') q = FluentSQLQuery() # LEVEL 0 rules try: lvl_zero_rules = self._lookup_rules.get_rules(type(obj), 0) except KeyError: if self.logger is not None: self.logger.error("Level 0 rules for '%s' not found." % obj._table_name) return False for rule in lvl_zero_rules: attr_present = [item for item in rule.entities if item in obj] # if there are no such attrubutes or not the requested count of them, # continue to the next rule if rule.reqcount > len(attr_present): continue self._db.refresh() for cnt in reversed(range(rule.reqcount, len(attr_present)+1)): for attr_comb in combinations(attr_present, cnt): # now select them q.cleanup() q.select("id").from_table(obj._table_name) for attr in attr_comb: try: q.where("%s=" % attr, obj[attr]) except FluentSQLQueryError: q.and_("%s=" % attr, obj[attr]) q() res = q.fetch_all() if q.count() > 1: # there shouln't be more results than one self.logger.warning("There are more than one identical "\ "%ss. List of ID's: %s" % (obj._table_name, str([x[0] for x in res]))) if not res or res is None: continue obj['id'] = res[0][0] return True # LEVEL 1 rules try: lvl_one_rules = self._lookup_rules.get_rules(type(obj), 1) except KeyError: if self.logger is not None: self.logger.error("Level 1 rules for '%s' not found." % obj._table_name) return False # returns type of entity mapped in ent_id_map def getetype(ent_id_map, ent): for k in ent_id_map.keys(): e, et = k if ent == e: return et # these are objects which really are present in the entity for rule in lvl_one_rules: ent_present = [item for item in rule.entities if item in obj] # if there are no such entities or not the requested count of them, # continue to the next rule if rule.reqcount > len(ent_present): continue # get all those identifiers ent_id_map = {} for ent_name in ent_present: target = obj[ent_name] if type(target) is list and target: # list of relationship objects key = (ent_name, type(target[0])) ent_id_map[key] = [] for rel_obj in target: assert len(rel_obj.get_entities()) > 0 e = rel_obj.get_entities()[0] if self.lookup(e, level-1): if not key in ent_id_map: ent_id_map[key] = [] ent_id_map[key].append(e) if not ent_id_map[key]: del ent_id_map[key] elif isinstance(target, _RRSDatabaseEntity): # this is FK - @target is RRS*** object if self.lookup(target, level-1): ent_id_map[(ent_name, type(target))] = [target] else: ent_id_map[(ent_name, type(target))] = [target] # if we did not found as much as the rules requests, continue if rule.reqcount > len(ent_id_map): continue # try to catch some data from the minimum count of requested entities # to match, probably 2 # if this select spits out too many results (>100), the reqcount level 2 # is omitted and the process starts again from 3. # There has to be a flag, which indicates, that the level 2 # requested entities returned too many results next_reqcount_lvl = False ent_keys = [x[0] for x in ent_id_map.keys()] for cnt in range(rule.reqcount, len(ent_id_map)+1): next_reqcount_lvl = False for entity_comb in combinations(ent_keys, cnt): if next_reqcount_lvl: break self._db.refresh() # re-create cursors to drop the loaded data # construct the query q.cleanup() tg_tbl = obj._table_name from_lst = [tg_tbl] q.select("%s.id" % tg_tbl) # recognition of the same table in the query tablecounter = 1 for ent in entity_comb: etype = getetype(ent_id_map, ent) # now we have key to the object -> ent_id_map[(ent, etype)] # @ent is instance of RRS****** - 1:N relationship # the object contains id of this entity if issubclass(etype, _RRSDatabaseEntity): o = ent_id_map[(ent, etype)][0] try: q.where("%s.%s_id=" % (tg_tbl, ent), o['id']) except FluentSQLQueryError: q.and_("%s.%s_id=" % (tg_tbl, ent), o['id']) # @ent is fake junction table - it means, that it's # the second side of 1:N relationship - N:1. elif issubclass(etype, _RRSDbEntityRelationship) and etype._fake_table: # TODO return False # @ent is true junction table - this M:N relationship. elif issubclass(etype, _RRSDbEntityRelationship) and not etype._fake_table: j_tbl_uniq_as = None # storage of all acronyms iof junction tables j_tbl_uniq_as_list = [] o = None # join together all the found entities - for example: # given publication, two persons (authors), both found # in db so create query which selects ID of publication # which has both - the first AND the second person. for o in ent_id_map[(ent, etype)]: j_tbl_uniq_as = "%s%s" % (etype._table_name, tablecounter) j_tbl_uniq_as_list.append(j_tbl_uniq_as) e_tbl_uniq_as = "%s%s" % (o._table_name, tablecounter) # add table to the list of tables we are joining together from_lst.append("%s AS %s" % (etype._table_name, j_tbl_uniq_as)) from_lst.append("%s AS %s" % (o._table_name, e_tbl_uniq_as)) try: q.where("%s.id=" % e_tbl_uniq_as, o['id']) except FluentSQLQueryError: q.and_("%s.id=" % e_tbl_uniq_as, o['id']) q.and_("%s.%s_id=" % (j_tbl_uniq_as, o._table_name), "%s.id" % e_tbl_uniq_as, True) tablecounter += 1 # add the condition that all the junction table ID's of # the entity we are looking for has to be the same - we # are looking not for union, but intersection of them for i in range(0, len(j_tbl_uniq_as_list)): try: j1 = j_tbl_uniq_as_list[i] j2 = j_tbl_uniq_as_list[i+1] q.and_("%s.%s_id=" % (j1, tg_tbl), "%s.%s_id" % (j2, tg_tbl), True) except IndexError: break # bind junction table.entity_id to id of entity we are looking for q.and_("%s.%s_id=" % (j_tbl_uniq_as, tg_tbl), "%s.id" % tg_tbl, True) # @ent is attribute (int, basestring ect.) else: attr = ent_id_map[(ent, etype)][0] try: q.where("%s.%s=" % (tg_tbl, ent), attr) except FluentSQLQueryError: q.and_("%s.%s=" % (tg_tbl, ent), attr) q.from_table(from_lst) q() search_sql_query = q._sql # now if the total count of probably identical files is higher # than 100, we need to specify it more, so we jump to next # request count level (probably 1->2 or 2->3). if q.count() > 100: next_reqcount_lvl = True continue res = q.fetch_all() if not res: continue elif len(res) == 1: obj['id'] = res[0][0] self.logger.info("Found exactly one result for lookup: %s, params: %s, found ID: %s, SQL: %s" % \ (obj._table_name, str(entity_comb), obj['id'], search_sql_query)) return True else: # do some magic stuff here # intelligenty compare the attribute of all returned results # and choose the most similar q.cleanup() id_list = [x[0] for x in res] attrunion = set(["id"]) lvl_zero_rules = self._lookup_rules.get_rules(type(obj), 0) # make a list of attributes needed to acomplish the rules # (these are all which are present in rules) for rule in lvl_zero_rules: attrunion = attrunion.union(set(rule.entities)) # construct query which loads all needed attributes of all returned ID's q.select(list(attrunion)).from_table(obj._table_name) for _id in id_list: try: q.where("id=", _id) except FluentSQLQueryError: q.or_("id=", _id) q() # perform the query loaded_data = q.fetch_all() similarity = {} # every rule tell us what attributes have to be similar # (or identical) for rule in lvl_zero_rules: attrs = [item for item in rule.entities if item in obj] if rule.reqcount > len(attrs): continue # count every row's similarity (the result is sum of # similarities of their attributes) sim_lst = {} for d in loaded_data: row_similarity = 0.0 for attr in attrs: if attr not in d or d[attr] is None or attr not in obj: continue if (d['id'], attr) not in similarity: s = SequenceMatcher(None, d[attr], obj[attr]) similarity[(d['id'], attr)] = s.ratio() row_similarity += similarity[(d['id'], attr)] sim_lst[row_similarity] = d['id'] # get the most similar row to the object obj['id'] = sim_lst[max(sim_lst.keys())] self.logger.info("Found more than one result for lookup: %s, params: %s, "\ "Choosen ID: %s, SQL: %s" % (obj._table_name, str(entity_comb), obj['id'], search_sql_query)) return True
def _bind_entity_to_name(self, namedentity, source_module): """ This method creates connection between entity and it's name, which is stored in other database table. These tables are: - person vs person_name - organization vs organization_name - event vs event_name @returns ID of the row in the name-table. """ ACRONYM = 'acronym' # to be easily changed to abbreviation or whatever needed.. TITLE = 'title' # will be name? or what? if not isinstance(namedentity, _RRSDatabaseEntity): raise TypeError("Named object has to be instance of subclass of _RRSDatabaseEntity") if not 'id' in namedentity: raise DatabaseError("Named object has to contain ID!") q = FluentSQLQuery() if namedentity._table_name == "person": # act like person and handle person_name # this is slightly different because there is N:N relationship # create new person name object pname = RRSPerson_name() for attr in ('first_name', 'middle_name', 'last_name', 'full_name'): if attr in namedentity: pname[attr] = namedentity[attr] # create relationship object rel_obj = RRSRelationshipPersonPerson_name() rel_obj.set_entity(pname) namedentity['person_name'] = rel_obj # look for this name in database if self.lookup(pname): # it is in db yet, just check if rel exists q.select("person_id").from_table(("j__person__person_name")) q.where("person_id=", namedentity['id']).and_("person_name_id=", pname['id']) q() if not q.count(): # if the relationship doesn't exist, create new one self._rrsdb.relationship("person_name", rel_obj) elif q.count() > 1: self.logger.warning("There are more than one relationship "\ "entries in table 'j__person__person_name"\ " between person.id=%s and person_name.id=%s" \ % (namedentity['id'], pname['id'])) else: # insert new person_name and create the relationship self._rrsdb.insert(pname, self.module) self._rrsdb.relationship("person_name", rel_obj) # get the reference out of which is this name extracted and assign # the person name to the reference (j__person_name__reference) try: refe = namedentity['publication'][0].get_entities()[0]['reference_reference'][0].get_entities()[0] except (KeyError, TypeError, IndexError): pass else: pname_ref_rel = RRSRelationshipPerson_nameReference() pname_ref_rel.set_entity(refe) pname['reference'] = pname_ref_rel try: self._rrsdb.relationship('reference', pname_ref_rel) except DatabaseError: self._queue.wait(WQEntry(self._rrsdb.relationship, ('reference', pname_ref_rel))) elif namedentity._table_name in ("event", "organization"): if TITLE not in namedentity: # this violates constraint... raise exception?? Or return false? return False name_tbl = "%s_name" % namedentity._table_name # if there in the database is no title like this, insert it q.select(("id", "%s_id" % namedentity._table_name, ACRONYM, TITLE)).from_table(name_tbl) q.where("%s=" % TITLE, namedentity[TITLE]) if ACRONYM in namedentity: q.or_("%s=" % ACRONYM, namedentity[ACRONYM]) q() if q.count(): # check the parent id if it matches for row in q.fetch_all(): if namedentity['id'] == row[1]: # if it matched on acronym, check the titles if they are the same if row[TITLE] != namedentity[TITLE]: # if not, check the rest and maybe add new row into table continue # add the missing acronym if needed if row[ACRONYM] is None and namedentity[ACRONYM] is not None: # update the row q.cleanup() q.update(name_tbl, {ACRONYM: namedentity[ACRONYM]}) q.where("id=", row['id']) q() return row['id'] # if nothing matched, insert new name name_obj = self._table_to_class_map[name_tbl]() for attr in (TITLE, ACRONYM): if attr in namedentity: name_obj[attr] = namedentity[attr] if name_obj.empty(): return False name_obj[namedentity._table_name] = namedentity self._rrsdb.insert(name_obj, source_module) return name_obj['id'] else: raise RRSDatabaseEntityError("%s is not a named entity." % type(namedentity)) self._db.refresh()
def lookup(self, obj, level=None): """ More sophisticated RRSDatabase.contains(). This method doesnt call RRSDatabase.contains() explicitly, it checks other entities and tries to find relationship between them. This method uses list of lookup rules. @returns True if found (the object now carries the ID) False if not found """ if level is None: level = self.lookup_level if level < 0: return if not isinstance(obj, _RRSDatabaseEntity): raise TypeError("lookup() method can be called only on database " "entity objects.") if obj._table_name.endswith("_meta"): raise RRSDatabaseEntityError("lookup() method cannot be called on meta-tables.") q = FluentSQLQuery() # LEVEL 0 rules try: lvl_zero_rules = self._lookup_rules.get_rules(type(obj), 0) except KeyError: if self.logger is not None: self.logger.error("Level 0 rules for '%s' not found." % obj._table_name) return False for rule in lvl_zero_rules: attr_present = [item for item in rule.entities if item in obj] # if there are no such attrubutes or not the requested count of them, # continue to the next rule if rule.reqcount > len(attr_present): continue self._db.refresh() for cnt in reversed(range(rule.reqcount, len(attr_present) + 1)): for attr_comb in combinations(attr_present, cnt): # now select them q.cleanup() q.select("id").from_table(obj._table_name) for attr in attr_comb: try: q.where("%s=" % attr, obj[attr]) except FluentSQLQueryError: q.and_("%s=" % attr, obj[attr]) q() res = q.fetch_all() if q.count() > 1: # there shouln't be more results than one self.logger.warning( "There are more than one identical " "%ss. List of ID's: %s" % (obj._table_name, str([x[0] for x in res])) ) if not res or res is None: continue obj["id"] = res[0][0] return True # LEVEL 1 rules try: lvl_one_rules = self._lookup_rules.get_rules(type(obj), 1) except KeyError: if self.logger is not None: self.logger.error("Level 1 rules for '%s' not found." % obj._table_name) return False # returns type of entity mapped in ent_id_map def getetype(ent_id_map, ent): for k in ent_id_map.keys(): e, et = k if ent == e: return et # these are objects which really are present in the entity for rule in lvl_one_rules: ent_present = [item for item in rule.entities if item in obj] # if there are no such entities or not the requested count of them, # continue to the next rule if rule.reqcount > len(ent_present): continue # get all those identifiers ent_id_map = {} for ent_name in ent_present: target = obj[ent_name] if type(target) is list and target: # list of relationship objects key = (ent_name, type(target[0])) ent_id_map[key] = [] for rel_obj in target: assert len(rel_obj.get_entities()) > 0 e = rel_obj.get_entities()[0] if self.lookup(e, level - 1): if not key in ent_id_map: ent_id_map[key] = [] ent_id_map[key].append(e) if not ent_id_map[key]: del ent_id_map[key] elif isinstance(target, _RRSDatabaseEntity): # this is FK - @target is RRS*** object if self.lookup(target, level - 1): ent_id_map[(ent_name, type(target))] = [target] else: ent_id_map[(ent_name, type(target))] = [target] # if we did not found as much as the rules requests, continue if rule.reqcount > len(ent_id_map): continue # try to catch some data from the minimum count of requested entities # to match, probably 2 # if this select spits out too many results (>100), the reqcount level 2 # is omitted and the process starts again from 3. # There has to be a flag, which indicates, that the level 2 # requested entities returned too many results next_reqcount_lvl = False ent_keys = [x[0] for x in ent_id_map.keys()] for cnt in range(rule.reqcount, len(ent_id_map) + 1): next_reqcount_lvl = False for entity_comb in combinations(ent_keys, cnt): if next_reqcount_lvl: break self._db.refresh() # re-create cursors to drop the loaded data # construct the query q.cleanup() tg_tbl = obj._table_name from_lst = [tg_tbl] q.select("%s.id" % tg_tbl) # recognition of the same table in the query tablecounter = 1 for ent in entity_comb: etype = getetype(ent_id_map, ent) # now we have key to the object -> ent_id_map[(ent, etype)] # @ent is instance of RRS****** - 1:N relationship # the object contains id of this entity if issubclass(etype, _RRSDatabaseEntity): o = ent_id_map[(ent, etype)][0] try: q.where("%s.%s_id=" % (tg_tbl, ent), o["id"]) except FluentSQLQueryError: q.and_("%s.%s_id=" % (tg_tbl, ent), o["id"]) # @ent is fake junction table - it means, that it's # the second side of 1:N relationship - N:1. elif issubclass(etype, _RRSDbEntityRelationship) and etype._fake_table: # TODO return False # @ent is true junction table - this M:N relationship. elif issubclass(etype, _RRSDbEntityRelationship) and not etype._fake_table: j_tbl_uniq_as = None # storage of all acronyms iof junction tables j_tbl_uniq_as_list = [] o = None # join together all the found entities - for example: # given publication, two persons (authors), both found # in db so create query which selects ID of publication # which has both - the first AND the second person. for o in ent_id_map[(ent, etype)]: j_tbl_uniq_as = "%s%s" % (etype._table_name, tablecounter) j_tbl_uniq_as_list.append(j_tbl_uniq_as) e_tbl_uniq_as = "%s%s" % (o._table_name, tablecounter) # add table to the list of tables we are joining together from_lst.append("%s AS %s" % (etype._table_name, j_tbl_uniq_as)) from_lst.append("%s AS %s" % (o._table_name, e_tbl_uniq_as)) try: q.where("%s.id=" % e_tbl_uniq_as, o["id"]) except FluentSQLQueryError: q.and_("%s.id=" % e_tbl_uniq_as, o["id"]) q.and_("%s.%s_id=" % (j_tbl_uniq_as, o._table_name), "%s.id" % e_tbl_uniq_as, True) tablecounter += 1 # add the condition that all the junction table ID's of # the entity we are looking for has to be the same - we # are looking not for union, but intersection of them for i in range(0, len(j_tbl_uniq_as_list)): try: j1 = j_tbl_uniq_as_list[i] j2 = j_tbl_uniq_as_list[i + 1] q.and_("%s.%s_id=" % (j1, tg_tbl), "%s.%s_id" % (j2, tg_tbl), True) except IndexError: break # bind junction table.entity_id to id of entity we are looking for q.and_("%s.%s_id=" % (j_tbl_uniq_as, tg_tbl), "%s.id" % tg_tbl, True) # @ent is attribute (int, basestring ect.) else: attr = ent_id_map[(ent, etype)][0] try: q.where("%s.%s=" % (tg_tbl, ent), attr) except FluentSQLQueryError: q.and_("%s.%s=" % (tg_tbl, ent), attr) q.from_table(from_lst) q() search_sql_query = q._sql # now if the total count of probably identical files is higher # than 100, we need to specify it more, so we jump to next # request count level (probably 1->2 or 2->3). if q.count() > 100: next_reqcount_lvl = True continue res = q.fetch_all() if not res: continue elif len(res) == 1: obj["id"] = res[0][0] self.logger.info( "Found exactly one result for lookup: %s, params: %s, found ID: %s, SQL: %s" % (obj._table_name, str(entity_comb), obj["id"], search_sql_query) ) return True else: # do some magic stuff here # intelligenty compare the attribute of all returned results # and choose the most similar q.cleanup() id_list = [x[0] for x in res] attrunion = set(["id"]) lvl_zero_rules = self._lookup_rules.get_rules(type(obj), 0) # make a list of attributes needed to acomplish the rules # (these are all which are present in rules) for rule in lvl_zero_rules: attrunion = attrunion.union(set(rule.entities)) # construct query which loads all needed attributes of all returned ID's q.select(list(attrunion)).from_table(obj._table_name) for _id in id_list: try: q.where("id=", _id) except FluentSQLQueryError: q.or_("id=", _id) q() # perform the query loaded_data = q.fetch_all() similarity = {} # every rule tell us what attributes have to be similar # (or identical) for rule in lvl_zero_rules: attrs = [item for item in rule.entities if item in obj] if rule.reqcount > len(attrs): continue # count every row's similarity (the result is sum of # similarities of their attributes) sim_lst = {} for d in loaded_data: row_similarity = 0.0 for attr in attrs: if attr not in d or d[attr] is None or attr not in obj: continue if (d["id"], attr) not in similarity: s = SequenceMatcher(None, d[attr], obj[attr]) similarity[(d["id"], attr)] = s.ratio() row_similarity += similarity[(d["id"], attr)] sim_lst[row_similarity] = d["id"] # get the most similar row to the object obj["id"] = sim_lst[max(sim_lst.keys())] self.logger.info( "Found more than one result for lookup: %s, params: %s, " "Choosen ID: %s, SQL: %s" % (obj._table_name, str(entity_comb), obj["id"], search_sql_query) ) return True
def _bind_entity_to_name(self, namedentity, source_module): """ This method creates connection between entity and it's name, which is stored in other database table. These tables are: - person vs person_name - organization vs organization_name - event vs event_name @returns ID of the row in the name-table. """ ACRONYM = "acronym" # to be easily changed to abbreviation or whatever needed.. TITLE = "title" # will be name? or what? if not isinstance(namedentity, _RRSDatabaseEntity): raise TypeError("Named object has to be instance of subclass of _RRSDatabaseEntity") if not "id" in namedentity: raise DatabaseError("Named object has to contain ID!") q = FluentSQLQuery() if namedentity._table_name == "person": # act like person and handle person_name # this is slightly different because there is N:N relationship # create new person name object pname = RRSPerson_name() for attr in ("first_name", "middle_name", "last_name", "full_name"): if attr in namedentity: pname[attr] = namedentity[attr] # create relationship object rel_obj = RRSRelationshipPersonPerson_name() rel_obj.set_entity(pname) namedentity["person_name"] = rel_obj # look for this name in database if self.lookup(pname): # it is in db yet, just check if rel exists q.select("person_id").from_table(("j__person__person_name")) q.where("person_id=", namedentity["id"]).and_("person_name_id=", pname["id"]) q() if not q.count(): # if the relationship doesn't exist, create new one self._rrsdb.relationship("person_name", rel_obj) elif q.count() > 1: self.logger.warning( "There are more than one relationship " "entries in table 'j__person__person_name" " between person.id=%s and person_name.id=%s" % (namedentity["id"], pname["id"]) ) else: # insert new person_name and create the relationship self._rrsdb.insert(pname, self.module) self._rrsdb.relationship("person_name", rel_obj) # get the reference out of which is this name extracted and assign # the person name to the reference (j__person_name__reference) try: refe = namedentity["publication"][0].get_entities()[0]["reference_reference"][0].get_entities()[0] except (KeyError, TypeError, IndexError): pass else: pname_ref_rel = RRSRelationshipPerson_nameReference() pname_ref_rel.set_entity(refe) pname["reference"] = pname_ref_rel try: self._rrsdb.relationship("reference", pname_ref_rel) except DatabaseError: self._queue.wait(WQEntry(self._rrsdb.relationship, ("reference", pname_ref_rel))) elif namedentity._table_name in ("event", "organization"): if TITLE not in namedentity: # this violates constraint... raise exception?? Or return false? return False name_tbl = "%s_name" % namedentity._table_name # if there in the database is no title like this, insert it q.select(("id", "%s_id" % namedentity._table_name, ACRONYM, TITLE)).from_table(name_tbl) q.where("%s=" % TITLE, namedentity[TITLE]) if ACRONYM in namedentity: q.or_("%s=" % ACRONYM, namedentity[ACRONYM]) q() if q.count(): # check the parent id if it matches for row in q.fetch_all(): if namedentity["id"] == row[1]: # if it matched on acronym, check the titles if they are the same if row[TITLE] != namedentity[TITLE]: # if not, check the rest and maybe add new row into table continue # add the missing acronym if needed if row[ACRONYM] is None and namedentity[ACRONYM] is not None: # update the row q.cleanup() q.update(name_tbl, {ACRONYM: namedentity[ACRONYM]}) q.where("id=", row["id"]) q() return row["id"] # if nothing matched, insert new name name_obj = self._table_to_class_map[name_tbl]() for attr in (TITLE, ACRONYM): if attr in namedentity: name_obj[attr] = namedentity[attr] if name_obj.empty(): return False name_obj[namedentity._table_name] = namedentity self._rrsdb.insert(name_obj, source_module) return name_obj["id"] else: raise RRSDatabaseEntityError("%s is not a named entity." % type(namedentity)) self._db.refresh()
class ResearchrPublicationFeeder: def __init__(self, config, importer_kwargs): #data ziskana z api self.rPublication = None #objekt typu RRSPublication, ktery po naplneni budeme importovat do db self.publication = None #nastaveni pro importer self.importer_kwargs = importer_kwargs #sleeper range self.LimitMin = 0.1 self.LimitMax = 0.2 #objekt pro vytvareni sql dotazu self.q = FluentSQLQuery() #researchr API self.researchrClass = ResearchrClass() #nejvyssi vrstva, pro nacteni objektu podle id self.rrsdb = RRSDatabase() #normalizator self.norm = Normalize() #importer self.importer = RRSXMLImporter(self.importer_kwargs) def __FillType(self): """ Transform rPublication.type to publication.type """ _id = self.__GetId("publication_type", "type=", self.rPublication.publication_type) if (_id != None): self.publication["type"] = self.rrsdb.load("publication_type", _id) def __FillSeries(self): """ Add rPublication.series to publication_series table """ if (self.rPublication.series != None and self.rPublication.series != ""): _id = None while (_id == None): _id = self.__GetId("publication_series", "title=", self.rPublication.series) if (_id == None): series = RRSPublication_series(title=self.rPublication.series) #importer = RRSXMLImporter(self.importer_kwargs) self.importer.import_model(series) continue self.publication["series"] = self.rrsdb.load("publication_series", _id) def __GetId(self, _from, where, _is): """ Try to find ID in table and return it @type _from: string @param _from: Name of table. @type where: string @param where: Name of column. @type _is: string @param _is: What it is equal. @rtype: int @return: Id of selected entry. """ self.q.select("id").from_table(_from) self.q.where(where, _is) self.q() data = self.q.fetch_one() #print(self.q.sql()) self.q.cleanup() if data != None: return data[0] return None def __FillPublisher(self): """ Add rPublication.publisher to organization table """ if (self.rPublication.publisher != None and self.rPublication.publisher != ""): _id = None normalized_title = self.norm.organization(self.rPublication.publisher) while (_id == None): _id = self.__GetId("organization", "title_normalized=", normalized_title) if (_id == None): organization = RRSOrganization(title=self.rPublication.publisher, title_normalized=normalized_title) #importer = RRSXMLImporter(self.importer_kwargs) self.importer.import_model(organization) continue self.publication["publisher"] = self.rrsdb.load("organization", _id) def __FillAuthors(self, authorData, isEditor): """ FillAuthor Add (if there are not) person to db and contain them with actual publication. Foreach rPublication.authors, take only person's url and fullname. @type authorData: list @param authorData: List of authors data (person, alias) @type isEditor: bool @param isEditor: True if authors are editors of this publication. """ if (len(authorData) != 0): rank = 0 for author in authorData: if 'author' in author: rFullname = author["person"]["fullname"] rUrl = author["person"]["url"] else: rFullname = author["alias"]["name"] rUrl = author["alias"]["url"] personUrl = RRSRelationshipPersonUrl() rank += 1 self.__FillUrl(personUrl, rUrl) self.__FillPerson(personUrl, rFullname, rank, isEditor) def __FillUrl(self, personUrl, rUrl): """ This function add url to db bind url to person @type personUrl: RRSRelationshipPersonUrl @param personUrl: Relationship object to add url into it. @type rUrl: string @param isEditor: rPublication.(person/alias) url, url of author/editor. """ _id = None while (_id == None): _id = self.__GetId("url", "link=", rUrl) if (_id == None): url = RRSUrl(link=rUrl) url["type"] = self.rrsdb.load("url_type", "1") #importer = RRSXMLImporter(self.importer_kwargs) self.importer.import_model(url) continue url = self.rrsdb.load("url", _id) personUrl.set_entity(url) #print( personUrl) def __FillPerson(self, personUrl, rFullname, rank, isEditor): """ This function try fill first name, middle name, last name of person. @type personUrl: RRSRelationshipPersonUrl @param personUrl: Relationship object to bind to person["url"]. @type rFullname: string @param rFullname: Fullname of author. @type rank: int @param rank: Rank of author, first author get 1, second 2 and so on. @type isEditor: bool @param isEditor: True if person is editor of this publication. """ _id = None while (_id == None): _id = self.__GetId("person", "full_name=", rFullname) if (_id == None): person = RRSPerson() person["full_name"] = rFullname person["url"] = personUrl self.__SetPersonNames(person, rFullname) person["full_name_ascii"] = unicodedata.normalize('NFKD', rFullname).encode('ascii', 'ignore') #importer = RRSXMLImporter(self.importer_kwargs) #print(person) self.importer.import_model(person) continue publicationPerson = RRSRelationshipPersonPublication(author_rank=rank, editor=isEditor) publicationPerson.set_entity(self.rrsdb.load("person", _id)) #print(publicationPerson) self.publication['person'] = publicationPerson def __SetPersonNames(self, person, rFullname): """ This function try fill first name, middle name, last name of person. @type person: RRSPerson @param person: Object of author of publication. @type rFullname: string @param rFullname: Fullname of author. """ splitName = rFullname.split() if (len(splitName) == 3): person["first_name"] = splitName[0] person["middle_name"] = splitName[1] person["last_name"] = splitName[2] elif (len(splitName) == 2): person["first_name"] = splitName[0] person["last_name"] = splitName[1] def FillPublication(self, key): """ This function call all private function with prefix Fill, this function load data to rPublication structure and then assign data from rPublication to publication(RRSPublication). @type key: string @param key: Key of the publication. """ self.__FillRPublication(key) self.publication = RRSPublication() self.__FillAuthors(self.rPublication.authors, False) self.__FillAuthors(self.rPublication.editors, True) self.__FillPublisher() self.__FillType() self.__FillSeries() self.publication["title"] = self.rPublication.title self.publication["title_normalized"] = self.norm.publication(self.rPublication.title) if (self.rPublication.year != None and self.rPublication.year != ""): self.publication["year"] = int(self.rPublication.year) # "2000" -> 2000 if (self.rPublication.month != None and self.rPublication.month != ""): self.publication["month"] = int(strptime(self.rPublication.month[:3],'%b').tm_mon) if (self.rPublication.volume != None and self.rPublication.volume != "" and self.rPublication.volume.isdigit()): self.publication["volume"] = int(self.rPublication.volume) if (self.rPublication.number != None and self.rPublication.number != "" and self.rPublication.volume.isdigit()): self.publication["number"] = int(self.rPublication.number) if (self.rPublication.abstract != None and self.rPublication.abstract != ""): self.publication["abstract"] = self.rPublication.abstract if (self.rPublication.doi != None and "http://dx.doi.org/" in self.rPublication.doi): self.publication["doi"] = self.rPublication.doi.strip('http://dx.doi.org/') if (self.rPublication.firstpage != None and self.rPublication.lastpage != None and self.rPublication.firstpage != "" and self.rPublication.lastpage != ""): self.publication["pages"] = str(self.rPublication.firstpage) + " - " + str(self.rPublication.lastpage) self.publication["language"] = self.rrsdb.load('language', 1) self.publication.set("researchr_key", self.rPublication.key, strict=False) #print(self.publication) #importer = RRSXMLImporter(self.importer_kwargs) try: self.importer.import_model(self.publication) except RRSDatabaseEntityError as e: print('RRSDatabaseEntityError - %s, %s' % (self.rPublication.key, str(e))) logging.warning('RRSDatabaseEntityError - %s, %s' % (self.rPublication.key, str(e))) except DatabaseError as e: print('DatabaseError - %s, %s' % (self.rPublication.key, str(e))) logging.warning('DatabaseError - %s, %s' % (self.rPublication.key, str(e))) except TypeError as e: print('TypeError - %s, %s' % (self.rPublication.key, str(e))) logging.warning('TypeError - %s, %s' % (self.rPublication.key, str(e))) except: print('Unexpected error - %s, %s' % (self.rPublication.key, sys.exc_info()[0])) logging.warning('Unexpected error - %s, %s' % (self.rPublication.key, sys.exc_info()[0])) def __FillRPublication(self, key): """ Fill rPublication object. @type key: string @param key: Name od publication. """ self.rPublication = RPublication() publicationData = self.researchrClass.getPublication(key) time.sleep(random.uniform(self.LimitMin, self.LimitMax)) #print(publicationData) for key, value in publicationData.items(): if key == 'abstract': self.rPublication.abstract = value elif key == 'address': self.rPublication.address = value elif key == 'authors': self.rPublication.authors = value elif key == 'booktitle': self.rPublication.booktitle = value elif key == 'conference': self.rPublication.conference = value elif key == 'conferenceYear': self.rPublication.conferenceYear = value elif key == 'doi': self.rPublication.doi = value elif key == 'editors': self.rPublication.editors = value elif key == 'firstpage': self.rPublication.firstpage = value elif key == 'key': self.rPublication.key = value elif key == 'issuenumber': self.rPublication.issuenumber = value elif key == 'journal': self.rPublication.journal = value elif key == 'key': self.rPublication.key = value elif key == 'lastpage': self.rPublication.lastpage = value elif key == 'month': self.rPublication.month = value elif key == 'note': self.rPublication.note = value elif key == 'number': self.rPublication.number = value elif key == 'organization': self.rPublication.organization = value elif key == 'publisher': self.rPublication.publisher = value elif key == 'series': self.rPublication.series = value elif key == 'title': self.rPublication.title = value elif key == 'type': self.rPublication.publication_type = value elif key == 'url': self.rPublication.url = value elif key == 'volume': self.rPublication.volume = value elif key == 'volumenumber': self.rPublication.volumenumber = value elif key == 'year': self.rPublication.year = value