예제 #1
def checkIfImport(key):
	Check if publication is in database.
	@type name: string
	@param name: Key of publication.
	q = FluentSQLQuery()
	q.where("researchr_key=", key)
	data = q.fetch_one()
	return data
예제 #2
    def lookup(self, obj, level=None):
        More sophisticated RRSDatabase.contains(). This method doesnt call
        RRSDatabase.contains() explicitly, it checks other entities and tries
        to find relationship between them. This method uses list of lookup rules.
        @returns True if found (the object now carries the ID)
                 False if not found
        if level is None:
            level = self.lookup_level
        if level < 0:
        if not isinstance(obj, _RRSDatabaseEntity):
            raise TypeError('lookup() method can be called only on database '\
                                'entity objects.')
        if obj._table_name.endswith("_meta"):
            raise RRSDatabaseEntityError('lookup() method cannot be called on meta-tables.')

        q = FluentSQLQuery()
        # LEVEL 0 rules
            lvl_zero_rules = self._lookup_rules.get_rules(type(obj), 0)
        except KeyError:
            if self.logger is not None:
                self.logger.error("Level 0 rules for '%s' not found." % obj._table_name)
            return False
        for rule in lvl_zero_rules:
            attr_present = [item for item in rule.entities if item in obj]
            # if there are no such attrubutes or not the requested count of them,
            # continue to the next rule
            if rule.reqcount > len(attr_present):
            for cnt in reversed(range(rule.reqcount, len(attr_present)+1)):
                for attr_comb in combinations(attr_present, cnt):
                    # now select them
                    for attr in attr_comb:
                            q.where("%s=" % attr, obj[attr])
                        except FluentSQLQueryError:
                            q.and_("%s=" % attr, obj[attr])
                    res = q.fetch_all()
                    if q.count() > 1: # there shouln't be more results than one
                        self.logger.warning("There are more than one identical "\
                        "%ss. List of ID's: %s" % (obj._table_name, str([x[0] for x in res])))
                    if not res or res is None:
                    obj['id'] = res[0][0]
                    return True

        # LEVEL 1 rules
            lvl_one_rules = self._lookup_rules.get_rules(type(obj), 1)
        except KeyError:
            if self.logger is not None:
                self.logger.error("Level 1 rules for '%s' not found." % obj._table_name)
            return False
        # returns type of entity mapped in ent_id_map
        def getetype(ent_id_map, ent):
            for k in ent_id_map.keys():
                e, et = k
                if ent == e:
                    return et
        # these are objects which really are present in the entity
        for rule in lvl_one_rules:
            ent_present = [item for item in rule.entities if item in obj]
            # if there are no such entities or not the requested count of them,
            # continue to the next rule
            if rule.reqcount > len(ent_present):

            # get all those identifiers
            ent_id_map = {}
            for ent_name in ent_present:
                target = obj[ent_name]
                if type(target) is list and target:
                    # list of relationship objects
                    key = (ent_name, type(target[0]))
                    ent_id_map[key] = []
                    for rel_obj in target:
                        assert len(rel_obj.get_entities()) > 0
                        e = rel_obj.get_entities()[0]
                        if self.lookup(e, level-1):
                            if not key in ent_id_map:
                                ent_id_map[key] = []
                    if not ent_id_map[key]:
                        del ent_id_map[key]
                elif isinstance(target, _RRSDatabaseEntity):
                    # this is FK - @target is RRS*** object
                    if self.lookup(target, level-1):
                        ent_id_map[(ent_name, type(target))] = [target]
                    ent_id_map[(ent_name, type(target))] = [target]

            # if we did not found as much as the rules requests, continue
            if rule.reqcount > len(ent_id_map):
            # try to catch some data from the minimum count of requested entities
            # to match, probably 2
            # if this select spits out too many results (>100), the reqcount level 2
            # is omitted and the process starts again from 3.
            # There has to be a flag, which indicates, that the level 2
            # requested entities returned too many results
            next_reqcount_lvl = False
            ent_keys = [x[0] for x in ent_id_map.keys()]
            for cnt in range(rule.reqcount, len(ent_id_map)+1):
                next_reqcount_lvl = False
                for entity_comb in combinations(ent_keys, cnt):
                    if next_reqcount_lvl: break
                    self._db.refresh() # re-create cursors to drop the loaded data
                    # construct the query
                    tg_tbl = obj._table_name
                    from_lst = [tg_tbl]
                    q.select("%s.id" % tg_tbl)
                    # recognition of the same table in the query
                    tablecounter = 1
                    for ent in entity_comb:
                        etype = getetype(ent_id_map, ent)
                        # now we have key to the object -> ent_id_map[(ent, etype)]

                        # @ent is instance of RRS****** - 1:N relationship
                        # the object contains id of this entity
                        if issubclass(etype, _RRSDatabaseEntity):
                            o = ent_id_map[(ent, etype)][0]
                                q.where("%s.%s_id=" % (tg_tbl, ent), o['id'])
                            except FluentSQLQueryError:
                                q.and_("%s.%s_id=" % (tg_tbl, ent), o['id'])

                        # @ent is fake junction table - it means, that it's
                        # the second side of 1:N relationship - N:1.
                        elif issubclass(etype, _RRSDbEntityRelationship) and etype._fake_table:
                            # TODO
                            return False

                        # @ent is true junction table - this M:N relationship.
                        elif issubclass(etype, _RRSDbEntityRelationship) and not etype._fake_table:
                            j_tbl_uniq_as = None
                            # storage of all acronyms iof junction tables
                            j_tbl_uniq_as_list = []
                            o = None

                            # join together all the found entities - for example:
                            # given publication, two persons (authors), both found
                            # in db so create query which selects ID of publication
                            # which has both - the first AND the second person.
                            for o in ent_id_map[(ent, etype)]:
                                j_tbl_uniq_as = "%s%s" % (etype._table_name, tablecounter)
                                e_tbl_uniq_as = "%s%s" % (o._table_name, tablecounter)
                                # add table to the list of tables we are joining together
                                from_lst.append("%s AS %s" % (etype._table_name, j_tbl_uniq_as))
                                from_lst.append("%s AS %s" % (o._table_name, e_tbl_uniq_as))
                                    q.where("%s.id=" % e_tbl_uniq_as, o['id'])
                                except FluentSQLQueryError:
                                    q.and_("%s.id=" % e_tbl_uniq_as, o['id'])
                                q.and_("%s.%s_id=" % (j_tbl_uniq_as, o._table_name), "%s.id" % e_tbl_uniq_as, True)
                                tablecounter += 1

                            # add the condition that all the junction table ID's of
                            # the entity we are looking for has to be the same - we
                            # are looking not for union, but intersection of them
                            for i in range(0, len(j_tbl_uniq_as_list)):
                                    j1 = j_tbl_uniq_as_list[i]
                                    j2 = j_tbl_uniq_as_list[i+1]
                                    q.and_("%s.%s_id=" % (j1, tg_tbl), "%s.%s_id" % (j2, tg_tbl), True)
                                except IndexError:
                            # bind junction table.entity_id to id of entity we are looking for
                            q.and_("%s.%s_id=" % (j_tbl_uniq_as, tg_tbl), "%s.id" % tg_tbl, True)

                        # @ent is attribute (int, basestring ect.)
                            attr = ent_id_map[(ent, etype)][0]
                                q.where("%s.%s=" % (tg_tbl, ent), attr)
                            except FluentSQLQueryError:
                                q.and_("%s.%s=" % (tg_tbl, ent), attr)

                    search_sql_query = q._sql
                    # now if the total count of probably identical files is higher
                    # than 100, we need to specify it more, so we jump to next
                    # request count level (probably 1->2 or 2->3).
                    if q.count() > 100:
                        next_reqcount_lvl = True
                    res = q.fetch_all()
                    if not res:
                    elif len(res) == 1:
                        obj['id'] = res[0][0]
                        self.logger.info("Found exactly one result for lookup: %s, params: %s, found ID: %s, SQL: %s" % \
                                        (obj._table_name, str(entity_comb), obj['id'], search_sql_query))
                        return True
                        # do some magic stuff here
                        # intelligenty compare the attribute of all returned results
                        # and choose the most similar
                        id_list = [x[0] for x in res]
                        attrunion = set(["id"])
                        lvl_zero_rules = self._lookup_rules.get_rules(type(obj), 0)
                        # make a list of attributes needed to acomplish the rules
                        # (these are all which are present in rules)
                        for rule in lvl_zero_rules:
                            attrunion = attrunion.union(set(rule.entities))
                        # construct query which loads all needed attributes of all returned ID's
                        for _id in id_list:
                                q.where("id=", _id)
                            except FluentSQLQueryError:
                                q.or_("id=", _id)
                        q() # perform the query
                        loaded_data = q.fetch_all()

                        similarity = {}
                        # every rule tell us what attributes have to be similar
                        # (or identical)
                        for rule in lvl_zero_rules:
                            attrs = [item for item in rule.entities if item in obj]
                            if rule.reqcount > len(attrs):

                            # count every row's similarity (the result is sum of
                            # similarities of their attributes)
                            sim_lst = {}
                            for d in loaded_data:
                                row_similarity = 0.0
                                for attr in attrs:
                                    if attr not in d or d[attr] is None or attr not in obj:
                                    if (d['id'], attr) not in similarity:
                                        s = SequenceMatcher(None, d[attr], obj[attr])
                                        similarity[(d['id'], attr)] = s.ratio()
                                    row_similarity += similarity[(d['id'], attr)]
                                sim_lst[row_similarity] = d['id']
                            # get the most similar row to the object
                            obj['id'] = sim_lst[max(sim_lst.keys())]
                            self.logger.info("Found more than one result for lookup: %s, params: %s, "\
                                             "Choosen ID: %s, SQL: %s" % (obj._table_name, str(entity_comb), obj['id'], search_sql_query))
                            return True
예제 #3
    def _bind_entity_to_name(self, namedentity, source_module):
        This method creates connection between entity and it's name, which is stored
        in other database table. These tables are:
         - person vs person_name
         - organization vs organization_name
         - event vs event_name
        @returns ID of the row in the name-table.
        ACRONYM = 'acronym' # to be easily changed to abbreviation or whatever needed..
        TITLE = 'title' # will be name? or what?
        if not isinstance(namedentity, _RRSDatabaseEntity):
            raise TypeError("Named object has to be instance of subclass of _RRSDatabaseEntity")
        if not 'id' in namedentity:
            raise DatabaseError("Named object has to contain ID!")
        q = FluentSQLQuery()
        if namedentity._table_name == "person":
            # act like person and handle person_name
            # this is slightly different because there is N:N relationship

            # create new person name object
            pname = RRSPerson_name()
            for attr in ('first_name', 'middle_name', 'last_name', 'full_name'):
                if attr in namedentity:
                    pname[attr] = namedentity[attr]
            # create relationship object
            rel_obj = RRSRelationshipPersonPerson_name()
            namedentity['person_name'] = rel_obj
            # look for this name in database
            if self.lookup(pname):
                # it is in db yet, just check if rel exists
                q.where("person_id=", namedentity['id']).and_("person_name_id=", pname['id'])
                if not q.count():
                    # if the relationship doesn't exist, create new one
                    self._rrsdb.relationship("person_name", rel_obj)
                elif q.count() > 1:
                    self.logger.warning("There are more than one relationship "\
                                        "entries in table 'j__person__person_name"\
                                        " between person.id=%s and person_name.id=%s" \
                                        % (namedentity['id'], pname['id']))
                # insert new person_name and create the relationship
                self._rrsdb.insert(pname, self.module)
                self._rrsdb.relationship("person_name", rel_obj)

            # get the reference out of which is this name extracted and assign
            # the person name to the reference (j__person_name__reference)
                refe = namedentity['publication'][0].get_entities()[0]['reference_reference'][0].get_entities()[0]
            except (KeyError, TypeError, IndexError):
                pname_ref_rel = RRSRelationshipPerson_nameReference()
                pname['reference'] = pname_ref_rel
                    self._rrsdb.relationship('reference', pname_ref_rel)
                except DatabaseError:
                    self._queue.wait(WQEntry(self._rrsdb.relationship, ('reference', pname_ref_rel)))

        elif namedentity._table_name in ("event", "organization"):
            if TITLE not in namedentity:
                # this violates constraint... raise exception?? Or return false?
                return False
            name_tbl = "%s_name" % namedentity._table_name
            # if there in the database is no title like this, insert it
            q.select(("id", "%s_id" % namedentity._table_name, ACRONYM, TITLE)).from_table(name_tbl)
            q.where("%s=" % TITLE, namedentity[TITLE])
            if ACRONYM in namedentity:
                q.or_("%s=" % ACRONYM, namedentity[ACRONYM])
            if q.count():
                # check the parent id if it matches
                for row in q.fetch_all():
                    if namedentity['id'] == row[1]:
                        # if it matched on acronym, check the titles if they are the same
                        if row[TITLE] != namedentity[TITLE]:
                            # if not, check the rest and maybe add new row into table
                        # add the missing acronym if needed
                        if row[ACRONYM] is None and namedentity[ACRONYM] is not None:
                            # update the row
                            q.update(name_tbl, {ACRONYM: namedentity[ACRONYM]})
                            q.where("id=", row['id'])
                        return row['id']
                # if nothing matched, insert new name
            name_obj = self._table_to_class_map[name_tbl]()
            for attr in (TITLE, ACRONYM):
                if attr in namedentity:
                    name_obj[attr] = namedentity[attr]
            if name_obj.empty():
                return False
            name_obj[namedentity._table_name] = namedentity
            self._rrsdb.insert(name_obj, source_module)
            return name_obj['id']
            raise RRSDatabaseEntityError("%s is not a named entity." % type(namedentity))
예제 #6
class ResearchrPublicationFeeder:
	def __init__(self, config, importer_kwargs):
		#data ziskana z api
		self.rPublication = None
		#objekt typu RRSPublication, ktery po naplneni budeme importovat do db
		self.publication = None

		#nastaveni pro importer
		self.importer_kwargs = importer_kwargs

		#sleeper range
		self.LimitMin = 0.1
		self.LimitMax = 0.2

		#objekt pro vytvareni sql dotazu
		self.q = FluentSQLQuery()

		#researchr API
		self.researchrClass = ResearchrClass()

		#nejvyssi vrstva, pro nacteni objektu podle id
		self.rrsdb = RRSDatabase()

		self.norm = Normalize()
		self.importer = RRSXMLImporter(self.importer_kwargs)

	def __FillType(self):
		Transform rPublication.type to publication.type
		_id = self.__GetId("publication_type", "type=", self.rPublication.publication_type)
		if (_id != None):
			self.publication["type"] = self.rrsdb.load("publication_type", _id)

	def __FillSeries(self):
		Add rPublication.series to publication_series table
		if (self.rPublication.series != None and self.rPublication.series != ""):
			_id = None
			while (_id == None):
				_id = self.__GetId("publication_series", "title=", self.rPublication.series)
				if (_id == None):
					series = RRSPublication_series(title=self.rPublication.series)
					#importer = RRSXMLImporter(self.importer_kwargs)
			self.publication["series"] = self.rrsdb.load("publication_series", _id)

	def __GetId(self, _from, where, _is):
		Try to find ID in table and return it
		@type  _from: string
		@param _from: Name of table.
		@type  where: string
		@param where: Name of column.
		@type  _is: string
		@param _is: What it is equal.
		@rtype:   int
		@return:  Id of selected entry.
		self.q.where(where, _is)
		data = self.q.fetch_one()
		if data != None:
			return data[0]
		return None
	def __FillPublisher(self):
		Add rPublication.publisher to organization table
	 	if (self.rPublication.publisher != None and self.rPublication.publisher != ""):
			_id = None
			normalized_title = self.norm.organization(self.rPublication.publisher)
			while (_id == None):
				_id = self.__GetId("organization", "title_normalized=", normalized_title)
				if (_id == None):
					organization = RRSOrganization(title=self.rPublication.publisher, 
					#importer = RRSXMLImporter(self.importer_kwargs)
				self.publication["publisher"] = self.rrsdb.load("organization", _id)

	def __FillAuthors(self, authorData, isEditor):
       		FillAuthor Add (if there are not) person to db and
       		contain them with actual publication. Foreach
		rPublication.authors, take only person's url and fullname.
		@type  authorData: list
		@param authorData: List of authors data (person, alias)
		@type  isEditor: bool
		@param isEditor: True if authors are editors of this publication.
		if (len(authorData) != 0):
			rank = 0
			for author in authorData:
				if 'author' in author:
					rFullname = author["person"]["fullname"]
					rUrl = author["person"]["url"]
					rFullname = author["alias"]["name"]
					rUrl = author["alias"]["url"]
				personUrl = RRSRelationshipPersonUrl()
				rank += 1
				self.__FillUrl(personUrl, rUrl)
				self.__FillPerson(personUrl, rFullname, rank, isEditor)

	def __FillUrl(self, personUrl, rUrl):
		This function add url to db bind url to person 

		@type  personUrl: RRSRelationshipPersonUrl
		@param personUrl: Relationship object to add url into it.
		@type  rUrl: string
		@param isEditor: rPublication.(person/alias) url, url of author/editor.
		_id = None
		while (_id == None):
			_id = self.__GetId("url", "link=", rUrl)
			if (_id == None):	
				url = RRSUrl(link=rUrl)
				url["type"] = self.rrsdb.load("url_type", "1")
				#importer = RRSXMLImporter(self.importer_kwargs)
			url = self.rrsdb.load("url", _id)
			#print( personUrl)

	def __FillPerson(self, personUrl, rFullname, rank, isEditor):
		This function try fill first name, middle name, last name of person.

		@type  personUrl: RRSRelationshipPersonUrl
		@param personUrl: Relationship object to bind to person["url"].
		@type  rFullname: string
		@param rFullname: Fullname of author.
		@type  rank: int
		@param rank: Rank of author, first author get 1, second 2 and so on.
		@type  isEditor: bool
		@param isEditor: True if person is editor of this publication.
		_id = None
		while (_id == None):
			_id = self.__GetId("person", "full_name=", rFullname)
			if (_id == None):
				person = RRSPerson()
				person["full_name"] = rFullname
				person["url"] = personUrl
				self.__SetPersonNames(person, rFullname)
				person["full_name_ascii"] = unicodedata.normalize('NFKD', rFullname).encode('ascii', 'ignore')
				#importer = RRSXMLImporter(self.importer_kwargs)
			publicationPerson = RRSRelationshipPersonPublication(author_rank=rank, editor=isEditor)
			publicationPerson.set_entity(self.rrsdb.load("person", _id))
			self.publication['person'] = publicationPerson

	def __SetPersonNames(self, person, rFullname):
		This function try fill first name, middle name, last name of person.

		@type  person: RRSPerson
		@param person: Object of author of publication.
		@type  rFullname: string
		@param rFullname: Fullname of author.
		splitName = rFullname.split()
		if (len(splitName) == 3):
			person["first_name"] = splitName[0]
			person["middle_name"] = splitName[1]
			person["last_name"] = splitName[2]
		elif (len(splitName) == 2):
			person["first_name"] = splitName[0]
			person["last_name"] = splitName[1]

	def FillPublication(self, key):
		This function call all private function with prefix Fill, 
		this function load data to rPublication structure and then 
		assign data from rPublication to publication(RRSPublication).
		@type  key: string
		@param key: Key of the publication.
		self.publication = RRSPublication()
		self.__FillAuthors(self.rPublication.authors, False)
		self.__FillAuthors(self.rPublication.editors, True)
		self.publication["title"] = self.rPublication.title
		self.publication["title_normalized"] = self.norm.publication(self.rPublication.title)

		if (self.rPublication.year != None and self.rPublication.year != ""):
			self.publication["year"] = int(self.rPublication.year) # "2000" -> 2000

		if (self.rPublication.month != None and self.rPublication.month != ""):
			self.publication["month"] = int(strptime(self.rPublication.month[:3],'%b').tm_mon)

		if (self.rPublication.volume != None and self.rPublication.volume != "" and self.rPublication.volume.isdigit()):
			self.publication["volume"] = int(self.rPublication.volume)

		if (self.rPublication.number != None and self.rPublication.number != "" and self.rPublication.volume.isdigit()):
			self.publication["number"] = int(self.rPublication.number)

		if (self.rPublication.abstract != None and self.rPublication.abstract != ""):
			self.publication["abstract"] = self.rPublication.abstract

		if (self.rPublication.doi != None and "http://dx.doi.org/" in self.rPublication.doi):
			self.publication["doi"] = self.rPublication.doi.strip('http://dx.doi.org/')

		if (self.rPublication.firstpage != None and self.rPublication.lastpage != None and 
			self.rPublication.firstpage != "" and self.rPublication.lastpage != ""):
			self.publication["pages"] = str(self.rPublication.firstpage) + " - " + str(self.rPublication.lastpage)

		self.publication["language"] = self.rrsdb.load('language', 1)
		self.publication.set("researchr_key", self.rPublication.key, strict=False)
		#importer = RRSXMLImporter(self.importer_kwargs)
		except RRSDatabaseEntityError as e:
			print('RRSDatabaseEntityError - %s, %s' % (self.rPublication.key, str(e)))
			logging.warning('RRSDatabaseEntityError - %s, %s' % (self.rPublication.key, str(e)))
		except DatabaseError as e:
			print('DatabaseError - %s, %s' % (self.rPublication.key, str(e)))
			logging.warning('DatabaseError - %s, %s' % (self.rPublication.key, str(e)))
		except TypeError as e:
			print('TypeError - %s, %s' % (self.rPublication.key, str(e)))
			logging.warning('TypeError - %s, %s' % (self.rPublication.key, str(e)))
			print('Unexpected error - %s, %s' % (self.rPublication.key, sys.exc_info()[0]))
			logging.warning('Unexpected error - %s, %s' % (self.rPublication.key, sys.exc_info()[0]))

	def __FillRPublication(self, key):
		Fill rPublication object.

		@type  key: string
		@param key: Name od publication.	
		self.rPublication = RPublication()
		publicationData = self.researchrClass.getPublication(key)
		time.sleep(random.uniform(self.LimitMin, self.LimitMax))
		for key, value in publicationData.items():
			if key == 'abstract':
				self.rPublication.abstract = value
			elif key == 'address':
				self.rPublication.address = value
			elif key == 'authors':
				self.rPublication.authors = value
			elif key == 'booktitle':
	     			self.rPublication.booktitle = value
			elif key == 'conference':
	    			self.rPublication.conference = value
			elif key == 'conferenceYear':
	     	       		self.rPublication.conferenceYear = value
			elif key == 'doi':
	     	       		self.rPublication.doi = value
			elif key == 'editors':
				self.rPublication.editors = value
			elif key == 'firstpage':
	     	       		self.rPublication.firstpage = value
			elif key == 'key':
				self.rPublication.key = value
			elif key == 'issuenumber':
				self.rPublication.issuenumber = value
			elif key == 'journal':
				self.rPublication.journal = value
			elif key == 'key':
				self.rPublication.key = value
			elif key == 'lastpage':
	     	       		self.rPublication.lastpage = value
			elif key == 'month':
	     	       		self.rPublication.month = value
			elif key == 'note':
				self.rPublication.note = value
	     		elif key == 'number':
	     	       		self.rPublication.number = value
	     		elif key == 'organization':
	  	   		self.rPublication.organization = value
	  	   	elif key == 'publisher':
	     			self.rPublication.publisher = value
	     		elif key == 'series':
	     			self.rPublication.series = value
	  	   	elif key == 'title':
	  	   		self.rPublication.title = value
	  	   	elif key == 'type':
	 			self.rPublication.publication_type = value
	     		elif key == 'url':
	     			self.rPublication.url = value
	     		elif key == 'volume':
	   			self.rPublication.volume = value
	    		elif key == 'volumenumber':
				self.rPublication.volumenumber = value
	     		elif key == 'year':
		    		self.rPublication.year = value