示例#1
0
	def run(self):
		try:
			data_run_out = False
			while not data_run_out:
				if self.idList is None or len(self.idList) == 0:
					walker = PersonWalkThroughOrderByPubcount(self.extractor.generation,
							processer=self.person_processer, fetch_size=100, fix_person_ext=True)
				else:
					walker = PersonWalkThroughByGivenIDList(self.extractor.generation,
							processer=self.person_processer, pids=self.idList, fix_person_ext=True);
				walker.walk()
				
				if self.personUpdater.isAllFinished(self.extractor.generation) or self.settings.byid:
					print "All data finished. Ended Provider."
					data_run_out = True
				print "All person walked, reload all"
				time.sleep(10)

			# reach here if all persons loaded
			self.extractor.waiting_to_finish = True
			print "$mgr/provider:> All person added to Queue, waiting for stop."
		except Exception, e:
			ExceptionHelper.print_exec(e)
			print '-' * 100
			print 'BIG Exception, and can\'t continue.'
			print '-' * 100
			sys.exit()
示例#2
0
	def walk(self):
		''' Walk through all persons in database. '''
		""" this place put while true inner """
		while True:
			try:
				lastId = -1
				#former while true here
	#			while True:
				conn = DB.pool().getConnection()
				cursor = conn.cursor()
				#print self.sql % (self.gen, start_id, self.limit)
				cursor.execute(self.sql, (self.update_generation, lastId, self.fetch_size))
				data = cursor.fetchall()
				print "&[Walker]> walk through na_person, %s items" % cursor.rowcount
				if cursor.rowcount == 0:
					break
				id = 0
				for id, names, pubcount in data:
					# fix 
					if self.fix_person_ext and pubcount is None:
						self.person_update_tool.insertPersonExt(id, self.update_generation, pubcount)

					lastId = id
					namelist = names.split(",")
					for name in namelist:
						name = name.strip()
					# Call callback
					self.processer(Person(id, namelist, pubcount))
				cursor.close()
				conn.close()

			except MySQLdb.Error, e: #@UndefinedVariable
				ExceptionHelper.print_exec(e)
#				raise
			return data
示例#3
0
	def walk(self):
		''' Walkthrough all persons in db. '''
		while True:
			try:
				lastId = -1
	#			while True:
				conn = DB.pool().getConnection()
				cursor = conn.cursor()
				#print self.sql % (self.gen, start_id, self.limit)
				cursor.execute(self.sql, (self.gen, lastId, self.fetch_size))
				data = cursor.fetchall()
				print "&-walker-:> walk person(citation) %s items" % cursor.rowcount
				if cursor.rowcount == 0:
					break
				id = 0
				for id, fullname, pubcount in data:
					lastId = id
					person = Person(id, fullname, pubcount)
					self.processer(person)
				cursor.close()
				conn.close()
				# sleep 10 minutes next loop
				time.sleep(20)

			except MySQLdb.Error, e: #@UndefinedVariable
				ExceptionHelper.print_exec(e)
#				raise
			return data
示例#4
0
    def walk(self):
        ''' Walkthrough all persons in db. '''
        while True:
            try:
                lastId = -1
                #			while True:
                conn = DB.pool().getConnection()
                cursor = conn.cursor()
                #print self.sql % (self.gen, start_id, self.limit)
                cursor.execute(self.sql, (self.gen, lastId, self.fetch_size))
                data = cursor.fetchall()
                print "&-walker-:> walk person(citation) %s items" % cursor.rowcount
                if cursor.rowcount == 0:
                    break
                id = 0
                for id, fullname, pubcount in data:
                    lastId = id
                    person = Person(id, fullname, pubcount)
                    self.processer(person)
                cursor.close()
                conn.close()
                # sleep 10 minutes next loop
                time.sleep(20)

            except MySQLdb.Error, e:  #@UndefinedVariable
                ExceptionHelper.print_exec(e)
#				raise
            return data
示例#5
0
	def execute(self, sql, *args):
		try:
			conn = DB.pool().getConnection();
			cursor = conn.cursor()
			cursor.execute(sql, args)
			cursor.close()
			conn.close()
			return cursor.rowcount
		except Exception, e:
			ExceptionHelper.print_exec(e)
示例#6
0
 def getPersonTotalCount(self):
     while True:
         try:
             conn = DB.pool().getConnection()
             cursor = conn.cursor()
             cursor.execute("select count(*) from na_person")
             row = cursor.fetchone()
             cursor.close()
             conn.close()
             return row[0]
         except MySQLdb.Error, e:  # @UndefinedVariable
             ExceptionHelper.print_exec(e)
示例#7
0
 def getTotalCount(self):
     while True:
         try:
             conn = DB.pool().getConnection()
             cursor = conn.cursor()
             cursor.execute("select count(*) from publication")
             row = cursor.fetchone()
             cursor.close()
             conn.close()
             return row[0]
         except MySQLdb.Error, e:
             ExceptionHelper.print_exec(e)
示例#8
0
	def insertPersonExt(self, person_id, update_generation=0, pubcount=0):
		while True:
			try:
				conn = DB.pool().getConnection();
				cursor = conn.cursor()
				cursor.execute("insert into person_update_ext(id,u_citation_gen,pubcount) values(%s,%s,%s)",
							 (person_id, update_generation, 0))
				cursor.close()
				conn.close()
				return cursor.rowcount
			except MySQLdb.Error, e: #@UndefinedVariable
				ExceptionHelper.print_exec(e)
示例#9
0
 def getMinGenerationInDB(self):
     while True:
         try:
             conn = DB.pool().getConnection()
             cursor = conn.cursor()
             cursor.execute("select min(u_citation_gen) from publication")
             row = cursor.fetchone()
             cursor.close()
             conn.close()
             return row[0]
         except MySQLdb.Error, e:
             ExceptionHelper.print_exec(e)
示例#10
0
 def getTotalCount(self):
     while True:
         try:
             conn = DB.pool().getConnection()
             cursor = conn.cursor()
             cursor.execute("select count(*) from publication")
             row = cursor.fetchone()
             cursor.close()
             conn.close()
             return row[0]
         except MySQLdb.Error, e:
             ExceptionHelper.print_exec(e)
示例#11
0
 def getMinGenerationInDB(self):
     while True:
         try:
             conn = DB.pool().getConnection()
             cursor = conn.cursor()
             cursor.execute("select min(u_citation_gen) from publication")
             row = cursor.fetchone()
             cursor.close()
             conn.close()
             return row[0]
         except MySQLdb.Error, e:
             ExceptionHelper.print_exec(e)
示例#12
0
 def getPersonTotalCount(self):
     while True:
         try:
             conn = DB.pool().getConnection()
             cursor = conn.cursor()
             cursor.execute("select count(*) from na_person")
             row = cursor.fetchone()
             cursor.close()
             conn.close()
             return row[0]
         except MySQLdb.Error, e:  #@UndefinedVariable
             ExceptionHelper.print_exec(e)
	def setGoogleValue(self, id, google_value, links=''):
		sql = 'update GoogleResult_mark set google = %s, url = %s where id = %s limit 1'  
		if links is None:
			linkstr = ''
		else:
			linkstr = '\n'.join(links)
			
		try:
			conn = DB.pool().getConnection()
			cursor = conn.cursor()
			cursor.execute(sql, (google_value, linkstr, id,))
		except Exception, e:
			ExceptionHelper.print_exec(e)
示例#14
0
 def resetPersonPublicationUpdateGen(self, personId):
     ''' set publication's update_gen to 0 of one person'''
     while True:
         try:
             sql = '''update publication p left join na_author2pub a2p on p.id = a2p.pid set p.u_citation_gen=0 where a2p.aid=%s '''
             conn = DB.pool().getConnection()
             cursor = conn.cursor()
             cursor.execute(sql, (personId))
             cursor.close()
             conn.close()
             return cursor.rowcount
         except MySQLdb.Error, e:  #@UndefinedVariable
             ExceptionHelper.print_exec(e)
示例#15
0
 def resetPersonPublicationUpdateGen(self, personId):
     """ set publication's update_gen to 0 of one person"""
     while True:
         try:
             sql = """update publication p left join na_author2pub a2p on p.id = a2p.pid set p.u_citation_gen=0 where a2p.aid=%s """
             conn = DB.pool().getConnection()
             cursor = conn.cursor()
             cursor.execute(sql, (personId))
             cursor.close()
             conn.close()
             return cursor.rowcount
         except MySQLdb.Error, e:  # @UndefinedVariable
             ExceptionHelper.print_exec(e)
示例#16
0
    def getLeftCount(self, generation):
        while True:
            try:
                conn = DB.pool().getConnection()
                cursor = conn.cursor()
                cursor.execute(
                    "select count(*) from publication p \
					where p.u_citation_gen is null or p.u_citation_gen < %s", generation)
                row = cursor.fetchone()
                cursor.close()
                conn.close()
                return row[0]
            except MySQLdb.Error, e:
                ExceptionHelper.print_exec(e)
示例#17
0
 def markPersonUpdateCitationFinished(self, personId, gen):
     while True:
         data = None
         try:
             conn = DB.pool().getConnection()
             cursor = conn.cursor()
             data = cursor.execute("update person_update_ext set u_citation_gen=%s where id=%s;", (gen, personId))
             cursor.close()
             conn.close()
             return cursor.rowcount
         except Exception, e:
             ExceptionHelper.print_exec(e)
         # 				raise
         return data
示例#18
0
 def savePerson(self, personId, personName):
     """Save author2pub"""
     while True:
         try:
             conn = DB.pool().getConnection()
             cursor = conn.cursor()
             data = cursor.execute("insert into person (id, fullname) values (%s,%s)", (personId, personName))
             cursor.close()
             conn.close()
             return cursor.rowcount
         except MySQLdb.Error, e:
             ExceptionHelper.print_exec(e)
         # 				raise
         return data
示例#19
0
    def getPersonLeftCount(self, generation):
        while True:
            try:
                conn = DB.pool().getConnection()
                cursor = conn.cursor()
                cursor.execute(
                    "select count(*) from na_person p left join person_update_ext pe on p.id=pe.id \
				 where pe.u_citation_gen is null or pe.u_citation_gen < %s", generation)
                row = cursor.fetchone()
                cursor.close()
                conn.close()
                return row[0]
            except MySQLdb.Error, e:
                ExceptionHelper.print_exec(e)
	def popTitlesFromDB(self):
		sql = 'select id,titleA,titleB from GoogleResult_mark where google is null and updatetime is null  order by id limit 1' 
		try:
			conn = DB.pool().getConnection()
			cursor = conn.cursor()
			cursor.execute(sql)
			data = cursor.fetchall()
			
			if(data and len(data) == 1):
				(id, title1, title2) = data[0]
				updateSQL = 'update GoogleResult_mark set updatetime = CURRENT_TIMESTAMP where id = %s' 
				cursor.execute(updateSQL, (id,))
				return id, title1, title2
		except Exception, e:
			ExceptionHelper.print_exec(e)
示例#21
0
    def save(self, person):
        '''Save author2pub'''
        while True:
            try:
                conn = DB.pool().getConnection()
                cursor = conn.cursor()
                data = cursor.execute(
                    "insert into person (id, fullname) values (%s,%s)",
                    (person.id, person.name))
                cursor.close()
                conn.close()
                return cursor.rowcount
            except MySQLdb.Error, e:
                ExceptionHelper.print_exec(e)
#				raise
            return data
示例#22
0
    def markPersonUpdateCitationFinished(self, personId, gen):
        while True:
            data = None
            try:
                conn = DB.pool().getConnection()
                cursor = conn.cursor()
                data = cursor.execute(
                    "update person_update_ext set u_citation_gen=%s where id=%s;",
                    (gen, personId))
                cursor.close()
                conn.close()
                return cursor.rowcount
            except Exception, e:
                ExceptionHelper.print_exec(e)
#				raise
            return data
示例#23
0
    def getLeftCount(self, generation):
        while True:
            try:
                conn = DB.pool().getConnection()
                cursor = conn.cursor()
                cursor.execute(
                    "select count(*) from publication p \
					where p.u_citation_gen is null or p.u_citation_gen < %s",
                    generation,
                )
                row = cursor.fetchone()
                cursor.close()
                conn.close()
                return row[0]
            except MySQLdb.Error, e:
                ExceptionHelper.print_exec(e)
示例#24
0
    def getPersonLeftCount(self, generation):
        while True:
            try:
                conn = DB.pool().getConnection()
                cursor = conn.cursor()
                cursor.execute(
                    "select count(*) from na_person p left join person_update_ext pe on p.id=pe.id \
				 where pe.u_citation_gen is null or pe.u_citation_gen < %s",
                    generation,
                )
                row = cursor.fetchone()
                cursor.close()
                conn.close()
                return row[0]
            except MySQLdb.Error, e:
                ExceptionHelper.print_exec(e)
示例#25
0
 def save(self, aid, pid, position):
     """Save author2pub"""
     while True:
         try:
             conn = DB.pool().getConnection()
             cursor = conn.cursor()
             data = cursor.execute(
                 "insert into author2pub (aid,pid,position) values(%s,%s,%s)", (aid, pid, position)
             )
             cursor.close()
             conn.close()
             return cursor.rowcount
         except MySQLdb.Error, e:
             ExceptionHelper.print_exec(e)
         # 				raise
         return data
示例#26
0
	def putToPubCache(self, person, pub):
		"""将pub放到缓存等待再搜一遍"""
		if person is None: return
		
		try:
			if pub is not None and pub.id not in self.pubmap:
				with self.pub_lock:
					self.pubmap[pub.id] = pub
#					print "[store](putToPubCache):add pub(%s, [%s]) to pubmap, now length %s, with person(%s)" % (pub.id, pub.ncitation, len(self.pubmap), person.id)
					if person.id not in self.person_pub_map:
						self.person_pub_map[person.id] = []
#						print "[store](putToPubCache):add person(%s) to person_pub_map, now length %s " % (person.id, len(self.person_pub_map))
					person_pub_list = self.person_pub_map[person.id]
					person_pub_list.append(pub.id)
		except Exception, e:
			ExceptionHelper.print_exec(e)
示例#27
0
	def walk(self):
		''' Get by pids. '''
		while True:
			try:
				print "&[Walker]> walk through na_person, BY_ID_LIST: %s items" % len(self.pids)
				page = 0
				count = 300
				data = []
				conn = DB.pool().getConnection()
				while True:
					cursor = conn.cursor()
					print "Getting People of Page %s" % page
					# sql
					temp = []
					for item in self.pids[page*count:(page+1)*count]:
						temp.append(str(item))
					if page*count > len(self.pids):
						break
					if len(temp) == 0:
						break
					inwhere = "".join(["(", ",".join(temp) , ")"])

					self.sql = '''select p.id, p.names, pe.id, pe.pubcount 
						from na_person p left join person_update_ext pe on p.id=pe.id \
						where (pe.u_citation_gen is null or pe.u_citation_gen < %s) and p.id in %s ''' % (self.update_generation, inwhere)
					cursor.execute(self.sql)
					data.extend(cursor.fetchall())
					page += 1
					if cursor.rowcount == 0:
						break
					time.sleep(1)
						
				for pid, names, peid, pubcount in data:
					# fix 
					if self.fix_person_ext and peid is None:
						self.person_update_tool.insertPersonExt(pid, self.update_generation, pubcount)
					namelist = names.split(",")
					for name in namelist:
						name = name.strip()
					# Call callback
					self.processer(Person(pid, namelist, pubcount))
				cursor.close()
				conn.close()

			except MySQLdb.Error, e: #@UndefinedVariable
				ExceptionHelper.print_exec(e)
			return data
示例#28
0
	def flushDBCache(self):
		''' 将内存中缓存的已经抓取的pub批量存入数据库中。
		'''
		try:
			if len(self.pub_db_cache) == 0: return
			print 'start flush to db.'
			temp = [] # pubs[]
			for i in range(0, len(self.pub_db_cache)): #@UnusedVariable
				(k, v) = self.pub_db_cache.popitem() #@UnusedVariable
				if v:
					temp.append(v)
					print "%s updated to db." % v
			if len(temp) > 0:
				self.pub_dao.batchUpdate(self.gen, temp)
			print 'end flush to db.'
			# @todo: flush db cache
		except Exception, e:
			ExceptionHelper.print_exec(e)
示例#29
0
    def runOriginal(self):
        while self.extractor.running and not self.ask_to_stop:
            self.mark()
            self.extractor.wait_for_pause()  # wait if paused
            #			url, url_without_author, pubs_in_url = store.getFromPubQueue() # get url and pubs

            query, used_pubs = self.store.getFromPubQueue()  # get url and pubs
            if used_pubs is None or len(used_pubs) == 0:
                print "[ERROR][t_pub_process:%s] Queue is Empty.(%s,%s)" % (
                    self.name, query, used_pubs)
                time.sleep(10)
                continue
            self.extractor.wait_for_pause()  # wait again

            with self.extractor.busy_semaphore_lock:
                self.extractor.busy_semaphore += 1
                self.extractor.busy_pub_semaphore += 1

            pubs_found = None
            pubs_notfound = None
            try:
                all_models = Extractor.getInstance().getNodesByPubs(used_pubs)
                if all_models is not None:
                    (pubs_found,
                     pubs_notfound) = PubMatcher.getInstance().matchPub(
                         used_pubs, all_models)
                    if pubs_found is None or pubs_notfound is None:
                        print '[ERROR][-/-] some pubs, pubs_found is None or pubs_notfound is None, return'
                        return
                    print "{+P}[%s/%s] [found/notfound] pub, query[%s]." % (
                        len(pubs_found), len(pubs_notfound), query)
                else:
                    pubs_notfound = used_pubs
            except Exception, e:
                ExceptionHelper.print_exec(e)
                print '-------------------------------------------------------'
                print 'query:', query
                print 'all_models', all_models
                print 'used_pubs', used_pubs
                print '-------------------------------------------------------'
                return
            finally:
示例#30
0
    def batchUpdate(self, gen, pubs):
        """batch update pubs, params is ((ncitation, gen, id),...)"""
        params = []
        for pub in pubs:
            params.append((pub.ncitation, gen, pub.id))

        while True:
            try:
                conn = DB.pool().getConnection()
                cursor = conn.cursor()
                cursor.execute("Set AUTOCOMMIT = 0")
                data = cursor.executemany("update publication set ncitation=%s, u_citation_gen=%s where id=%s", params)
                conn.commit()
                cursor.execute("Set AUTOCOMMIT = 1")
                cursor.close()
                conn.close()
                return cursor.rowcount
            except MySQLdb.Error, e:
                ExceptionHelper.print_exec(e)
            # 				raise
            return data
示例#31
0
 def get_author(self, aid, generation):
     print aid, generation
     self.sql = """select p.id, p.names, pe.pubcount 
         from na_person p left join person_update_ext pe on p.id=pe.id
         where (pe.u_citation_gen is null or pe.u_citation_gen < %s) and p.id=%s
         """ % (generation, aid)
     try:
         conn = DB.pool().getConnection()
         cursor = conn.cursor()
         cursor.execute(self.sql)
         data = cursor.fetchall()
         if cursor.rowcount == 0:
             return
         for aid, names, pubcount in data:
             namelist = [name.strip() for name in names.split(",")]
             print 'get author: ', namelist
             return Person(aid, namelist, pubcount)
         cursor.close()
         conn.close()
     except MySQLdb.Error, e:
         ExceptionHelper.print_exec(e)
示例#32
0
 def get_author(self, aid, generation):
     print aid, generation
     self.sql = """select p.id, p.names, pe.pubcount 
         from na_person p left join person_update_ext pe on p.id=pe.id
         where (pe.u_citation_gen is null or pe.u_citation_gen < %s) and p.id=%s
         """ % (generation, aid)
     try:
         conn = DB.pool().getConnection()
         cursor = conn.cursor()
         cursor.execute(self.sql)
         data = cursor.fetchall()
         if cursor.rowcount == 0:
             return
         for aid, names, pubcount in data:
             namelist = [name.strip() for name in names.split(",")]
             print 'get author: ', namelist
             return Person(aid, namelist, pubcount)
         cursor.close()
         conn.close()
     except MySQLdb.Error, e:
         ExceptionHelper.print_exec(e)
示例#33
0
    def getPublicationByPerson(self, personId, gen):
        """Get all publications of a person"""
        while True:
            try:
                conn = DB.pool().getConnection()
                cursor = conn.cursor()
                cursor.execute(
                    """select p.id, p.year, p.title, p.pubkey, p.jconf, p.authors, p.ncitation, p.u_citation_gen 
				from publication p left join na_author2pub a2p on p.id=a2p.pid 
				where (p.u_citation_gen < %s or p.u_citation_gen is null) and a2p.aid=%s""",
                    (gen, personId),
                )
                data = cursor.fetchall()
                pubs = []
                for id, year, title, pubkey, jconf, authors, ncitation, gen in data:
                    pub = Publication(id, year, title, pubkey, jconf, authors, ncitation)
                    pubs.append(pub)
                cursor.close()
                conn.close()
                return pubs
            except Exception, e:
                ExceptionHelper.print_exec(e)
示例#34
0
    def getPublicationByPerson(self, personId, gen):
        '''Get all publications of a person'''
        while True:
            try:
                conn = DB.pool().getConnection()
                cursor = conn.cursor()
                cursor.execute(
                    '''select p.id, p.year, p.title, p.pubkey, p.jconf, p.authors, p.ncitation, p.u_citation_gen 
				from publication p left join na_author2pub a2p on p.id=a2p.pid 
				where (p.u_citation_gen < %s or p.u_citation_gen is null) and a2p.aid=%s''',
                    (gen, personId))
                data = cursor.fetchall()
                pubs = []
                for id, year, title, pubkey, jconf, authors, ncitation, gen in data:
                    pub = Publication(id, year, title, pubkey, jconf, authors,
                                      ncitation)
                    pubs.append(pub)
                cursor.close()
                conn.close()
                return pubs
            except Exception, e:
                ExceptionHelper.print_exec(e)
示例#35
0
	def runOriginal(self):
		while self.extractor.running and not self.ask_to_stop:
			self.mark()
			self.extractor.wait_for_pause() # wait if paused
#			url, url_without_author, pubs_in_url = store.getFromPubQueue() # get url and pubs
			
			query, used_pubs = self.store.getFromPubQueue() # get url and pubs
			if used_pubs is None or len(used_pubs) == 0:
				print "[ERROR][t_pub_process:%s] Queue is Empty.(%s,%s)" % (self.name, query, used_pubs)
				time.sleep(10)
				continue
			self.extractor.wait_for_pause() # wait again

			with self.extractor.busy_semaphore_lock: 
				self.extractor.busy_semaphore += 1
				self.extractor.busy_pub_semaphore += 1

			pubs_found = None
			pubs_notfound = None
			try:
				all_models = Extractor.getInstance().getNodesByPubs(used_pubs)
				if all_models is not None:
					(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models)
					if pubs_found is None or pubs_notfound is None:
						print '[ERROR][-/-] some pubs, pubs_found is None or pubs_notfound is None, return'
						return
					print "{+P}[%s/%s] [found/notfound] pub, query[%s]." % (len(pubs_found), len(pubs_notfound), query)
				else:
					pubs_notfound = used_pubs
			except Exception, e:
				ExceptionHelper.print_exec(e)
				print '-------------------------------------------------------'
				print 'query:', 	query
				print 'all_models', all_models
				print 'used_pubs', used_pubs
				print '-------------------------------------------------------'
				return
			finally:
示例#36
0
	def run(self):
		while self.extractor.running and not self.ask_to_stop:
			# acquire person
			self.mark()
			self.person = None
			self.extractor.wait_for_pause() # wait if paused
			self.person = self.store.getFromPersonQueue(timeout=self.extractor.mgr_interval)
			self.extractor.wait_for_pause() # wait again

			if self.person is not None:  # process person.
				with self.extractor.busy_semaphore_lock: 
					self.extractor.busy_semaphore += 1
					self.extractor.busy_person_semaphore += 1
				try:
					self.process_person()
				except Exception, e:
					ExceptionHelper.print_exec(e)
#					raise
					if self.person is not None:
						print "[ERROR] < pub back person %s:" % self.person
						self.store.addToPersonQueue(self.person)
				finally:
					with self.extractor.busy_semaphore_lock: 
示例#37
0
    def batchUpdate(self, gen, pubs):
        '''batch update pubs, params is ((ncitation, gen, id),...)'''
        params = []
        for pub in pubs:
            params.append((pub.ncitation, gen, pub.id))

        while True:
            try:
                conn = DB.pool().getConnection()
                cursor = conn.cursor()
                cursor.execute("Set AUTOCOMMIT = 0")
                data = cursor.executemany(
                    "update publication set ncitation=%s, u_citation_gen=%s where id=%s",
                    params)
                conn.commit()
                cursor.execute("Set AUTOCOMMIT = 1")
                cursor.close()
                conn.close()
                return cursor.rowcount
            except MySQLdb.Error, e:
                ExceptionHelper.print_exec(e)
#				raise
            return data
    def run(self):
        while self.extractor.running and not self.ask_to_stop:
            # acquire person
            self.mark()
            self.person = None
            self.extractor.wait_for_pause()  # wait if paused
            self.person = self.store.getFromPersonQueue(
                timeout=self.extractor.mgr_interval)
            self.extractor.wait_for_pause()  # wait again

            if self.person is not None:  # process person.
                with self.extractor.busy_semaphore_lock:
                    self.extractor.busy_semaphore += 1
                    self.extractor.busy_person_semaphore += 1
                try:
                    self.process_person()
                except Exception, e:
                    ExceptionHelper.print_exec(e)
                    #					raise
                    if self.person is not None:
                        print "[ERROR] < pub back person %s:" % self.person
                        self.store.addToPersonQueue(self.person)
                finally:
                    with self.extractor.busy_semaphore_lock:
示例#39
0
	def getFromPubQueue(self):
		''' 从Store中的零散Pub中取下一个要抓取的pub组合
		(取几个pub拼成一个最长字符串用来抓取)
		如果遇到错误,可能返回None.
		@return: (url, pubs[])
		'''
		print_verbose = False
		try:
			while self.running and len(self.person_pub_map) == 0:
				time.sleep(self.mgr_interval)
				
			self.blocked_pub_t += 1
			with self.pub_lock: # lock
				self.blocked_pub_t -= 1
				pub_candidates = [] 	# {pubId -> pub_with_person_name}, candidates
				person_invalid = []  	# mark person that not valid, delete later
				for personId, ids in self.person_pub_map.iteritems():
					# if person with no ids, del this person.
					if ids is None or len(ids) == 0:
						person_invalid.append(personId)
					else:
						valid_ids = 0
						for pubId in ids:
							if print_verbose: print('\tcandidate pub %s' % pubId)
							
							if pubId in self.pubmap:
								_pub = self.pubmap[pubId]
								if _pub:
									pub_candidates.append(_pub)
									valid_ids += 1
									
								if print_verbose: 
									print('\tcandidate pub %s of person %s.' % (_pub.title, personId))

							if len(pub_candidates) > 0:  # enough
								if print_verbose:
									print('\tcandidates enough, length %s ' % len(pub_candidates))
								break

						if valid_ids == 0:  # means all pub of this person is not valid. just delete this person.
							person_invalid.append(personId)

				for personId in person_invalid:
					for pubId in self.person_pub_map[personId]:
						if pubId in self.pubmap:
#							print "[store](getFromPubQueue):delete pub(%s,[%s]) from pubmap, cause person(%s) " % (pubId, self.pubmap[pubId].ncitation, personId)
							del self.pubmap[pubId]
							
					del self.person_pub_map[personId]
#					print "[store](getFromPubQueue):delete person(%s) from person_pub_map, now length %s " % (personId, len(self.person_pub_map))

				# return None if not available
				if pub_candidates is None or len(pub_candidates) == 0:
					print('\t[store] Cannot be here. empty candidates. return null.')
					return None, None

				# gen query
				query, used_pubs, nouse_pubs = Extractor.pinMaxQuery(pub_candidates[:1])
				for pub in used_pubs:
					del self.pubmap[pub.id] # delete pub.
#					print "[store](getFromPubQueue):delete pub(%s, [%s]) from pubmap, now length %s " % (pub.id, pub.ncitation, len(self.pubmap))
					
				# Save nouse_pubs to dbcache, waiting to write to db.
				nouse_pubs += pub_candidates[1:]
				if nouse_pubs:
					for pub in nouse_pubs:
						self.putToPubdbcache(pub);

				return query, used_pubs

		except Exception, e:
			ExceptionHelper.print_exec(e)
			print ('Exception occurred: %s. ' % e)
示例#40
0
	def getFromPubQueueBack(self):
		''' 从Store中的零散Pub中取下一个要抓取的pub组合,(取几个pub拼成一个最长字符串用来抓取)
			如果遇到错误,可能返回None.
			@return: (url, pubs[])
		'''
		print_verbose = True
		try:
			# block if no pub items.
			start = time.time()
			while self.running and len(self.person_pub_map) == 0:
				time.sleep(self.mgr_interval)
			dur = (time.time() - start)
			#print "---------============----------- get 1 wait %.4s" % dur
			if print_verbose: print('TimeUsed:%.4s ms, ' % dur)

			start = time.time()
			self.blocked_pub_t += 1
			with self.pub_lock: # lock
				self.blocked_pub_t -= 1
				# count
				self.ppt_wait += dur
				#print "---------============----------- get 3 getlock %.4s" % (time.time() - start)
				self.ppt_getlock += (time.time() - start)
				start = time.time()

				# select candidates
				pub_candidates = [] 	# {pubId -> pub_with_person_name}, candidates
				person_invalid = []  	# mark person that not valid, delete later
				for personId, ids in self.person_pub_map.iteritems():
					# if person with no ids, del this person.
					if ids is None or len(ids) == 0:
						person_invalid.append(personId)
					else:
						valid_ids = 0
						for pubId in ids:
							if print_verbose: print('\tcandidate pub %s' % pubId)
							if pubId in self.pubmap:
								_pub = self.pubmap[pubId]
								if _pub is not None:
									pub_candidates.append(_pub)
									valid_ids = valid_ids + 1
									if print_verbose: 
										print('\tcandidate pub %s of person %s.' % (_pub.title, personId))

							if len(pub_candidates) > 0:  # enough
								if print_verbose: print('\tcandidates enough, length %s ' % len(pub_candidates))
								break

						if valid_ids == 0:  # means all pub of this person is not valid. just delete this person.
							person_invalid.append(personId)

				for personId in person_invalid:
					del self.person_pub_map[personId]
#					print "[store](line 123):delete person(%s) from person_pub_map, now length %s " % (personId, len(self.person_pub_map))

				# return None if not available
				if pub_candidates is None or len(pub_candidates) == 0:
					print('\t[ERR] Cannot be here. empty candidates. return null.')
					return None, None

				# gen query
				query, used_pubs, nouse_pubs = Extractor.pinMaxQuery(pub_candidates[:1])
				for pub in used_pubs:
					del self.pubmap[pub.id] # delete pub.
#					print "[store](line 134):delete pub(%s) from pubmap, now length %s " % (pub.id, len(self.pubmap))
					
				# Save nouse_pubs to dbcache, waiting to write to db.
				nouse_pubs += pub_candidates[1:]
				if nouse_pubs:
					for pub in nouse_pubs:
						self.putToPubdbcache(pub);

				return query, used_pubs

		except Exception, e:
			ExceptionHelper.print_exec(e)
			print ('Exception occurred: %s. ' % e)