Python PublicationDao示例，com.lish.ajia.googlescholar.daos.PublicationDao Python示例

示例#1

0

显示文件

    def test_matchPub(self):
        self.extractor = Extractor().getInstance()
        pubdao = PublicationDao()
        person_id = 13419
        person_name = 'jie tang'
        # Read sources from files
        all_models = {}
        for page in range(0, 3):
            filename = "".join((person_name, '_page_', str(page), '.html'))
            f = file(os.path.join(self.settings.source_dir, filename), 'r')
            html = f.read()
            models = self.extractor.extract_from_source(html)
            if models is not None:
                self.extractor._Extractor__merge_into_extractedmap(
                    all_models, models)
        print 'Total found DEBUG  %s items.' % len(all_models)

        # part 2
        pubs = pubdao.getPublicationByPerson(person_id,
                                             self.settings.generation)

        printout = False
        if printout:
            for key, models in all_models.items():
                print key, " --> ", models
            print '==================='
            for pub in pubs:
                print pub

        (pubs_matched, pubs_not_matched) = self.matchPub(pubs, all_models)
        print '- test done -', len(pubs_matched), len(pubs_not_matched)
        return pubs_not_matched

示例#2

0

显示文件

文件： pubmatcher_v1_a_little_strict.py 项目： AlexLyj/aminer-spider

	def test_matchPub(self):
		self.extractor = Extractor().getInstance()
		pubdao = PublicationDao()
		person_id = 13419
		person_name = 'jie tang'
		# Read sources from files
		all_models = {}
		for page in range(0, 3):
			filename = "".join((person_name, '_page_', str(page), '.html'))
			f = file(os.path.join(self.settings.source_dir, filename), 'r')
			html = f.read()
			models = self.extractor.extract_from_source(html)
			if models is not None:
				self.extractor._Extractor__merge_into_extractedmap(all_models, models)
		print 'Total found DEBUG  %s items.' % len(all_models)

		# part 2
		pubs = pubdao.getPublicationByPerson(person_id, self.settings.generation)

		printout = False
		if printout:
			for key, models in all_models.items():
				print key, " --> ", models
			print '==================='
			for pub in pubs:
				print pub

		(pubs_matched, pubs_not_matched) = self.matchPub(pubs, all_models)
		print '- test done -', len(pubs_matched), len(pubs_not_matched)
		return pubs_not_matched

示例#3

0

显示文件

文件： test_dao.py 项目： AlexLyj/aminer-spider

	def test_getpublications(self):
		'''Test get all publications from database.'''
		print '-TEST-:', TestCase.test_getpublications.__doc__
		pubdao = PublicationDao()
		pubs = pubdao.getPublicationByPerson(13423, self.settings.generation)  # id for jie tang, current generation
		for pub in pubs:
			print pub
		print '-END TEST-'

示例#4

0

显示文件

	def __init__(self, generation, mgr_interval=5):
		self.settings = Settings.getInstance()
		self.debug = self.settings.debug

		self.gen = generation
		self.mgr_interval = mgr_interval

		self.person_queue 		 = Queue.Queue(maxsize=self.settings.person_cache_size)
		self.person_id_set	 	 = set([])	# sync with queue, quick contains using id. 

		self.pubmap		 		 = {}		# {id -> pub}
		self.person_pub_map		 = {}		# {person_id->[pub_id_list]} - person to pub_ids
		self.pub_db_cache 		 = {}

		self.pub_lock			 = threading.Lock()
		self.pub_dbcache_lock 	 = threading.RLock()

		self.running = True #sync ed with main running flag in mgr_interval_thread
		self.blocked_pub_t 		 = 0

		# time sum
		self.ppt_wait = 0
		self.ppt_getlock = 0
		self.ppt_get = 0

		self.person_dao = PersonDao()
		self.pub_dao = PublicationDao()

示例#5

0

显示文件

    def __init__(self):
        print "Task: extract paper's citation from schooler.google.com.\n"
        self.settings = Settings.getInstance()
        self.debug = self.settings.debug

        # Configs
        self.mgr_interval = 10  # seconds
        self.max_person_thread = 2  # max threads used to extract person,
        self.max_pub_thread = 2  # these 2 values can modified on the fly. diff in day or night

        # Threads and configurations
        self.t_mgr = None  # MgrThread(self)	# management thread, create
        self.t_provider = None
        self.person_thread_pool = [
        ]  #= Queue.Queue(maxsize=self.max_person_thread)
        self.pub_thread_pool = []  #= Queue.Queue(maxsize=self.max_pub_thread)

        self.busy_semaphore = 0  # 用来监视是否所有的线程都处于Idle状态
        self.busy_semaphore_lock = threading.Lock()  # 用来监视是否所有的线程都处于Idle状态

        # utils
        self.store = None

        # switchers & flags
        self.running = True  # If False, threads will stop after current task.
        self.stopped = False  # If MGRThread can stop.
        self.pause = False  # All works paused.
        self.waiting_to_finish = False  # No additional data. all added to queue.
        self.num_report = 0
        self.last_report_time = datetime.datetime.now()  # 上次Interval的时间

        self.restart_all_thread = False
        self.detect_exit_wait = 0  # 当刚刚从pause模式退出来时，会有大量failed的任务，会导致立刻再次等待

        self.generation = 0

        self.dao = dbs()
        self.personDao = PersonDao()
        self.pubDao = PublicationDao()

        if self.settings.save_pdflink:
            self.pdfcache = PDFLinkSaver.getInstance()

        # start
        self.determineGereration()

示例#6

0

显示文件

文件： t_person_processer.py 项目： yinonbaron/aminer-spider

    def __init__(self, extractorInstance):
        threading.Thread.__init__(self)

        self.extractor = extractorInstance
        self.store = self.extractor.store
        self.pubdao = PublicationDao()

        self.person = None  # set this and start.
        self.ask_to_stop = False
        self.last_action = datetime.datetime.now()

示例#7

0

显示文件

文件： start_cw.py 项目： Rygbee/aminer-spider

    def __init__(self):
        print "Task: extract paper's citation from schooler.google.com.\n"
        self.settings = Settings.getInstance()
        self.debug = self.settings.debug

        # Configs
        self.mgr_interval = 10  # seconds
        self.max_person_thread = 2  # max threads used to extract person,
        self.max_pub_thread = 2  # these 2 values can modified on the fly. diff in day or night

        # Threads and configurations
        self.t_mgr = None  # MgrThread(self)	# management thread, create
        self.t_provider = None
        self.person_thread_pool = []  # = Queue.Queue(maxsize=self.max_person_thread)
        self.pub_thread_pool = []  # = Queue.Queue(maxsize=self.max_pub_thread)

        self.busy_semaphore = 0  # 用来监视是否所有的线程都处于Idle状态
        self.busy_semaphore_lock = threading.Lock()  # 用来监视是否所有的线程都处于Idle状态

        # utils
        self.store = None

        # switchers & flags
        self.running = True  # If False, threads will stop after current task.
        self.stopped = False  # If MGRThread can stop.
        self.pause = False  # All works paused.
        self.waiting_to_finish = False  # No additional data. all added to queue.
        self.num_report = 0
        self.last_report_time = datetime.datetime.now()  # 上次Interval的时间

        self.restart_all_thread = False
        self.detect_exit_wait = 0  # 当刚刚从pause模式退出来时，会有大量failed的任务，会导致立刻再次等待

        self.generation = 0

        self.dao = dbs()
        self.personDao = PersonDao()
        self.pubDao = PublicationDao()

        if self.settings.save_pdflink:
            self.pdfcache = PDFLinkSaver.getInstance()

            # start
        self.determineGereration()

示例#8

0

显示文件

文件： debug.py 项目： AlexLyj/aminer-spider

class DebugSuit():

	def __init__(self):
		self.extractor = Extractor.getInstance()
		self.matcher = PubMatcher.getInstance()
		self.pubdao = PublicationDao()

	def debug_person(self, person_id, person_name, generation):
		'''Test method extract_from_source.'''
		print '- DEBUG Person "%s" -:' % person_name

		pubs = self.pubdao.getPublicationByPerson(person_id, generation)
		all_models = self.extractor.getNodesByPersonName(person_name)
#		if True:#print all all_models
#			print '-' * 100, 'This is all_models'
#			for key, models in all_models.items():
#				print key, ':'
#				for model in models:
#					print '\t', model.readable_title, '(', model, ')'
#			print '=' * 100 , 'all_models print done'
		(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models)
		for pub in pubs_found:
			print 'pubs found' , pub
		print '-' * 100
		for pub in pubs_notfound:
			print 'not found' , pub

		print '|||||||||||||||||||||||||||| get by pubs '
		# todo here should be a while
		query, used_pubs = Extractor.pinMaxQuery(pubs_notfound)
		print '%s pub, query: %s' % (len(used_pubs), query)
		all_models = self.extractor.getNodesByPubs(used_pubs)
		(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models)
		for pub in pubs_found:
			print 'pubs found' , pub
		print '-' * 100
		for pub in pubs_notfound:
			print 'not found' , pub

		print '- END DEBUG -'

	def debug_pubs(self):
		'''Debug get by pub'''
		print '-TEST-:', self.debug_pubs.__doc__.strip()
		#----------------------------------------------------
		pub_candidates = []
		
		# group 1
#		pub_candidates.append(Publication(-1, 2000, 'Some Reflections on Proof Transformations', "pubkey", -1, "Peter B. Andrews", -5))
#		pub_candidates.append(Publication(-1, 2000, 'Theorem Proving via General Mappings', "pubkey", -1, "Peter B. Andrews", -5))
#		pub_candidates.append(Publication(-1, 2000, 'Connections and Higher-Order Logic', "pubkey", -1, "Peter B. Andrews", -5))
#		pub_candidates.append(Publication(-1, 2000, 'The TPS Theorem Proving System', "pubkey", -1, "Peter B. Andrews,Sunil Issar,Dan Nesmith,Frank Pfenning", -5))
		
		# group 2
#		pub_candidates.append(Publication(-1, 2000, 'Linearizable concurrent objects', "pubkey", -1, "MP Herlihy, JM Wing", -5))
#		pub_candidates.append(Publication(-1, 2000, 'Protein structure prediction using a combination of sequence homology and global energy minimization I. Global energy minimization of surface loops', "pubkey", -1, "MJ Dudek, HA Scheraga", -5))
		
		# group 3
#		pub_candidates.append(Publication(-1, 2000, 'Implementation of Prolog databases and database operation builtins in the WAM-Plus model', "pubkey", -1, "Z Chenxi, C Yungui, L Bo", -5))

		# group 4
		pub_candidates.append(Publication(-1, 2000, 'Procedural Semantics for Fuzzy Disjunctive Programs on Residuated Lattices', "pubkey", -1, "Dusan Guller", -5))
		
		extractor = Extractor.getInstance()
		query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
		print '%s pub, query: %s' % (len(used_pubs), query)

		#
		# Get WEB PAGE
		#
		use_web = True # ***************
		if use_web:
			all_models = extractor.getNodesByPubs(used_pubs)
		else:
			f = file('debug_pubs.txt', 'r')
			html = f.read()
			models = self.extractor.extract_from_source(html)
			all_models = self.extractor._Extractor__merge_into_extractedmap(None, models)

		print '\n- all_models ----------------------'
		if all_models is not None:
			for key, models in all_models.items():
				print key
				for model in models:
					print "\t", model
		else:
			print 'all_models is None'
		print '- all_models end ----------------------\n'

		(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models)
		for pub in pubs_found:
			print 'pubs found' , pub
		print '-' * 100
		for pub in pubs_notfound:
			print 'not found' , pub
		print '- test done -'

示例#9

0

显示文件

文件： debug.py 项目： AlexLyj/aminer-spider

	def __init__(self):
		self.extractor = Extractor.getInstance()
		self.matcher = PubMatcher.getInstance()
		self.pubdao = PublicationDao()

示例#10

0

显示文件

文件： start.py 项目： AlexLyj/aminer-spider

class GoogleScholarExtractor:
	'''Author gb<elivoa[AT]gmail.com> v0.4.0'''

	def __init__(self):
		print "Task: extract paper's citation from scholar.google.com.\n"
		self.settings = Settings.getInstance()
		self.debug = self.settings.debug
		self.threadChildren = 0
		self.PersonThreadActive = 0
		self.PubThreadActive = 0

		self.mgr_interval 		 = 8		# seconds
		self.t_mgr 				 = None 	# MgrThread(self)	# management thread, create
		self.t_provider 		 = None
		self.person_thread_pool	 = []		#= Queue.Queue(maxsize=self.settings.max_person_thread)
		self.pub_thread_pool	 = []		#= Queue.Queue(maxsize=self.settings.max_pub_thread)

		self.busy_semaphore 	 = 0 				# 用来监视是否所有的线程都处于Idle状态
		self.busy_semaphore_lock = threading.Lock() # 用来监视是否所有的线程都处于Idle状态

		self.busy_person_semaphore = 0
		self.busy_pub_semaphore = 0

		self.store				 = None

		self.running 			 = True			# If False, threads will stop after current task.
		self.stopped			 = False		# If MGRThread can stop.
		self.pause 				 = False		# All works paused.
		self.waiting_to_finish 	 = False		# No additional data. all added to queue.
		self.num_report 		 = 0
		self.last_report_time	 = datetime.datetime.now()			# 上次Interval的时间

		self.restart_all_thread = False
		self.detect_exit_wait 	 = 0			# 当刚刚从pause模式退出来时，会有大量failed的任务，会导致立刻再次等待

		self.generation 		 = 0
		
		self.dao = dbs()
		self.personDao = PersonDao()
		self.pubDao = PublicationDao()

		if self.settings.save_pdflink:
			self.pdfcache = PDFLinkSaver.getInstance()

		self.determineGereration();

	# determine if the program could run or wait, or load continue status.
	def determineGereration(self):
		self.generation = self.dao.getGeneration()
		currentMinGen = self.dao.getMinGenerationInDB()
		currentMaxGen = self.dao.getMaxGenerationInDB()

		print '====================================================================='
		print " * Required update_generation is: [ %s ]." % (self.generation)
		print " * Current min update_generation is: [ %s ]." % (currentMinGen)
		
		# process generation
		if currentMinGen < self.generation == currentMaxGen or self.generation > currentMaxGen:
			print " * Not finished task, continue to finish current generation."
		elif self.generation == currentMinGen == currentMaxGen:
			print " * Just start new generation";
			self.generation = self.generation + 1
			self.dao.setGeneration(self.generation)
		else:
			print "=== Error: generation(%s) bigger than currentMinGen(%s)" % (self.generation, currentMinGen)
			self.generation = currentMaxGen
			self.dao.setGeneration(self.generation)

		# count task progress
		print " * Process NA Persons : %s." % self.reportPersonProgress(self.generation)
		print " * Process Publication: %s." % self.reportPublicationProgress(self.generation)
		print '====================================================================='


	def reportPersonProgress(self, udpate_generation):
		''' Return String that report progress of person.
		'''
		total = self.personDao.getPersonTotalCount()
		left = self.personDao.getPersonLeftCount(udpate_generation)
		progress = float(total - left) / total * 100.0 
		return "[%6.2f%%] %s/%s" % (progress, total - left, total)

	
	def reportPublicationProgress(self, udpate_generation):
		''' Return String that report progress of person.
		'''
		total = self.pubDao.getTotalCount()
		left = self.pubDao.getLeftCount(udpate_generation)
		progress = float(total - left) / total * 100.0
		return "[%6.2f%%] %s/%s" % (progress, total - left, total)


	def start(self):
		'''Extract Citation Multithread
		- Start main threads...
		- Manager Threads
		- Person Provider Thread
		- Publication Download Thread
		- ...
		'''
		self.store = Store(self.generation, self.mgr_interval)

		self.t_mgr = threading.Thread(target=self.mgrThreadBody, args=(), name='thread-mgr') # use method mgr.
		self.t_mgr.start()

		self.t_provider = ProviderThread(self, None)
		self.t_provider.start()

		# waiting to finish
		self.t_mgr.join()
		print "============ ALL END ============"


	def wait_for_pause(self):
		while self.pause:
			time.sleep(self.mgr_interval)

	#
	# Management Thread
	#
	def mgrThreadBody(self):
		"Management Thread"
		print "#init:> start mgr & provider."
		getter = HtmlRetriever.getInstance(self.settings.use_proxy)

		while self.running or not self.stopped:
			# interval seconds passed.
			interval_seconds = (datetime.datetime.now() - self.last_report_time).seconds
			if interval_seconds == 0: interval_seconds = 1
			self.last_report_time = datetime.datetime.now();

			try:
				self.PersonThreadActive = 0
				self.PubThreadActive = 0
				for x in self.person_thread_pool:
					if x.check_idle():
						self.PersonThreadActive += 1	
				for y in self.pub_thread_pool:
					if y.check_idle():
						self.PubThreadActive += 1
			except Exception:
				print "ERROR:count errer"
				print Exception

			try:
				# save pdf link
				if self.settings.save_pdflink:
					self.pdfcache.flush()
			except Exception:
				print "ERROR: pdf link"
				print Exception

			message = None

			# 什么时候重启所有线程&进程
			reload_all_thread = False
			if self.num_report % 1000 == 0:
				reload_all_thread = True
				message = "Kill & Restart All Thread."
			
			try:
				# Maintain Threads and get worker threads status.
				(num_persont_alive, num_pubt_alive) = self._maintainThreadPool(reload_all_thread=False)
			except Exception:
				print "ERROR: maintain threads and worker"
				print Exception

			try:
				# Finish Condition.
				if self._checkFinishCondition():
					self.running = False					# -> tell all threads finish.
					message = "MESSAGE! Send terminal signal to all worker thread."
			except Exception:
				print "ERROR: condition check"
				print Exception
				
			# if all worker threads stopped, mgrThread can stop.
			if num_persont_alive == 0 and num_pubt_alive == 0:
				self.stopped = True
				message = "Send terminal signal to mgr_thread."

			# check network and count 
			period_success_connection = getter.success_connection_count - getter.last_success_connection_count
			period_bad_connection = getter.bad_connection_count - getter.last_bad_connection_count
			total_connections = period_success_connection + period_bad_connection
			getter.last_success_connection_count = getter.success_connection_count
			getter.last_bad_connection_count = getter.bad_connection_count

			average_success_persecond = period_success_connection / float(interval_seconds)
			average_bad_persecond = period_bad_connection / float(interval_seconds)

			if False: # 是否Block模式，就是暂停整个程序
				if getter.detect_mode:
					if getter.detect_success_count > 3:
						getter.leave_detect_mode()
						self.detect_exit_wait = 1 # 刚出来时，下两轮都不要再进入block模式了。
				else:
					if total_connections * 0.9 < period_bad_connection:
						if self.detect_exit_wait > 0:
							print "---- waiting %s rounds ----" % self.detect_exit_wait
							self.detect_exit_wait -= 1
						else:
							getter.enter_detect_mode()

			################ print interval string ################
			try:
				# print report
				if not getter.detect_mode:
					str_report = None
					if not self.pause:
						self.num_report += 1
						str_report = self.num_report
					else:
						str_report = "paused"
					
					report_strs = []
					report_strs.append("-" * 100)
					report_strs.append("\n")
					report_strs.append("$&mgr:%s(%s):> " % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), str_report))
					report_strs.append("Person(%sT on %s), " % (num_persont_alive, self.store.person_queue.qsize()))
					report_strs.append("Pub(%sT on %s, %s items), " % (num_pubt_alive, len(self.store.pubmap), len(self.store.person_pub_map)))
					report_strs.append("DBCache({{{ %s }}}), " % len(self.store.pub_db_cache))
					report_strs.append("T(busy/idle)(%s/%s), " % (self.busy_semaphore, self.settings.max_person_thread + self.settings.max_pub_thread - self.busy_semaphore))
					report_strs += '\n'
					report_strs.append("Person(busy/idle)(%s/%s), Pub(busy/idle)(%s/%s)" % (self.busy_person_semaphore, self.settings.max_person_thread-self.busy_person_semaphore, self.busy_pub_semaphore, self.settings.max_pub_thread-self.busy_pub_semaphore))
					g = getter.success_connection_count
					b = getter.bad_connection_count
					t = g + b
					rate = 0
					if(t > 0):
						rate = g / float(t)
					report_strs.append("network(g+b=t)=(%s+%s=%s),rate=%.2f " % (g, b , t, rate))
					report_strs.append("interval-network(g+b=t)=(%s+%s=%s), " % (period_success_connection, period_bad_connection, total_connections))
					report_strs.append("avg:(g%.1f b%.1f in %s seconds.), " % (average_success_persecond, average_bad_persecond, interval_seconds))
					report_strs.append("\n")
					report_strs.append("now have %s child threads, " % self.threadChildren)
					report_strs.append("active threads (%s person, %s pub) , " % (self.PersonThreadActive, self.PubThreadActive))
					report_strs.append("\n")
					report_strs.append("time:(wait=%.2f, getlock=%.2f, get=%.2f)" % (self.store.ppt_wait, self.store.ppt_getlock, self.store.ppt_get))
					if message is not None:
						report_strs.append("\n")
						report_strs.append(message)
					report_strs.append("\n")
						
					report_strs.append(" * Process NA Persons : %s.\n" % self.reportPersonProgress(self.generation))
					report_strs.append(" * Process Publication: %s.\n" % self.reportPublicationProgress(self.generation))
					report_strs.append("-" * 100)
					report_strs.append("\n")

					print "".join(report_strs)
					if (self.num_report%100 == 0):
						mr = MailReporter()
						mr.report(report_strs)
			except Exception:
				print "ERROR: report error"
				print Exception

			try:
				self.store.flushDBCache()				# last flush cache to db.
				self.store.running = self.running		# pass main running thread to Store object.
			except Exception:
				print "ERROR: flush db cache"
				print Exception

			time.sleep(self.mgr_interval) 			# interval

		print "$mgr:> exit."

	def _checkFinishCondition(self):
		'''@return: true if all can stop.'''
		# Finish Condition.
		if self.waiting_to_finish and not self.pause: 		# Provider report finish and not paused.
			if self.busy_semaphore == 0: 					# all threads' status must be idle.
				if self.store.person_queue.empty() \
				  		and len(self.store.pubmap) == 0 \
				  		and len(self.store.pub_db_cache) == 0: 	# task queue must be empty
					left = self.pubDao.getLeftCount(self.generation)
					if left == 0:	# really finished.
						return True
		return False
	
	
	def _adjustThreadNum(self):
		# strength by period of day. 
		hour = datetime.datetime.now().hour
		if hour <= 9:
			self.settings.max_person_thread = 25
			self.settings.max_pub_thread = 75
		elif 22 <= hour:
			self.settings.max_person_thread = 16
			self.settings.max_pub_thread = 40
		else:
			self.settings.max_person_thread = 22
			self.settings.max_pub_thread = 60

		
	def _maintainThreadPool(self, reload_all_thread):
		'''
		Maintain ThreadPool, detect and restart, and set running threads on the fly.
		'''
		num_persont_alive = 0
		num_pubt_alive = 0

#		if reload_all_thread: # kill all thread first.
		if False:#we don't want reload fucntion until fix the bug : pub thread idle  
			for idx_pub_t in range(0, self.settings.max_pub_thread):
				t = None
				if len(self.pub_thread_pool) <= idx_pub_t:
					self.pub_thread_pool.append(t)
				else:
					t = self.pub_thread_pool[idx_pub_t]
				if t is not None:
					t.ask_to_stop = True
			self.pub_thread_pool = []

		#fill thread pool
		while len(self.person_thread_pool)<self.settings.max_person_thread:
			self.person_thread_pool.append(None)

		while len(self.pub_thread_pool)<self.settings.max_pub_thread:
			self.pub_thread_pool.append(None)

		# check and start all unstarted threads.
		idx_person_t = 0
		for idx_person_t in range(0, self.settings.max_person_thread):
			t = self.person_thread_pool[idx_person_t]

			if t is None: # if is None(new add) or dead.
				if self.running:
					t = PersonProcessThread(self)
					t.name = 'person-thread-' + str(idx_person_t)
					self.person_thread_pool[idx_person_t] = t
					t.start()
					with self.busy_semaphore_lock:
						self.threadChildren += 1
						num_persont_alive += 1
			elif not t.is_alive() or not t.check_idle():
				if self.running:
					killedname = t.name
					t.stop()
					print "$mgr/thread:> kill thread %s" % killedname
					t = PersonProcessThread(self)
					t.name = 'person-thread-' + str(idx_person_t)
					self.person_thread_pool[idx_person_t] = t
					t.start()
					with self.busy_semaphore_lock:
						self.threadChildren += 1
						num_persont_alive += 1
			else:
				num_persont_alive += 1

		# check and start all unstarted threads.
		idx_pub_t = 0
		for idx_pub_t in range(0, self.settings.max_pub_thread):
			t = self.pub_thread_pool[idx_pub_t]
			
			if t is None:
				if self.running:
					t = PubProcessThread(self)
					t.name = 'pub-thread-' + str(idx_pub_t)
					self.pub_thread_pool[idx_pub_t] = t
					t.start()
					with self.busy_semaphore_lock:
						self.threadChildren += 1
						num_pubt_alive += 1
			elif not t.is_alive() or not t.check_idle():
				if self.running:
					killedname = t.name
					t.stop()
					print "$mgr/thread:> kill thread %s" % killedname
					t = PubProcessThread(self)
					t.name = 'pub-thread-' + str(idx_pub_t)
					self.pub_thread_pool[idx_pub_t] = t
					t.start()
					with self.busy_semaphore_lock:
						self.threadChildren += 1
						num_pubt_alive += 1
			else:
				num_pubt_alive += 1

		return (num_persont_alive, num_pubt_alive)

示例#11

0

显示文件

class GoogleScholarExtractor:
    '''Author gb<elivoa[AT]gmail.com> v0.4.0'''
    def __init__(self):
        print "Task: extract paper's citation from schooler.google.com.\n"
        self.settings = Settings.getInstance()
        self.debug = self.settings.debug

        # Configs
        self.mgr_interval = 10  # seconds
        self.max_person_thread = 2  # max threads used to extract person,
        self.max_pub_thread = 2  # these 2 values can modified on the fly. diff in day or night

        # Threads and configurations
        self.t_mgr = None  # MgrThread(self)	# management thread, create
        self.t_provider = None
        self.person_thread_pool = [
        ]  #= Queue.Queue(maxsize=self.max_person_thread)
        self.pub_thread_pool = []  #= Queue.Queue(maxsize=self.max_pub_thread)

        self.busy_semaphore = 0  # 用来监视是否所有的线程都处于Idle状态
        self.busy_semaphore_lock = threading.Lock()  # 用来监视是否所有的线程都处于Idle状态

        # utils
        self.store = None

        # switchers & flags
        self.running = True  # If False, threads will stop after current task.
        self.stopped = False  # If MGRThread can stop.
        self.pause = False  # All works paused.
        self.waiting_to_finish = False  # No additional data. all added to queue.
        self.num_report = 0
        self.last_report_time = datetime.datetime.now()  # 上次Interval的时间

        self.restart_all_thread = False
        self.detect_exit_wait = 0  # 当刚刚从pause模式退出来时，会有大量failed的任务，会导致立刻再次等待

        self.generation = 0

        self.dao = dbs()
        self.personDao = PersonDao()
        self.pubDao = PublicationDao()

        if self.settings.save_pdflink:
            self.pdfcache = PDFLinkSaver.getInstance()

        # start
        self.determineGereration()

    # determine if the program could run or wait, or load continue status.
    def determineGereration(self):
        self.generation = self.dao.getGeneration()
        currentMinGen = self.dao.getMinGenerationInDB()
        currentMaxGen = self.dao.getMaxGenerationInDB()

        print '====================================================================='
        print " * Required update_generation is: [ %s ]." % (self.generation)
        print " * Current min update_generation is: [ %s ]." % (currentMinGen)

        # process generation
        if currentMinGen < self.generation == currentMaxGen or self.generation > currentMaxGen:
            print " * Not finished task, continue to finish current generation."
        elif self.generation == currentMinGen == currentMaxGen:
            print " * Just start new generation"
            self.generation = self.generation + 1
            self.dao.setGeneration(self.generation)
        else:
            print "=== Error: generation(%s) bigger than currentMinGen(%s)" % (
                self.generation, currentMinGen)
            self.generation = currentMaxGen
            self.dao.setGeneration(self.generation)

        # count task progress
        print " * Process NA Persons : %s." % self.reportPersonProgress(
            self.generation)
        print " * Process Publication: %s." % self.reportPublicationProgress(
            self.generation)
        print '====================================================================='

    def reportPersonProgress(self, udpate_generation):
        ''' Return String that report progress of person.
		'''
        total = self.personDao.getPersonTotalCount()
        left = self.personDao.getPersonLeftCount(udpate_generation)
        progress = float(total - left) / total * 100.0
        return "[%6.2f%%] %s/%s" % (progress, total - left, total)

    def reportPublicationProgress(self, udpate_generation):
        ''' Return String that report progress of person.
		'''
        total = self.pubDao.getTotalCount()
        left = self.pubDao.getLeftCount(udpate_generation)
        progress = float(total - left) / total * 100.0
        return "[%6.2f%%] %s/%s" % (progress, total - left, total)

    def start(self):
        '''Extract Citation Multithread
		- Start main threads...
		- Manager Threads
		- Person Provider Thread
		- Publication Download Thread
		- ...
		'''
        self.store = Store(self.generation, self.mgr_interval)

        self.t_mgr = threading.Thread(target=self.mgrThreadBody,
                                      args=(),
                                      name='thread-mgr')  # use method mgr.
        self.t_mgr.start()

        self.t_provider = ProviderThread(self, None)
        self.t_provider.start()

        # waiting to finish
        self.t_mgr.join()
        print "============ ALL END ============"

    def wait_for_pause(self):
        while self.pause:
            time.sleep(self.mgr_interval)

    #
    # Management Thread
    #
    def mgrThreadBody(self):
        '''Management Thread
		'''
        print "$init:> start mgr & provider."
        getter = HtmlRetriever.getInstance(self.settings.use_proxy)

        while self.running or not self.stopped:

            # interval seconds passed.
            interval_seconds = (datetime.datetime.now() -
                                self.last_report_time).seconds
            if interval_seconds == 0: interval_seconds = 1
            self.last_report_time = datetime.datetime.now()

            # --------------------------------------------------------
            # strength by period of day.
            hour = datetime.datetime.now().hour
            if hour <= 9:  # 12h-9h
                self.max_person_thread = 25
                self.max_pub_thread = 75
            elif 22 <= hour:  # 9h-22h
                self.max_person_thread = 16
                self.max_pub_thread = 40
            else:  # 22h-24h
                self.max_person_thread = 22
                self.max_pub_thread = 60

            self.max_person_thread = 2
            self.max_pub_thread = 2
            # --------------------------------------------------------

            try:
                # save pdf link
                if self.settings.save_pdflink:
                    self.pdfcache.flush()
            except e:
                print "ERROR: pdf link"
                print e

            # message
            message = None

            # 什么时候重启所有线程&进程。
            reload_all_thread = False
            if self.num_report % 1000 == 0:
                reload_all_thread = True
                message = "Kill & Restart All Thread."

            try:
                # Maintain Threads and get worker threads status.
                (num_persont_alive,
                 num_pubt_alive) = self._maintainThreadPool(reload_all_thread)
            except e:
                print "ERROR: maintain threads and worker"
                print e

            try:
                # Finish Condition.
                if self._checkFinishCondition():
                    self.running = False  # -> tell all threads finish.
                    message = "MESSAGE! Send terminal signal to all worker thread."
            except e:
                print "ERROR: condition check"
                print e

            # if all worker threads stopped, mgrThread can stop.
            if num_persont_alive == 0 and num_pubt_alive == 0:
                self.stopped = True
                message = "Send terminal signal to mgr_thread."

            # check network and count
            period_success_connection = getter.success_connection_count - getter.last_success_connection_count
            period_bad_connection = getter.bad_connection_count - getter.last_bad_connection_count
            total_connections = period_success_connection + period_bad_connection
            getter.last_success_connection_count = getter.success_connection_count
            getter.last_bad_connection_count = getter.bad_connection_count

            average_success_persecond = period_success_connection / float(
                interval_seconds)
            average_bad_persecond = period_bad_connection / float(
                interval_seconds)

            if False:  # 是否Block模式，就是暂停整个程序
                if getter.detect_mode:
                    if getter.detect_success_count > 3:
                        getter.leave_detect_mode()
                        self.detect_exit_wait = 1  # 刚出来时，下两轮都不要再进入block模式了。
                else:
                    if total_connections * 0.9 < period_bad_connection:
                        if self.detect_exit_wait > 0:
                            print "---- waiting %s rounds ----" % self.detect_exit_wait
                            self.detect_exit_wait -= 1
                        else:
                            getter.enter_detect_mode()

            try:
                # print report
                if not getter.detect_mode:
                    str_report = None
                    if not self.pause:
                        self.num_report += 1
                        str_report = self.num_report
                    else:
                        str_report = "paused"

                    #--------------------------------------------------------------------------------
                    # print interval string.
                    report_strs = []
                    report_strs.append("-" * 100)
                    report_strs.append("\n")
                    report_strs.append(
                        "$&mgr:%s(%s):> " %
                        (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                         str_report))
                    report_strs.append(
                        "Person(%sT on %s), " %
                        (num_persont_alive, self.store.person_queue.qsize()))
                    report_strs.append(
                        "Pub(%sT on %s), " %
                        (num_pubt_alive, len(self.store.pubmap)))
                    report_strs.append("DBCache({{{ %s }}}), " %
                                       len(self.store.pub_db_cache))
                    report_strs.append(
                        "T(busy/idle)(%s/%s), " %
                        (self.busy_semaphore, self.max_person_thread +
                         self.max_pub_thread - self.busy_semaphore))
                    report_strs.append("\n")
                    g = getter.success_connection_count
                    b = getter.bad_connection_count
                    t = g + b
                    rate = 0
                    if (t > 0):
                        rate = g / float(t)
                    report_strs.append("network(g+b=t)=(%s+%s=%s),rate=%.2f " %
                                       (g, b, t, rate))
                    report_strs.append(
                        "interval-network(g+b=t)=(%s+%s=%s), " %
                        (period_success_connection, period_bad_connection,
                         total_connections))
                    report_strs.append(
                        "avg:(g%.1f b%.1f in %s seconds.), " %
                        (average_success_persecond, average_bad_persecond,
                         interval_seconds))
                    report_strs.append("\n")
                    report_strs.append(
                        "time:(wait=%.2f, getlock=%.2f, get=%.2f)" %
                        (self.store.ppt_wait, self.store.ppt_getlock,
                         self.store.ppt_get))
                    if message is not None:
                        report_strs.append("\n")
                        report_strs.append(message)
                    report_strs.append("\n")
                    report_strs.append(
                        " * Process NA Persons : %s.\n" %
                        self.reportPersonProgress(self.generation))
                    report_strs.append(
                        " * Process Publication: %s.\n" %
                        self.reportPublicationProgress(self.generation))
                    report_strs.append("-" * 100)
                    report_strs.append("\n")

                    print "".join(report_strs)
                    #--------------------------------------------------------------------------------
            except e:
                print "ERROR: report error"
                print e

            try:
                # flush db cache
                self.store.flushDBCache()  # last flush cache to db.
                self.store.running = self.running  # pass main running thread to Store object.
            except e:
                print "ERROR: flush db cache"
                print e

            time.sleep(self.mgr_interval)  # interval

        print "$mgr:> exit."

    def _checkFinishCondition(self):
        '''@return: true if all can stop.'''
        # Finish Condition.
        if self.waiting_to_finish and not self.pause:  # Provider report finish and not paused.
            if self.busy_semaphore == 0:  # all threads' status must be idle.
                if self.store.person_queue.empty() \
                    and len(self.store.pubmap) == 0 \
                    and len(self.store.pub_db_cache) == 0:  # task queue must be empty
                    left = self.pubDao.getLeftCount(self.generation)
                    if left == 0:  # really finished.
                        return True
        return False

    def _maintainThreadPool(self, reload_all_thread):
        '''
                Maintain ThreadPool, detect and restart, and set running threads on the fly.
		'''
        # Collect Information.
        num_persont_alive = 0
        num_pubt_alive = 0

        if reload_all_thread:  # kill all thread first.
            for idx_pub_t in range(0, self.max_pub_thread):
                t = None
                if len(self.pub_thread_pool) <= idx_pub_t:
                    self.pub_thread_pool.append(t)
                else:
                    t = self.pub_thread_pool[idx_pub_t]
                if t is not None:
                    t.ask_to_stop = True
            self.pub_thread_pool = []

        # check and start all unstarted threads.
        idx_person_t = 0
        for idx_person_t in range(0, self.max_person_thread):
            t = None
            if len(self.person_thread_pool) <= idx_person_t:
                self.person_thread_pool.append(
                    t)  # if len less than max size, increase with None.
            else:
                t = self.person_thread_pool[idx_person_t]

            if t is None or not t.is_alive():  # if is None(new add) or dead.
                if self.running:
                    t = PersonProcessThread(self)
                    t.name = 'person-thread-' + str(idx_person_t)
                    self.person_thread_pool[idx_person_t] = t
                    t.start()
                    num_persont_alive += 1
            else:
                num_persont_alive += 1

        # kill threads if needed.
        for i in range(idx_person_t,
                       len(self.person_thread_pool) - 1):  #@UnusedVariable
            t = self.person_thread_pool.pop(idx_person_t)
            t.stop()
            print "$mgr/thread:> kill thread %s" % t.name

        # check and start all unstarted threads.
        idx_pub_t = 0
        for idx_pub_t in range(0, self.max_pub_thread):
            t = None
            if len(self.pub_thread_pool) <= idx_pub_t:
                self.pub_thread_pool.append(t)
            else:
                t = self.pub_thread_pool[idx_pub_t]

            if t is None or not t.is_alive():
                if self.running:
                    t = PubProcessThread(self)
                    t.name = 'pub-thread-' + str(idx_pub_t)
                    self.pub_thread_pool[idx_pub_t] = t
                    t.start()
                    num_pubt_alive += 1
            else:
                num_pubt_alive += 1

        # kill threads if needed.
        for i in range(idx_pub_t,
                       len(self.pub_thread_pool) - 1):  #@UnusedVariable
            t = self.pub_thread_pool.pop(idx_pub_t)
            t.stop()
            print "$mgr/thread:> kill thread %s" % t.name

        return (num_persont_alive, num_pubt_alive)

示例#12

0

显示文件

文件： debug.py 项目： yinonbaron/aminer-spider

class DebugSuit():
    def __init__(self):
        self.extractor = Extractor.getInstance()
        self.matcher = PubMatcher.getInstance()
        self.pubdao = PublicationDao()

    def debug_person(self, person_id, person_name, generation):
        '''Test method extract_from_source.'''
        print '- DEBUG Person "%s" -:' % person_name

        pubs = self.pubdao.getPublicationByPerson(person_id, generation)
        all_models = self.extractor.getNodesByPersonName(person_name)
        #		if True:#print all all_models
        #			print '-' * 100, 'This is all_models'
        #			for key, models in all_models.items():
        #				print key, ':'
        #				for model in models:
        #					print '\t', model.readable_title, '(', model, ')'
        #			print '=' * 100 , 'all_models print done'
        (pubs_found,
         pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models)
        for pub in pubs_found:
            print 'pubs found', pub
        print '-' * 100
        for pub in pubs_notfound:
            print 'not found', pub

        print '|||||||||||||||||||||||||||| get by pubs '
        # todo here should be a while
        query, used_pubs = Extractor.pinMaxQuery(pubs_notfound)
        print '%s pub, query: %s' % (len(used_pubs), query)
        all_models = self.extractor.getNodesByPubs(used_pubs)
        (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(
            used_pubs, all_models)
        for pub in pubs_found:
            print 'pubs found', pub
        print '-' * 100
        for pub in pubs_notfound:
            print 'not found', pub

        print '- END DEBUG -'

    def debug_pubs(self):
        '''Debug get by pub'''
        print '-TEST-:', self.debug_pubs.__doc__.strip()
        #----------------------------------------------------
        pub_candidates = []

        # group 1
        #		pub_candidates.append(Publication(-1, 2000, 'Some Reflections on Proof Transformations', "pubkey", -1, "Peter B. Andrews", -5))
        #		pub_candidates.append(Publication(-1, 2000, 'Theorem Proving via General Mappings', "pubkey", -1, "Peter B. Andrews", -5))
        #		pub_candidates.append(Publication(-1, 2000, 'Connections and Higher-Order Logic', "pubkey", -1, "Peter B. Andrews", -5))
        #		pub_candidates.append(Publication(-1, 2000, 'The TPS Theorem Proving System', "pubkey", -1, "Peter B. Andrews,Sunil Issar,Dan Nesmith,Frank Pfenning", -5))

        # group 2
        #		pub_candidates.append(Publication(-1, 2000, 'Linearizable concurrent objects', "pubkey", -1, "MP Herlihy, JM Wing", -5))
        #		pub_candidates.append(Publication(-1, 2000, 'Protein structure prediction using a combination of sequence homology and global energy minimization I. Global energy minimization of surface loops', "pubkey", -1, "MJ Dudek, HA Scheraga", -5))

        # group 3
        #		pub_candidates.append(Publication(-1, 2000, 'Implementation of Prolog databases and database operation builtins in the WAM-Plus model', "pubkey", -1, "Z Chenxi, C Yungui, L Bo", -5))

        # group 4
        pub_candidates.append(
            Publication(
                -1, 2000,
                'Procedural Semantics for Fuzzy Disjunctive Programs on Residuated Lattices',
                "pubkey", -1, "Dusan Guller", -5))

        extractor = Extractor.getInstance()
        query, used_pubs = Extractor.pinMaxQuery(pub_candidates)
        print '%s pub, query: %s' % (len(used_pubs), query)

        #
        # Get WEB PAGE
        #
        use_web = True  # ***************
        if use_web:
            all_models = extractor.getNodesByPubs(used_pubs)
        else:
            f = file('debug_pubs.txt', 'r')
            html = f.read()
            models = self.extractor.extract_from_source(html)
            all_models = self.extractor._Extractor__merge_into_extractedmap(
                None, models)

        print '\n- all_models ----------------------'
        if all_models is not None:
            for key, models in all_models.items():
                print key
                for model in models:
                    print "\t", model
        else:
            print 'all_models is None'
        print '- all_models end ----------------------\n'

        (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(
            used_pubs, all_models)
        for pub in pubs_found:
            print 'pubs found', pub
        print '-' * 100
        for pub in pubs_notfound:
            print 'not found', pub
        print '- test done -'

示例#13

0

显示文件

文件： debug.py 项目： yinonbaron/aminer-spider

 def __init__(self):
     self.extractor = Extractor.getInstance()
     self.matcher = PubMatcher.getInstance()
     self.pubdao = PublicationDao()

示例#14

0

显示文件

文件： update_author.py 项目： yinonbaron/aminer-spider

 def __init__(self, aid, generation):
     self.aid = aid
     self.generation = generation
     self.person = self.get_author(aid, generation)
     self.pubdao = PublicationDao()