Python PDFLinkSaver示例，com.lish.ajia.googlescholar.pdfsaver.PDFLinkSaver Python示例

示例#1

0

显示文件

文件： extractor.py 项目： AlexLyj/aminer-spider

	def __init__(self):
		self.settings = Settings.getInstance()
		self.debug = self.settings.debug
		self.htmlRetriever = HtmlRetriever.getInstance(self.settings.use_proxy)
		if self.settings.save_pdflink:
			self.pdfcache = PDFLinkSaver.getInstance()
		self.author = re.compile('<div class="?gs_a"?>([^\\x00]+?) - ', re.I)
		self.pdf_block = re.compile('<div class="?gs_ggs gs_fl"?><a href="?([^\s"]+)?"?[^>]+?><span class="?gs_ctg2"?>\[PDF\]</span>', re.I)
		self.citation_block = re.compile('<div class="?gs_fl"?>.*?</div>', re.I)

示例#2

0

显示文件

文件： extractor.py 项目： yinonbaron/aminer-spider

 def __init__(self):
     self.settings = Settings.getInstance()
     self.debug = self.settings.debug
     self.htmlRetriever = HtmlRetriever.getInstance(self.settings.use_proxy)
     if self.settings.save_pdflink:
         self.pdfcache = PDFLinkSaver.getInstance()
     self.author = re.compile('<div class="?gs_a"?>([^\\x00]+?) - ', re.I)
     self.pdf_block = re.compile(
         '<div class="?gs_ggs gs_fl"?><a href="?([^\s"]+)?"?[^>]+?><span class="?gs_ctg2"?>\[PDF\]</span>',
         re.I)
     self.citation_block = re.compile('<div class="?gs_fl"?>.*?</div>',
                                      re.I)

示例#3

0

显示文件

    def __init__(self):
        print "Task: extract paper's citation from schooler.google.com.\n"
        self.settings = Settings.getInstance()
        self.debug = self.settings.debug

        # Configs
        self.mgr_interval = 10  # seconds
        self.max_person_thread = 2  # max threads used to extract person,
        self.max_pub_thread = 2  # these 2 values can modified on the fly. diff in day or night

        # Threads and configurations
        self.t_mgr = None  # MgrThread(self)	# management thread, create
        self.t_provider = None
        self.person_thread_pool = [
        ]  #= Queue.Queue(maxsize=self.max_person_thread)
        self.pub_thread_pool = []  #= Queue.Queue(maxsize=self.max_pub_thread)

        self.busy_semaphore = 0  # 用来监视是否所有的线程都处于Idle状态
        self.busy_semaphore_lock = threading.Lock()  # 用来监视是否所有的线程都处于Idle状态

        # utils
        self.store = None

        # switchers & flags
        self.running = True  # If False, threads will stop after current task.
        self.stopped = False  # If MGRThread can stop.
        self.pause = False  # All works paused.
        self.waiting_to_finish = False  # No additional data. all added to queue.
        self.num_report = 0
        self.last_report_time = datetime.datetime.now()  # 上次Interval的时间

        self.restart_all_thread = False
        self.detect_exit_wait = 0  # 当刚刚从pause模式退出来时，会有大量failed的任务，会导致立刻再次等待

        self.generation = 0

        self.dao = dbs()
        self.personDao = PersonDao()
        self.pubDao = PublicationDao()

        if self.settings.save_pdflink:
            self.pdfcache = PDFLinkSaver.getInstance()

        # start
        self.determineGereration()

示例#4

0

显示文件

文件： start_cw.py 项目： Rygbee/aminer-spider

    def __init__(self):
        print "Task: extract paper's citation from schooler.google.com.\n"
        self.settings = Settings.getInstance()
        self.debug = self.settings.debug

        # Configs
        self.mgr_interval = 10  # seconds
        self.max_person_thread = 2  # max threads used to extract person,
        self.max_pub_thread = 2  # these 2 values can modified on the fly. diff in day or night

        # Threads and configurations
        self.t_mgr = None  # MgrThread(self)	# management thread, create
        self.t_provider = None
        self.person_thread_pool = []  # = Queue.Queue(maxsize=self.max_person_thread)
        self.pub_thread_pool = []  # = Queue.Queue(maxsize=self.max_pub_thread)

        self.busy_semaphore = 0  # 用来监视是否所有的线程都处于Idle状态
        self.busy_semaphore_lock = threading.Lock()  # 用来监视是否所有的线程都处于Idle状态

        # utils
        self.store = None

        # switchers & flags
        self.running = True  # If False, threads will stop after current task.
        self.stopped = False  # If MGRThread can stop.
        self.pause = False  # All works paused.
        self.waiting_to_finish = False  # No additional data. all added to queue.
        self.num_report = 0
        self.last_report_time = datetime.datetime.now()  # 上次Interval的时间

        self.restart_all_thread = False
        self.detect_exit_wait = 0  # 当刚刚从pause模式退出来时，会有大量failed的任务，会导致立刻再次等待

        self.generation = 0

        self.dao = dbs()
        self.personDao = PersonDao()
        self.pubDao = PublicationDao()

        if self.settings.save_pdflink:
            self.pdfcache = PDFLinkSaver.getInstance()

            # start
        self.determineGereration()

示例#5

0

显示文件

文件： debug.py 项目： AlexLyj/aminer-spider

			for key, models in all_models.items():
				print key
				for model in models:
					print "\t", model
		else:
			print 'all_models is None'
		print '- all_models end ----------------------\n'

		(pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models)
		for pub in pubs_found:
			print 'pubs found' , pub
		print '-' * 100
		for pub in pubs_notfound:
			print 'not found' , pub
		print '- test done -'

if __name__ == '__main__':
	''' top pub person， this is in local database.
	'''
	debug = DebugSuit()
#	debug.debug_person(29463, 'Reihaneh Safavi-Naini', 4)
	debug.debug_pubs()


	# end
	if Settings.getInstance().save_pdflink:
		PDFLinkSaver.getInstance().flush()

示例#6

0

显示文件

文件： debug.py 项目： yinonbaron/aminer-spider

        print '\n- all_models ----------------------'
        if all_models is not None:
            for key, models in all_models.items():
                print key
                for model in models:
                    print "\t", model
        else:
            print 'all_models is None'
        print '- all_models end ----------------------\n'

        (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(
            used_pubs, all_models)
        for pub in pubs_found:
            print 'pubs found', pub
        print '-' * 100
        for pub in pubs_notfound:
            print 'not found', pub
        print '- test done -'


if __name__ == '__main__':
    ''' top pub person， this is in local database.
	'''
    debug = DebugSuit()
    #	debug.debug_person(29463, 'Reihaneh Safavi-Naini', 4)
    debug.debug_pubs()

    # end
    if Settings.getInstance().save_pdflink:
        PDFLinkSaver.getInstance().flush()