示例#1
0
    def test_matchPub(self):
        self.extractor = Extractor().getInstance()
        pubdao = PublicationDao()
        person_id = 13419
        person_name = 'jie tang'
        # Read sources from files
        all_models = {}
        for page in range(0, 3):
            filename = "".join((person_name, '_page_', str(page), '.html'))
            f = file(os.path.join(self.settings.source_dir, filename), 'r')
            html = f.read()
            models = self.extractor.extract_from_source(html)
            if models is not None:
                self.extractor._Extractor__merge_into_extractedmap(
                    all_models, models)
        print 'Total found DEBUG  %s items.' % len(all_models)

        # part 2
        pubs = pubdao.getPublicationByPerson(person_id,
                                             self.settings.generation)

        printout = False
        if printout:
            for key, models in all_models.items():
                print key, " --> ", models
            print '==================='
            for pub in pubs:
                print pub

        (pubs_matched, pubs_not_matched) = self.matchPub(pubs, all_models)
        print '- test done -', len(pubs_matched), len(pubs_not_matched)
        return pubs_not_matched
示例#2
0
	def __init__(self, generation, mgr_interval=5):
		self.settings = Settings.getInstance()
		self.debug = self.settings.debug

		self.gen = generation
		self.mgr_interval = mgr_interval

		self.person_queue 		 = Queue.Queue(maxsize=self.settings.person_cache_size)
		self.person_id_set	 	 = set([])	# sync with queue, quick contains using id. 

		self.pubmap		 		 = {}		# {id -> pub}
		self.person_pub_map		 = {}		# {person_id->[pub_id_list]} - person to pub_ids
		self.pub_db_cache 		 = {}

		self.pub_lock			 = threading.Lock()
		self.pub_dbcache_lock 	 = threading.RLock()

		self.running = True #sync ed with main running flag in mgr_interval_thread
		self.blocked_pub_t 		 = 0

		# time sum
		self.ppt_wait = 0
		self.ppt_getlock = 0
		self.ppt_get = 0

		self.person_dao = PersonDao()
		self.pub_dao = PublicationDao()
    def __init__(self, extractorInstance):
        threading.Thread.__init__(self)

        self.extractor = extractorInstance
        self.store = self.extractor.store
        self.pubdao = PublicationDao()

        self.person = None  # set this and start.
        self.ask_to_stop = False
        self.last_action = datetime.datetime.now()
示例#4
0
    def __init__(self):
        print "Task: extract paper's citation from schooler.google.com.\n"
        self.settings = Settings.getInstance()
        self.debug = self.settings.debug

        # Configs
        self.mgr_interval = 10  # seconds
        self.max_person_thread = 2  # max threads used to extract person,
        self.max_pub_thread = 2  # these 2 values can modified on the fly. diff in day or night

        # Threads and configurations
        self.t_mgr = None  # MgrThread(self)	# management thread, create
        self.t_provider = None
        self.person_thread_pool = [
        ]  #= Queue.Queue(maxsize=self.max_person_thread)
        self.pub_thread_pool = []  #= Queue.Queue(maxsize=self.max_pub_thread)

        self.busy_semaphore = 0  # 用来监视是否所有的线程都处于Idle状态
        self.busy_semaphore_lock = threading.Lock()  # 用来监视是否所有的线程都处于Idle状态

        # utils
        self.store = None

        # switchers & flags
        self.running = True  # If False, threads will stop after current task.
        self.stopped = False  # If MGRThread can stop.
        self.pause = False  # All works paused.
        self.waiting_to_finish = False  # No additional data. all added to queue.
        self.num_report = 0
        self.last_report_time = datetime.datetime.now()  # 上次Interval的时间

        self.restart_all_thread = False
        self.detect_exit_wait = 0  # 当刚刚从pause模式退出来时,会有大量failed的任务,会导致立刻再次等待

        self.generation = 0

        self.dao = dbs()
        self.personDao = PersonDao()
        self.pubDao = PublicationDao()

        if self.settings.save_pdflink:
            self.pdfcache = PDFLinkSaver.getInstance()

        # start
        self.determineGereration()
示例#5
0
 def __init__(self):
     self.extractor = Extractor.getInstance()
     self.matcher = PubMatcher.getInstance()
     self.pubdao = PublicationDao()
示例#6
0
 def __init__(self, aid, generation):
     self.aid = aid
     self.generation = generation
     self.person = self.get_author(aid, generation)
     self.pubdao = PublicationDao()