def test_matchPub(self): self.extractor = Extractor().getInstance() pubdao = PublicationDao() person_id = 13419 person_name = 'jie tang' # Read sources from files all_models = {} for page in range(0, 3): filename = "".join((person_name, '_page_', str(page), '.html')) f = file(os.path.join(self.settings.source_dir, filename), 'r') html = f.read() models = self.extractor.extract_from_source(html) if models is not None: self.extractor._Extractor__merge_into_extractedmap( all_models, models) print 'Total found DEBUG %s items.' % len(all_models) # part 2 pubs = pubdao.getPublicationByPerson(person_id, self.settings.generation) printout = False if printout: for key, models in all_models.items(): print key, " --> ", models print '===================' for pub in pubs: print pub (pubs_matched, pubs_not_matched) = self.matchPub(pubs, all_models) print '- test done -', len(pubs_matched), len(pubs_not_matched) return pubs_not_matched
def __init__(self, generation, mgr_interval=5): self.settings = Settings.getInstance() self.debug = self.settings.debug self.gen = generation self.mgr_interval = mgr_interval self.person_queue = Queue.Queue(maxsize=self.settings.person_cache_size) self.person_id_set = set([]) # sync with queue, quick contains using id. self.pubmap = {} # {id -> pub} self.person_pub_map = {} # {person_id->[pub_id_list]} - person to pub_ids self.pub_db_cache = {} self.pub_lock = threading.Lock() self.pub_dbcache_lock = threading.RLock() self.running = True #sync ed with main running flag in mgr_interval_thread self.blocked_pub_t = 0 # time sum self.ppt_wait = 0 self.ppt_getlock = 0 self.ppt_get = 0 self.person_dao = PersonDao() self.pub_dao = PublicationDao()
def __init__(self, extractorInstance): threading.Thread.__init__(self) self.extractor = extractorInstance self.store = self.extractor.store self.pubdao = PublicationDao() self.person = None # set this and start. self.ask_to_stop = False self.last_action = datetime.datetime.now()
def __init__(self): print "Task: extract paper's citation from schooler.google.com.\n" self.settings = Settings.getInstance() self.debug = self.settings.debug # Configs self.mgr_interval = 10 # seconds self.max_person_thread = 2 # max threads used to extract person, self.max_pub_thread = 2 # these 2 values can modified on the fly. diff in day or night # Threads and configurations self.t_mgr = None # MgrThread(self) # management thread, create self.t_provider = None self.person_thread_pool = [ ] #= Queue.Queue(maxsize=self.max_person_thread) self.pub_thread_pool = [] #= Queue.Queue(maxsize=self.max_pub_thread) self.busy_semaphore = 0 # 用来监视是否所有的线程都处于Idle状态 self.busy_semaphore_lock = threading.Lock() # 用来监视是否所有的线程都处于Idle状态 # utils self.store = None # switchers & flags self.running = True # If False, threads will stop after current task. self.stopped = False # If MGRThread can stop. self.pause = False # All works paused. self.waiting_to_finish = False # No additional data. all added to queue. self.num_report = 0 self.last_report_time = datetime.datetime.now() # 上次Interval的时间 self.restart_all_thread = False self.detect_exit_wait = 0 # 当刚刚从pause模式退出来时,会有大量failed的任务,会导致立刻再次等待 self.generation = 0 self.dao = dbs() self.personDao = PersonDao() self.pubDao = PublicationDao() if self.settings.save_pdflink: self.pdfcache = PDFLinkSaver.getInstance() # start self.determineGereration()
def __init__(self): self.extractor = Extractor.getInstance() self.matcher = PubMatcher.getInstance() self.pubdao = PublicationDao()
def __init__(self, aid, generation): self.aid = aid self.generation = generation self.person = self.get_author(aid, generation) self.pubdao = PublicationDao()