def load(self): """ Threaded loading of elements. """ settings.from_json(self.settings) sql.init_from_settings() self._session = sql.session() self.progress.set_scanning(True) retry_failed = settings.get('processing.retry_failed') # Query for all unhandled URLs, and submit them before scanning for new Posts. unfinished = self._session\ .query(sql.URL)\ .filter((sql.URL.processed == False) | (retry_failed and sql.URL.failed == True))\ .all() self._push_url_list(unfinished) self._scan_sources() self.progress.set_scanning(False) # Wait for any remaining ACKS to come in, before closing the writing pipe. # ...Until the Downloaders have confirmed completion of everything, more album URLS may come in. while len(self._open_ack) > 0 and not self._stop_event.is_set(): self._handle_acks(timeout=0.5) print("Finished loading.") sql.close()
def load(self): """ Threaded loading of elements. """ settings.from_json(self.settings) sql.init_from_settings() self._session = sql.session() t_start = datetime.now() #vy print("Started loading.") #vy self.progress.set_scanning(True) retry_failed = settings.get('processing.retry_failed') # Query for all unhandled URLs, and submit them before scanning for new Posts. unfinished = self._session\ .query(sql.URL)\ .filter((sql.URL.processed == False) | \ (retry_failed and sql.URL.failed and \ sql.not_(sql.URL.failure_reason.contains('404'))))\ .all() print("Loading %s unfinished urls" % len(unfinished)) self._push_url_list(unfinished) self._scan_sources() self.progress.set_scanning(False) # Wait for any remaining ACKS to come in, before closing the writing pipe. # ...Until the Downloaders have confirmed completion of everything, more album URLS may come in. while len(self._open_ack) > 0 and not self._stop_event.is_set(): self._handle_acks(timeout=1.0, clear=True) print("Finished loading.") #vy print("Elapsed time: %s" % str(datetime.now() - t_start)) #vy sql.close()
def run(self): """ Threaded loading of elements. """ settings.from_json(self._settings) sql.init_from_settings() self._session = sql.session() self.progress.clear(status="Starting up...") self.progress.set_running(True) while not self._stop_event.is_set(): self._dedupe() self.progress.set_status("Waiting for new files...") self._stop_event.wait(2) self._dedupe() # Run one final pass after downloading stops. self.progress.set_running(False) sql.close() self.progress.clear("Finished.")
def __init__(self, source_patterns=None): super().__init__() sql.init_from_settings() # Make sure the database is built & migrated before starting threads. sql.close() self.daemon = False self.sources = source_patterns self.sources = self.load_sources() self.db_lock = RLock() # initialize Loader self.loader = RedditLoader(sources=self.sources, settings_json=settings.to_json(), db_lock=self.db_lock) self.deduplicator = Deduplicator( settings_json=settings.to_json(), stop_event=self.loader.get_stop_event(), db_lock=self.db_lock ) self._downloaders = self._create_downloaders() self._all_processes = [self.loader, *self._downloaders] if settings.get('processing.deduplicate_files'): self._all_processes.append(self.deduplicator)
def processLgJob(): page = 1 config_file = "../../conf/sys.conf" dictConfig = sql.readConfig( config_file ) db = sql.connSql( dictConfig ) max_date = get_last_date( db ) url = getUrl("深圳") is_update = True; while 1: if( is_update != True ): break if page == 1: post_data = {'first':'true','pn':'1'} else: post_data = {'first':'false','pn': page} page = page + 1 job_list = getJobList(url,post_data) time.sleep(5) #print job_list; #job_list = readJobs() if job_list: job_data = processJobList( job_list ) for index in xrange(len(job_data)): if( job_data[index]["create_date"] < max_date ): is_update = False break job_sql = getSql( job_data[index] ) #print job_sql sql.insert( job_sql , db ) else: break sql.close(db)
def get_proxies(): curTime = TIME(UnixTime=TIME().NowUnix) # 获得当前时间 sql.start() if sql.insert(ip='000.000.0.000', port='8888', addr='爬取时间', time=curTime.normal()): fill_proxypool() # 第一次爬取代理网站 else: time_info = sql.get(ip='000.000.0.000')[6] lastTime = TIME(NormalTime=time_info) diffTime = (curTime.unix() - lastTime.unix()) / 60 # 相差多少分钟 # 距上次爬取超过30分钟,则更新代理池 if diffTime > 30: print('代理池需要更新,请稍后...') fill_proxypool() sql.update('000.000.0.000', 'TIME', curTime.normal()) else: print('代理池无需更新') proxy = select_proxy() sql.close() return proxy
def run(self): """ Threaded loading of elements. """ settings.from_json(self._settings) sql.init_from_settings() try: self._session = sql.session() self.progress.clear(status="Starting up...") self.progress.set_running(True) while not self._stop_event.is_set(): self._dedupe() self.progress.set_status("Ready for new files...") self._stop_event.wait(2) self._dedupe() # Run one final pass after downloading stops. self.progress.clear(status="Finished.", running=False) except Exception as ex: print('Deduplication Process Error:', ex) self.progress.set_error(ex) self.progress.set_running(False) traceback.print_exc() finally: sql.close()
def run(self): """ Threaded loading of elements. """ settings.from_json(self._settings) sql.init_from_settings() print("Starting up...", debug=True) try: self._session = sql.session() self.progress.clear(status="Starting up...") self.progress.set_running(True) self.dedup_ignore_ids = set() self.prune_counter = 0 self.special_hashes = self._session.query(Hash).filter( Hash.id < 0).all() while not self._stop_event.is_set(): #print("_stop_event is %s"%self._stop_event.is_set(), debug=True) completed = self._dedupe() if completed: self.progress.set_status( "Completed %s files. Ready for new files..." % completed) self._stop_event.wait(1) else: self._stop_event.wait(10) print("_stop_event is %s" % self._stop_event.is_set(), debug=True) self._dedupe() # Run one final pass after downloading stops. self.progress.clear(status="Finished.", running=False) except Exception as ex: print('Deduplication Process Error:', ex) self.progress.set_error(ex) self.progress.set_running(False) traceback.print_exc() finally: print("Finished process, _stop_event is %s" % self._stop_event.is_set(), debug=True) sql.close()
def tearDownClass(cls): if sql._Session: sql.close() if cls.dir: rmtree(cls.dir)
sql.check_site(config.ID_SITE) print "[OK]" except: print "[FAILED], site with idsite = %s wasn't found" % config.ID_SITE else: config.read_config(options.config_file) if not (options.start_date or config.CONFIG_START): print "Start date parameter is required. For more info type ./google2piwik.py -h" exit() start_date = read_date(options.start_date or config.CONFIG_START) end_date = None if not (options.end_date or config.CONFIG_END) else read_date(options.end_date or config.CONFIG_END) sql.initialize(config.MYSQL_CREDENTIALS) CURRENT_VERSION = sql.get_version(config.MYSQL_CREDENTIALS["table_prefix"]) if StrictVersion(CURRENT_VERSION) < StrictVersion('1.9'): CURRENT_VERSION = 1.8 else: CURRENT_VERSION = 1.9 if options.update_visit_actions: sql.update_total_visit_actions() exit() sql.update_site_ts_created(config.ID_SITE, start_date) export_period(start_date, end_date) sql.clear_archives() sql.close() print "Please go to your Piwik installation folder and run misc/cron/archive.sh script."
elif key == 'Title': book_info['name'] = ':'.join(ar[1:]).strip() book_info[ 'enter_path'] = book_info['name'] + '.txt' elif key == 'Release Date': book_info['date'] = ':'.join(ar[1:]).strip() except Exception: pass sql.add_book(book_info['publisher_id'], book_info['name'], book_info['main_path'], \ book_info['enter_path'], book_info['date'], book_info['wiki_link'], book_info['description']) update_book_table() shutil.copyfile(file_name, build_path(book_info)) fill_book_form(book_info) def build_path(book_info: dict) -> str: ''' build path to book file :param book_info: provider of book path info :return: book file_path ''' return '\\'.join( [script_path, book_info['main_path'], book_info['enter_path']]) if __name__ == '__main__': sql.connect() showMain() sql.close()
def tearDown(self): sql.close()
def tearDown(self): sql.close() importlib.reload(settings)
def run(self): """ Threaded loading of elements. """ settings.from_json(self._settings) sql.init_from_settings() self._session = sql.session() self.progress.clear(status="Starting up...", running=True) failed = False for nxt_id in self._reader: try: url = self._session.query( sql.URL).filter(sql.URL.id == nxt_id).first() if not url: raise Exception("Unknown URL ID provided: (%s}" % nxt_id) file = url.file path = SanitizedRelFile(base=settings.get("output.base_dir"), file_path=str(file.path)) self.progress.set_file(path.relative()) self.progress.set_status("Attempting to Handle URL...") self.progress.set_running(True) task = handlers.HandlerTask(url=url.address, file_obj=path) resp = handlers.handle(task, self.progress) is_album_parent = False with self._db_lock: if resp.album_urls: if url.album_id: resp.album_urls = [ ] # Ignore nested Albums to avoid recursion. else: url.album_id = str(uuid.uuid4()) is_album_parent = True else: resp.album_urls = [] url.failed = not resp.success url.failure_reason = resp.failure_reason url.last_handler = resp.handler url.album_is_parent = is_album_parent if resp.rel_file: file.downloaded = True file.path = resp.rel_file.relative() file.hash = None utime(resp.rel_file.absolute(), times=(time(), time())) self._session.commit() # Once *all* processing is completed on this URL, the Downloader needs to ACK it. # If any additional Album URLS were located, they should be sent before the ACK. self._ack_queue.put( AckPacket(url_id=nxt_id, extra_urls=resp.album_urls)) self.progress.clear(status="Waiting for URL...") except Exception as ex: failed = str(ex) self._ack_queue.put(AckPacket(url_id=nxt_id, extra_urls=[])) print(ex) traceback.print_exc() self.progress.set_error("Exited with error: {%s}" % failed) break sql.close() self.progress.clear( "Finished." if not failed else "Exited with error: %s" % failed, running=False)