Exemplo n.º 1
0
    def load(self):
        """ Threaded loading of elements. """
        settings.from_json(self.settings)
        sql.init_from_settings()
        self._session = sql.session()

        self.progress.set_scanning(True)

        retry_failed = settings.get('processing.retry_failed')

        # Query for all unhandled URLs, and submit them before scanning for new Posts.
        unfinished = self._session\
         .query(sql.URL)\
         .filter((sql.URL.processed == False) | (retry_failed and sql.URL.failed == True))\
         .all()
        self._push_url_list(unfinished)

        self._scan_sources()

        self.progress.set_scanning(False)
        # Wait for any remaining ACKS to come in, before closing the writing pipe.
        # ...Until the Downloaders have confirmed completion of everything, more album URLS may come in.
        while len(self._open_ack) > 0 and not self._stop_event.is_set():
            self._handle_acks(timeout=0.5)
        print("Finished loading.")
        sql.close()
Exemplo n.º 2
0
    def load(self):
        """ Threaded loading of elements. """
        settings.from_json(self.settings)
        sql.init_from_settings()
        self._session = sql.session()
        t_start = datetime.now()  #vy
        print("Started loading.")  #vy
        self.progress.set_scanning(True)

        retry_failed = settings.get('processing.retry_failed')

        # Query for all unhandled URLs, and submit them before scanning for new Posts.
        unfinished = self._session\
         .query(sql.URL)\
         .filter((sql.URL.processed == False) | \
          (retry_failed and sql.URL.failed and \
           sql.not_(sql.URL.failure_reason.contains('404'))))\
         .all()
        print("Loading %s unfinished urls" % len(unfinished))
        self._push_url_list(unfinished)

        self._scan_sources()

        self.progress.set_scanning(False)
        # Wait for any remaining ACKS to come in, before closing the writing pipe.
        # ...Until the Downloaders have confirmed completion of everything, more album URLS may come in.
        while len(self._open_ack) > 0 and not self._stop_event.is_set():
            self._handle_acks(timeout=1.0, clear=True)
        print("Finished loading.")  #vy
        print("Elapsed time: %s" % str(datetime.now() - t_start))  #vy
        sql.close()
    def run(self):
        """ Threaded loading of elements. """
        settings.from_json(self._settings)
        sql.init_from_settings()
        self._session = sql.session()
        self.progress.clear(status="Starting up...")
        self.progress.set_running(True)

        while not self._stop_event.is_set():
            self._dedupe()
            self.progress.set_status("Waiting for new files...")
            self._stop_event.wait(2)
        self._dedupe()  # Run one final pass after downloading stops.

        self.progress.set_running(False)
        sql.close()
        self.progress.clear("Finished.")
Exemplo n.º 4
0
	def __init__(self, source_patterns=None):
		super().__init__()
		sql.init_from_settings()  # Make sure the database is built & migrated before starting threads.
		sql.close()
		self.daemon = False
		self.sources = source_patterns
		self.sources = self.load_sources()
		self.db_lock = RLock()
		# initialize Loader
		self.loader = RedditLoader(sources=self.sources, settings_json=settings.to_json(), db_lock=self.db_lock)
		self.deduplicator = Deduplicator(
			settings_json=settings.to_json(),
			stop_event=self.loader.get_stop_event(),
			db_lock=self.db_lock
		)
		self._downloaders = self._create_downloaders()
		self._all_processes = [self.loader, *self._downloaders]
		if settings.get('processing.deduplicate_files'):
			self._all_processes.append(self.deduplicator)
Exemplo n.º 5
0
def processLgJob():

    page = 1
    config_file = "../../conf/sys.conf"
    dictConfig = sql.readConfig( config_file )
    db = sql.connSql( dictConfig ) 
    max_date = get_last_date( db )

    url = getUrl("深圳") 

    is_update = True;
    while 1:

        if( is_update != True ):
            break

        if page == 1:
            post_data = {'first':'true','pn':'1'}
        else:
            post_data = {'first':'false','pn': page}

        page = page + 1
        job_list = getJobList(url,post_data)
        time.sleep(5)

        #print job_list;
        #job_list = readJobs() 
        if job_list:
            job_data = processJobList( job_list )
            for index in xrange(len(job_data)):


                if( job_data[index]["create_date"] < max_date ):
                    is_update = False
                    break 

                job_sql = getSql( job_data[index] )
                #print job_sql
                sql.insert( job_sql , db )
        else:
            break
    sql.close(db)
Exemplo n.º 6
0
def get_proxies():
    curTime = TIME(UnixTime=TIME().NowUnix)  # 获得当前时间
    sql.start()
    if sql.insert(ip='000.000.0.000',
                  port='8888',
                  addr='爬取时间',
                  time=curTime.normal()):
        fill_proxypool()  # 第一次爬取代理网站
    else:
        time_info = sql.get(ip='000.000.0.000')[6]
        lastTime = TIME(NormalTime=time_info)
        diffTime = (curTime.unix() - lastTime.unix()) / 60  # 相差多少分钟
        # 距上次爬取超过30分钟,则更新代理池
        if diffTime > 30:
            print('代理池需要更新,请稍后...')
            fill_proxypool()
            sql.update('000.000.0.000', 'TIME', curTime.normal())
        else:
            print('代理池无需更新')
    proxy = select_proxy()
    sql.close()
    return proxy
Exemplo n.º 7
0
    def run(self):
        """ Threaded loading of elements. """
        settings.from_json(self._settings)
        sql.init_from_settings()
        try:
            self._session = sql.session()
            self.progress.clear(status="Starting up...")
            self.progress.set_running(True)

            while not self._stop_event.is_set():
                self._dedupe()
                self.progress.set_status("Ready for new files...")
                self._stop_event.wait(2)
            self._dedupe()  # Run one final pass after downloading stops.
            self.progress.clear(status="Finished.", running=False)
        except Exception as ex:
            print('Deduplication Process Error:', ex)
            self.progress.set_error(ex)
            self.progress.set_running(False)
            traceback.print_exc()
        finally:
            sql.close()
Exemplo n.º 8
0
    def run(self):
        """ Threaded loading of elements. """
        settings.from_json(self._settings)
        sql.init_from_settings()
        print("Starting up...", debug=True)
        try:
            self._session = sql.session()
            self.progress.clear(status="Starting up...")
            self.progress.set_running(True)
            self.dedup_ignore_ids = set()
            self.prune_counter = 0
            self.special_hashes = self._session.query(Hash).filter(
                Hash.id < 0).all()

            while not self._stop_event.is_set():
                #print("_stop_event is %s"%self._stop_event.is_set(), debug=True)
                completed = self._dedupe()
                if completed:
                    self.progress.set_status(
                        "Completed %s files. Ready for new files..." %
                        completed)
                    self._stop_event.wait(1)
                else:
                    self._stop_event.wait(10)
            print("_stop_event is %s" % self._stop_event.is_set(), debug=True)
            self._dedupe()  # Run one final pass after downloading stops.
            self.progress.clear(status="Finished.", running=False)
        except Exception as ex:
            print('Deduplication Process Error:', ex)
            self.progress.set_error(ex)
            self.progress.set_running(False)
            traceback.print_exc()
        finally:
            print("Finished process, _stop_event is %s" %
                  self._stop_event.is_set(),
                  debug=True)
            sql.close()
Exemplo n.º 9
0
 def tearDownClass(cls):
     if sql._Session:
         sql.close()
     if cls.dir:
         rmtree(cls.dir)
Exemplo n.º 10
0
            sql.check_site(config.ID_SITE)
            print "[OK]"
        except:
            print "[FAILED], site with idsite = %s wasn't found" % config.ID_SITE
    else:
        config.read_config(options.config_file)
        if not (options.start_date or config.CONFIG_START):
            print "Start date parameter is required. For more info type ./google2piwik.py -h"
            exit()
        start_date = read_date(options.start_date or config.CONFIG_START)
        end_date = None if not (options.end_date or config.CONFIG_END) else read_date(options.end_date or config.CONFIG_END)
        sql.initialize(config.MYSQL_CREDENTIALS)

        CURRENT_VERSION = sql.get_version(config.MYSQL_CREDENTIALS["table_prefix"])
        if StrictVersion(CURRENT_VERSION) < StrictVersion('1.9'):
            CURRENT_VERSION = 1.8
        else:
            CURRENT_VERSION = 1.9

        if options.update_visit_actions:
            sql.update_total_visit_actions()
            exit()

        sql.update_site_ts_created(config.ID_SITE, start_date)

        export_period(start_date, end_date)

        sql.clear_archives()
        sql.close()
        print "Please go to your Piwik installation folder and run misc/cron/archive.sh script."
Exemplo n.º 11
0
                        elif key == 'Title':
                            book_info['name'] = ':'.join(ar[1:]).strip()
                            book_info[
                                'enter_path'] = book_info['name'] + '.txt'
                        elif key == 'Release Date':
                            book_info['date'] = ':'.join(ar[1:]).strip()
            except Exception:
                pass
        sql.add_book(book_info['publisher_id'], book_info['name'], book_info['main_path'], \
            book_info['enter_path'], book_info['date'], book_info['wiki_link'], book_info['description'])
        update_book_table()
        shutil.copyfile(file_name, build_path(book_info))
        fill_book_form(book_info)


def build_path(book_info: dict) -> str:
    '''
    build path to book file
    
    :param book_info: provider of book path info
	:return: book file_path 
    '''
    return '\\'.join(
        [script_path, book_info['main_path'], book_info['enter_path']])


if __name__ == '__main__':
    sql.connect()
    showMain()
    sql.close()
Exemplo n.º 12
0
	def tearDown(self):
		sql.close()
Exemplo n.º 13
0
 def tearDown(self):
     sql.close()
     importlib.reload(settings)
Exemplo n.º 14
0
    def run(self):
        """ Threaded loading of elements. """
        settings.from_json(self._settings)
        sql.init_from_settings()
        self._session = sql.session()
        self.progress.clear(status="Starting up...", running=True)
        failed = False

        for nxt_id in self._reader:
            try:
                url = self._session.query(
                    sql.URL).filter(sql.URL.id == nxt_id).first()
                if not url:
                    raise Exception("Unknown URL ID provided: (%s}" % nxt_id)

                file = url.file
                path = SanitizedRelFile(base=settings.get("output.base_dir"),
                                        file_path=str(file.path))

                self.progress.set_file(path.relative())
                self.progress.set_status("Attempting to Handle URL...")
                self.progress.set_running(True)

                task = handlers.HandlerTask(url=url.address, file_obj=path)
                resp = handlers.handle(task, self.progress)

                is_album_parent = False

                with self._db_lock:
                    if resp.album_urls:
                        if url.album_id:
                            resp.album_urls = [
                            ]  # Ignore nested Albums to avoid recursion.
                        else:
                            url.album_id = str(uuid.uuid4())
                            is_album_parent = True
                    else:
                        resp.album_urls = []

                    url.failed = not resp.success
                    url.failure_reason = resp.failure_reason
                    url.last_handler = resp.handler
                    url.album_is_parent = is_album_parent

                    if resp.rel_file:
                        file.downloaded = True
                        file.path = resp.rel_file.relative()
                        file.hash = None
                        utime(resp.rel_file.absolute(), times=(time(), time()))

                    self._session.commit()

                # Once *all* processing is completed on this URL, the Downloader needs to ACK it.
                # If any additional Album URLS were located, they should be sent before the ACK.
                self._ack_queue.put(
                    AckPacket(url_id=nxt_id, extra_urls=resp.album_urls))
                self.progress.clear(status="Waiting for URL...")
            except Exception as ex:
                failed = str(ex)
                self._ack_queue.put(AckPacket(url_id=nxt_id, extra_urls=[]))
                print(ex)
                traceback.print_exc()
                self.progress.set_error("Exited with error: {%s}" % failed)
                break

        sql.close()
        self.progress.clear(
            "Finished." if not failed else "Exited with error: %s" % failed,
            running=False)