def downloadAllPagesVideos(url): global proxy, host, thread_count print url content = getContent(url, None, proxy) all_page_content = '' matched_groups = re.findall('''<a href="(.*?)" title='第\d+页' charset=".*?">\d+</a>''', content) for matched in matched_groups: page_url = 'http://so.youku.com'+matched.strip() all_page_content += getContent(page_url, None, proxy) pool = ThreadPool(thread_count) video_url_set = set() matched_groups = re.findall('''<a href="(http\://v\.youku\.com/v_show/id_.*?=\.html)"''', all_page_content) for matched in matched_groups: #print matched.strip() video_url = matched.strip() video_url_set.add(video_url) for video_url in video_url_set: print video_url log(video_url) pool.queueTask(downloadVideo, (video_url)) pool.joinAll()
def convertFlv2Mp4underDir(path): if not os.path.isdir(path): if os.path.exists(path): print " Path:["+ path+ "] is not a directory, exit!\n" return else: os.makedirs(path) pool = ThreadPool(6) MP4_CMD = '''D:\\Program\\tools\\ffmpeg.exe -i "%s" -vcodec mpeg4 -b 1200kb -mbd 2 -aic 2 -cmp 2 -subcmp 2 -acodec libfaac -ac 2 -ab 128000 -y "%s"''' MP3_CMD = '''D:\\Program\\tools\\ffmpeg.exe -i "%s" -vn -ar 44100 -ac 2 -f mp3 "%s"''' for file_name in os.listdir(path): flv_path = path+'\\'+file_name if os.path.isfile(flv_path): mp4_file_name = file_name[:file_name.rfind('.')]+'.mp4' mp4_save_path = path+'\\mp4\\'+mp4_file_name if os.path.exists(mp4_save_path): print " File:[" + mp4_save_path+ "] already exists, pass.\n" else: cmd = MP4_CMD%(flv_path, mp4_save_path) #print cmd #pool.queueTask(run_cmd, (cmd)) mp3_file_name = file_name[:file_name.rfind('.')]+'.mp3' mp3_save_path = path+'\\mp3\\'+mp3_file_name if os.path.exists(mp3_save_path): print " File:[" + mp3_save_path+ "] already exists, pass.\n" else: cmd = MP3_CMD%(flv_path, mp3_save_path) print cmd pool.queueTask(run_cmd, (cmd)) pool.joinAll()
def convertWMA2MP3underDir(path): if not os.path.isdir(path): if existFile(path): print " Path:["+ path+ "] is not a directory, exit!\n" return else: os.makedirs(path) pool = ThreadPool(6) MP3_CMD = '''ffmpeg.exe -i "%s" -f mp3 "%s"''' DEL_CMD = '''del %s''' for file_name in os.listdir(path): wma_path = path+'\\'+file_name if os.path.isfile(wma_path) and wma_path.lower().endswith('.wma'): mp3_file_name = file_name[:file_name.rfind('.')]+'.mp3' mp3_save_path = path+'\\'+mp3_file_name if os.path.exists(mp3_save_path): print " File:[" + mp3_save_path+ "] already exists, pass.\n" else: cmd1 = MP3_CMD%(wma_path, mp3_save_path) #cmd2 = DEL_CMD%(wma_path) print cmd1 pool.queueTask(run_cmd, (cmd1)) pool.joinAll()
def getSongsFromHTML(htmlcontent, save_path): global thread_count pool = ThreadPool(thread_count) matched_groups = re.findall("""W[LS]\("(\d+)",\s*"(\d+)",\s*"(.*?)\s+",""", htmlcontent) for matched in matched_groups: print "-" * 2, matched order = matched[0].strip() song_id = matched[1].strip() song_name = matched[2].strip() # getSong(song_id, order, save_path) pool.queueTask(getSongThread, (song_id, order, save_path)) pool.joinAll()
def aggregate_all(client, iterator, connection_factory): """ Aggregate all feeds returned by the generator. The generator should contain pairs of two elements (feed_url, categories) """ def attach_connection(thread): thread.hbase = connection_factory() return thread pool = ThreadPool(10, thread_init=attach_connection) for feed, categs in iterator: pool.queueTask(lambda worker, p: aggregate(worker.hbase, *p), (feed, categs)) pool.joinAll()
def downloadFirstVideo(url): global proxy, host, thread_count print url htmlcontent = getContent(url, None, proxy) pool = ThreadPool(thread_count) matched_groups = re.findall('''class=list>(.*?)</a> <a title=".*?" href="http://www.cctv.com/video/(.*?).shtml" target="_blank">''', htmlcontent) for matched in matched_groups: #print matched.strip() video_title = matched[0].strip() video_url = matched[1].strip() video_url = 'http://v.cctv.com/flash/'+video_url+'.flv' print video_title, '-', video_url log(video_url) pool.queueTask(downloadVideoThread, (video_url, video_title)) break pool.joinAll()
def downloadSpaceVideos(url): global proxy, host, thread_count print url htmlcontent = getContent(url, None, proxy) pool = ThreadPool(thread_count) #video_url_set = set() matched_groups = re.findall('''src="(.*?)" alt=".*?" title="(.*?)"/>''', htmlcontent) for matched in matched_groups: #print matched.strip() video_title = matched[1].strip() video_url = matched[0].strip() video_url = video_url.replace('image', 'flash').replace('jpg', 'flv') #video_url_set.add((video_url) print video_title, '-', video_url log(video_url) pool.queueTask(downloadVideoThread, (video_url, video_title)) pool.joinAll()
def downloadAllVideos(url): global proxy, host, thread_count, pool print url htmlcontent = getContent(url, None, proxy) pool = ThreadPool(thread_count) video_url_set = set() matched_groups = re.findall('''<a href="(http\://v\.youku\.com/v_show/id_.*?=\.html)"''', htmlcontent) for matched in matched_groups: #print matched.strip() video_url = matched.strip() video_url_set.add(video_url) for video_url in video_url_set: print video_url log(video_url) pool.queueTask(downloadVideo, (video_url)) pool.joinAll()
class FilesystemMonitor(object): """ FileMonitor Class keeps track of all files down a tree starting at the root """ def __init__(self, searcher): self.searcher = searcher self._thread_pool = ThreadPool(THREAD_POOL_WORKS) # Add a watch to the root of the dir self.watch_manager = WatchManager() self.notifier = ThreadedNotifier(self.watch_manager, FileProcessEvent(self)) self.notifier.start() self._build_exclude_list() def _build_exclude_list(self): log.info("[FileMonitor] Set Regexs for Ignore List") self._exclude_regexs = [] # Complie Ignore list in to a list of regexs for ignore in self.searcher.configuration.get_value("EXCLUDE_LIST"): ignore = ignore.strip() ignore = ignore.replace(".", "\.") ignore = ignore.replace("*", ".*") ignore = "^"+ignore+"$" log.debug("[FileMonitor] Ignore Regex = %s" % ignore) self._exclude_regexs.append(re.compile(ignore)) def change_root(self, previous_root): self._thread_pool.clearTasks() wd = self.watch_manager.get_wd(previous_root) if wd: self.watch_manager.rm_watch(wd, rec=True) self.searcher.clear_database() self.add_directory(self.searcher.current_root) def add_directory(self, path): """ Starts a WalkDirectoryThread to add the directory """ basename = os.path.basename(path) if self.validate(basename): self.watch_manager.add_watch(path, EVENT_MASK) self._thread_pool.queueTask(self.walk_directory, path) def add_file(self, path, name): """ Add a single file to the databse """ if self.validate(name): self.searcher.add_file(path, name) def remove_file(self, path, name): self.searcher.remove_file(path, name) def remove_directory(self, path): self.searcher.remove_directory(path) def walk_directory(self, root): """ From a give root of a tree this method will walk through ever branch and return a generator. """ if os.path.isdir(root): names = os.listdir(root) for name in names: try: file_stat = os.lstat(os.path.join(root, name)) except os.error: continue if stat.S_ISDIR(file_stat.st_mode): self.add_directory(os.path.join(root, name)) else: if not stat.S_ISLNK(file_stat.st_mode): self.add_file(root, name) def finish(self): wd = self.watch_manager.get_wd(self.searcher.current_root) self.watch_manager.rm_watch(wd, rec=True) self.notifier.stop() self._thread_pool.joinAll(waitForTasks=False) def validate(self, name): # Check to make sure the file not in the ignore list for ignore_re in self._exclude_regexs: if ignore_re.match(name): log.debug("[WalkDirectoryThread] ##### Ignored %s #####", name) return False log.debug("[WalkDirectoryThread] # Passed %s", name) return True
t_pool = ThreadPool(opts.thread) ''' Read Sample Files (Concurrent by sample)''' for m, sample_arg in enumerate(args): replist = sample_arg.split(',') for n, replicate_file in enumerate(replist): if not os.path.exists(replicate_file): sys.stderr.write('%d th replicate file of %d th sample (%s) doesn\'t exist' % (n+1,m+1,replicate_file)) sys.exit(1) sample_list.append(SampleData(replist)) for m, sample_data in enumerate(sample_list): t_pool.queueTask(preprocess_samples,sample_data,None) t_pool.joinAll() if DEBUG: print "print chromosome order" for chrom_name in sample_data.chrom_order: print chrom_name ''' debug purpose ''' if DEBUG: for sample_data in sample_list: sample_data.output_debug_info() ''' Smoothing''' if opts.smooth: for sample_data in sample_list: smooth_data((sample_data,opts.smooth_window))
class FilesystemMonitor(object): """ FileMonitor Class keeps track of all files down a tree starting at the root """ def __init__(self, searcher): self.searcher = searcher self._thread_pool = ThreadPool(THREAD_POOL_WORKS) # Add a watch to the root of the dir self.watch_manager = WatchManager() self.notifier = ThreadedNotifier(self.watch_manager, FileProcessEvent(self)) self.notifier.start() self._build_exclude_list() def _build_exclude_list(self): log.info("[FileMonitor] Set Regexs for Ignore List") self._exclude_regexs = [] # Complie Ignore list in to a list of regexs for ignore in self.searcher.configuration.exclude_list: ignore = ignore.strip() ignore = ignore.replace(".", "\.") ignore = ignore.replace("*", ".*") ignore = "^" + ignore + "$" log.debug("[FileMonitor] Ignore Regex = %s" % ignore) self._exclude_regexs.append(re.compile(ignore)) def change_root(self, previous_root): self._thread_pool.clearTasks() wd = self.watch_manager.get_wd(previous_root) if wd: self.watch_manager.rm_watch(wd, rec=True) self.searcher.clear_database() self.add_directory(self.searcher.current_root) def add_directory(self, path): """ Starts a WalkDirectoryThread to add the directory """ basename = os.path.basename(path) if self.validate(basename): self.watch_manager.add_watch(path, EVENT_MASK) self._thread_pool.queueTask(self.walk_directory, path) def add_file(self, path, name): """ Add a single file to the databse """ if self.validate(name): self.searcher.add_file(path, name) def remove_file(self, path, name): self.searcher.remove_file(path, name) def remove_directory(self, path): self.searcher.remove_directory(path) def walk_directory(self, root): """ From a give root of a tree this method will walk through ever branch and return a generator. """ if os.path.isdir(root): names = os.listdir(root) for name in names: try: file_stat = os.lstat(os.path.join(root, name)) except os.error: continue if stat.S_ISDIR(file_stat.st_mode): self.add_directory(os.path.join(root, name)) else: if not stat.S_ISLNK(file_stat.st_mode): self.add_file(root, name) def finish(self): wd = self.watch_manager.get_wd(self.searcher.current_root) self.watch_manager.rm_watch(wd, rec=True) self.notifier.stop() self._thread_pool.joinAll(waitForTasks=False) def validate(self, name): # Check to make sure the file not in the ignore list for ignore_re in self._exclude_regexs: if ignore_re.match(name): log.debug("[WalkDirectoryThread] ##### Ignored %s #####", name) return False log.debug("[WalkDirectoryThread] # Passed %s", name) return True