def get_stats(): print 'Fetching NBA player stats...' stats_outfile = RUNDAY+'_nba_stats.csv' csvout = open(stats_outfile, 'wb') NUM_THREADS = 8 in_queue = Queue() out_queue = Queue() queue_players(in_queue) while not in_queue.empty(): jobs = [] for i in range(NUM_THREADS): if not in_queue.empty(): thread = Process(target=get_stats_helper, args=(in_queue, out_queue)) jobs.append(thread) thread.start() for thread in jobs: thread.join() while not out_queue.empty(): player = out_queue.get() del player['SUCCESS'] try: name = player['NAME'] except KeyError as e: continue player['TIME'] = RUNDAY fieldnames = [ 'TIME', 'NAME', 'JERSEY', 'SPORT', 'TEAM', 'POSITION', 'PTS', 'REB', 'AST', 'URL' ] csvwriter = csv.DictWriter(csvout, delimiter='|', fieldnames=fieldnames) csvwriter.writerow(player) csvout.close() print 'Finished fetching NBA player stats.' print 'Ouput saved in %s' % stats_outfile
def find_words(start_words, center_words=None, neg_words=None, min_sim=0.6, max_sim=1., alpha=0.25): if center_words == None and neg_words == None: min_sim = max(min_sim, 0.6) center_vec, neg_vec = np.zeros([word_size]), np.zeros([word_size]) if center_words: # 中心向量是所有中心种子词向量的平均 _ = 0 for w in center_words: if w in word2vec.wv.vocab: center_vec += word2vec[w] _ += 1 if _ > 0: center_vec /= _ if neg_words: # 负向量是左右负种子词向量的平均(此处无用) _ = 0 for w in neg_words: if w in word2vec.wv.vocab: neg_vec += word2vec[w] _ += 1 if _ > 0: neg_vec /= _ queue_count = 1 task_count = 0 cluster = [] queue = Queue() # 建立队列 for w in start_words: queue.put((0, w)) if w not in cluster: cluster.append(w) while not queue.empty(): idx, word = queue.get() queue_count -= 1 task_count += 1 sims = most_similar(word, center_vec, neg_vec) min_sim_ = min_sim + (max_sim - min_sim) * (1 - np.exp(-alpha * idx)) if task_count % 10 == 0: log = '%s in cluster, %s in queue, %s tasks done, %s min_sim' % ( len(cluster), queue_count, task_count, min_sim_) print(log) for i, j in sims: if j >= min_sim_: if i not in cluster and is_good(i): # is_good是人工写的过滤规则 queue.put((idx + 1), i) if i not in cluster and is_good(i): cluster.append(i) queue_count += 1 return cluster
def download_sifts(): # download all sift files # first try to download a tarball containing all the sift xmls if not os.path.exists(SIFTS_TARBALL): try: urlretrieve(S3_BUCKET_URL.format('sifts.tar'), SIFTS_TARBALL) except RuntimeError: logging.warning('failed downloading sifts tarball') else: tf = tarfile.TarFile(SIFTS_TARBALL) tf.extractall(BASEDIR) # the tarball contains directory data/sifts try: os.mkdir(SIFTS_FILE.format('')) except FileExistsError: pass ftp = ftplib.FTP("ftp.ebi.ac.uk") ftp.login() ftp.cwd('/pub/databases/msd/sifts/xml') filenames = ftp.nlst() # get filenames within the directory ftp.quit() # This is the “polite” way to close a connection filename_queue = Queue() for filename in filenames: filename_queue.put(filename) ftp_processes = [ Process(target=download_ftp, args=(filename_queue, )) for _ in range(10) ] for process in ftp_processes: process.start() try: while not filename_queue.empty(): print('{}/{} sifts downloaded'.format( len(filenames) - filename_queue.qsize(), len(filenames)), end='\r') time.sleep(1) for process in ftp_processes: process.join() except KeyboardInterrupt: # TODO kill not supported by multiprocessing.dummy.. for process in ftp_processes: process.kill()
def stat_files(): all_files = [] for root, dirs, files in os.walk('/home/gzguoyubo/mf/tw2/res/entities/custom_type'): ignore = False for ig_path in ignore_paths: if ig_path in root: ignore = True if ignore: continue for fname in files: if not fname.endswith('.py'): continue abs_file_path = join(root, fname) all_files.append(abs_file_path) file_sections = [] file_total_nums = len(all_files) for i in xrange(P_NUM): start = i * file_total_nums / P_NUM stop = start + file_total_nums / P_NUM if i == P_NUM - 1: stop = -1 file_sections.append(all_files[start : stop]) res_queue = Queue() processes = [] for section in file_sections: p = Process(target=stat_file, args=(section, res_queue)) p.start() processes.append(p) for p in processes: p.join() total_stats = defaultdict(int) while not res_queue.empty(): stat = res_queue.get() for author, cnt in stat.iteritems(): total_stats[author] += cnt print total_stats
def proxy_thr(fun): """ 多线程获取代理地址 :param fun: 处理url函数,用于获取页面内的proxy :return: """ q = Queue() pool = Pool(40) proxys = [] i = 1 for u in proxy_url_list(): pool.apply_async(fun, (q, u)) pool.close() pool.join() while 1: if q.empty(): break s = q.get().split("\n") proxys += s proxys = set(proxys) return list(proxys)
class mmonly: def __init__(self): self.ua = UserAgent() self.headers = {} self.q1 = Queue(300) self.q2 = Queue(1000) self.lock = Lock() # self.path = 'D:/IMG/' self.main_page_urls = [] self.subpageurls = [] conn = sqlite3.connect('mmonly.db') conn.isolation_level = None try: conn.execute( '''create table subpageurl(url text primary key not null)''') conn.execute( '''create table imgurl(url text primary key not null)''') except (Exception) as e: print('创建表:{}'.format(e).decode('utf-8').encode(type)) finally: conn.close() self.rootpath = os.getcwd().replace('\\', '/') self.path = os.path.join(self.rootpath, 'imges/') if not os.path.exists(self.path): os.mkdir(self.path) def get_mainpage_urls(self, inurl): # 得到所有主页url self.headers['User-Agent'] = self.ua.random try: req = requests.get(inurl, headers=self.headers, timeout=10) req.encoding = 'gbk' cont = req.text content = pq(cont) elem = list(content('div #pageNum').children('a').items()) for ele in elem: if ele.text() == '末页': pgnum = int(ele.attr('href').split('_')[-1].split('.')[0]) spurl = inurl.split('_') for i in range(1, pgnum + 1): self.main_page_urls.append('{}_{}_{}.html'.format( spurl[0], spurl[1], str(i))) print('主页计算完毕!!'.decode('utf-8').encode(type)) except (Exception) as e: self.lock.acquire() print('主页读取错误:{}'.format(e).decode('utf-8').encode(type)) self.lock.release() return def get_subpage_urls(self, inurl): # 得到所有子页面url self.headers['User-Agent'] = self.ua.random try: req = requests.get(inurl, headers=self.headers, timeout=10) req.encoding = 'gbk' cont = req.text content = pq(cont) elems = list(content('div .ABox').children('a').items()) for ele in elems: url = ele.attr('href') self.q1.put(url) print('取得子页面地址:{}'.format(url).decode('utf-8').encode(type)) except (Exception) as e: self.lock.acquire() print('遍历主页面读取错误:{}'.format(e).decode('utf-8').encode(type)) self.lock.release() return def savesuburl(self): # 将子页面url存入数据库subpageurl表中 while 1: try: suburl = self.q1.get(timeout=20) self.subpageurls.append(suburl) print('列表存入子页面:{}'.format(suburl).decode('utf-8').encode(type)) except (Exception) as e: print('读取子页面url:{}'.format(e).decode('utf-8').encode(type)) time.sleep(2) if self.q1.empty(): time.sleep(2) if self.q1.empty(): break conn = sqlite3.connect('mmonly.db') cur = conn.cursor() time.sleep(4) print('开始将子页面url写入数据库'.decode('utf-8').encode(type)) for date in self.subpageurls: try: cur.execute('insert into subpageurl values(?)', (date, )) print('写入:{}'.format(date).decode('utf-8').encode(type)) except (Exception) as er: print('写入数据库错误:{}'.format(er).decode('utf-8').encode(type)) conn.commit() conn.close() print('写入完毕!!'.decode('utf-8').encode(type)) def get_img_url(self, inurl): # get图片地址 self.headers['User-Agent'] = self.ua.random try: req = requests.get(inurl, headers=self.headers, timeout=10) time.sleep(0.2) req.encoding = 'gbk' cont = req.text content = pq(cont) imgnum = int(content('.totalpage').text()) urlsp = '.'.join(inurl.split('.')[:-1]) for n in range(1, imgnum + 1): imgpage = '{}_{}.html'.format(urlsp, n) self.headers['User-Agent'] = self.ua.random try: req = requests.get(imgpage, headers=self.headers, timeout=10) time.sleep(0.3) req.encoding = 'gbk' cont = req.text content = pq(cont) imgurl = content('.down-btn').attr('href') self.q2.put(imgurl) except (Exception) as ee: print('get图片url错误:{}'.format(ee).decode('utf-8').encode( type)) print( 'get图片url:{}'.format(imgurl).decode('utf-8').encode(type)) except (Exception) as e: print('get图片页面地址错误:{}'.format(e).decode('utf-8').encode(type)) return def download(self, inurl): # 下载图片 # inurl = q.get(timeout=10) na = inurl.split('/') imgname = '{}{}'.format(na[-2], na[-1]) imgpath = '{}{}'.format(self.path, imgname) statu = os.path.exists(imgpath) if not statu: self.headers['User-Agent'] = self.ua.random try: req = requests.get(inurl, headers=self.headers, timeout=8).content with open(imgpath, 'wb') as f: f.write(req) self.lock.acquire() print('下载图片:{}'.format(imgname).decode('utf-8').encode(type)) self.lock.release() except (Exception) as e: self.lock.acquire() print('下载错误:{}'.format(e).decode('utf-8').encode(type)) self.lock.release() else: self.lock.acquire() print('重复图片:{}'.format(imgname).decode('utf-8').encode(type)) self.lock.release() def run(self, inurl): ch = eval( input('输入1表示采集页面\n输入2表示下载图片\n输入3退出程序\n输入:'.decode('utf-8').encode( type))) if ch == 1: self.get_mainpage_urls(inurl) time.sleep(4) pool1 = Pool(20) for mainurl in self.main_page_urls: pool1.apply_async(self.get_subpage_urls, (mainurl, )) time.sleep(1) self.savesuburl() pool1.close() pool1.join() print('子页面采集完毕!!!'.decode('utf-8').encode(type)) self.run('http://www.mmonly.cc/mmtp/list_9_2.html') elif ch == 2: conn = sqlite3.connect('mmonly.db') cur = conn.cursor() pool2 = Pool(10) pool3 = Pool(30) cur.execute('select * from subpageurl') suburls = cur.fetchall() while 1: for nn in range(200): try: for i in suburls: pool2.apply_async(self.get_img_url, i) cur.execute('delete from subpageurl where url=?', i) while 1: img = self.q2.get(timeout=20) pool3.apply_async(self.download, (img, )) except (Exception) as e: print('数据库读取子页面url:{}'.format(e).decode( 'utf-8').encode(type)) time.sleep(2) if self.q2.empty(): time.sleep(2) if self.q2.empty(): break conn.commit() conn.close() conn = sqlite3.connect('mmonly.db') cur = conn.cursor() cur.execute('select * from subpageurl') suburls = cur.fetchall() time.sleep(2) if self.q2.empty(): time.sleep(2) if self.q2.empty(): break pool3.close() pool2.close() pool3.join() pool2.join() else: print('结束程序!'.decode('utf-8').encode(type))
class tizyoutubeproxy(object): """A class that accesses YouTube, retrieves stream URLs and creates and manages a playback queue. """ def __init__(self): self.queue = list() self.queue_index = -1 self.play_queue_order = list() self.play_modes = TizEnumeration(["NORMAL", "SHUFFLE"]) self.current_play_mode = self.play_modes.NORMAL self.now_playing_stream = None # Create multiprocess queues self.task_queue = Queue() self.done_queue = Queue() # Workers self.workers = list() def set_play_mode(self, mode): """ Set the playback mode. :param mode: current valid values are "NORMAL" and "SHUFFLE" """ self.current_play_mode = getattr(self.play_modes, mode) self.__update_play_queue_order() def enqueue_audio_stream(self, arg): """Add the audio stream of a YouTube video to the playback queue. :param arg: a search string """ logging.info('arg : %s', arg) try: yt_video = pafy.new(arg) yt_audio = yt_video.getbestaudio(preftype="webm") if not yt_audio: raise ValueError(str("No WebM audio stream for : %s" % arg)) yt_info = VideoInfo(ytid=arg, title=yt_audio.title) self.add_to_playback_queue(audio=yt_audio, video=yt_video, info=yt_info) self.__update_play_queue_order() except ValueError: raise ValueError(str("Video not found : %s" % arg)) def enqueue_audio_playlist(self, arg): """Add all audio streams in a YouTube playlist to the playback queue. :param arg: a YouTube playlist id """ logging.info('arg : %s', arg) try: count = len(self.queue) playlist = pafy.get_playlist2(arg) if len(playlist) > 0: for yt_video in playlist: self.add_to_playback_queue(video=yt_video, \ info=VideoInfo(ytid=yt_video.videoid, \ title=yt_video.title)) if count == len(self.queue): raise ValueError self.__update_play_queue_order() except ValueError: raise ValueError(str("Playlist not found : %s" % arg)) def enqueue_audio_search(self, arg): """Search YouTube and add the audio streams to the playback queue. :param arg: a search string """ logging.info('arg : %s', arg) try: query = generate_search_query(arg) wdata = pafy.call_gdata('search', query) wdata2 = wdata count = 0 while True: for track_info in get_tracks_from_json(wdata2): self.add_to_playback_queue(info=track_info) count += 1 if count > 100: break if not wdata2.get('nextPageToken'): break query['pageToken'] = wdata2['nextPageToken'] wdata2 = pafy.call_gdata('search', query) self.__update_play_queue_order() except ValueError: raise ValueError(str("Could not find any mixes : %s" % arg)) def enqueue_audio_mix(self, arg, feelinglucky=True): """Obtain a YouTube mix associated to a given video id or url and add all audio streams in the mix playlist to the playback queue. :param arg: a YouTube video id :param feelinglucky: If True, it will perform another YouTube search to find alternatives if the original mix cannot be found. """ logging.info('arg : %s', arg) yt_video = None try: count = len(self.queue) yt_video = pafy.new(arg) playlist = yt_video.mix if len(playlist) > 0: for yt_video in playlist: video_id = yt_video.videoid video_title = yt_video.title yt_info = VideoInfo(ytid=video_id, title=video_title) self.add_to_playback_queue(video=yt_video, info=yt_info) if count == len(self.queue): raise ValueError self.__update_play_queue_order() except IndexError: if not feelinglucky: raise ValueError else: print_wrn("[YouTube] Could not find a mix for '{0}'. "\ "Searching YouTube instead. Feeling lucky?." \ .format(arg.encode('utf-8'))) if yt_video.title: self.enqueue_audio_search(yt_video.title) else: self.enqueue_audio_stream(arg) def enqueue_audio_mix_search(self, arg): """Obtain a YouTube mix associated to a given textual search and add all the audio streams in the mix playlist to the playback queue. :param arg: a search string """ logging.info('arg : %s', arg) try: query = generate_search_query(arg) wdata = pafy.call_gdata('search', query) wdata2 = wdata count = len(self.queue) for track_info in get_tracks_from_json(wdata2): if track_info and track_info.ytid: try: self.enqueue_audio_mix(track_info.ytid, feelinglucky=False) break except ValueError: logging.info( 'Could not find a mix. Trying another video') if count == len(self.queue): raise ValueError except ValueError: raise ValueError(str("Could not find any mixes : %s" % arg)) def current_audio_stream_title(self): """ Retrieve the current stream's title. """ stream = self.now_playing_stream title = '' if stream: title = to_ascii(stream['a'].title).encode("utf-8") return title def current_audio_stream_author(self): """ Retrieve the current stream's author. """ stream = self.now_playing_stream author = '' if stream: author = to_ascii(stream['v'].author).encode("utf-8") return author def current_audio_stream_file_size(self): """ Retrieve the current stream's file size. """ stream = self.now_playing_stream size = 0 if stream: size = stream['a'].get_filesize() return size def current_audio_stream_duration(self): """ Retrieve the current stream's duration. """ stream = self.now_playing_stream duration = '' if stream: duration = to_ascii(stream['v'].duration).encode("utf-8") return duration def current_audio_stream_bitrate(self): """ Retrieve the current stream's bitrate. """ stream = self.now_playing_stream bitrate = '' if stream: bitrate = stream['a'].bitrate return bitrate def current_audio_stream_view_count(self): """ Retrieve the current stream's view count. """ stream = self.now_playing_stream viewcount = 0 if stream: viewcount = stream['v'].viewcount return viewcount def current_audio_stream_description(self): """ Retrieve the current stream's description. """ stream = self.now_playing_stream description = '' if stream: description = to_ascii(stream['v'].description).encode("utf-8") return description def current_audio_stream_file_extension(self): """ Retrieve the current stream's file extension. """ stream = self.now_playing_stream file_extension = '' if stream: file_extension = to_ascii(stream['a'].extension).encode("utf-8") return file_extension def current_audio_stream_video_id(self): """ Retrieve the current stream's video id. """ stream = self.now_playing_stream video_id = '' if stream: video_id = to_ascii(stream['i'].ytid).encode("utf-8") return video_id def current_audio_stream_published(self): """ Retrieve the current stream's upload date and time. """ stream = self.now_playing_stream if stream: published = to_ascii(stream['v'].published).encode("utf-8") return published def current_audio_stream_queue_index_and_queue_length(self): """ Retrieve index in the queue (starting from 1) of the current stream and the length of the playback queue. """ return self.queue_index + 1, len(self.queue) def clear_queue(self): """ Clears the playback queue. """ self.queue = list() self.queue_index = -1 def remove_current_url(self): """Remove the currently active url from the playback queue. """ logging.info("") if len(self.queue) and self.queue_index: stream = self.queue[self.queue_index] print_nfo("[YouTube] [Stream] '{0}' removed." \ .format(to_ascii(stream['i'].title).encode("utf-8"))) del self.queue[self.queue_index] self.queue_index -= 1 if self.queue_index < 0: self.queue_index = 0 self.__update_play_queue_order() def next_url(self): """ Retrieve the url of the next stream in the playback queue. """ logging.info("") try: if len(self.queue): self.queue_index += 1 if (self.queue_index < len(self.queue)) \ and (self.queue_index >= 0): next_stream = self.queue[self.play_queue_order \ [self.queue_index]] return self.__retrieve_stream_url( next_stream, self.queue_index).rstrip() else: self.queue_index = -1 return self.next_url() else: return '' except (KeyError, AttributeError): # TODO: We don't remove this for now # del self.queue[self.queue_index] logging.info("exception") return self.next_url() def prev_url(self): """ Retrieve the url of the previous stream in the playback queue. """ logging.info("") try: if len(self.queue): self.queue_index -= 1 if (self.queue_index < len(self.queue)) \ and (self.queue_index >= 0): prev_stream = self.queue[self.play_queue_order \ [self.queue_index]] return self.__retrieve_stream_url( prev_stream, self.queue_index).rstrip() else: self.queue_index = len(self.queue) return self.prev_url() else: return '' except (KeyError, AttributeError): # TODO: We don't remove this for now # del self.queue[self.queue_index] logging.info("exception") return self.prev_url() def __update_play_queue_order(self): """ Update the queue playback order. A sequential order is applied if the current play mode is "NORMAL" or a random order if current play mode is "SHUFFLE" """ total_streams = len(self.queue) if total_streams: if not len(self.play_queue_order): # Create a sequential play order, if empty self.play_queue_order = range(total_streams) if self.current_play_mode == self.play_modes.SHUFFLE: random.shuffle(self.play_queue_order) print_nfo("[YouTube] [Streams in queue] '{0}'." \ .format(total_streams)) def __retrieve_stream_url(self, stream, queue_index): """ Retrieve a stream url """ try: if not len(self.workers): for _ in range(WORKER_PROCESSES): proc = Process(target=obtain_stream, \ args=(self.task_queue, \ self.done_queue)).start() self.workers.append(proc) while not self.done_queue.empty(): stream = self.done_queue.get() self.queue[stream['q']] = stream stream = self.queue[queue_index] if not stream.get('v') or not stream.get('a'): logging.info("ytid : %s", stream['i'].ytid) video = stream.get('v') if not video: video = pafy.new(stream['i'].ytid) audio = video.getbestaudio(preftype="webm") if not audio: logging.info("no suitable audio found") raise AttributeError() stream.update({'a': audio, 'v': video}) # streams = stream.get('v').audiostreams[::-1] # pprint.pprint(streams) # dump_stream_info(streams) self.now_playing_stream = stream return stream['a'].url.encode("utf-8") except AttributeError: logging.info("Could not retrieve the stream url!") raise def add_to_playback_queue(self, audio=None, video=None, info=None): """ Add to the playback queue. """ if audio: print_nfo("[YouTube] [Stream] '{0}' [{1}]." \ .format(to_ascii(audio.title).encode("utf-8"), \ to_ascii(audio.extension))) if info: print_nfo("[YouTube] [Stream] '{0}'." \ .format(to_ascii(info.title).encode("utf-8"))) queue_index = len(self.queue) self.task_queue.put(dict(a=audio, v=video, i=info, q=queue_index)) self.queue.append(dict(a=audio, v=video, i=info, q=queue_index))
class tizyoutubeproxy(object): """A class that accesses YouTube, retrieves stream URLs and creates and manages a playback queue. """ def __init__(self, api_key=API_KEY): self.queue = list() self.queue_index = -1 self.play_queue_order = list() self.play_modes = TizEnumeration(["NORMAL", "SHUFFLE"]) self.current_play_mode = self.play_modes.NORMAL self.now_playing_stream = None # Create multiprocess queues self.task_queue = Queue() self.done_queue = Queue() # Workers self.workers = list() self.api_key = api_key if api_key != "" else API_KEY pafy.set_api_key(self.api_key) def set_play_mode(self, mode): """ Set the playback mode. :param mode: current valid values are "NORMAL" and "SHUFFLE" """ self.current_play_mode = getattr(self.play_modes, mode) self._update_play_queue_order() def enqueue_audio_stream(self, arg): """Add the audio stream of a YouTube video to the playback queue. :param arg: a search string """ logging.info("arg : %s", arg) try: print_msg("[YouTube] [Audio strean] : '{0}'. ".format(arg)) yt_search = MEMORY.cache(run_youtube_search) yt_video = yt_search(arg) yt_audio = yt_video.getbestaudio(preftype="webm") if not yt_audio: raise ValueError(str("No WebM audio stream for : %s" % arg)) yt_info = VideoInfo(ytid=arg, title=yt_audio.title) self._add_to_playback_queue(audio=yt_audio, video=yt_video, info=yt_info) self._update_play_queue_order() except ValueError: raise ValueError(str("Video not found : %s" % arg)) def enqueue_audio_playlist(self, arg): """Add all audio streams in a YouTube playlist to the playback queue. :param arg: a YouTube playlist id """ logging.info("arg : %s", arg) try: print_msg("[YouTube] [Audio playlist] : '{0}'. ".format(arg)) count = len(self.queue) yt_pl_search = MEMORY.cache(run_youtube_playlist_search) playlist = yt_pl_search(arg) if len(playlist) > 0: for yt_video in playlist: self._add_to_playback_queue( video=yt_video, info=VideoInfo(ytid=yt_video.videoid, title=yt_video.title), ) if count == len(self.queue): raise ValueError self._update_play_queue_order() except ValueError: raise ValueError(str("Playlist not found : %s" % arg)) def enqueue_audio_search(self, arg): """Search YouTube and add the audio streams to the playback queue. :param arg: a search string """ logging.info("arg : %s", arg) try: print_msg("[YouTube] [Audio search] : '{0}'. ".format(arg)) yt_dt_search = MEMORY.cache(run_youtube_data_search) query = generate_search_query(arg, self.api_key) wdata = yt_dt_search("search", query) wdata2 = wdata count = 0 while True: for track_info in get_tracks_from_json(wdata2): self._add_to_playback_queue(info=track_info) count += 1 if count > 100: break if not wdata2.get("nextPageToken"): break query["pageToken"] = wdata2["nextPageToken"] wdata2 = yt_dt_search("search", query) self._update_play_queue_order() except ValueError: raise ValueError(str("Could not find any mixes : %s" % arg)) def enqueue_audio_mix(self, arg, feelinglucky=True): """Obtain a YouTube mix associated to a given video id or url and add all audio streams in the mix playlist to the playback queue. :param arg: a YouTube video id :param feelinglucky: If True, it will perform another YouTube search to find alternatives if the original mix cannot be found. """ logging.info("arg : %s", arg) yt_video = None try: print_msg("[YouTube] [Audio mix] : '{0}'. ".format(arg)) count = len(self.queue) yt_search = MEMORY.cache(run_youtube_search) yt_video = yt_search(arg) playlist = yt_video.mix if len(playlist) > 0: for yt_video in playlist: video_id = yt_video.videoid video_title = yt_video.title yt_info = VideoInfo(ytid=video_id, title=video_title) self._add_to_playback_queue(video=yt_video, info=yt_info) if count == len(self.queue): raise ValueError self._update_play_queue_order() except IndexError: if not feelinglucky: raise ValueError else: print_adv("[YouTube] Could not find a mix for '{0}'. " "Searching YouTube instead. Feeling lucky?.".format( arg.encode("utf-8"))) if yt_video.title: self.enqueue_audio_search(yt_video.title) else: self.enqueue_audio_stream(arg) def enqueue_audio_mix_search(self, arg): """Obtain a YouTube mix associated to a given textual search and add all the audio streams in the mix playlist to the playback queue. :param arg: a search string """ logging.info("arg : %s", arg) try: print_msg("[YouTube] [Audio mix search] : '{0}'. ".format(arg)) yt_dt_search = MEMORY.cache(run_youtube_data_search) wdata = yt_dt_search("search", generate_search_query(arg, self.api_key)) wdata2 = wdata count = len(self.queue) for track_info in get_tracks_from_json(wdata2): if track_info and track_info.ytid: try: self.enqueue_audio_mix(track_info.ytid, feelinglucky=False) break except ValueError: logging.info( "Could not find a mix. Trying another video") if count == len(self.queue): raise ValueError except ValueError: raise ValueError(str("Could not find any mixes : %s" % arg)) def enqueue_audio_channel_uploads(self, arg): """Add all audio streams in a YouTube channel to the playback queue. :param arg: a YouTube channel url """ logging.info("arg : %s", arg) try: print_msg( "[YouTube] [Audio channel uploads] : '{0}'. ".format(arg)) count = len(self.queue) yt_ch_search = MEMORY.cache(run_youtube_channel_search) channel = yt_ch_search(arg) if channel: for yt_video in channel.uploads: self._add_to_playback_queue( video=yt_video, info=VideoInfo(ytid=yt_video.videoid, title=yt_video.title), ) if count == len(self.queue): raise ValueError self._update_play_queue_order() except ValueError: raise ValueError(str("Channel not found : %s" % arg)) def enqueue_audio_channel_playlist(self, channel_name, playlist_name): """Search a playlist within a channel and if found, adds all the audio streams to the playback queue. :param arg: a YouTube playlist id """ logging.info("args : %s - %s", channel_name, playlist_name) try: print_msg( "[YouTube] [Audio channel playlist] : '{0} - {1}'. ".format( channel_name, playlist_name)) count = len(self.queue) yt_ch_search = MEMORY.cache(run_youtube_channel_search) channel = yt_ch_search(channel_name) if channel: pl_dict = dict() pl_titles = list() pl_name = "" playlist = None for pl in channel.playlists: print_nfo("[YouTube] [Playlist] '{0}'.".format( to_ascii(pl.title))) if fuzz.partial_ratio(playlist_name, pl.title) > 50: pl_dict[pl.title] = pl pl_titles.append(pl.title) if len(pl_titles) > 1: pl_name = process.extractOne(playlist_name, pl_titles)[0] playlist = pl_dict[pl_name] elif len(pl_titles) == 1: pl_name = pl_titles[0] playlist = pl_dict[pl_name] if pl_name: if pl_name.lower() != playlist_name.lower(): print_adv("[YouTube] Playlist '{0}' not found. " "Playing '{1}' instead.".format( to_ascii(playlist_name), to_ascii(pl_name))) for yt_video in playlist: self._add_to_playback_queue( video=yt_video, info=VideoInfo(ytid=yt_video.videoid, title=yt_video.title), ) if count == len(self.queue): raise ValueError self._update_play_queue_order() except ValueError: raise ValueError(str("Channel not found : %s" % channel_name)) def current_audio_stream_title(self): """ Retrieve the current stream's title. """ stream = self.now_playing_stream title = "" if stream: title = to_ascii(stream["a"].title) return title def current_audio_stream_author(self): """ Retrieve the current stream's author. """ stream = self.now_playing_stream author = "" if stream: author = to_ascii(stream["v"].author) return author def current_audio_stream_file_size(self): """ Retrieve the current stream's file size. """ stream = self.now_playing_stream size = 0 if stream: size = stream["a"].get_filesize() return size def current_audio_stream_duration(self): """ Retrieve the current stream's duration. """ stream = self.now_playing_stream duration = "" if stream: duration = to_ascii(stream["v"].duration) return duration def current_audio_stream_bitrate(self): """ Retrieve the current stream's bitrate. """ stream = self.now_playing_stream bitrate = "" if stream: bitrate = stream["a"].bitrate return bitrate def current_audio_stream_view_count(self): """ Retrieve the current stream's view count. """ stream = self.now_playing_stream viewcount = 0 if stream: viewcount = stream["v"].viewcount return viewcount def current_audio_stream_description(self): """ Retrieve the current stream's description. """ stream = self.now_playing_stream description = "" if stream: description = to_ascii(stream["v"].description) return description def current_audio_stream_file_extension(self): """ Retrieve the current stream's file extension. """ stream = self.now_playing_stream file_extension = "" if stream: file_extension = to_ascii(stream["a"].extension) return file_extension def current_audio_stream_video_id(self): """ Retrieve the current stream's video id. """ stream = self.now_playing_stream video_id = "" if stream: video_id = to_ascii(stream["i"].ytid) return video_id def current_audio_stream_published(self): """ Retrieve the current stream's upload date and time. """ stream = self.now_playing_stream if stream: published = to_ascii(stream["v"].published) return published def current_audio_stream_queue_index_and_queue_length(self): """ Retrieve index in the queue (starting from 1) of the current stream and the length of the playback queue. """ return self.play_queue_order[self.queue_index] + 1, len(self.queue) def clear_queue(self): """ Clears the playback queue. """ self.queue = list() self.queue_index = -1 def remove_current_url(self): """Remove the currently active url from the playback queue. """ logging.info("") if len(self.queue) and self.queue_index: stream = self.queue[self.queue_index] print_nfo("[YouTube] [Stream] '{0}' removed.".format( to_ascii(stream["i"].title))) del self.queue[self.queue_index] self.queue_index -= 1 if self.queue_index < 0: self.queue_index = 0 self._update_play_queue_order() def next_url(self): """ Retrieve the url of the next stream in the playback queue. """ logging.info("") try: if len(self.queue): self.queue_index += 1 if (self.queue_index < len(self.queue)) and (self.queue_index >= 0): next_stream = self.queue[self.play_queue_order[ self.queue_index]] return self._retrieve_stream_url( next_stream, self.play_queue_order[self.queue_index]).rstrip() else: self.queue_index = -1 return self.next_url() else: return "" except (KeyError, AttributeError): # TODO: We don't remove this for now # del self.queue[self.queue_index] logging.info("KeyError, or AttributeError exception") return self.next_url() except (IOError): # Remove this video del self.queue[self.queue_index] logging.info("IOError exception") return self.next_url() def prev_url(self): """ Retrieve the url of the previous stream in the playback queue. """ logging.info("") try: if len(self.queue): self.queue_index -= 1 if (self.queue_index < len(self.queue)) and (self.queue_index >= 0): prev_stream = self.queue[self.play_queue_order[ self.queue_index]] return self._retrieve_stream_url( prev_stream, self.play_queue_order[self.queue_index]).rstrip() else: self.queue_index = len(self.queue) return self.prev_url() else: return "" except (KeyError, AttributeError): # TODO: We don't remove this for now # del self.queue[self.queue_index] logging.info("exception") return self.prev_url() except (IOError): # Remove this video del self.queue[self.queue_index] logging.info("IOError exception") return self.next_url() def _update_play_queue_order(self): """ Update the queue playback order. A sequential order is applied if the current play mode is "NORMAL" or a random order if current play mode is "SHUFFLE" """ total_streams = len(self.queue) if total_streams: if not len(self.play_queue_order): # Create a sequential play order, if empty self.play_queue_order = list(range(total_streams)) if self.current_play_mode == self.play_modes.SHUFFLE: random.shuffle(self.play_queue_order) print_nfo( "[YouTube] [Streams in queue] '{0}'.".format(total_streams)) def _retrieve_stream_url(self, stream, queue_index): """ Retrieve a stream url """ try: if not len(self.workers): for _ in range(WORKER_PROCESSES): proc = Process(target=obtain_stream, args=(self.task_queue, self.done_queue)).start() self.workers.append(proc) while not self.done_queue.empty(): stream = self.done_queue.get() self.queue[stream["q"]] = stream stream = self.queue[queue_index] if not stream.get("v") or not stream.get("a"): logging.info("ytid : %s", stream["i"].ytid) video = stream.get("v") if not video: yt_search = MEMORY.cache(run_youtube_search) video = yt_search(stream["i"].ytid) audio = video.getbestaudio(preftype="webm") if not audio: logging.info("no suitable audio found") raise AttributeError() stream.update({"a": audio, "v": video}) # streams = stream.get('v').audiostreams[::-1] # pprint.pprint(streams) # dump_stream_info(streams) self.now_playing_stream = stream return stream["a"].url except AttributeError: logging.info("Could not retrieve the stream url!") raise def _add_to_playback_queue(self, audio=None, video=None, info=None): """ Add to the playback queue. """ if audio: print_nfo("[YouTube] [Stream] '{0}' [{1}].".format( to_ascii(audio.title), to_ascii(audio.extension))) if info: print_nfo("[YouTube] [Stream] '{0}'.".format(to_ascii(info.title))) queue_index = len(self.queue) self.task_queue.put(dict(a=audio, v=video, i=info, q=queue_index)) self.queue.append(dict(a=audio, v=video, i=info, q=queue_index))
class tizyoutubeproxy(object): """A class that accesses YouTube, retrieves stream URLs and creates and manages a playback queue. """ def __init__(self): self.queue = list() self.queue_index = -1 self.play_queue_order = list() self.play_modes = TizEnumeration(["NORMAL", "SHUFFLE"]) self.current_play_mode = self.play_modes.NORMAL self.now_playing_stream = None # Create multiprocess queues self.task_queue = Queue() self.done_queue = Queue() # Workers self.workers = list() def set_play_mode(self, mode): """ Set the playback mode. :param mode: current valid values are "NORMAL" and "SHUFFLE" """ self.current_play_mode = getattr(self.play_modes, mode) self.__update_play_queue_order() def enqueue_audio_stream(self, arg): """Add the audio stream of a YouTube video to the playback queue. :param arg: a search string """ logging.info('arg : %s', arg) try: yt_video = pafy.new(arg) yt_audio = yt_video.getbestaudio(preftype="webm") if not yt_audio: raise ValueError(str("No WebM audio stream for : %s" % arg)) yt_info = VideoInfo(ytid=arg, title=yt_audio.title) self.add_to_playback_queue(audio=yt_audio, video=yt_video, info=yt_info) self.__update_play_queue_order() except ValueError: raise ValueError(str("Video not found : %s" % arg)) def enqueue_audio_playlist(self, arg): """Add all audio streams in a YouTube playlist to the playback queue. :param arg: a YouTube playlist id """ logging.info('arg : %s', arg) try: count = len(self.queue) playlist = pafy.get_playlist2(arg) if len(playlist) > 0: for yt_video in playlist: self.add_to_playback_queue(video=yt_video, \ info=VideoInfo(ytid=yt_video.videoid, \ title=yt_video.title)) if count == len(self.queue): raise ValueError self.__update_play_queue_order() except ValueError: raise ValueError(str("Playlist not found : %s" % arg)) def enqueue_audio_search(self, arg): """Search YouTube and add the audio streams to the playback queue. :param arg: a search string """ logging.info('arg : %s', arg) try: query = generate_search_query(arg) wdata = pafy.call_gdata('search', query) wdata2 = wdata count = 0 while True: for track_info in get_tracks_from_json(wdata2): self.add_to_playback_queue(info=track_info) count += 1 if count > 100: break if not wdata2.get('nextPageToken'): break query['pageToken'] = wdata2['nextPageToken'] wdata2 = pafy.call_gdata('search', query) self.__update_play_queue_order() except ValueError: raise ValueError(str("Could not find any mixes : %s" % arg)) def enqueue_audio_mix(self, arg, feelinglucky=True): """Obtain a YouTube mix associated to a given video id or url and add all audio streams in the mix playlist to the playback queue. :param arg: a YouTube video id :param feelinglucky: If True, it will perform another YouTube search to find alternatives if the original mix cannot be found. """ logging.info('arg : %s', arg) yt_video = None try: count = len(self.queue) yt_video = pafy.new(arg) playlist = yt_video.mix if len(playlist) > 0: for yt_video in playlist: video_id = yt_video.videoid video_title = yt_video.title yt_info = VideoInfo(ytid=video_id, title=video_title) self.add_to_playback_queue(video=yt_video, info=yt_info) if count == len(self.queue): raise ValueError self.__update_play_queue_order() except IndexError: if not feelinglucky: raise ValueError else: print_wrn("[YouTube] Could not find a mix for '{0}'. "\ "Searching YouTube instead. Feeling lucky?." \ .format(arg.encode('utf-8'))) if yt_video.title: self.enqueue_audio_search(yt_video.title) else: self.enqueue_audio_stream(arg) def enqueue_audio_mix_search(self, arg): """Obtain a YouTube mix associated to a given textual search and add all the audio streams in the mix playlist to the playback queue. :param arg: a search string """ logging.info('arg : %s', arg) try: query = generate_search_query(arg) wdata = pafy.call_gdata('search', query) wdata2 = wdata count = len(self.queue) for track_info in get_tracks_from_json(wdata2): if track_info and track_info.ytid: try: self.enqueue_audio_mix(track_info.ytid, feelinglucky=False) break except ValueError: logging.info('Could not find a mix. Trying another video') if count == len(self.queue): raise ValueError except ValueError: raise ValueError(str("Could not find any mixes : %s" % arg)) def enqueue_audio_channel_uploads(self, arg): """Add all audio streams in a YouTube channel to the playback queue. :param arg: a YouTube channel url """ logging.info('arg : %s', arg) try: count = len(self.queue) channel = pafy.get_channel(arg) if channel: for yt_video in channel.uploads: self.add_to_playback_queue(video=yt_video, \ info=VideoInfo(ytid=yt_video.videoid, \ title=yt_video.title)) if count == len(self.queue): raise ValueError self.__update_play_queue_order() except ValueError: raise ValueError(str("Channel not found : %s" % arg)) def enqueue_audio_channel_playlist(self, channel_name, playlist_name): """Search a playlist within a channel and if found, adds all the audio streams to the playback queue. :param arg: a YouTube playlist id """ logging.info('args : %s - %s', channel_name, playlist_name) try: count = len(self.queue) channel = pafy.get_channel(channel_name) if channel: pl_dict = dict() pl_titles = list() pl_name = '' playlist = None for pl in channel.playlists: print_nfo("[YouTube] [Playlist] '{0}'." \ .format(to_ascii(pl.title))) if fuzz.partial_ratio(playlist_name, pl.title) > 50: pl_dict[pl.title] = pl pl_titles.append(pl.title) if len(pl_titles) > 1: pl_name = process.extractOne(playlist_name, pl_titles)[0] playlist = pl_dict[pl_name] elif len(pl_titles) == 1: pl_name = pl_titles[0] playlist = pl_dict[pl_name] if pl_name: if pl_name.lower() != playlist_name.lower(): print_wrn("[YouTube] Playlist '{0}' not found. " \ "Playing '{1}' instead." \ .format(to_ascii(playlist_name), \ to_ascii(pl_name))) for yt_video in playlist: self.add_to_playback_queue(video=yt_video, \ info=VideoInfo(ytid=yt_video.videoid, \ title=yt_video.title)) if count == len(self.queue): raise ValueError self.__update_play_queue_order() except ValueError: raise ValueError(str("Channel not found : %s" % channel_name)) def current_audio_stream_title(self): """ Retrieve the current stream's title. """ stream = self.now_playing_stream title = '' if stream: title = to_ascii(stream['a'].title).encode("utf-8") return title def current_audio_stream_author(self): """ Retrieve the current stream's author. """ stream = self.now_playing_stream author = '' if stream: author = to_ascii(stream['v'].author).encode("utf-8") return author def current_audio_stream_file_size(self): """ Retrieve the current stream's file size. """ stream = self.now_playing_stream size = 0 if stream: size = stream['a'].get_filesize() return size def current_audio_stream_duration(self): """ Retrieve the current stream's duration. """ stream = self.now_playing_stream duration = '' if stream: duration = to_ascii(stream['v'].duration).encode("utf-8") return duration def current_audio_stream_bitrate(self): """ Retrieve the current stream's bitrate. """ stream = self.now_playing_stream bitrate = '' if stream: bitrate = stream['a'].bitrate return bitrate def current_audio_stream_view_count(self): """ Retrieve the current stream's view count. """ stream = self.now_playing_stream viewcount = 0 if stream: viewcount = stream['v'].viewcount return viewcount def current_audio_stream_description(self): """ Retrieve the current stream's description. """ stream = self.now_playing_stream description = '' if stream: description = to_ascii(stream['v'].description).encode("utf-8") return description def current_audio_stream_file_extension(self): """ Retrieve the current stream's file extension. """ stream = self.now_playing_stream file_extension = '' if stream: file_extension = to_ascii(stream['a'].extension).encode("utf-8") return file_extension def current_audio_stream_video_id(self): """ Retrieve the current stream's video id. """ stream = self.now_playing_stream video_id = '' if stream: video_id = to_ascii(stream['i'].ytid).encode("utf-8") return video_id def current_audio_stream_published(self): """ Retrieve the current stream's upload date and time. """ stream = self.now_playing_stream if stream: published = to_ascii(stream['v'].published).encode("utf-8") return published def current_audio_stream_queue_index_and_queue_length(self): """ Retrieve index in the queue (starting from 1) of the current stream and the length of the playback queue. """ return self.queue_index + 1, len(self.queue) def clear_queue(self): """ Clears the playback queue. """ self.queue = list() self.queue_index = -1 def remove_current_url(self): """Remove the currently active url from the playback queue. """ logging.info("") if len(self.queue) and self.queue_index: stream = self.queue[self.queue_index] print_nfo("[YouTube] [Stream] '{0}' removed." \ .format(to_ascii(stream['i'].title).encode("utf-8"))) del self.queue[self.queue_index] self.queue_index -= 1 if self.queue_index < 0: self.queue_index = 0 self.__update_play_queue_order() def next_url(self): """ Retrieve the url of the next stream in the playback queue. """ logging.info("") try: if len(self.queue): self.queue_index += 1 if (self.queue_index < len(self.queue)) \ and (self.queue_index >= 0): next_stream = self.queue[self.play_queue_order \ [self.queue_index]] return self.__retrieve_stream_url(next_stream, self.queue_index).rstrip() else: self.queue_index = -1 return self.next_url() else: return '' except (KeyError, AttributeError): # TODO: We don't remove this for now # del self.queue[self.queue_index] logging.info("KeyError, or AttributeError exception") return self.next_url() except (IOError): # Remove this video del self.queue[self.queue_index] logging.info("IOError exception") return self.next_url() def prev_url(self): """ Retrieve the url of the previous stream in the playback queue. """ logging.info("") try: if len(self.queue): self.queue_index -= 1 if (self.queue_index < len(self.queue)) \ and (self.queue_index >= 0): prev_stream = self.queue[self.play_queue_order \ [self.queue_index]] return self.__retrieve_stream_url(prev_stream, self.queue_index).rstrip() else: self.queue_index = len(self.queue) return self.prev_url() else: return '' except (KeyError, AttributeError): # TODO: We don't remove this for now # del self.queue[self.queue_index] logging.info("exception") return self.prev_url() except (IOError): # Remove this video del self.queue[self.queue_index] logging.info("IOError exception") return self.next_url() def __update_play_queue_order(self): """ Update the queue playback order. A sequential order is applied if the current play mode is "NORMAL" or a random order if current play mode is "SHUFFLE" """ total_streams = len(self.queue) if total_streams: if not len(self.play_queue_order): # Create a sequential play order, if empty self.play_queue_order = range(total_streams) if self.current_play_mode == self.play_modes.SHUFFLE: random.shuffle(self.play_queue_order) print_nfo("[YouTube] [Streams in queue] '{0}'." \ .format(total_streams)) def __retrieve_stream_url(self, stream, queue_index): """ Retrieve a stream url """ try: if not len(self.workers): for _ in range(WORKER_PROCESSES): proc = Process(target=obtain_stream, \ args=(self.task_queue, \ self.done_queue)).start() self.workers.append(proc) while not self.done_queue.empty(): stream = self.done_queue.get() self.queue[stream['q']] = stream stream = self.queue[queue_index] if not stream.get('v') or not stream.get('a'): logging.info("ytid : %s", stream['i'].ytid) video = stream.get('v') if not video: video = pafy.new(stream['i'].ytid) audio = video.getbestaudio(preftype="webm") if not audio: logging.info("no suitable audio found") raise AttributeError() stream.update({'a': audio, 'v': video}) # streams = stream.get('v').audiostreams[::-1] # pprint.pprint(streams) # dump_stream_info(streams) self.now_playing_stream = stream return stream['a'].url.encode("utf-8") except AttributeError: logging.info("Could not retrieve the stream url!") raise def add_to_playback_queue(self, audio=None, video=None, info=None): """ Add to the playback queue. """ if audio: print_nfo("[YouTube] [Stream] '{0}' [{1}]." \ .format(to_ascii(audio.title).encode("utf-8"), \ to_ascii(audio.extension))) if info: print_nfo("[YouTube] [Stream] '{0}'." \ .format(to_ascii(info.title).encode("utf-8"))) queue_index = len(self.queue) self.task_queue.put(dict(a=audio, v=video, i=info, q=queue_index)) self.queue.append( dict(a=audio, v=video, i=info, q=queue_index))
# 队列 tasks = Queue() for i in range(9): tasks.put(str(i)*2) def main(name): while True: time.sleep(1) if name.empty(): print("name is over.") break else: thread = threading.current_thread().getName() print("线程:%s 打印:%s" % (thread, name.get())) # 启动四个线程 pool = Pool(4, main, (tasks,)) while True: time.sleep(5) if tasks.empty(): print("tasks is over.") # 终结线程池 pool.terminate() break print("main is over.")
class ParallelDownloader(URL_Fetcher): 'Parallel threaded web page downloader' def __init__(self, db_name, proc_count, site_base_url, fUseCache=True, fCacheSearchPages=True, fUseCookies=False, timeout=secHTTP_WAIT_TIMEOUT, search_proc_count=2, proxies=None): self.proxies = proxies self.queue = Queue() self.fSaveSearchPages = fCacheSearchPages self.site_base_url = site_base_url self.pool = Pool(processes=proc_count) self.search_queue = Queue() self.url_extract_pool = Pool(processes=search_proc_count) URL_Fetcher.__init__(self, db_name, fUseCache, fUseCookies, timeout=timeout, proxies=proxies) def process_urls_from_search_queue(self): while not self.search_queue.empty(): search_page_url = self.search_queue.get() # logOut('search pages queue size: %d'%self.search_queue.qsize()) logDbg('search page: %s' % search_page_url) search_page = self.get_page(search_page_url, fUseCache=self.fSaveSearchPages) rel_urls = extract_data_xpath(search_page, self.url_extract_xpath) #rel_urls = self.extract_page_xpath(self.url_extract_xpath, search_page_url) # logOut('URLs from %s extracted'%search_page_url) logOut('%d urls extracted from [%s]. Queuing...' % (len(rel_urls), search_page_url)) logDbg('Extracted urls: %s. Queuing to download...' % rel_urls) list(map(self.queue.put, self.prefix_site_base_url(rel_urls))) self.queue.put(None) self.postprocess_search_page_list(rel_urls, search_page) def queue_pages(self, url_list): list(map(self.queue.put, url_list)) # признак завершения очереди заданий self.queue.put(None) def postprocess_search_page_list(self, url, page): pass def prefix_site_base_url(self, rel_urls): return [self.site_base_url + url for url in rel_urls] def process_pages(self, page_processor, *add_processor_args): self.page_processor = page_processor self.add_pprocessor_args = add_processor_args self.pool.apply(self.process_page) def process_page(self): while True: url = self.queue.get() logDbg('Url got from queue: %s' % url) if not url: break page = self.get_page(url) #, proxies=self.proxies #logOut('pp_arg_list: [%s]'%pp_arg_list) if page: self.page_processor(url, page, *self.add_pprocessor_args)