def _waitwait( ): parser = ArgumentParser( ) parser.add_argument('--dirname', dest='dirname', type=str, action = 'store', default = _default_inputdir, help = 'Name of the directory to store the file. Default is %s.' % _default_inputdir ) parser.add_argument('--date', dest='date', type=str, action = 'store', default = get_datestring(_get_last_saturday( datetime.datetime.now())), help = 'The date, in the form of "January 1, 2014." The default is last Saturday, %s.' % get_datestring( _get_last_saturday( datetime.datetime.now() ) ) ) parser.add_argument('--dump', dest='do_dump', action='store_true', default = False, help = 'If chosen, download the NPR XML data sheet for this Wait Wait episode.') parser.add_argument('--level', dest='level', action='store', type=str, default = 'NONE', choices = sorted( logging_dict ), help = 'choose the debug level for downloading NPR Wait Wait episodes or their XML representation of episode info. Can be one of %s. Default is NONE.' % sorted( logging_dict ) ) parser.add_argument('--justfix', dest='do_justfix', action='store_true', default = False, help = "If chosen, just fix the title of an existing NPR Wait Wait episode's file.") args = parser.parse_args( ) logger.setLevel( logging_dict[ args.level ] ) fname = get_waitwait( args.dirname, get_time_from_datestring( args.date ), dump = args.do_dump, justFix = args.do_justfix )
def _process_freshairs_by_year_tuple(input_tuple): outputdir, totnum, verbose, datetimes_order_tuples = input_tuple driver = npr_utils.get_chrome_driver() for date_s, order in datetimes_order_tuples: time0 = time.time() try: fname = get_freshair(outputdir, date_s, order_totnum=(order, totnum), driver=driver) logging.info('processed %s in %0.3f seconds.' % (os.path.basename(fname), time.time() - time0)) except Exception as e: logging.error(str(e)) logging.error( 'Could not create Fresh Air episode for date %s for some reason' % npr_utils.get_datestring(date_s))
def rm_get_main_url(date_s): """ :param date_s: the :py:class:`date <datetime.date>` for this episode, which must be a Saturday. :returns: the full RealMedia_ URL for this older `NPR Wait Wait <waitwait_>`_ episode. :rtype: str .. _RealMedia: https://en.wikipedia.org/wiki/RealMedia """ if not npr_utils.is_saturday(date_s): raise ValueError("Error, this date given by '%s' is not a Saturday." % npr_utils.get_datestring(date_s)) # mon_lower = date_s.strftime('%b').lower() year = date_s.year dsub = date_s.strftime('%y%m%d') full_rm_url = 'http://www.npr.org/programs/waitwait/archrndwn/%04d/%s/%s.waitwait.html' % ( year, mon_lower, dsub) return full_rm_url
def _process_waitwaits_by_year_tuple(input_tuple): outputdir, totnum, datetimes_order_tuples = input_tuple ww_image = get_waitwait_image() driver = npr_utils.get_chrome_driver() for date_s, order in datetimes_order_tuples: time0 = time.time() try: fname = get_waitwait(outputdir, date_s, order_totnum=(order, totnum), driver=driver) if verbose: print('Processed %s in %0.3f seconds.' % (fname, time.time() - time0)) except Exception as e: print( 'Could not create Wait Wait episode for date %s for some reason.' % (npr_utils.get_datestring(date_s)))
def rm_download_file(date_s, outdir=os.getcwd()): """ downloads the RealMedia_ `NPR Wait Wait <waitwait_>`_ episode into a specified directory. :param date_s: the :py:class:`date <datetime.date>` for this episode, which must be a Saturday. :param str outdir: the directory into which one downloads the `NPR Fresh Air`_ episodes. :returns: the RealMedia_ output file. :rtype: str """ decdate = npr_utils.get_decdate(date_s) outfile = os.path.join(outdir, 'NPR.WaitWait.%s.rm' % decdate) try: dsub = date_s.strftime('%Y%m%d') rm_url = 'http://download.npr.org/real.npr.na-central/waitwait/%s_waitwait.rm' % dsub req = urlopen(rm_url) with open(outfile, 'w') as openfile: openfile.write(req.read()) return outfile except Exception as e: if os.path.isfile(outfile): os.remove(outfile) raise ValueError( "Error, could not download Wait Wait RM file for '%s' into %s." % (npr_utils.get_datestring(date_s), outdir))
def _freshair(): parser = ArgumentParser() parser.add_argument( '--dirname', dest='dirname', type=str, action='store', default=_default_inputdir, help='Name of the directory to store the file. Default is %s.' % _default_inputdir) parser.add_argument( '-d', '--date', dest='date', type=str, action='store', default=npr_utils.get_datestring(datetime.datetime.now()), help= 'The date, in the form of "January 1, 2014." The default is today\'s date, %s.' % npr_utils.get_datestring(datetime.datetime.now())) parser.add_argument( '--mp3exist', dest='mp3_exist', action='store_true', default=False, help=' '.join([ 'If chosen, then do not download the transitional mp3 files.', 'Use the ones that already exist.' ])) parser.add_argument( '-D', '--debug', dest='debug', action='store_true', help='If chosen, dump out NPR Freshair webpage as XML.', default=False) parser.add_argument( '-L', '--level', dest='level', action='store', type=str, default='NONE', choices=sorted(logging_dict), help= 'choose the debug level for downloading NPR Fresh Air episodes or their XML representation of episode info. Can be one of %s. Default is NONE.' % sorted(logging_dict)) parser.add_argument( '-r', '--relax', dest='relax_date_check', action='store_true', default=False, help= 'If chosen, then do NOT do a date check validation of NPR URL articles.' ) args = parser.parse_args() dirname = os.path.expanduser(args.dirname) logger.setLevel(logging_dict[args.level]) fname = freshair.get_freshair(dirname, npr_utils.get_time_from_datestring( args.date), debug=args.debug, mp3_exist=args.mp3_exist, relax_date_check=args.relax_date_check)
def get_title_mp3_urls_attic(outputdir, date_s, debug=False, to_file_debug=True): """ older functionality that uses the `old NPR API` to get an ordered :py:class:`list` of :py:class:`tuple` of stories for an `NPR Fresh Air`_ episode. Here is an example operation, .. code-block:: python >> date_s = datetime.datetime.strptime('July 31, 2020', '%B %d, %Y' ).date( ) >> title_mp3_urls = get_title_mp3_urls_attic( date_s ) >> title_list_mp3_urls >> [('Remembering Regis Philbin, Prolific Talk and Game Show Personality', 'https://ondemand.npr.org/anon.npr-mp3/npr/fa/2020/07/20200731_fa_01.mp3'), ("With 'Folklore,' Taylor Swift Marks Off Her Past and Enters a New Phase", 'https://ondemand.npr.org/anon.npr-mp3/npr/fa/2020/07/20200731_fa_02.mp3'), ('Remembering Jazz Singer Annie Ross', 'https://ondemand.npr.org/anon.npr-mp3/npr/fa/2020/07/20200731_fa_03.mp3'), ("'Muppets Now' Proves It's Not Easy to Capture the Old Muppet Magic", 'https://ondemand.npr.org/anon.npr-mp3/npr/fa/2020/07/20200731_fa_04.mp3')] .. note:: I was able to get this to work by replacing the ``https://`` in the API URL query with ``http://``. :param str outputdir: the directory into which one downloads the `NPR Fresh Air`_ episodes. :param date_s: the :py:class:`date <datetime.date>` for this episode, which must be a weekday. :param bool debug: optional argument, if ``True`` returns the :py:class:`BeautifulSoup <bs4.BeautifulSoup>` XML tree for the `NPR Fresh Air`_ episode, or its file representation. Default is ``False``. :param bool to_file_debug: optional argument, if ``True`` dumps out the file representation of the :py:class:`BeautifulSoup <bs4.BeautifulSoup>` XML tree for the `NPR Fresh Air`_ episode. Default is ``False``. :returns: the :py:class:`list` of stories, by order, for the `NPR Fresh Air`_ episode. The first element of each :py:class:`tuple` is the story title, and th second is the MP3_ URL for the story. *However*, if ``debug`` is ``True`` and ``to_file_debug`` is ``True``, returns the :py:class:`BeautifulSoup <bs4.BeautifulSoup>` XML tree for this `NPR Fresh Air`_ episode. .. seealso:: * :py:meth:`get_freshair <nprstuff.core.freshair.get_freshair>`. * :py:class:`get_title_mp3_urls_working <nprstuff.core.freshair.get_title_mp3_urls_working>`. """ # ## download this data into a BeautifulSoup object resp = requests.get('http://api.npr.org/query', params={ 'id': _npr_FreshAir_progid, 'date': date_s.strftime('%Y-%m-%d'), 'dateType': 'story', 'output': 'NPRML', 'apiKey': npr_utils.get_api_key() }) params = { 'id': _npr_FreshAir_progid, 'date': date_s.strftime('%Y-%m-%d'), 'dateType': 'story', 'output': 'NPRML', 'apiKey': npr_utils.get_api_key() } full_URL = 'http://api.npr.org/query?%s' % ('&'.join( map(lambda tup: '%s=%s' % (tup[0], tup[1]), params.items()))) print(full_URL) if not resp.ok: logging.info('ERROR GETTING FRESH AIR STORY FOR %s' % date_s.strftime('%d %B %Y')) return None html = BeautifulSoup(resp.content, 'lxml') # if debug: # print 'URL = %s' % nprURL if to_file_debug: decdate = date_s.strftime('%d.%m.%Y') with open( os.path.join(outputdir, 'NPR.FreshAir.tree.%s.xml' % decdate), 'w') as openfile: openfile.write('%s\n' % html.prettify()) return html # ## check for unavailable tag if len(html.find_all('unavailable', {'value': 'true'})) != 0: unavailable_elem = html.find_all('unavailable', {'value': 'true'})[0] if unavailable_elem.text is None: print( 'Could not create Fresh Air episode for date %s because unavailable without a specific reason' % npr_utils.get_datestring(date_s)) else: print( 'Could not create Fresh Air episode for date %s because unavailable for this reason: %s' % (npr_utils.get_datestring(date_s), unavailable_elem.text.strip())) return None # ## now get tuple of title to mp3 file title_mp3_urls = _process_freshair_titlemp3_tuples(html) if title_mp3_urls is None or len(title_mp3_urls) == 0: print('Error, could not find any Fresh Air episodes for date %s.' % npr_utils.get_datestring(date_s)) return None return title_mp3_urls
def get_waitwait(outputdir, date_s, order_totnum=None, dump=False, driver=None, justFix=False): """ The main driver method that downloads `NPR Wait Wait <waitwait_>`_ episodes for a given date into a specified output directory. :param str outputdir: the directory into which one downloads the `NPR Wait Wait <waitwait_>`_ episodes. :param date_s: the :py:class:`date <datetime.date>` for this episode, which must be a weekday. :param tuple order_totnum: optional argument, the :py:class:`tuple` of track number and total number of tracks of `NPR Wait Wait <waitwait_>`_ episodes for that year. If ``None``, then this information is gathered from :py:meth:`get_order_num_saturday_in_year <nprstuff.core.npr_utils.get_order_num_saturday_in_year>`. :param bool dump: optional argument, if ``True`` returns the :py:class:`BeautifulSoup <bs4.BeautifulSoup>` XML tree for the `NPR Wait Wait <waitwait_>`_ episode (and downloads the XML tree into a file). Default is ``False``. :param driver: optional argument, the :py:class:`Webdriver <selenium.webdriver.remote.webdriver.WebDriver>` used for webscraping and querying (instead of using a functional API) for `NPR Wait Wait <waitwait_>`_ episodes. If ``None``, then a new :py:class:`Webdriver <selenium.webdriver.remote.webdriver.WebDriver>` will be defined and used within this method's scope. :param bool justFix: optional argument, if ``True`` and if `NPR Wait Wait <waitwait_>`_ file exists, then just change the title of the M4A_ file. Default is ``False``. :returns: the name of the `NPR Wait Wait <waitwait_>`_ episode file. :rtype: str """ # check if outputdir is a directory if not os.path.isdir(outputdir): raise ValueError("Error, %s is not a directory." % outputdir) # check if actually saturday if not npr_utils.is_saturday(date_s): raise ValueError("Error, date = %s not a Saturday." % npr_utils.get_datestring(date_s)) # ## if driver is None if driver is None: driver = npr_utils.get_chrome_driver() exec_dict = npr_utils.find_necessary_executables() assert (exec_dict is not None) avconv_exec = exec_dict['avconv'] logging.debug('avconv exec = %s.' % avconv_exec) if order_totnum is None: order_totnum = npr_utils.get_order_number_saturday_in_year(date_s) order_in_year, tot_in_year = order_totnum file_data = get_waitwait_image() year = date_s.year decdate = npr_utils.get_decdate(date_s) m4afile = os.path.join(outputdir, 'NPR.WaitWait.%s.m4a' % decdate) logging.info( 'INFO TO GET FIGURE OUT get_title_mp3s_url_working: %s, %s, %s, %s' % (m4afile, date_s, driver, dump)) if year >= 2006: data = get_title_mp3_urls_working('.', date_s, driver, dump=dump) if dump: return data title_mp3_urls = data if title_mp3_urls is None or len(title_mp3_urls) == 0: return None titles, songurls = list(zip(*title_mp3_urls)) title = date_s.strftime('%B %d, %Y') title = '%s: %s.' % (title, '; '.join( ['%d) %s' % (num + 1, titl) for (num, titl) in enumerate(titles)])) if justFix: if not os.path.isfile(m4afile): print("Error, %s does not exist." % os.path.basename(m4afile)) return mp4tags = mutagen.mp4.MP4(m4afile) mp4tags.tags['\xa9nam'] = [ title, ] mp4tags.save() logging.info('fixed title for %s.' % m4afile) return m4afile logging.info('got here in NPR Wait Wait episode %s, title = %s.' % (date_s, title)) # temporary directory tmpdir = tempfile.mkdtemp() m4afile_temp = os.path.join(tmpdir, 'NPR.WaitWait.%s.m4a' % decdate) outfiles = [ os.path.join(tmpdir, 'waitwait.%s.%d.mp3' % (decdate, num + 1)) for (num, mp3url) in enumerate(songurls) ] # download those files with multiprocessing.Pool(processes=min(multiprocessing.cpu_count(), len(songurls))) as pool: outfiles = sorted( filter(None, pool.map(_download_file, zip(songurls, outfiles)))) # now convert to m4a file fnames = list( map(lambda filename: filename.replace(' ', '\ '), outfiles)) avconv_concat_cmd = 'concat:%s' % '|'.join(fnames) split_cmd = [ avconv_exec, '-y', '-i', avconv_concat_cmd, '-ar', '44100', '-ac', '2', '-threads', '%d' % multiprocessing.cpu_count(), '-strict', 'experimental', '-acodec', 'aac', m4afile_temp ] logging.info("here is the split command: %s." % split_cmd) proc = subprocess.Popen(split_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout_val, stderr_val = proc.communicate() logging.debug("stdout_val: %s." % stdout_val) logging.debug("stderr_val: %s." % stderr_val) # ## remove mp3 files for filename in outfiles: os.remove(filename) else: tmpdir = tempfile.mkdtemp() title = waitwait_realmedia.rm_get_title_from_url(date_s) rmfile = waitwait_realmedia.rm_download_file(date_s, outdir=tmpdir) wavfile = waitwait_realmedia.rm_create_wav_file(date_s, rmfile, outdir=tmpdir) os.remove(rmfile) # ## now convert to m4a file m4afile_temp = os.path.join(outputdir, 'NPR.WaitWait.%s.m4a' % decdate) split_cmd = [ avconv_exec, '-y', '-i', wavfile, '-ar', '44100', '-ac', '2', '-threads', '%d' % multiprocessing.cpu_count(), '-strict', 'experimental', '-acodec', 'aac', m4afile_temp ] proc = subprocess.Popen(split_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout_val, stderr_val = proc.communicate() # ## remove wav file os.remove(wavfile) # ## now put in metadata mp4tags = mutagen.mp4.MP4(m4afile_temp) mp4tags.tags['\xa9nam'] = [ title, ] mp4tags.tags['\xa9alb'] = [ "Wait Wait...Don't Tell Me: %d" % year, ] mp4tags.tags['\xa9ART'] = [ 'Peter Sagal', ] mp4tags.tags['\xa9day'] = [ '%d' % year, ] mp4tags.tags['\xa9cmt'] = [ "more info at : NPR Web site", ] mp4tags.tags['trkn'] = [ (order_in_year, tot_in_year), ] mp4tags.tags['covr'] = [ mutagen.mp4.MP4Cover(file_data, mutagen.mp4.MP4Cover.FORMAT_PNG), ] mp4tags.tags['\xa9gen'] = [ 'Podcast', ] mp4tags.save() os.chmod(m4afile_temp, 0o644) # ## now copy to actual location and remove temp directory shutil.copy(m4afile_temp, m4afile) shutil.rmtree(tmpdir) return m4afile