def get_args(): outdict = {'dateini': None, 'datefim': None} # sys.argv.append('--all') for arg in sys.argv: if arg.startswith('-h'): show_help_n_exit() elif arg.startswith('--all'): dtini, dtfim = find_ini_fim_full_date_range_in_data_n_confim() outdict = {'dateini': dtini, 'datefim': dtfim} return outdict elif arg.startswith('--ini='): try: pos = len('--ini=') param = arg[pos:] paramdate = dtfs.get_refdate_from_strdate_or_none(param) outdict['dateini'] = paramdate except IndexError: pass elif arg.startswith('--fim='): try: pos = len('--fim=') param = arg[pos:] paramdate = dtfs.get_refdate_from_strdate_or_none(param) outdict['datefim'] = paramdate except IndexError: pass return outdict
def find_dateini_n_datefin_thru_yyyymmdd_level3_folders(level0abspath=None): level0abspath = find_level0folderabspath(level0abspath) strdate_ini = find_oldest_yyyymmdd_level3_foldername(level0abspath) strdate_fin = find_newest_yyyymmdd_level3_foldername(level0abspath) dateini = dtfs.get_refdate_from_strdate_or_none(strdate_ini) datefin = dtfs.get_refdate_from_strdate_or_none(strdate_fin) return dateini, datefin
def init_dates(self, dateini=None, datefim=None): ''' Conditions: 1) if ini is greater than fim, an exception (error) will be raised; 2) if fim is greater than today, an exception (error) will also be raised; 3) if a date not in format yyyy-mm-dd is entered, an exception (error) will be raised; 4) if a wrong date is entered, an exception (error) will also be raised. :param dateini: :param datefim: :return: ''' if dateini is None and datefim is None: self.dateini = dtfs.return_refdate_as_datetimedate_or_today() self.datefim = dtfs.return_refdate_as_datetimedate_or_today() elif dateini is None: rdatefim = dtfs.get_refdate_from_strdate_or_none(datefim) if rdatefim is None: error_msg = 'parameter datefim (%s) is an invalid date. Please, retry with a valid date.' % rdatefim raise ValueError(error_msg) self.datefim = rdatefim self.dateini = copy.copy(rdatefim) elif datefim is None: rdateini = dtfs.get_refdate_from_strdate_or_none(dateini) if rdateini is None: error_msg = 'parameter dateini (%s) is an invalid date. Please, retry with a valid date.' % rdateini raise ValueError(error_msg) self.dateini = rdateini self.datefim = copy.copy(rdateini) else: rdateini = dtfs.get_refdate_from_strdate_or_none(dateini) if rdateini is None: error_msg = 'parameter dateini (%s) is an invalid date. Please, retry with a valid date.' % rdateini raise ValueError(error_msg) self.dateini = rdateini rdatefim = dtfs.get_refdate_from_strdate_or_none(datefim) if rdatefim is None: error_msg = 'parameter datefim (%s) is an invalid date. Please, retry with a valid date.' % rdatefim raise ValueError(error_msg) self.datefim = rdatefim today = datetime.date.today() if self.datefim > today: error_msg = 'Error: datafim is greater than today: self.datefim (%s) > today (%s). Please, correct datafim and retry.' % ( self.dateini, self.datefim) raise ValueError(error_msg) if self.dateini > self.datefim: error_msg = 'Error: dataini is greater than datafim: self.dateini (%s) > self.datefim (%s). Please, invert them and retry.' % ( self.dateini, self.datefim) raise ValueError(error_msg) self.datepointer = copy.copy(self.dateini)
def find_yyyymmdd_level3_foldernames(): total_level2_abspath_entries = find_2ndlevel_yyyymm_dir_abspaths() total_yyyymmdd_foldernames = [] for pathentry in total_level2_abspath_entries: entries = os.listdir(pathentry) entries = list(filter(lambda e: dtfs.get_refdate_from_strdate_or_none(e), entries)) total_yyyymmdd_foldernames += entries return total_yyyymmdd_foldernames
def set_refdate(self): if len(self.filename) < 11: error_msg = 'Error: len(self.filename) < 11 (%s) when trying to derive strdate.' % str( self.filename) raise ValueError(error_msg) strdate = self.filename[:10] refdate = dtfs.get_refdate_from_strdate_or_none(strdate) if refdate is None: error_msg = 'Error: refdate %s has not been found; filename %s.' % ( str(strdate), self.filename) raise ValueError(error_msg) self._refdate = refdate
def verify_videopagefiles_w_no_corresponding_dbsubs(): """ About 800 pages were committed with the function below, ie they received missing subscriber numbers. There is one 'subscriber_number' per day per channel and some were missing within the last 3 months. However, after about 800 recups, there are still 22 missing, with scraperesult returning None; ie, there are yet 22 pages that maybe demand the old scraping routine for fetching n_of_subscribers; it's probably possible to treat them, picking up the 'museum' code. This above is a TO-DO (segunda, 24 de agosto de 2020 01:43), ie try to rescrape these with the old routine. :return: """ count = 0 n_commits = 0 session = con.Session() for abspath in autof.generate_all_ytvideopages_abspath_asc_date(): strdate, sname, ytchannelid = regexp.find_triple_date_sname_n_ytchid_in_filepath( abspath) subs = session.query(sam.YTDailySubscribersSA).\ filter(sam.YTDailySubscribersSA.ytchannelid == ytchannelid).\ filter(sam.YTDailySubscribersSA.infodate == strdate).\ first() if subs: continue count += 1 print(count, strdate, sname, ytchannelid, abspath) t_osstat = os.stat(abspath) timestamp = t_osstat.st_mtime dt = datetime.datetime.fromtimestamp(timestamp) filedate = dtfs.convert_datetime_to_date(dt) pdate = dtfs.get_refdate_from_strdate_or_none(strdate) if pdate != filedate: print('strdate', strdate, 'pdate', pdate, 'filedate', filedate, 'dt', dt) continue filetime = dtfs.extract_time_from_datetime(dt) text = open(abspath, encoding='utf8').read() n_of_subscribers = scrape_n_return_number_of_subscribers_from_channels_pagetext( text) # print('n_of_subscribers', n_of_subscribers) if n_of_subscribers is None: continue subs = sam.YTDailySubscribersSA() subs.ytchannelid = ytchannelid subs.infodate = pdate subs.infotime = filetime subs.subscribers = n_of_subscribers session.add(subs) n_commits += 1 print('n_commits', n_commits, 'committing', subs) session.commit() print('n_commits', n_commits, 'missing', count) session.close()
def endswith_htmls_n_startswith_date(filename): if filename is None: return False name, ext = os.path.splitext(filename) if ext is None or len(ext) == 0: return False if ext not in config.HTML_EXTLIST: return False try: strdate = name[:10] except IndexError: return False pdate = dtfs.get_refdate_from_strdate_or_none(strdate) if pdate is None: return False return True
def is_htmldatedfilename_under_convention(filename): """ Convention is "(10-char-strdate) ([publisher-nname]) (title-or-slug)(.ext)" :param filename: :return: """ if len(filename) < 11: return False may_be_none = dtfs.get_refdate_from_strdate_or_none(filename[:10]) if may_be_none is None: return False pos_open_squarebracket = filename.find('[') if pos_open_squarebracket > -1: return False pos_close_squarebracket = filename.find(']') if pos_close_squarebracket > -1: return False if pos_open_squarebracket > pos_close_squarebracket: return False _, ext = os.path.splitext(filename) if ext not in config.HTML_EXTLIST: return False return True