def get_args():
    outdict = {'dateini': None, 'datefim': None}
    # sys.argv.append('--all')
    for arg in sys.argv:
        if arg.startswith('-h'):
            show_help_n_exit()
        elif arg.startswith('--all'):
            dtini, dtfim = find_ini_fim_full_date_range_in_data_n_confim()
            outdict = {'dateini': dtini, 'datefim': dtfim}
            return outdict
        elif arg.startswith('--ini='):
            try:
                pos = len('--ini=')
                param = arg[pos:]
                paramdate = dtfs.get_refdate_from_strdate_or_none(param)
                outdict['dateini'] = paramdate
            except IndexError:
                pass
        elif arg.startswith('--fim='):
            try:
                pos = len('--fim=')
                param = arg[pos:]
                paramdate = dtfs.get_refdate_from_strdate_or_none(param)
                outdict['datefim'] = paramdate
            except IndexError:
                pass
    return outdict
def find_dateini_n_datefin_thru_yyyymmdd_level3_folders(level0abspath=None):
  level0abspath = find_level0folderabspath(level0abspath)
  strdate_ini = find_oldest_yyyymmdd_level3_foldername(level0abspath)
  strdate_fin = find_newest_yyyymmdd_level3_foldername(level0abspath)
  dateini = dtfs.get_refdate_from_strdate_or_none(strdate_ini)
  datefin = dtfs.get_refdate_from_strdate_or_none(strdate_fin)
  return dateini, datefin
    def init_dates(self, dateini=None, datefim=None):
        '''
    Conditions:
      1) if ini is greater than fim, an exception (error) will be raised;
      2) if fim is greater than today, an exception (error) will also be raised;
      3) if a date not in format yyyy-mm-dd is entered, an exception (error) will be raised;
      4) if a wrong date is entered, an exception (error) will also be raised.

    :param dateini:
    :param datefim:
    :return:
    '''
        if dateini is None and datefim is None:
            self.dateini = dtfs.return_refdate_as_datetimedate_or_today()
            self.datefim = dtfs.return_refdate_as_datetimedate_or_today()
        elif dateini is None:
            rdatefim = dtfs.get_refdate_from_strdate_or_none(datefim)
            if rdatefim is None:
                error_msg = 'parameter datefim (%s) is an invalid date. Please, retry with a valid date.' % rdatefim
                raise ValueError(error_msg)
            self.datefim = rdatefim
            self.dateini = copy.copy(rdatefim)
        elif datefim is None:
            rdateini = dtfs.get_refdate_from_strdate_or_none(dateini)
            if rdateini is None:
                error_msg = 'parameter dateini (%s) is an invalid date. Please, retry with a valid date.' % rdateini
                raise ValueError(error_msg)
            self.dateini = rdateini
            self.datefim = copy.copy(rdateini)
        else:
            rdateini = dtfs.get_refdate_from_strdate_or_none(dateini)
            if rdateini is None:
                error_msg = 'parameter dateini (%s) is an invalid date. Please, retry with a valid date.' % rdateini
                raise ValueError(error_msg)
            self.dateini = rdateini
            rdatefim = dtfs.get_refdate_from_strdate_or_none(datefim)
            if rdatefim is None:
                error_msg = 'parameter datefim (%s) is an invalid date. Please, retry with a valid date.' % rdatefim
                raise ValueError(error_msg)
            self.datefim = rdatefim

        today = datetime.date.today()
        if self.datefim > today:
            error_msg = 'Error: datafim is greater than today: self.datefim (%s) > today (%s). Please, correct datafim and retry.' % (
                self.dateini, self.datefim)
            raise ValueError(error_msg)
        if self.dateini > self.datefim:
            error_msg = 'Error: dataini is greater than datafim: self.dateini (%s) > self.datefim (%s). Please, invert them and retry.' % (
                self.dateini, self.datefim)
            raise ValueError(error_msg)
        self.datepointer = copy.copy(self.dateini)
def find_yyyymmdd_level3_foldernames():
  total_level2_abspath_entries = find_2ndlevel_yyyymm_dir_abspaths()
  total_yyyymmdd_foldernames = []
  for pathentry in total_level2_abspath_entries:
    entries = os.listdir(pathentry)
    entries = list(filter(lambda e: dtfs.get_refdate_from_strdate_or_none(e), entries))
    total_yyyymmdd_foldernames += entries
  return total_yyyymmdd_foldernames
示例#5
0
 def set_refdate(self):
     if len(self.filename) < 11:
         error_msg = 'Error: len(self.filename) < 11 (%s) when trying to derive strdate.' % str(
             self.filename)
         raise ValueError(error_msg)
     strdate = self.filename[:10]
     refdate = dtfs.get_refdate_from_strdate_or_none(strdate)
     if refdate is None:
         error_msg = 'Error: refdate %s has not been found; filename %s.' % (
             str(strdate), self.filename)
         raise ValueError(error_msg)
     self._refdate = refdate
示例#6
0
def verify_videopagefiles_w_no_corresponding_dbsubs():
    """
  About 800 pages were committed with the function below, ie they received missing subscriber numbers.
    There is one 'subscriber_number' per day per channel and some were missing within the last 3 months.
  However, after about 800 recups, there are still 22 missing, with scraperesult returning None;
    ie, there are yet 22 pages that maybe demand the old scraping routine for fetching n_of_subscribers;
    it's probably possible to treat them, picking up the 'museum' code.
  This above is a TO-DO (segunda, 24 de agosto de 2020 01:43), ie try to rescrape these with the old routine.
  :return:
  """
    count = 0
    n_commits = 0
    session = con.Session()
    for abspath in autof.generate_all_ytvideopages_abspath_asc_date():
        strdate, sname, ytchannelid = regexp.find_triple_date_sname_n_ytchid_in_filepath(
            abspath)
        subs = session.query(sam.YTDailySubscribersSA).\
          filter(sam.YTDailySubscribersSA.ytchannelid == ytchannelid).\
          filter(sam.YTDailySubscribersSA.infodate == strdate).\
          first()
        if subs:
            continue
        count += 1
        print(count, strdate, sname, ytchannelid, abspath)
        t_osstat = os.stat(abspath)
        timestamp = t_osstat.st_mtime
        dt = datetime.datetime.fromtimestamp(timestamp)
        filedate = dtfs.convert_datetime_to_date(dt)
        pdate = dtfs.get_refdate_from_strdate_or_none(strdate)
        if pdate != filedate:
            print('strdate', strdate, 'pdate', pdate, 'filedate', filedate,
                  'dt', dt)
            continue
        filetime = dtfs.extract_time_from_datetime(dt)
        text = open(abspath, encoding='utf8').read()
        n_of_subscribers = scrape_n_return_number_of_subscribers_from_channels_pagetext(
            text)
        # print('n_of_subscribers', n_of_subscribers)
        if n_of_subscribers is None:
            continue
        subs = sam.YTDailySubscribersSA()
        subs.ytchannelid = ytchannelid
        subs.infodate = pdate
        subs.infotime = filetime
        subs.subscribers = n_of_subscribers
        session.add(subs)
        n_commits += 1
        print('n_commits', n_commits, 'committing', subs)
        session.commit()
    print('n_commits', n_commits, 'missing', count)
    session.close()
def endswith_htmls_n_startswith_date(filename):
  if filename is None:
    return False
  name, ext = os.path.splitext(filename)
  if ext is None or len(ext) == 0:
    return False
  if ext not in config.HTML_EXTLIST:
    return False
  try:
    strdate = name[:10]
  except IndexError:
    return False
  pdate = dtfs.get_refdate_from_strdate_or_none(strdate)
  if pdate is None:
    return False
  return True
示例#8
0
def is_htmldatedfilename_under_convention(filename):
    """
    Convention is "(10-char-strdate) ([publisher-nname]) (title-or-slug)(.ext)"
  :param filename:
  :return:
  """
    if len(filename) < 11:
        return False
    may_be_none = dtfs.get_refdate_from_strdate_or_none(filename[:10])
    if may_be_none is None:
        return False
    pos_open_squarebracket = filename.find('[')
    if pos_open_squarebracket > -1:
        return False
    pos_close_squarebracket = filename.find(']')
    if pos_close_squarebracket > -1:
        return False
    if pos_open_squarebracket > pos_close_squarebracket:
        return False
    _, ext = os.path.splitext(filename)
    if ext not in config.HTML_EXTLIST:
        return False
    return True