示例#1
0
def loop_vitems_to_correct_publishdatetime():
  filename = 'z_pubdate_analysis.log'
  fp = open(filename, 'w', encoding='utf8')
  session = Session()
  vitems = session.query(YTVideoItemInfoSA).all()
  n_need_change = 0
  for i, vi in enumerate(vitems):
    recalc_publishdtime = vi.recalc_n_return_publishdtime_from_infodtime_n_calendarstr()
    recalc_publishdt = dtfs.convert_datetime_to_date(recalc_publishdtime)
    if recalc_publishdt != vi.publishdate:
      n_need_change += 1
      line = str(n_need_change) + '/' + str(i+1) + ' infdt ' + str(vi.infodate) + ' pubdt ' \
                                + str(vi.publishdate) + ' calend ' + str(vi.published_time_ago) + '\n'
      line += ' recalc ' + str(recalc_publishdt) + ' [ needs updating ]' + '\n'
      fp.write(line)
      print(line)
      vi.publishdatetime = recalc_publishdtime
      session.commit()
  line = 'Number of corrected records = ' + str(n_need_change) + ' // Total video items records = '\
         + str(len(vitems)) + '\n'
  fp.write(line)
  print(line)
  fp.close()
  session.close()
  print('Written', filename)
 def test_convert_datetime_to_date(self):
   today = datetime.date.today()
   input_datetime = datetime.datetime(year=today.year, month=today.month, day=today.day)
   expected_date = today
   returned_date = dtfs.convert_datetime_to_date(input_datetime)
   self.assertEqual(expected_date, returned_date)
   input_datetime = datetime.datetime.now()
   # expected_date = today
   returned_date = dtfs.convert_datetime_to_date(input_datetime)
   self.assertEqual(expected_date, returned_date)
   date_mock_obj = EmptyMock()
   date_mock_obj.year = today.year
   date_mock_obj.month = today.month
   date_mock_obj.day = today.day
   returned_date = dtfs.convert_datetime_to_date(date_mock_obj)
   self.assertEqual(today, returned_date)
示例#3
0
def update_subscribers(ytchannel, refdate, n_of_subs, dt, session):
    global gcount
    pdate = dtfs.convert_datetime_to_date(dt)
    if pdate != refdate:
        error_msg = 'Error: pdate (%s) != refdate (%s) dt=(%s)' % (pdate,
                                                                   refdate, dt)
        raise ValueError(error_msg)
    ptime = dtfs.extract_time_from_datetime(dt)
    subs = session.query(sam.YTDailySubscribersSA).\
      filter(sam.YTDailySubscribersSA.ytchannelid == ytchannel.ytchannelid).\
      filter(sam.YTDailySubscribersSA.infodate == pdate).\
      first()
    if subs:
        was_changed = False
        if subs.subscribers != n_of_subs:
            subs.subscribers = n_of_subs
            was_changed = True
        if subs.infotime != n_of_subs:
            subs.infotime = ptime
            was_changed = True
        if was_changed:
            gcount += 1
            print(gcount, 'db-update', subs)
            session.commit()
        return
    subs = sam.YTDailySubscribersSA()
    subs.ytchannelid = ytchannel.ytchannelid
    subs.subscribers = n_of_subs
    subs.infodate = pdate
    subs.infotime = ptime
    gcount += 1
    print(gcount, 'db-insert', subs)
    session.commit()
示例#4
0
def transport_osdatetime_to_null_infotime_values_in_subscribers():
    global g_nsubs_is_none, gcount
    session = con.Session()
    subs = session.query(sam.YTDailySubscribersSA).\
      filter(sam.YTDailySubscribersSA.infotime == None).\
      order_by(sam.YTDailySubscribersSA.infodate).\
      all()
    for i, sub in enumerate(subs):
        sname = sub.ytchannel.sname
        filename = autof.form_datedpage_filename_with_triple(
            sub.infodate, sname, sub.ytchannelid)
        filepath = autof.form_datedpage_filepath_with_triple(
            sub.infodate, sname, sub.ytchannelid)
        print(sname, sub, 'infotime', sub.infotime)
        osstat_nt = os.stat(filepath)
        mtime = osstat_nt.st_mtime
        dt = datetime.datetime.fromtimestamp(mtime)
        print(i + 1, dt, filepath)
        pdate = dtfs.convert_datetime_to_date(dt)
        ptime = dtfs.extract_time_from_datetime(dt)
        if sub.infodate != pdate:
            g_nsubs_is_none += 1
            line = '============= sub.infodate %s != pdate %s =============' % (
                sub.infodate, pdate)
            print(line)
            continue
        gcount += 1
        sub.infotime = ptime

    print('g_nsubs_is_none', g_nsubs_is_none)
    print('gcount', gcount)
    session.commit()
    session.close()
示例#5
0
def compare_prefixdate_with_osstatdate():
    seq = 0
    count_equal_dates = 0
    field_names = ['seq', 'strdate', 'dayhour', 'dt']
    ptab = remake_ptab(field_names)
    ptab_out = remake_ptab(field_names)
    for ytvideopage_abspath in autof.generate_all_ytvideopages_abspath_asc_date(
    ):
        seq += 1
        _, ytvideopage = os.path.split(ytvideopage_abspath)
        strdate = ytvideopage[:10]
        filesdatetimest = os.stat(ytvideopage_abspath)[7]
        dt = datetime.datetime.fromtimestamp(filesdatetimest)
        dayhour = dt.hour
        filesdate = dtfs.convert_datetime_to_date(dt)
        if strdate == str(filesdate):
            count_equal_dates += 1
            if dt.minute > 30 and dt.hour < 23:
                dayhour += 1
            ptab.add_row([seq, strdate, dayhour, dt])
        else:
            ptab_out.add_row([seq, strdate, '', dt])
    print(ptab)
    print(ptab_out)
    print('seq', seq, 'count_equal_dates', count_equal_dates)
示例#6
0
def verify_videopagefiles_w_no_corresponding_dbsubs():
    """
  About 800 pages were committed with the function below, ie they received missing subscriber numbers.
    There is one 'subscriber_number' per day per channel and some were missing within the last 3 months.
  However, after about 800 recups, there are still 22 missing, with scraperesult returning None;
    ie, there are yet 22 pages that maybe demand the old scraping routine for fetching n_of_subscribers;
    it's probably possible to treat them, picking up the 'museum' code.
  This above is a TO-DO (segunda, 24 de agosto de 2020 01:43), ie try to rescrape these with the old routine.
  :return:
  """
    count = 0
    n_commits = 0
    session = con.Session()
    for abspath in autof.generate_all_ytvideopages_abspath_asc_date():
        strdate, sname, ytchannelid = regexp.find_triple_date_sname_n_ytchid_in_filepath(
            abspath)
        subs = session.query(sam.YTDailySubscribersSA).\
          filter(sam.YTDailySubscribersSA.ytchannelid == ytchannelid).\
          filter(sam.YTDailySubscribersSA.infodate == strdate).\
          first()
        if subs:
            continue
        count += 1
        print(count, strdate, sname, ytchannelid, abspath)
        t_osstat = os.stat(abspath)
        timestamp = t_osstat.st_mtime
        dt = datetime.datetime.fromtimestamp(timestamp)
        filedate = dtfs.convert_datetime_to_date(dt)
        pdate = dtfs.get_refdate_from_strdate_or_none(strdate)
        if pdate != filedate:
            print('strdate', strdate, 'pdate', pdate, 'filedate', filedate,
                  'dt', dt)
            continue
        filetime = dtfs.extract_time_from_datetime(dt)
        text = open(abspath, encoding='utf8').read()
        n_of_subscribers = scrape_n_return_number_of_subscribers_from_channels_pagetext(
            text)
        # print('n_of_subscribers', n_of_subscribers)
        if n_of_subscribers is None:
            continue
        subs = sam.YTDailySubscribersSA()
        subs.ytchannelid = ytchannelid
        subs.infodate = pdate
        subs.infotime = filetime
        subs.subscribers = n_of_subscribers
        session.add(subs)
        n_commits += 1
        print('n_commits', n_commits, 'committing', subs)
        session.commit()
    print('n_commits', n_commits, 'missing', count)
    session.close()
示例#7
0
def update_infotime_for_all_ytvideopages():
    seq = 0
    for ytvideopage_abspath in autof.generate_all_ytvideopages_abspath_asc_date(
    ):
        t = os.stat(ytvideopage_abspath)
        _, filename = os.path.split(ytvideopage_abspath)
        filesdatetime = t[7]
        pdatetime = datetime.datetime.fromtimestamp(filesdatetime)
        pdate = dtfs.convert_datetime_to_date(pdatetime)
        strdate = filename[:10]
        if strdate != str(pdate):
            continue
        seq += 1
        print(seq, filename, 'filesdatetime', filesdatetime, pdatetime)
示例#8
0
def update_subscribers_scrape_for_date(refdate):
    global g_nsubs_is_none
    sess = con.Session()
    ytchannelids_n_cdatetimes = autof.find_ytchannelid_n_videopagefilemodifiedtimestamp_tuplelist_for_date(
        refdate)
    for ytchannelid_n_cdatetime in ytchannelids_n_cdatetimes:
        ytchannelid, cdatetime = ytchannelid_n_cdatetime
        ytchannel = fetcher.fetch_ytchannel_with_ytchannelid(ytchannelid, sess)
        if ytchannel is None:
            continue
        dt = datetime.datetime.fromtimestamp(cdatetime)
        pdate = dtfs.convert_datetime_to_date(dt)
        n_of_subs = scrape_channel_from_its_datedfile_n_return_number_of_subscribers(
            ytchannel, pdate)
        if n_of_subs is None:
            g_nsubs_is_none += 1
            print(g_nsubs_is_none, 'n_of_subs is None')
            continue
        print(ytchannel.nname, refdate, n_of_subs, dt)
        update_subscribers(ytchannel, refdate, n_of_subs, dt, sess)
    sess.close()
def look_up_modifiedtime_of_htmlvideopagefiles(refdate, session):
  """
  """
  global gcounter
  htmlfilepaths = autof.find_htmlfilepaths_from_date(refdate)
  counter_vitems = 0
  counter_vviews = 0
  for filepath in htmlfilepaths:
    _, filename = os.path.split(filepath)
    t = os.stat(filepath)
    modified_dt = datetime.datetime.fromtimestamp(t.st_mtime)
    ptime = dtfs.extract_time_from_datetime(modified_dt)
    mdate = dtfs.convert_datetime_to_date(modified_dt)
    if mdate != refdate:
      line = 'cdate (%s) != refdate (%s)' % (mdate, refdate)
      print(line)
      continue
    ytchannelid = pathfs.extract_ytchid_from_filename(filename)
    # counter_vitems = ajust_vitems(refdate, ytchannelid, ptime, session)
    counter_vviews = ajust_vviews(refdate, ytchannelid, ptime, session)

  if counter_vviews > 0 or counter_vitems > 0:
    print('Committing counter with', counter_vviews, counter_vitems, gcounter)
    session.commit()
 def publishdate(self):
     return dtfs.convert_datetime_to_date(self.publishdatetime)