def loop_vitems_to_correct_publishdatetime(): filename = 'z_pubdate_analysis.log' fp = open(filename, 'w', encoding='utf8') session = Session() vitems = session.query(YTVideoItemInfoSA).all() n_need_change = 0 for i, vi in enumerate(vitems): recalc_publishdtime = vi.recalc_n_return_publishdtime_from_infodtime_n_calendarstr() recalc_publishdt = dtfs.convert_datetime_to_date(recalc_publishdtime) if recalc_publishdt != vi.publishdate: n_need_change += 1 line = str(n_need_change) + '/' + str(i+1) + ' infdt ' + str(vi.infodate) + ' pubdt ' \ + str(vi.publishdate) + ' calend ' + str(vi.published_time_ago) + '\n' line += ' recalc ' + str(recalc_publishdt) + ' [ needs updating ]' + '\n' fp.write(line) print(line) vi.publishdatetime = recalc_publishdtime session.commit() line = 'Number of corrected records = ' + str(n_need_change) + ' // Total video items records = '\ + str(len(vitems)) + '\n' fp.write(line) print(line) fp.close() session.close() print('Written', filename)
def test_convert_datetime_to_date(self): today = datetime.date.today() input_datetime = datetime.datetime(year=today.year, month=today.month, day=today.day) expected_date = today returned_date = dtfs.convert_datetime_to_date(input_datetime) self.assertEqual(expected_date, returned_date) input_datetime = datetime.datetime.now() # expected_date = today returned_date = dtfs.convert_datetime_to_date(input_datetime) self.assertEqual(expected_date, returned_date) date_mock_obj = EmptyMock() date_mock_obj.year = today.year date_mock_obj.month = today.month date_mock_obj.day = today.day returned_date = dtfs.convert_datetime_to_date(date_mock_obj) self.assertEqual(today, returned_date)
def update_subscribers(ytchannel, refdate, n_of_subs, dt, session): global gcount pdate = dtfs.convert_datetime_to_date(dt) if pdate != refdate: error_msg = 'Error: pdate (%s) != refdate (%s) dt=(%s)' % (pdate, refdate, dt) raise ValueError(error_msg) ptime = dtfs.extract_time_from_datetime(dt) subs = session.query(sam.YTDailySubscribersSA).\ filter(sam.YTDailySubscribersSA.ytchannelid == ytchannel.ytchannelid).\ filter(sam.YTDailySubscribersSA.infodate == pdate).\ first() if subs: was_changed = False if subs.subscribers != n_of_subs: subs.subscribers = n_of_subs was_changed = True if subs.infotime != n_of_subs: subs.infotime = ptime was_changed = True if was_changed: gcount += 1 print(gcount, 'db-update', subs) session.commit() return subs = sam.YTDailySubscribersSA() subs.ytchannelid = ytchannel.ytchannelid subs.subscribers = n_of_subs subs.infodate = pdate subs.infotime = ptime gcount += 1 print(gcount, 'db-insert', subs) session.commit()
def transport_osdatetime_to_null_infotime_values_in_subscribers(): global g_nsubs_is_none, gcount session = con.Session() subs = session.query(sam.YTDailySubscribersSA).\ filter(sam.YTDailySubscribersSA.infotime == None).\ order_by(sam.YTDailySubscribersSA.infodate).\ all() for i, sub in enumerate(subs): sname = sub.ytchannel.sname filename = autof.form_datedpage_filename_with_triple( sub.infodate, sname, sub.ytchannelid) filepath = autof.form_datedpage_filepath_with_triple( sub.infodate, sname, sub.ytchannelid) print(sname, sub, 'infotime', sub.infotime) osstat_nt = os.stat(filepath) mtime = osstat_nt.st_mtime dt = datetime.datetime.fromtimestamp(mtime) print(i + 1, dt, filepath) pdate = dtfs.convert_datetime_to_date(dt) ptime = dtfs.extract_time_from_datetime(dt) if sub.infodate != pdate: g_nsubs_is_none += 1 line = '============= sub.infodate %s != pdate %s =============' % ( sub.infodate, pdate) print(line) continue gcount += 1 sub.infotime = ptime print('g_nsubs_is_none', g_nsubs_is_none) print('gcount', gcount) session.commit() session.close()
def compare_prefixdate_with_osstatdate(): seq = 0 count_equal_dates = 0 field_names = ['seq', 'strdate', 'dayhour', 'dt'] ptab = remake_ptab(field_names) ptab_out = remake_ptab(field_names) for ytvideopage_abspath in autof.generate_all_ytvideopages_abspath_asc_date( ): seq += 1 _, ytvideopage = os.path.split(ytvideopage_abspath) strdate = ytvideopage[:10] filesdatetimest = os.stat(ytvideopage_abspath)[7] dt = datetime.datetime.fromtimestamp(filesdatetimest) dayhour = dt.hour filesdate = dtfs.convert_datetime_to_date(dt) if strdate == str(filesdate): count_equal_dates += 1 if dt.minute > 30 and dt.hour < 23: dayhour += 1 ptab.add_row([seq, strdate, dayhour, dt]) else: ptab_out.add_row([seq, strdate, '', dt]) print(ptab) print(ptab_out) print('seq', seq, 'count_equal_dates', count_equal_dates)
def verify_videopagefiles_w_no_corresponding_dbsubs(): """ About 800 pages were committed with the function below, ie they received missing subscriber numbers. There is one 'subscriber_number' per day per channel and some were missing within the last 3 months. However, after about 800 recups, there are still 22 missing, with scraperesult returning None; ie, there are yet 22 pages that maybe demand the old scraping routine for fetching n_of_subscribers; it's probably possible to treat them, picking up the 'museum' code. This above is a TO-DO (segunda, 24 de agosto de 2020 01:43), ie try to rescrape these with the old routine. :return: """ count = 0 n_commits = 0 session = con.Session() for abspath in autof.generate_all_ytvideopages_abspath_asc_date(): strdate, sname, ytchannelid = regexp.find_triple_date_sname_n_ytchid_in_filepath( abspath) subs = session.query(sam.YTDailySubscribersSA).\ filter(sam.YTDailySubscribersSA.ytchannelid == ytchannelid).\ filter(sam.YTDailySubscribersSA.infodate == strdate).\ first() if subs: continue count += 1 print(count, strdate, sname, ytchannelid, abspath) t_osstat = os.stat(abspath) timestamp = t_osstat.st_mtime dt = datetime.datetime.fromtimestamp(timestamp) filedate = dtfs.convert_datetime_to_date(dt) pdate = dtfs.get_refdate_from_strdate_or_none(strdate) if pdate != filedate: print('strdate', strdate, 'pdate', pdate, 'filedate', filedate, 'dt', dt) continue filetime = dtfs.extract_time_from_datetime(dt) text = open(abspath, encoding='utf8').read() n_of_subscribers = scrape_n_return_number_of_subscribers_from_channels_pagetext( text) # print('n_of_subscribers', n_of_subscribers) if n_of_subscribers is None: continue subs = sam.YTDailySubscribersSA() subs.ytchannelid = ytchannelid subs.infodate = pdate subs.infotime = filetime subs.subscribers = n_of_subscribers session.add(subs) n_commits += 1 print('n_commits', n_commits, 'committing', subs) session.commit() print('n_commits', n_commits, 'missing', count) session.close()
def update_infotime_for_all_ytvideopages(): seq = 0 for ytvideopage_abspath in autof.generate_all_ytvideopages_abspath_asc_date( ): t = os.stat(ytvideopage_abspath) _, filename = os.path.split(ytvideopage_abspath) filesdatetime = t[7] pdatetime = datetime.datetime.fromtimestamp(filesdatetime) pdate = dtfs.convert_datetime_to_date(pdatetime) strdate = filename[:10] if strdate != str(pdate): continue seq += 1 print(seq, filename, 'filesdatetime', filesdatetime, pdatetime)
def update_subscribers_scrape_for_date(refdate): global g_nsubs_is_none sess = con.Session() ytchannelids_n_cdatetimes = autof.find_ytchannelid_n_videopagefilemodifiedtimestamp_tuplelist_for_date( refdate) for ytchannelid_n_cdatetime in ytchannelids_n_cdatetimes: ytchannelid, cdatetime = ytchannelid_n_cdatetime ytchannel = fetcher.fetch_ytchannel_with_ytchannelid(ytchannelid, sess) if ytchannel is None: continue dt = datetime.datetime.fromtimestamp(cdatetime) pdate = dtfs.convert_datetime_to_date(dt) n_of_subs = scrape_channel_from_its_datedfile_n_return_number_of_subscribers( ytchannel, pdate) if n_of_subs is None: g_nsubs_is_none += 1 print(g_nsubs_is_none, 'n_of_subs is None') continue print(ytchannel.nname, refdate, n_of_subs, dt) update_subscribers(ytchannel, refdate, n_of_subs, dt, sess) sess.close()
def look_up_modifiedtime_of_htmlvideopagefiles(refdate, session): """ """ global gcounter htmlfilepaths = autof.find_htmlfilepaths_from_date(refdate) counter_vitems = 0 counter_vviews = 0 for filepath in htmlfilepaths: _, filename = os.path.split(filepath) t = os.stat(filepath) modified_dt = datetime.datetime.fromtimestamp(t.st_mtime) ptime = dtfs.extract_time_from_datetime(modified_dt) mdate = dtfs.convert_datetime_to_date(modified_dt) if mdate != refdate: line = 'cdate (%s) != refdate (%s)' % (mdate, refdate) print(line) continue ytchannelid = pathfs.extract_ytchid_from_filename(filename) # counter_vitems = ajust_vitems(refdate, ytchannelid, ptime, session) counter_vviews = ajust_vviews(refdate, ytchannelid, ptime, session) if counter_vviews > 0 or counter_vitems > 0: print('Committing counter with', counter_vviews, counter_vitems, gcounter) session.commit()
def publishdate(self): return dtfs.convert_datetime_to_date(self.publishdatetime)