def initiate_collections(cls, since: datetime) -> None: """ Initiate the visa status storage with the file based data.""" since_midnight = since.replace(hour=0, minute=0, second=0, microsecond=0) today_midnight = datetime.combine(datetime.now().date(), datetime.min.time()) date_range = [since_midnight + timedelta(days=d) for d in range((today_midnight - since_midnight).days + 1)] embassy_lst = USEmbassy.get_embassy_lst() cls.drop() cls.visa_status.create_index([('write_date', pymongo.ASCENDING)]) for vt in VISA_TYPES: for emb in embassy_lst: print() accumulated_inserted = 0 for date in date_range: file_path = util.construct_data_file_path(vt, emb.location, date.strftime('%Y/%m/%d')) if not os.path.exists(file_path): continue with open(file_path) as f: fetched_result_lst = [util.file_line_to_dt(ln) for ln in f.readlines()] available_dates_arr = [ {'write_time': datetime.combine(date.date(), wt), 'available_date': avai_dt} for wt, avai_dt in fetched_result_lst ] cls.visa_status.insert_one( { 'visa_type': vt, 'embassy_code': emb.code, 'write_date': date, 'available_dates': available_dates_arr } ) if len(available_dates_arr) > 0: earliest_dt = min([d['available_date'] for d in available_dates_arr]) latest_dt = max([d['available_date'] for d in available_dates_arr]) cls.overview.update_one( {'visa_type': vt, 'embassy_code': emb.code}, { '$push': { 'overview': { 'write_date': date, 'earliest_date': earliest_dt, 'latest_date': latest_dt, } } }, upsert=True, ) accumulated_inserted += len(available_dates_arr) print( f'Inserted: {vt}-{emb.location}-{date.year}/{date.month}/{date.day}\ \t\t{len(available_dates_arr)}\trecords |\t{accumulated_inserted} in total', end='\r' )
def restore_overview(cls) -> None: """ This method should only be used when `mongorestore` is executed and the `tuixue.visa_status` collection is restored. """ cls.drop('overview') embassy_lst = USEmbassy.get_embassy_lst() for visa_type in VISA_TYPES: for emb in embassy_lst: print() avai_dt_cache = defaultdict(list) all_avai_dt = cls.visa_status.aggregate([ {'$match': {'visa_type': visa_type, 'embassy_code': emb.code}}, {'$unwind': '$available_dates'}, { '$project': { '_id': False, 'write_time': '$available_dates.write_time', 'available_date': '$available_dates.available_date' } }, ]) for adt in all_avai_dt: write_time_utc = adt['write_time'] available_date = adt['available_date'] write_time_emb = write_time_utc.astimezone(emb.timezone) write_date_emb = write_time_emb.replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=None) avai_dt_cache[write_date_emb].append(available_date) for write_date, avai_dt_arr in avai_dt_cache.items(): if len(avai_dt_arr) > 0: earliest_dt, latest_dt = min(avai_dt_arr), max(avai_dt_arr) cls.overview.update_one( {'visa_type': visa_type, 'embassy_code': emb.code}, { '$push': { 'overview': { 'write_date': write_date, 'earliest_date': earliest_dt, 'latest_date': latest_dt, } } }, upsert=True, ) print( 'Update tuixue.overview: {}\t{}\t\t\t{}'.format( visa_type, emb.location, write_date.strftime('%Y/%m/%d') ), end='\r' )
def save_fetched_visa_status( cls, visa_type: VisaType, embassy_code: EmbassyCode, write_time: datetime, available_date: Optional[datetime], ) -> None: """ The method called when a new fetched result is obtained from crawler backend. The `'latest_written'` collection will always be modified, whereas the `'available_dates'` collection will only be modified when available date is not None """ embassy = USEmbassy.get_embassy_by_code(embassy_code) write_time_utc = write_time.astimezone(tz=None).astimezone(tz=timezone.utc) write_date_utc = write_time_utc.replace(hour=0, minute=0, second=0, microsecond=0) write_date_emb = write_time_utc.astimezone(embassy.timezone)\ .replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=None) query = {'visa_type': visa_type, 'embassy_code': embassy_code} visa_status_query = {**query, 'write_date': write_date_utc} overview_query = {**query, 'overview.write_date': write_date_emb} new_fetch = {'write_time': write_time_utc, 'available_date': available_date} # udpate document if exists, otherwise insert a new document cls.latest_written.update_one(query, {'$set': new_fetch}, upsert=True) if available_date is not None: cls.visa_status.update_one(visa_status_query, {'$push': {'available_dates': new_fetch}}, upsert=True) if cls.overview.find_one(overview_query) is None: # $(update) of array can't work with upsert cls.overview.update_one( query, { '$push': { 'overview': { 'write_date': write_date_emb, 'earliest_date': available_date, 'latest_date': available_date } } }, upsert=True ) else: cls.overview.update_one( overview_query, { '$min': {'overview.$.earliest_date': available_date}, '$max': {'overview.$.latest_date': available_date}, } )
def find_visa_status_past24h_turning_point( cls, visa_type: VisaType, embassy_code: EmbassyCode, timestamp: datetime, ): """ Fill in the missing minute and return the visa status detail with consecutive duplicate removed""" visa_status = cls.find_visa_status_past24h(visa_type, embassy_code, timestamp) if visa_status is None or len(visa_status['available_dates']) == 0: return embassy = USEmbassy.get_embassy_by_code(embassy_code) interval = CGI_FETCH_TIME_INTERVAL[visa_type] if embassy.sys == 'cgi' else AIS_FETCH_TIME_INTERVAL[visa_type] interval = (interval + 60) * 1000 # add 1min tolerance def convert(dt: datetime): return dt_to_utc(dt, remove_second=True) available_dates = [{ 'write_time': convert(i['write_time']), 'available_date': i['available_date'], } for i in visa_status['available_dates']] ts_start, ts_end = list(map(convert, visa_status['time_range'])) purified_available_dates = [] first_dp = available_dates[0] if first_dp['write_time'] - ts_start > 1: purified_available_dates = [{'write_time': ts_start, 'available_date': None}] for i, (prev_dp, next_dp) in enumerate(zip(available_dates[:-1], available_dates[1:])): if i == 0: purified_available_dates.append(prev_dp) if next_dp['write_time'] - prev_dp['write_time'] <= interval: if prev_dp['available_date'] == next_dp['available_date']: continue else: purified_available_dates.append(next_dp) else: purified_available_dates.append({'write_time': prev_dp['write_time'] + 60000, 'available_date': None}) purified_available_dates.append(next_dp) last_dp = available_dates[-1] if ts_end - last_dp['write_time'] > interval: purified_available_dates.append({'write_time': last_dp['write_time'] + 60000, 'available_date': None}) return { **visa_status, 'time_range': [ts_start, ts_end], 'available_dates': purified_available_dates, }
def initiate_latest_written_parallel(cls, sys: str) -> None: """ write an empty latest_written record for every embassy and visa type. this method pick the latest `write_date` for a `(visa_type, embassy_code)` pair, then get the last written record from `available_dates` array of it. And overwrite the whole `last_written` collection. """ embassy_code_lst = [emb.code for emb in USEmbassy.get_embassy_lst() if emb.sys == sys] query_param = list(cls.visa_status.aggregate([ { '$group': { '_id': {'visa_type': '$visa_type', 'embassy_code': '$embassy_code'}, 'write_date': {'$max': '$write_date'}, }, }, {'$replaceRoot': {'newRoot': {'$mergeObjects': ['$_id', {'write_date': '$write_date'}]}}}, ])) query_param = [query for query in query_param if query['embassy_code'] in embassy_code_lst] last_effective_write = cls.visa_status.aggregate([ {'$facet': {'{}{}'.format(q['visa_type'], q['embassy_code']): [ {'$match': q}, { '$project': { '_id': False, 'visa_type': True, 'embassy_code': True, 'available_date': {'$slice': ['$available_dates.available_date', -1]}, }, }, {'$unwind': '$available_date'}, ] for q in query_param}}, { '$project': { 'facet_result': { '$setUnion': ['${}{}'.format(q['visa_type'], q['embassy_code']) for q in query_param], }, }, }, {'$unwind': '$facet_result'}, {'$replaceRoot': {'newRoot': '$facet_result'}}, {'$set': {'write_time': datetime.now(timezone.utc)}}, ], allowDiskUse=True) cls.latest_written.drop() cls.latest_written.insert_many(list(last_effective_write))
def send_subscription_confirmation(cls, email: str, subs_lst: List[DB.EmailSubscription]): """ Send the email for confirmation of email subscription.""" confirmation_url = URL( f'https://{FRONTEND_BASE_URI}/visa/email/subscription') confirmation_url.query_param.set('email', email) for visa_type, code, till in subs_lst: confirmation_url.query_param.append('visa_type', visa_type.value) confirmation_url.query_param.append('code', code.value) confirmation_url.query_param.append('till', till) subscription_str = '<ul>\n{}\n</ul>'.format('\n'.join([ '<li>{} Visa at {} till {}.</li>'.format( VISA_TYPE_DETAILS[vt], next((e.name_en for e in USEmbassy.get_embassy_lst() if e.code == ec), 'None'), tl.strftime('%Y/%m/%d') if tl != datetime.max else 'FOREVER', ) for vt, ec, tl in subs_lst ])) content = SUBSCRIPTION_CONFIRMATION_CONTENT.format( user=email.split('@')[0], email=email, subscription_str=subscription_str, confirmation_url=confirmation_url, ) for _ in range(10): # for robust sent = cls.send_email( title=SUBSCRIPTION_CONFIRMATION_TITLE.format(email=email), content=content, receivers=[email]) if sent: break else: sent = False return sent
def initiate_latest_written_sequential(cls, sys: str, backtrack_hr: int = 12) -> None: """ Initate latest_written in sequentail order.""" embassy_code_lst = [emb.code for emb in USEmbassy.get_embassy_lst() if emb.sys == sys] now = datetime.now() start = datetime.combine((now - timedelta(hours=backtrack_hr)).date(), datetime.min.time()) end = datetime.combine(now.date(), datetime.min.time()) dates = [start + timedelta(days=d) for d in range((end - start).days + 1)] query_param = cls.visa_status.aggregate([ {'$match': {'write_date': {'$in': dates}}}, { '$group': { '_id': {'visa_type': '$visa_type', 'embassy_code': '$embassy_code'}, 'write_date': {'$max': '$write_date'}, }, }, {'$replaceRoot': {'newRoot': {'$mergeObjects': ['$_id', {'write_date': '$write_date'}]}}}, ], allowDiskUse=True) for query in query_param: if query['embassy_code'] not in embassy_code_lst: continue cursor = cls.visa_status.aggregate([ {'$match': query}, { '$project': { '_id': False, 'write_time': datetime.now(timezone.utc), 'available_date': {'$slice': ['$available_dates.available_date', -1]}, }, }, {'$unwind': '$available_date'}, ], allowDiskUse=True) query.pop('write_date') for last_effective_fetch in cursor: cls.latest_written.update_one(query, {'$set': last_effective_fetch}, upsert=True)
def send_subscription_confirmation(cls, email: str, subs_lst: List[DB.EmailSubscription]): """ Send the email for confirmation of email subscription.""" query_dct = {'visa_type': [], 'code': [], 'till': []} for visa_type, code, till in subs_lst: query_dct['visa_type'].append(visa_type) query_dct['code'].append(code) query_dct['till'].append(till) # Construct the redirect frontend url confirmation_url = urlunsplit( ('https', 'tuixue.online', '/subscription/email', urlencode(query_dct, doseq=True, quote_via=quote), '')) subscription_str = '<ul>\n{}\n</ul>'.format('\n'.join([ '<li>{} Visa at {} till {}.</li>'.format( vt, next((e.name_en for e in USEmbassy.get_embassy_lst() if e.code == ec), 'None'), tl.strftime('%Y/%m/%d') if tl != datetime.max else 'FOREVER', ) for vt, ec, tl in subs_lst ])) content = SUBSCRIPTION_CONFIRMATION_CONTENT.format( email=email, subscription_str=subscription_str, confirmation_url=confirmation_url, ) for _ in range(10): # for robust sent = cls.send_email( title=SUBSCRIPTION_CONFIRMATION_TITLE.format(email=email), content=content, receivers=[email]) if sent: break return sent
def send_unsubscription_confirmation(cls, email: str): """ Send the email for confirmation of email unsubscription. """ subs_lst_by_email = DB.Subscription.get_subscriptions_by_email(email) if len(subs_lst_by_email ) == 0: # If the user has no subscription/email doesn't exist for _ in range(10): sent = cls.send_email( title=UNSUBSCRIPTION_EMPTY_SUBS_TITLE.format(email=email), content=UNSUBSCRIPTION_EMPTY_SUBS_CONTENT.format( user=email.split('@')[0], email=email, base_uri=FRONTEND_BASE_URI), receivers=[email], ) if sent: break else: sent = False return sent unsubs_url = URL( f'https://{FRONTEND_BASE_URI}/visa/email/unsubscription' ) # Unsubscription confirmation url unsubs_url.query_param.set('email', email) unsubs_all_url = unsubs_url.copy() unsubs_info = [] for subs in subs_lst_by_email: url = unsubs_url.copy() url.query_param.set('visa_type', subs['visa_type']) url.query_param.set('code', subs['embassy_code']) url.query_param.set('till', subs['till']) unsubs_info.append((subs['visa_type'], subs['embassy_code'], subs['till'], subs['expired'], url)) unsubs_all_url.query_param.append('visa_type', subs['visa_type']) unsubs_all_url.query_param.append('code', subs['embassy_code']) unsubs_all_url.query_param.append('till', subs['till']) unsubscription_str = '{}'.format('\n'.join([ '<li>{} Visa at {} {} on {}: click <a href="{}">this link</a> to unsubscribe.</li>' .format( VISA_TYPE_DETAILS[vt], next((e.name_en for e in USEmbassy.get_embassy_lst() if e.code == ec), 'None'), 'expired' if exp else 'expiring', tl.strftime('%Y/%m/%d') if tl.year < 9999 else 'FOREVER', url, ) for vt, ec, tl, exp, url in unsubs_info ])) content = UNSUBSCRIPTION_CONFIRMATION_CONTENT.format( user=email.split('@')[0], email=email, unsubscription_str=unsubscription_str, unsubscribe_all_url=unsubs_all_url, ) for _ in range(10): sent = cls.send_email(title=UNSUBSCRIPTION_CONFIRMATION_TITLE, content=content, receivers=[email]) if sent: break else: sent = False return sent
def find_visa_status_overview_embtz( cls, visa_type: Union[VisaType, List[VisaType]], embassy_code: Union[EmbassyCode, List[EmbassyCode]], since_utc: datetime, to_utc: datetime, ): """ This method fix the problem of `cls.find_visa_status_overview` as the previous method doesn't convert the querying date into the embassy timezone. """ def dt_to_date(dt: datetime) -> datetime: return dt.replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=None) def utc_to_embtz(dt: datetime, embtz: timezone) -> datetime: return dt_to_date(dt.astimezone(embtz)) def single_target_query(visa_type: str, embassy_code: str, date_range: List[datetime]) -> List[dict]: """ construct the sub-pipeline for mongodb aggregation `facet` stage.""" return [ {'$match': {'visa_type': visa_type, 'embassy_code': embassy_code}}, { '$project': { 'visa_type': '$visa_type', 'embassy_code': '$embassy_code', 'overview': { '$filter': { 'input': '$overview', 'as': 'ov', 'cond': {'$in': ['$$ov.write_date', date_range]} } } } }, {'$unwind': '$overview'}, { '$project': { '_id': False, 'visa_type': '$visa_type', 'embassy_code': '$embassy_code', 'write_date': '$overview.write_date', 'earliest_date': '$overview.earliest_date', 'latest_date': '$overview.latest_date', } }, ] if not isinstance(visa_type, list): visa_type = [visa_type] if not isinstance(embassy_code, list): embassy_code = [embassy_code] overview_target = [ { 'visa_type': vt, 'embassy_code': emb.code, 'date_range': [ utc_to_embtz(since_utc, emb.timezone) + timedelta(days=d) for d in range( (utc_to_embtz(to_utc, emb.timezone) - utc_to_embtz(since_utc, emb.timezone)).days + 1 ) ], } for vt in visa_type for emb in [USEmbassy.get_embassy_by_code(ec) for ec in embassy_code] ] utc_date_range = [ dt_to_date(since_utc) + timedelta(days=d) for d in range((dt_to_date(to_utc) - dt_to_date(since_utc)).days + 1) ] embtz_utc_map = {tgt['embassy_code']: dict(zip(tgt['date_range'], utc_date_range)) for tgt in overview_target} query = [ { '$facet': { '{}{}'.format(tgt['visa_type'], tgt['embassy_code']): single_target_query(**tgt) for tgt in overview_target }, }, { '$project': { 'facet_result': { '$setUnion': ['${}{}'.format(tgt['visa_type'], tgt['embassy_code']) for tgt in overview_target] } }, }, {'$unwind': '$facet_result'}, {'$replaceRoot': {'newRoot': '$facet_result'}} ] overview_embtz = list(cls.overview.aggregate(query)) overview_utc = [{ **ov, 'write_date': embtz_utc_map[ov['embassy_code']][ov['write_date']], } for ov in overview_embtz] ov_groupby_date = defaultdict(list) for overview in overview_utc: write_date = overview.pop('write_date') ov_groupby_date[write_date].append(overview) return sorted( [{'date': write_date, 'overview': overview} for write_date, overview in ov_groupby_date.items()], key=lambda ov: ov['date'], reverse=True, )
def initiate_collections_tz(cls, since: datetime) -> None: """ Initiate the database with following handling of datetime object regarding timezone. 1. All of the `available_date` data are stored as is. (what we fetch is what we store) 2. All of the `write_time` and `write_date` data in Mongo collections **`visa_status`** and **`latest_written`** are stored in UTC+0 standard time. 3. **(Very important here)** All of the `write_time` and `write_date` data in Mongo collection **`overview`** are stored in the time in the local time zone of a given U.S. Embassy location. e.g. The overview data of U.S. Embassy in Phnom Pend on the date Oct 10th, 2020 stands for the time range `"2020-10-10T00:00+07:00"` to `"2020-10-10T23:59+07:00"`, **NOT** `"2020-10-10T00:00+00:00"` to `"2020-10-10T23:59+00:00"`. 4. All time data in a HTTP request from frontend **must be** a UTC standard time. The `Date.toISOString` is the default way we construct the time related query in a request url in frontend. FastAPI backend should add a layer of logic that consolidate the received datetime object must have a `tzinfo` attribute otherwise should return a 422 status code. """ since_midnight = since.replace(hour=0, minute=0, second=0, microsecond=0) today_midnight = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) date_range = [since_midnight + timedelta(days=d) for d in range((today_midnight - since_midnight).days + 1)] embassy_lst = USEmbassy.get_embassy_lst() cls.drop() cls.visa_status.create_index([('write_date', pymongo.ASCENDING)]) for vt in VISA_TYPES: for emb in embassy_lst: print() # Go to a new line (inner loop using end='\r') avai_dt_cache_utc = defaultdict(list) avai_dt_cache_emb = defaultdict(list) for date in date_range: file_path = util.construct_data_file_path(vt, emb.location, date.strftime('%Y/%m/%d')) if not os.path.exists(file_path): continue with open(file_path) as f: available_dates_arr = [ {'write_time': datetime.combine(date.date(), wt), 'available_date': avai_dt} for wt, avai_dt in [util.file_line_to_dt(ln) for ln in f.readlines()] ] for adt in available_dates_arr: write_time_utc = adt['write_time'].astimezone(tz=None).astimezone(tz=timezone.utc) write_date_utc = write_time_utc.replace(hour=0, minute=0, second=0, microsecond=0) write_date_emb = write_time_utc\ .astimezone(emb.timezone)\ .replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=None) available_date = adt['available_date'] avai_dt_cache_utc[write_date_utc].append( {'write_time': write_time_utc, 'available_date': available_date} ) avai_dt_cache_emb[write_date_emb].append(available_date) print(' ' * 150, end='\r') # erase previous print print('Reading: {}-{}-{}'.format(vt, emb.location, date.strftime('%Y/%m/%d')), end='\t\t') print( 'UTC\t{}: {}'.format( write_date_utc.strftime('%Y/%m/%d'), len(avai_dt_cache_utc[write_date_utc]) ), end='\t' ) print( 'EMB\t{}: {}'.format( write_date_emb.strftime('%Y/%m/%d'), len(avai_dt_cache_emb[write_date_emb]) ), end='\t' ) print( '|Total:\tUTC-{}\tEMB-{}'.format( sum([len(cache_len) for cache_len in avai_dt_cache_emb.values()]), sum([len(cache_len) for cache_len in avai_dt_cache_emb.values()]), ), end='\r' ) if len(avai_dt_cache_utc) > 0: cls.visa_status.insert_many([ # insert all visa status fetch result in one write { 'visa_type': vt, 'embassy_code': emb.code, 'write_date': write_date, 'available_dates': avai_dt_arr, } for write_date, avai_dt_arr in avai_dt_cache_utc.items() ]) else: print('Skipping: {}-{} No records'.format(vt, emb.location), end='\r') for write_date, avai_dt_arr in avai_dt_cache_emb.items(): if len(avai_dt_arr) > 0: earliest_dt, latest_dt = min(avai_dt_arr), max(avai_dt_arr) cls.overview.update_one( {'visa_type': vt, 'embassy_code': emb.code}, { '$push': { 'overview': { 'write_date': write_date, 'earliest_date': earliest_dt, 'latest_date': latest_dt, } } }, upsert=True, )