def initiate_collections(cls, since: datetime) -> None:
        """ Initiate the visa status storage with the file based data."""
        since_midnight = since.replace(hour=0, minute=0, second=0, microsecond=0)
        today_midnight = datetime.combine(datetime.now().date(), datetime.min.time())
        date_range = [since_midnight + timedelta(days=d) for d in range((today_midnight - since_midnight).days + 1)]

        embassy_lst = USEmbassy.get_embassy_lst()

        cls.drop()
        cls.visa_status.create_index([('write_date', pymongo.ASCENDING)])

        for vt in VISA_TYPES:
            for emb in embassy_lst:
                print()
                accumulated_inserted = 0
                for date in date_range:
                    file_path = util.construct_data_file_path(vt, emb.location, date.strftime('%Y/%m/%d'))
                    if not os.path.exists(file_path):
                        continue

                    with open(file_path) as f:
                        fetched_result_lst = [util.file_line_to_dt(ln) for ln in f.readlines()]
                        available_dates_arr = [
                            {'write_time': datetime.combine(date.date(), wt), 'available_date': avai_dt}
                            for wt, avai_dt in fetched_result_lst
                        ]

                    cls.visa_status.insert_one(
                        {
                            'visa_type': vt,
                            'embassy_code': emb.code,
                            'write_date': date,
                            'available_dates': available_dates_arr
                        }
                    )

                    if len(available_dates_arr) > 0:
                        earliest_dt = min([d['available_date'] for d in available_dates_arr])
                        latest_dt = max([d['available_date'] for d in available_dates_arr])
                        cls.overview.update_one(
                            {'visa_type': vt, 'embassy_code': emb.code},
                            {
                                '$push': {
                                    'overview': {
                                        'write_date': date,
                                        'earliest_date': earliest_dt,
                                        'latest_date': latest_dt,
                                    }
                                }
                            },
                            upsert=True,
                        )

                    accumulated_inserted += len(available_dates_arr)
                    print(
                        f'Inserted: {vt}-{emb.location}-{date.year}/{date.month}/{date.day}\
                            \t\t{len(available_dates_arr)}\trecords |\t{accumulated_inserted} in total',
                        end='\r'
                    )
Пример #2
0
def fetch_all(since: str):
    """ Fetch global USEbassy data."""
    since_date = datetime.strptime(since, '%Y/%m/%d')
    date_span = datetime.now() - since_date  # - timedelta(days=1)
    date_range = [
        since_date + timedelta(days=d) for d in range(date_span.days + 1)
    ]

    embassy_lst = G.USEmbassy.get_embassy_lst()

    for embassy in embassy_lst:
        if embassy.country != 'CHN':
            path = '/global/crawler'
        else:
            path = '/visa2'

        for visa_type in G.VISA_TYPES:
            for date in date_range:
                dt_str = date.strftime('%Y/%m/%d')
                endpoint = f'{path}/{visa_type}/{embassy.location}/{dt_str}'
                file_path = util.construct_data_file_path(
                    visa_type, embassy.location, dt_str)
                os.makedirs(os.path.join(*file_path.split('/')[:-1]),
                            exist_ok=True)
                if os.path.exists(file_path):
                    print(
                        f'Skipping fetching data: {visa_type}-{embassy.location}-{dt_str}'
                    )
                    continue  # skip if the file is already fetched.

                print(
                    f'Fetching down data: {visa_type}-{embassy.location}-{dt_str}',
                    end=' | ')
                res = requests.get(f'{BASE_URL}{endpoint}', timeout=60)

                with open(file_path, 'w') as f:
                    if res.status_code == 200:
                        f.write(res.text)
                    else:
                        f.write('')

                print('Written')
    def initiate_collections_tz(cls, since: datetime) -> None:
        """ Initiate the database with following handling of datetime object regarding timezone.

            1. All of the `available_date` data are stored as is. (what we fetch is what we store)
            2. All of the `write_time` and `write_date` data in Mongo collections **`visa_status`**
                and **`latest_written`** are stored in UTC+0 standard time.
            3. **(Very important here)** All of the `write_time` and `write_date` data in Mongo
                collection **`overview`** are stored in the time in the local time zone of a given
                U.S. Embassy location. e.g. The overview data of U.S. Embassy in Phnom Pend on the
                date Oct 10th, 2020 stands for the time range `"2020-10-10T00:00+07:00"` to
                `"2020-10-10T23:59+07:00"`, **NOT** `"2020-10-10T00:00+00:00"` to `"2020-10-10T23:59+00:00"`.
            4. All time data in a HTTP request from frontend **must be** a UTC standard time. The
                `Date.toISOString` is the default way we construct the time related query in a request
                url in frontend. FastAPI backend should add a layer of logic that consolidate the received
                datetime object must have a `tzinfo` attribute otherwise should return a 422 status code.
        """
        since_midnight = since.replace(hour=0, minute=0, second=0, microsecond=0)
        today_midnight = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
        date_range = [since_midnight + timedelta(days=d) for d in range((today_midnight - since_midnight).days + 1)]

        embassy_lst = USEmbassy.get_embassy_lst()

        cls.drop()
        cls.visa_status.create_index([('write_date', pymongo.ASCENDING)])

        for vt in VISA_TYPES:
            for emb in embassy_lst:
                print()  # Go to a new line (inner loop using end='\r')

                avai_dt_cache_utc = defaultdict(list)
                avai_dt_cache_emb = defaultdict(list)

                for date in date_range:
                    file_path = util.construct_data_file_path(vt, emb.location, date.strftime('%Y/%m/%d'))
                    if not os.path.exists(file_path):
                        continue

                    with open(file_path) as f:
                        available_dates_arr = [
                            {'write_time': datetime.combine(date.date(), wt), 'available_date': avai_dt}
                            for wt, avai_dt in [util.file_line_to_dt(ln) for ln in f.readlines()]
                        ]

                    for adt in available_dates_arr:
                        write_time_utc = adt['write_time'].astimezone(tz=None).astimezone(tz=timezone.utc)

                        write_date_utc = write_time_utc.replace(hour=0, minute=0, second=0, microsecond=0)
                        write_date_emb = write_time_utc\
                            .astimezone(emb.timezone)\
                            .replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=None)

                        available_date = adt['available_date']
                        avai_dt_cache_utc[write_date_utc].append(
                            {'write_time': write_time_utc, 'available_date': available_date}
                        )
                        avai_dt_cache_emb[write_date_emb].append(available_date)

                        print(' ' * 150, end='\r')  # erase previous print
                        print('Reading: {}-{}-{}'.format(vt, emb.location, date.strftime('%Y/%m/%d')), end='\t\t')
                        print(
                            'UTC\t{}: {}'.format(
                                write_date_utc.strftime('%Y/%m/%d'),
                                len(avai_dt_cache_utc[write_date_utc])
                            ),
                            end='\t'
                        )
                        print(
                            'EMB\t{}: {}'.format(
                                write_date_emb.strftime('%Y/%m/%d'),
                                len(avai_dt_cache_emb[write_date_emb])
                            ),
                            end='\t'
                        )
                        print(
                            '|Total:\tUTC-{}\tEMB-{}'.format(
                                sum([len(cache_len) for cache_len in avai_dt_cache_emb.values()]),
                                sum([len(cache_len) for cache_len in avai_dt_cache_emb.values()]),
                            ),
                            end='\r'
                        )

                if len(avai_dt_cache_utc) > 0:
                    cls.visa_status.insert_many([  # insert all visa status fetch result in one write
                        {
                            'visa_type': vt,
                            'embassy_code': emb.code,
                            'write_date': write_date,
                            'available_dates': avai_dt_arr,
                        } for write_date, avai_dt_arr in avai_dt_cache_utc.items()
                    ])
                else:
                    print('Skipping: {}-{} No records'.format(vt, emb.location), end='\r')

                for write_date, avai_dt_arr in avai_dt_cache_emb.items():
                    if len(avai_dt_arr) > 0:
                        earliest_dt, latest_dt = min(avai_dt_arr), max(avai_dt_arr)
                        cls.overview.update_one(
                            {'visa_type': vt, 'embassy_code': emb.code},
                            {
                                '$push': {
                                    'overview': {
                                        'write_date': write_date,
                                        'earliest_date': earliest_dt,
                                        'latest_date': latest_dt,
                                    }
                                }
                            },
                            upsert=True,
                        )