def write_info_file(self, dataset_name, original_fname): """Write the INFO file next to local file. Although the method is synchronized, there is still a risk two processes running at the same time overlap here. Risk accepted, since potentially lost data (`dataset_name`) is only for human consumption. Args: dataset_name: data used to dl the file. original_fname: name of file as downloaded. """ info = self._get_info() or {} urls = set(info.get('urls', []) + [self.url]) dataset_names = info.get('dataset_names', []) if dataset_name: dataset_names.append(dataset_name) if 'original_fname' in info and info[ 'original_fname'] != original_fname: raise AssertionError( '`original_fname` "%s" stored in %s does NOT match "%s".' % (info['original_fname'], self.info_path, original_fname)) info = dict(urls=list(urls), dataset_names=list(set(dataset_names)), original_fname=original_fname) with py_utils.atomic_write(self.info_path, 'w') as info_f: json.dump(info, info_f, sort_keys=True) self._info = info
def write_info_file( resource: 'Resource', path: str, dataset_name: str, original_fname: str, url_info: checksums_lib.UrlInfo, ) -> None: """Write the INFO file next to local file. Although the method is synchronized, there is still a risk two processes running at the same time overlap here. Risk accepted, since potentially lost data (`dataset_name`) is only for human consumption. Args: resource: resource for which to write the INFO file. path: path of downloaded file. dataset_name: data used to dl the file. original_fname: name of file as downloaded. url_info: checksums/size info of the url """ url_info_dict = url_info.asdict() info_path = _get_info_path(path) info = _read_info(info_path) or {} urls = set(info.get('urls', []) + [resource.url]) dataset_names = info.get('dataset_names', []) if dataset_name: dataset_names.append(dataset_name) if info.get('original_fname', original_fname) != original_fname: raise ValueError( '`original_fname` "{}" stored in {} does NOT match "{}".'.format( info['original_fname'], info_path, original_fname)) if info.get('url_info', url_info_dict) != url_info_dict: raise ValueError( 'File info {} contains a different checksum that the downloaded one: ' 'Stored: {}; Expected: {}'.format(info_path, info['url_info'], url_info_dict)) info = dict( urls=list(urls), dataset_names=list(set(dataset_names)), original_fname=original_fname, url_info=url_info_dict, ) with py_utils.atomic_write(info_path, 'w') as info_f: json.dump(info, info_f, sort_keys=True)