def _get_data(self): # useful constants # all of these colors are bolded RESET = '\033[0m' RED = '\033[1;91m' YELLOW = '\033[1;93m' GREEN = '\033[1;92m' BLUE = '\033[1;96m' CYAN = '\033[1;94m' MAGENTA = '\033[1;95m' # only use colors if we're outputting to a terminal USE_COLORS = _sys.stdout.isatty() if not USE_COLORS: RESET = RED = YELLOW = GREEN = BLUE = CYAN = MAGENTA = '' # generate the rainbow stars rainbow = [RED, YELLOW, GREEN, CYAN, BLUE, MAGENTA] size = 78 // len(rainbow) stars = ''.join([color + '*' * size for color in rainbow]) stars += RESET if not os.path.exists(self.data_path): PathManager.mkdirs(self.data_path) if not PathManager.exists(os.path.join(self.data_path, 'train.csv')): raise RuntimeError( f'\n\n{stars}\nThis data must be downloaded from {self.DATA_SOURCE}' '\nIt cannot be automatically downloaded, as one must agree to ' 'the competition rules outlined on the website before ' 'gaining access to the data.\n\n' 'Once downloaded, please put the data in the following ' f'directory: \n{self.data_path}\n{stars}')
def _check_parent_dir_exits(datapath): parent_dir = os.path.dirname(datapath) if not parent_dir or PathManager.exists(parent_dir): return logging.info( f'Parent directory ({parent_dir}) did not exist and was created.') PathManager.mkdirs(parent_dir)
def make_dir(path): """ Make the directory and any nonexistent parent directories (`mkdir -p`). """ # the current working directory is a fine path if path != '': PathManager.mkdirs(path)
def _unzip(path, fname, delete=True): """ Unpack the given zip file to the same directory. :param str path: The folder containing the archive. Will contain the contents. :param str fname: The filename of the archive file. :param bool delete: If true, the archive will be deleted after extraction. """ import zipfile logging.debug(f'unpacking {fname}') fullpath = os.path.join(path, fname) with zipfile.ZipFile(PathManager.open(fullpath, 'rb'), 'r') as zf: for member in zf.namelist(): outpath = os.path.join(path, member) if zf.getinfo(member).is_dir(): logging.debug(f"Making directory {outpath}") PathManager.mkdirs(outpath) continue logging.debug(f"Extracting to {outpath}") with zf.open(member, 'r') as inf, PathManager.open(outpath, 'wb') as outf: shutil.copyfileobj(inf, outf) if delete: try: PathManager.rm(fullpath) except PermissionError: logging.error( f"Tried to delete {fullpath} but got a permission error. This " "is known to happen in Windows and is probably not fatal.")
def _setup_test_data(self, opt): datapath = os.path.join(opt['datapath'], 'ImageTeacher') imagepath = os.path.join(datapath, 'images') PathManager.mkdirs(imagepath) self.image_features_path = os.path.join( datapath, f'{opt["image_mode"]}_image_features') # Create fake images and features imgs = [f'img_{i}' for i in range(10)] for i, img in enumerate(imgs): image = Image.new('RGB', (16, 16), color=i) with PathManager.open(os.path.join(imagepath, f'{img}.jpg'), 'wb') as fp: image.save(fp, 'JPEG') # write out fake data for dt in ['train', 'valid', 'test']: random.seed(42) data = [{ 'image_id': img, 'text': string.ascii_uppercase[i] } for i, img in enumerate(imgs)] with PathManager.open(os.path.join(datapath, f'{dt}.json'), 'w') as f: json.dump(data, f)
def _unzip(path, fname, delete=True): """ Unpack the given zip file to the same directory. :param str path: The folder containing the archive. Will contain the contents. :param str fname: The filename of the archive file. :param bool delete: If true, the archive will be deleted after extraction. """ import zipfile logging.debug(f'unpacking {fname}') fullpath = os.path.join(path, fname) with zipfile.ZipFile(PathManager.open(fullpath, 'rb'), 'r') as zf: for member in zf.namelist(): outpath = os.path.join(path, member) if zf.getinfo(member).is_dir(): logging.debug(f"Making directory {outpath}") PathManager.mkdirs(outpath) continue logging.debug(f"Extracting to {outpath}") with zf.open(member, 'r') as inf, PathManager.open(outpath, 'wb') as outf: shutil.copyfileobj(inf, outf) if delete: PathManager.rm(fullpath)
def setUp(self): self.datapath = ParlaiParser().parse_args([])['datapath'] self.datapath = os.path.join(self.datapath, 'build_data_pyt_data') PathManager.mkdirs(self.datapath) for d in self.dest_filenames: # Removing files if they are already there b/c otherwise it won't try to download them again try: PathManager.rm(os.path.join(self.datapath, d)) except OSError: pass
def get_image_features_path(self, task, image_model_name, dt): """ Override so that subclasses can see same image features. """ # In default implementation, self.data_path already has task name added image_features_path = os.path.join(self.data_path, 'image_features') if not os.path.isdir(image_features_path): PathManager.mkdirs(image_features_path) return os.path.join(image_features_path, f'{image_model_name}_{dt}_features_dict')
def __init__(self, opt: Opt): try: # tensorboard is a very expensive thing to import. Wait until the # last second to import it. from tensorboardX import SummaryWriter except ImportError: raise ImportError('Please run `pip install tensorboard tensorboardX`.') tbpath = opt['model_file'] + '.tensorboard' logging.debug(f'Saving tensorboard logs to: {tbpath}') if not PathManager.exists(tbpath): PathManager.mkdirs(tbpath) self.writer = SummaryWriter(tbpath, comment=json.dumps(opt))
def _untar(path, fname, delete=True, flatten=False): """ Unpack the given archive file to the same directory. :param str path: The folder containing the archive. Will contain the contents. :param str fname: The filename of the archive file. :param bool delete: If true, the archive will be deleted after extraction. """ import tarfile logging.debug(f'unpacking {fname}') fullpath = os.path.join(path, fname) # very painfully manually extract files so that we can use PathManger.open # instead, lest we are using fb internal file services with tarfile.open(fileobj=PathManager.open(fullpath, 'rb')) as tf: for item in tf: item_name = item.name while item_name.startswith("./"): # internal file systems will actually create a literal "." # directory, so we gotta watch out for that item_name = item_name[2:] if flatten: # flatten the tar file if there are subdirectories fn = os.path.join(path, os.path.split(item_name)[-1]) else: fn = os.path.join(path, item_name) logging.debug(f"Extracting to {fn}") if item.isdir(): PathManager.mkdirs(fn) elif item.isfile(): with PathManager.open(fn, 'wb') as wf, tf.extractfile( item.name) as rf: tarfile.copyfileobj(rf, wf) else: raise NotImplementedError( "No support for symlinks etc. right now.") if delete: try: PathManager.rm(fullpath) except PermissionError: logging.error( f"Tried to delete {fullpath} but got a permission error. This " "is known to happen in Windows and is probably not fatal.")
def finalize(self, frequencies: Dict[str, int], num_symbols: int = 30000, minfreq: int = 2) -> bool: """ Build the codecs. :param frequencies: dictionary of (token: frequency) pairs :param num_symbols: Number of BPE symbols. Recommend 30000-40000. If <= 0, default 30000 will be used. :param minfreq: Minimum frequency of a token before forced BPE decomposition. If <= 0 will use subword-nmt default of 2. :return did_finalize: return whether codecs are finalized this call. """ if hasattr(self, 'bpe'): # we already finalized the codecs return False logging.debug(f'Saving bpe codecs to {self.codecs}') dictionary = ("{} {}".format(k, v) for k, v in frequencies.items()) if num_symbols <= 0: num_symbols = 30000 if minfreq <= 0: minfreq = 2 codec_dir, _ = os.path.split(self.codecs) PathManager.mkdirs(codec_dir) with PathManager.open(self.codecs, 'w', encoding='utf-8') as outstream: learn_bpe.learn_bpe( dictionary, outstream, num_symbols=num_symbols, min_frequency=minfreq, is_dict=True, ) self._load_from_codecs() return True
def download_images(opt, task='personality_captions'): dpath = os.path.join(opt['datapath'], task) image_path = os.path.join(opt['datapath'], 'yfcc_images') version = '1.0' response = input( 'Please confirm that you have obtained permission ' 'to work with the YFCC100m dataset, as outlined by the steps ' 'listed at ' 'https://multimediacommons.wordpress.com/yfcc100m-core-dataset/ [Y/y]: ' ) if response.lower() != 'y': raise RuntimeError( 'In order to use the images from this dataset, ' 'you must obtain permission from the webpage above.') response = input( 'NOTE: This script will download each image individually from the ' 's3 server on which the images are hosted. This will take a *very ' 'long* time. Are you sure you would like to continue? [Y/y]: ') if response.lower() != 'y': raise RuntimeError('If you have access to the images, please specify ' 'the path to the folder via the `--yfcc-path` ' 'command line argument.') image_prefix = 'https://multimedia-commons.s3-us-west-2.amazonaws.com/data/images' hashes = [] dts = ['train', 'val', 'test'] if task == 'image_chat': dts[1] = 'valid' for dt in dts: with PathManager.open(os.path.join(dpath, '{}.json'.format(dt))) as f: data = json.load(f) hashes += [d['image_hash'] for d in data] PathManager.mkdirs(image_path) print('[downloading images to {}]'.format(image_path)) image_urls = [ f"{image_prefix}/{p_hash[:3]}/{p_hash[3:6]}/{p_hash}.jpg" for p_hash in hashes ] download_multiprocess(image_urls, image_path, dest_filenames=[f"{h}.jpg" for h in hashes]) build_data.mark_done(image_path, version)
def _untar(path, fname, delete=True): """ Unpack the given archive file to the same directory. :param str path: The folder containing the archive. Will contain the contents. :param str fname: The filename of the archive file. :param bool delete: If true, the archive will be deleted after extraction. """ import tarfile logging.debug(f'unpacking {fname}') fullpath = os.path.join(path, fname) # very painfully manually extract files so that we can use PathManger.open # instead, lest we are using fb internal file services with tarfile.open(fileobj=PathManager.open(fullpath, 'rb')) as tf: for item in tf: item_name = item.name while item_name.startswith("./"): # internal file systems will actually create a literal "." # directory, so we gotta watch out for that item_name = item_name[2:] fn = os.path.join(path, item_name) logging.debug(f"Extracting to {fn}") if item.isdir(): PathManager.mkdirs(fn) elif item.isfile(): with PathManager.open(fn, 'wb') as wf, tf.extractfile( item.name) as rf: tarfile.copyfileobj(rf, wf) else: raise NotImplementedError( "No support for symlinks etc. right now.") if delete: PathManager.rm(fullpath)
def _test_display_output(self, image_mode): """ Test display data output with given image_mode. """ with testing_utils.tempdir() as tmpdir: data_path = tmpdir PathManager.mkdirs(os.path.join(data_path, 'ImageTeacher')) opt = { 'task': 'integration_tests:ImageTeacher', 'datapath': data_path, 'image_mode': image_mode, 'display_verbose': True, } output = testing_utils.display_data(opt) train_labels = re.findall(r"\[labels\].*\n", output[0]) valid_labels = re.findall(r"\[eval_labels\].*\n", output[1]) test_labels = re.findall(r"\[eval_labels\].*\n", output[2]) for i, lbls in enumerate([train_labels, valid_labels, test_labels]): self.assertGreater(len(lbls), 0, 'DisplayData failed') self.assertEqual(len(lbls), len(set(lbls)), output[i])
def get_app_token(self): """ Find and return an app access token. """ if not self.opt.get('force_page_token'): if not os.path.exists(os.path.expanduser('~/.parlai/')): PathManager.mkdirs(os.path.expanduser('~/.parlai/')) access_token_file_path = '~/.parlai/messenger_token' expanded_file_path = os.path.expanduser(access_token_file_path) if os.path.exists(expanded_file_path): with open(expanded_file_path, 'r') as access_token_file: return access_token_file.read() token = input( 'Enter your page\'s access token from the developer page at' 'https://developers.facebook.com/apps/<YOUR APP ID>' '/messenger/settings/ to continue setup:') access_token_file_path = '~/.parlai/messenger_token' expanded_file_path = os.path.expanduser(access_token_file_path) with open(expanded_file_path, 'w+') as access_token_file: access_token_file.write(token) return token
def get_app_token(self): """ Find and return an app access token. """ if not self.opt.get('force_telegram_bot_token'): if not os.path.exists(os.path.expanduser("~/.parlai/")): PathManager.mkdirs(os.path.expanduser("~/.parlai/")) access_token_file = '~/.parlai/telegram_token' expanded_file_path = os.path.expanduser(access_token_file) if os.path.exists(expanded_file_path): print(f"Token was read from: {expanded_file_path}") with open(expanded_file_path, 'r') as access_token_file: return access_token_file.read() token = input( 'Enter your bot\'s access token from the BotFather page at ' 'https://telegram.me/botfather/ to continue setup: ' ) access_token_file_path = '~/.parlai/telegram_token' expanded_file_path = os.path.expanduser(access_token_file_path) with open(expanded_file_path, 'w') as access_token_file: access_token_file.write(token) return token
def _download_images(self, opt: Opt): """ Download available IGC images. """ urls = [] ids = [] for dt in ['test', 'val']: df = os.path.join(self.get_data_path(opt), f'IGC_crowd_{dt}.csv') with PathManager.open(df, newline='\n') as csv_file: reader = csv.reader(csv_file, delimiter=',') fields = [] for i, row in enumerate(reader): if i == 0: fields = row else: data = dict(zip(fields, row)) urls.append(data['url']) ids.append(data['id']) PathManager.mkdirs(self.get_image_path(opt)) # Make one blank image image = Image.new('RGB', (100, 100), color=0) image.save(os.path.join(self.get_image_path(opt), self.blank_image_id), 'JPEG') # Download the rest download_multiprocess(urls, self.get_image_path(opt), dest_filenames=ids) # Remove bad images for fp in os.listdir(self.get_image_path(opt)): img_path = os.path.join(self.get_image_path(opt), fp) if PathManager.exists(img_path): try: Image.open(img_path).convert('RGB') except OSError: PathManager.rm(img_path)
def _check_data_downloaded(self, opt): # Checks whether the data is downloaded properly # Also checks whether data is built, and builds it if so RESET = '\033[0m' RED = '\033[1;91m' YELLOW = '\033[1;93m' GREEN = '\033[1;92m' BLUE = '\033[1;96m' CYAN = '\033[1;94m' MAGENTA = '\033[1;95m' # only use colors if we're outputting to a terminal USE_COLORS = _sys.stdout.isatty() if not USE_COLORS: RESET = RED = YELLOW = GREEN = BLUE = CYAN = MAGENTA = '' # generate the rainbow stars rainbow = [RED, YELLOW, GREEN, CYAN, BLUE, MAGENTA] size = 78 // len(rainbow) stars = ''.join([color + '*' * size for color in rainbow]) stars += RESET self.data_path = os.path.join(opt['datapath'], 'md_gender', 'yelp') if not os.path.exists(self.data_path): PathManager.mkdirs(self.data_path) if not PathManager.exists( os.path.join(self.data_path, 'valid.fader.with_cat.40000')): raise RuntimeError( f'\n\n{stars}\nThis data must be downloaded following instructions in ' 'the README here:' '<https://github.com/facebookresearch/MultipleAttributeTextRewriting/blob/master/data/README.md>. ' '\nIt cannot be automatically downloaded, as one must agree to ' 'the terms outlined on the website before gaining access to the data.\n\n' 'Once downloaded, please put the data in the following ' f'directory: \n{self.data_path}\n{stars}') elif not PathManager.exists( os.path.join(self.data_path, 'classtrain.txt')): logging.info('[ Building data ... ]') # build train with open(os.path.join(self.data_path, 'classtrain.txt'), 'w') as f: for fle_num in [4000, 6000, 8000]: train_fle = f'train.fader.with_cat.{fle_num}' with open(os.path.join(self.data_path, train_fle)) as g: lines = g.readlines() for line in lines: tabs = line.split('\t') text = tabs[0] gend = tabs[1] if gend == '0': f.write(f'male\t{text}\n') elif gend == '1': f.write(f'female\t{text}\n') # build valid and test for pair in [('dev', 'valid'), ('test', 'test')]: with open( os.path.join(self.data_path, f'female_only.{pair[0]}.en'), 'w') as fem_val: with open( os.path.join(self.data_path, f'male_only.{pair[0]}.en'), 'w') as masc_val: for fle_num in [4000, 6000, 8000]: valid_fle = f'{pair[1]}.fader.with_cat.{fle_num}' with open(os.path.join(self.data_path, valid_fle), 'r') as g: lines = g.readlines() for line in lines: tabs = line.split('\t') text = tabs[0] gend = tabs[1] if gend == '0': masc_val.write(f'{text}\n') elif gend == '1': fem_val.write(f'{text}\n')