def download_data(self, data_path): if not is_done(Path(data_path)): download_decompress( url= "http://lnsigo.mipt.ru/export/datasets/insuranceQA-master.zip", download_path=data_path) mark_done(data_path)
def read(self, data_path: str, dataset_name: str = None, provide_pos: bool = False, provide_doc_ids: bool = False, iob: bool = False, docstart_token: str = None): self.provide_pos = provide_pos self.provide_doc_ids = provide_doc_ids self.iob = iob self.docstart_token = docstart_token self.num_docs = 0 self.x_is_tuple = self.provide_pos or self.provide_doc_ids data_path = Path(data_path) files = list(data_path.glob('*.txt')) if 'train.txt' not in {file_path.name for file_path in files}: if dataset_name == 'conll2003': url = 'http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz' elif dataset_name == 'collection_rus': url = 'http://files.deeppavlov.ai/deeppavlov_data/collection3_v2.tar.gz' elif dataset_name == 'ontonotes': url = 'http://files.deeppavlov.ai/deeppavlov_data/ontonotes_ner.tar.gz' else: raise RuntimeError( 'train.txt not found in "{}"'.format(data_path)) data_path.mkdir(exist_ok=True, parents=True) download_decompress(url, data_path) files = list(data_path.glob('*.txt')) dataset = {} for file_name in files: name = file_name.with_suffix('').name dataset[name] = self.parse_ner_file(file_name) return dataset
def __init__( self, path: str, model_name:str, tokenizer: AutoTokenizer = None, url: str = None, train: bool = True ): self.url = url or URL_SBERSQUAD self.path = path dir_path = Path(path) required_files = ['{}-v1.1.json'.format(dt) for dt in ['train', 'dev']] if not dir_path.exists(): dir_path.mkdir() if not all((dir_path / f).exists() for f in required_files): download_decompress(self.url, dir_path) if tokenizer is None: tokenizer = AutoTokenizer.from_pretrained(model_name) if train: self.dataset = load_and_cache_examples( model_name_or_path=model_name, tokenizer=tokenizer ) else: self.dataset = load_and_cache_examples( model_name_or_path=model_name, tokenizer=tokenizer, evaluate=True )
def read(self, dir_path: str, dataset='SQuAD'): if dataset == 'SQuAD': self.url = self.url_squad elif dataset == 'SberSQuAD': self.url = self.url_sber_squad else: raise RuntimeError('Dataset {} is unknown'.format(dataset)) dir_path = Path(dir_path) required_files = ['{}-v1.1.json'.format(dt) for dt in ['train', 'dev']] if not dir_path.exists(): dir_path.mkdir() if not all((dir_path / f).exists() for f in required_files): download_decompress(self.url, dir_path) dataset = {} for f in required_files: data = json.load((dir_path / f).open('r')) if f == 'dev-v1.1.json': dataset['valid'] = data else: dataset['train'] = data return dataset
def load_model(self): # options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" # options_file = "http://files.deeppavlov.ai/lang_models/sexy_elmo/elmo_2x4096_512_2048cnn_2xhighway_options.json" options_file = "http://files.deeppavlov.ai/lang_models/sexy_elmo/options.json" # weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" # custom weights # weight_file = "http://files.deeppavlov.ai/lang_models/sexy_elmo/weights.hdf5" # weight_file = "http://files.deeppavlov.ai/lang_models/sexy_elmo/weights_epoch_n_2.hdf5" weight_file = "http://files.deeppavlov.ai/lang_models/sexy_elmo/weights_epoch_n_3.hdf5" # weight_file = "http://files.deeppavlov.ai/lang_models/sexy_elmo/weights_epoch_n_4.hdf5" # allennlp realizatioon with updating states # self._elmobilm = _ElmoBiLm(options_file, weight_file) # realizatioon without updating states self._elmobilm = ELMOBiLM(options_file, weight_file) self._elmobilm.cuda() # TODO load head: # self._ff = torch.nn.Linear(1024, 1000000) self._ff = torch.nn.Linear(512, 1000000) self._ff.cuda() ############################################################## # TODO refactor # Load checkpoint of TF: # base_path = ROOT_DIR + "/bidirectional_lms/elmo_ru_news" base_path = ROOT_DIR + "/bidirectional_lms/elmo_ru_news/elmo_weights_ckpt3" # base_path = ROOT_DIR + "/bidirectional_lms/elmo_ru_news/elmo_weights_ckpt3/elmo_ru_news_uptrained_ckpt" ckpt_prefixed_path = base_path + "/model.ckpt-0003" # metafile_path = base_path + "/model.ckpt-0003.meta" try: # matrix which holds embedding into words projection emb2words_w_matrix = tf.train.load_variable( ckpt_prefixed_path, 'lm/softmax/W') except Exception as e: #download it, then read it again # from deeppavlov.core.data.utils import download from deeppavlov.core.data.utils import download_decompress # TODO download all 3 files of checkpoint CKPTS_URL = "http://files.deeppavlov.ai/spelling_correctors/elmo_weights_ckpt3.tar.gz" download_decompress(CKPTS_URL, base_path) emb2words_w_matrix = tf.train.load_variable( ckpt_prefixed_path, 'lm/softmax/W') ############################################################## # torch_w = torch.from_numpy(np.concatenate((softmax_w, softmax_w), axis=1)) torch_w = torch.from_numpy(emb2words_w_matrix) emb2words_bias = tf.train.load_variable(ckpt_prefixed_path, 'lm/softmax/b') self._ff.load_state_dict( { 'weight': torch_w, 'bias': torch.from_numpy(emb2words_bias) }, strict=False) self._softmax_fn = torch.nn.Softmax(dim=3)
def read(self, data_path: str, language: str = 'en', *args, **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]: """ Reads BoolQ dataset from files. Args: data_path: A path to a folder with dataset files. language: The dataset language ('ru', 'en' are available) Returns: dataset: items of the dataset [(question, passage), label] """ if language in self.urls: self.url = self.urls[language] else: raise RuntimeError(f'The dataset for {language} is unavailable') data_path = expand_path(data_path) if not data_path.exists(): data_path.mkdir(parents=True) download_decompress(self.url, data_path) dataset = {} for filename in ['train.jsonl', 'valid.jsonl']: dataset[filename.split('.')[0]] = self._build_data( language, data_path / filename) return dataset
def download_resources(args): if args.all: urls = ALL_URLS else: urls = REQ_URLS for url in urls: download_path = Path('../download') download_path.mkdir(exist_ok=True) dest_path = download_path embeddings_path = download_path.joinpath('embeddings') if url in EMBEDDING_URLS: embeddings_path.mkdir(exist_ok=True) dest_path = embeddings_path.joinpath(url.split("/")[-1]) download(dest_path, url) elif url in DATA_URLS: dest_path = download_path.joinpath( url.split("/")[-1].split(".")[0]) download_decompress(url, dest_path) else: download_decompress(url, dest_path)
def read(self, data_path: str, dialogs: bool = False) -> Dict[str, List]: """ Downloads ``'dstc2_v2.tar.gz'`` archive from ipavlov internal server, decompresses and saves files to ``data_path``. Parameters: data_path: path to save DSTC2 dataset dialogs: flag which indicates whether to output list of turns or list of dialogs Returns: dictionary that contains ``'train'`` field with dialogs from ``'dstc2-trn.jsonlist'``, ``'valid'`` field with dialogs from ``'dstc2-val.jsonlist'`` and ``'test'`` field with dialogs from ``'dstc2-tst.jsonlist'``. Each field is a list of tuples ``(x_i, y_i)``. """ required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst')) if not all(Path(data_path, f).exists() for f in required_files): log.info('[downloading data from {} to {}]'.format(self.url, data_path)) download_decompress(self.url, data_path) mark_done(data_path) data = { 'train': self._read_from_file( Path(data_path, self._data_fname('trn')), dialogs), 'valid': self._read_from_file( Path(data_path, self._data_fname('val')), dialogs), 'test': self._read_from_file( Path(data_path, self._data_fname('tst')), dialogs) } return data
def read(self, data_path: str, dialogs: bool = False) -> Dict[str, List]: """ Downloads ``'kvrest_public.tar.gz'``, decompresses, saves files to ``data_path``. Parameters: data_path: path to save data dialogs: flag indices whether to output list of turns or list of dialogs Returns: dictionary with ``'train'`` containing dialogs from ``'kvret_train_public.json'``, ``'valid'`` containing dialogs from ``'kvret_valid_public.json'``, ``'test'`` containing dialogs from ``'kvret_test_public.json'``. Each fields is a list of tuples ``(x_i, y_i)``. """ required_files = (self._data_fname(dt) for dt in ('train', 'dev', 'test')) if not all(Path(data_path, f).exists() for f in required_files): log.info('[downloading dstc2 from {} to {}]'.format( self.url, data_path)) download_decompress(self.url, data_path) mark_done(data_path) data = { 'train': self._read_from_file(Path(data_path, self._data_fname('train')), dialogs), 'valid': self._read_from_file(Path(data_path, self._data_fname('dev')), dialogs), 'test': self._read_from_file(Path(data_path, self._data_fname('test')), dialogs) } return data
def read(self, data_path: str, dialogs: bool = False) -> Dict[str, List]: """ Downloads ``'dstc2_v2.tar.gz'`` archive from ipavlov internal server, decompresses and saves files to ``data_path``. Parameters: data_path: path to save DSTC2 dataset dialogs: flag which indicates whether to output list of turns or list of dialogs Returns: dictionary that contains ``'train'`` field with dialogs from ``'dstc2-trn.jsonlist'``, ``'valid'`` field with dialogs from ``'dstc2-val.jsonlist'`` and ``'test'`` field with dialogs from ``'dstc2-tst.jsonlist'``. Each field is a list of tuples ``(x_i, y_i)``. """ required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst')) if not all(Path(data_path, f).exists() for f in required_files): log.info(f"[downloading data from {self.url} to {data_path}]") download_decompress(self.url, data_path) mark_done(data_path) data = { 'train': self._read_from_file(Path(data_path, self._data_fname('trn')), dialogs), 'valid': self._read_from_file(Path(data_path, self._data_fname('val')), dialogs), 'test': self._read_from_file(Path(data_path, self._data_fname('tst')), dialogs) } return data
def download_resource(url, dest_paths): dest_paths = list(dest_paths) if url.endswith(('.tar.gz', '.gz', '.zip')): download_path = dest_paths[0].parent download_decompress(url, download_path, dest_paths) else: file_name = url.split('/')[-1] dest_files = [dest_path / file_name for dest_path in dest_paths] download(dest_files, url)
def read(self, data_path: str, queries_per_intent: Optional[int] = None, test_validate_split: float = 0.5, *args, **kwargs) -> \ Dict[str, List[Dict[str, Any]]]: """ Each query in the output has the following form: { 'intent': intent_name, 'data': [ { 'text': text, ('entity': slot_name)? } ] } Args: data_path: A path to a folder with dataset files. queries_per_intent: Number of queries to load for each intent. None to load all. If the requested number is greater than available in file, all queries are returned. test_validate_split: Proportion of `_validate` files to be used as test dataset (since Snips is split into training and validation sets without a separate test set). """ data_path = Path(data_path) intents = [ 'AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork', 'SearchScreeningEvent' ] if not is_done(data_path): url = 'http://files.deeppavlov.ai/datasets/snips.tar.gz' log.info('[downloading data from {} to {}]'.format(url, data_path)) download_decompress(url, data_path) mark_done(data_path) use_full_file = queries_per_intent is None or queries_per_intent > 70 training_data = [] validation_data = [] test_data = [] for intent in intents: intent_path = data_path / intent train_file_name = f"train_{intent}{'_full' if use_full_file else ''}.json" validate_file_name = f"validate_{intent}.json" train_queries = self._load_file(intent_path / train_file_name, intent, queries_per_intent) validate_queries = self._load_file( intent_path / validate_file_name, intent, queries_per_intent) num_test_queries = round( len(validate_queries) * test_validate_split) training_data.extend(train_queries) validation_data.extend(validate_queries[num_test_queries:]) test_data.extend(validate_queries[:num_test_queries]) return { 'train': training_data, 'valid': validation_data, 'test': test_data }
def download_resource(url: str, dest_paths: Iterable[Path]) -> None: dest_paths = list(dest_paths) if check_md5(url, dest_paths): log.info(f'Skipped {url} download because of matching hashes') elif url.endswith(('.tar.gz', '.gz', '.zip')): download_path = dest_paths[0].parent download_decompress(url, download_path, dest_paths) else: file_name = url.split('/')[-1] dest_files = [dest_path / file_name for dest_path in dest_paths] download(dest_files, url)
def _download_data(self, data_path): """Download archive with the InsuranceQA dataset files and decompress if there is no dataset files in `data_path`. Args: data_path: A path to a folder where dataset files are stored. """ if not is_done(Path(data_path)): download_decompress( url= "http://files.deeppavlov.ai/datasets/insuranceQA-master.zip", download_path=data_path) mark_done(data_path)
def download_resource(url: str, dest_paths: Iterable[Path]) -> None: dest_paths = list(dest_paths) if check_md5(url, dest_paths): log.info(f'Skipped {url} download because of matching hashes') elif any(ext in url for ext in ('.tar.gz', '.gz', '.zip')): download_path = dest_paths[0].parent download_decompress(url, download_path, dest_paths) else: file_name = url.split('/')[-1].split('?')[0] dest_files = [dest_path / file_name for dest_path in dest_paths] download(dest_files, url)
def read(self, data_path: str): data_path = Path(data_path) files = list(data_path.glob('*.txt')) test_set_filename = "test_set_with_answers.txt" if test_set_filename not in {file_path.name for file_path in files}: url = 'http://files.deeppavlov.ai/kbqa/test_set_with_answers.zip' data_path.mkdir(exist_ok=True, parents=True) download_decompress(url, data_path) dataset = {} dataset["test"] = self.parse_ner_file(data_path / test_set_filename) dataset["train"] = [] dataset["valid"] = [] return dataset
def download_resource(url: str, dest_paths: Iterable[Union[Path, str]]) -> None: dest_paths = [Path(dest) for dest in dest_paths] download_path = dest_paths[0].parent download_path.mkdir(parents=True, exist_ok=True) file_name = urlparse(url).path.split('/')[-1] lockfile = download_path / f'.{file_name}.lock' with FileLock(lockfile).acquire(poll_intervall=10): if check_md5(url, dest_paths): log.info(f'Skipped {url} download because of matching hashes') elif any(ext in url for ext in ('.tar.gz', '.gz', '.zip')): download_decompress(url, download_path, dest_paths) else: dest_files = [dest_path / file_name for dest_path in dest_paths] download(dest_files, url)
def __init__(self, **kwargs): self.opt = deepcopy(kwargs) vocabs = self.opt.pop('vocabs') self.opt.update(vocabs) # Find all input parameters of the network init network_parameter_names = list( inspect.signature(NerNetwork.__init__).parameters) # Fill all provided parameters from opt network_parameters = { par: self.opt[par] for par in network_parameter_names if par in self.opt } # Initialize the network self.sess = tf.Session() network_parameters['sess'] = self.sess self._ner_network = NerNetwork(**network_parameters) download_best_model = self.opt.get('download_best_model', False) if download_best_model: model_path = str(self.load_path.parent.absolute()) best_model_url = 'http://lnsigo.mipt.ru/export/models/ner/ner_dstc_model.tar.gz' download_decompress(best_model_url, model_path) # Training parameters # Find all parameters for network train train_parameters_names = list( inspect.signature(NerNetwork.train_on_batch).parameters) train_parameters = { par: self.opt[par] for par in train_parameters_names if par in self.opt } self.train_parameters = train_parameters super().__init__(**kwargs) # Check existance of file with slots, slot values, and corrupted (misspelled) slot values slot_vals_filepath = Path(self.save_path.parent) / 'slot_vals.json' if not slot_vals_filepath.is_file(): self._download_slot_vals() with open(slot_vals_filepath) as f: self._slot_vals = json.load(f) if self.load_path is not None: self.load()
def download_resource(resource, download_path): url = resource['url'] sub_dirs = resource['subdir'] dest_paths = [] for sub_dir in sub_dirs: dest_path = download_path.joinpath(sub_dir) dest_paths.append(dest_path) if url.endswith(('.tar.gz', '.gz', '.zip')): download_path = dest_paths[0].parent download_decompress(url, download_path, dest_paths) else: file_name = url.split('/')[-1] dest_files = [dest_path / file_name for dest_path in dest_paths] download(dest_files, url)
def read(self, dir_path: str): dir_path = Path(dir_path) required_files = ['{}-v1.1.json'.format(dt) for dt in ['train', 'dev']] if not dir_path.exists(): dir_path.mkdir() if not all((dir_path / f).exists() for f in required_files): download_decompress(self.url, dir_path) dataset = {} for f in required_files: data = json.load((dir_path / f).open('r')) if f == 'dev-v1.1.json': dataset['valid'] = data else: dataset['train'] = data return dataset
def read(self, data_path, dialogs=False): #TODO: mkdir if it doesn't exist required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst')) if not all(Path(data_path, f).exists() for f in required_files): print('Loading dstc2 from `{}` to `{}`'.format(self.url, data_path)) download_decompress(self.url, data_path) mark_done(data_path) data = { 'train': self._read_from_file( Path(data_path, self._data_fname('trn')), dialogs), 'valid': self._read_from_file( Path(data_path, self._data_fname('val')), dialogs), 'test': self._read_from_file( Path(data_path, self._data_fname('tst')), dialogs) } return data
def read(self, dir_path: str, dataset: Optional[str] = 'MultiSQuADRetr', url: Optional[str] = None, *args, **kwargs) -> Dict[str, Dict[str, Any]]: """ Args: dir_path: path to save data dataset: default dataset names: ``'MultiSQuADRetr'``, ``'MultiSQuADRuRetr'`` url: link to archive with dataset, use url argument if non-default dataset is used Returns: dataset split on train/valid Raises: RuntimeError: if `dataset` is not one of these: ``'MultiSQuADRetr'``, ``'MultiSQuADRuRetr'``. """ if url is not None: self.url = url elif dataset == 'MultiSQuADRetr': self.url = self.url_multi_squad_retr elif dataset == 'MultiSQuADRuRetr': self.url = self.url_multi_squad_ru_retr else: raise RuntimeError('Dataset {} is unknown'.format(dataset)) dir_path = Path(dir_path) required_files = ['{}.jsonl'.format(dt) for dt in ['train', 'dev']] if not dir_path.exists(): dir_path.mkdir(parents=True) if not all((dir_path / f).exists() for f in required_files): download_decompress(self.url, dir_path) dataset = {} for f in required_files: if 'dev' in f: dataset['valid'] = dir_path.joinpath(f) else: dataset['train'] = dir_path.joinpath(f) return dataset
def read(self, dir_path: str, dataset: Optional[str] = 'SQuAD', url: Optional[str] = None, *args, **kwargs) \ -> Dict[str, Dict[str, Any]]: """ Args: dir_path: path to save data dataset: default dataset names: ``'SQuAD'``, ``'SberSQuAD'`` or ``'MultiSQuAD'`` url: link to archive with dataset, use url argument if non-default dataset is used Returns: dataset split on train/valid Raises: RuntimeError: if `dataset` is not one of these: ``'SQuAD'``, ``'SberSQuAD'``, ``'MultiSQuAD'``. """ if url is not None: self.url = url elif dataset == 'SQuAD': self.url = self.url_squad elif dataset == 'SberSQuAD': self.url = self.url_sber_squad elif dataset == 'MultiSQuAD': self.url = self.url_multi_squad else: raise RuntimeError('Dataset {} is unknown'.format(dataset)) dir_path = Path(dir_path) required_files = ['{}-v1.1.json'.format(dt) for dt in ['train', 'dev']] if not dir_path.exists(): dir_path.mkdir() if not all((dir_path / f).exists() for f in required_files): download_decompress(self.url, dir_path) dataset = {} for f in required_files: with dir_path.joinpath(f).open('r', encoding='utf8') as fp: data = json.load(fp) if f == 'dev-v1.1.json': dataset['valid'] = data else: dataset['train'] = data return dataset
def read(self, data_path: str, dialogs: bool = False, encoding='utf-8') -> Dict[str, List]: """ Downloads ``'simple_dstc2.tar.gz'`` archive from internet, decompresses and saves files to ``data_path``. Parameters: data_path: path to save DSTC2 dataset dialogs: flag which indicates whether to output list of turns or list of dialogs Returns: dictionary that contains ``'train'`` field with dialogs from ``'simple-dstc2-trn.json'``, ``'valid'`` field with dialogs from ``'simple-dstc2-val.json'`` and ``'test'`` field with dialogs from ``'simple-dstc2-tst.json'``. Each field is a list of tuples ``(user turn, system turn)``. """ required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst')) if not all(Path(data_path, f).exists() for f in required_files): log.info(f"{[Path(data_path, f) for f in required_files]}]") log.info(f"[downloading data from {self.url} to {data_path}]") download_decompress(self.url, data_path) mark_done(data_path) data = { 'train': self._read_from_file(Path(data_path, self._data_fname('trn')), dialogs, encoding), 'valid': self._read_from_file(Path(data_path, self._data_fname('val')), dialogs, encoding), 'test': self._read_from_file(Path(data_path, self._data_fname('tst')), dialogs, encoding) } log.info(f"There are {len(data['train'])} samples in train split.") log.info(f"There are {len(data['valid'])} samples in valid split.") log.info(f"There are {len(data['test'])} samples in test split.") return data
def read(self, data_path, dialogs=False): required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst')) if not all(Path(data_path, f).exists() for f in required_files): log.info('[downloading data from {} to {}]'.format( self.url, data_path)) download_decompress(self.url, data_path) mark_done(data_path) data = { 'train': self._read_from_file(Path(data_path, self._data_fname('trn')), dialogs), 'valid': self._read_from_file(Path(data_path, self._data_fname('val')), dialogs), 'test': self._read_from_file(Path(data_path, self._data_fname('tst')), dialogs) } return data
def read(self, dir_path: str, dataset_name=None, provide_pos=False): self.provide_pos = provide_pos dir_path = Path(dir_path) files = list(dir_path.glob('*.txt')) if 'train.txt' not in {file_path.name for file_path in files}: if dataset_name == 'conll2003': url = 'http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz' elif dataset_name == 'collection_rus': url = 'http://files.deeppavlov.ai/deeppavlov_data/collection5.tar.gz' else: raise RuntimeError( 'train.txt not found in "{}"'.format(dir_path)) dir_path.mkdir(exist_ok=True, parents=True) download_decompress(url, dir_path) files = list(dir_path.glob('*.txt')) dataset = {} for file_name in files: name = file_name.with_suffix('').name dataset[name] = self.parse_ner_file(file_name) return dataset
def read(self, dir_path: str, dataset: str = 'SQuAD', *args, **kwargs) -> Dict[str, Dict[str, Any]]: """ Args: dir_path: path to save data dataset: dataset name: ``'SQuAD'``, ``'SberSQuAD'`` or ``'MultiSQuAD'`` Returns: dataset split on train/valid Raises: RuntimeError: if `dataset` is not one of these: ``'SQuAD'``, ``'SberSQuAD'``, ``'MultiSQuAD'``. """ if dataset == 'SQuAD': self.url = self.url_squad elif dataset == 'SberSQuAD': self.url = self.url_sber_squad elif dataset == 'MultiSQuAD': self.url = self.url_multi_squad else: raise RuntimeError('Dataset {} is unknown'.format(dataset)) dir_path = Path(dir_path) required_files = ['{}-v1.1.json'.format(dt) for dt in ['train', 'dev']] if not dir_path.exists(): dir_path.mkdir() if not all((dir_path / f).exists() for f in required_files): download_decompress(self.url, dir_path) dataset = {} for f in required_files: with dir_path.joinpath(f).open('r', encoding='utf8') as fp: data = json.load(fp) if f == 'dev-v1.1.json': dataset['valid'] = data else: dataset['train'] = data return dataset
def read(self, data_path: str, dataset_name=None, provide_pos=False): self.provide_pos = provide_pos data_path = Path(data_path) files = list(data_path.glob('*.txt')) if 'train.txt' not in {file_path.name for file_path in files}: if dataset_name == 'conll2003': url = 'http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz' elif dataset_name == 'collection_rus': url = 'http://files.deeppavlov.ai/deeppavlov_data/collection5.tar.gz' elif dataset_name == 'ontonotes': url = 'http://files.deeppavlov.ai/deeppavlov_data/ontonotes_ner.tar.gz' else: raise RuntimeError('train.txt not found in "{}"'.format(data_path)) data_path.mkdir(exist_ok=True, parents=True) download_decompress(url, data_path) files = list(data_path.glob('*.txt')) dataset = {} for file_name in files: name = file_name.with_suffix('').name dataset[name] = self.parse_ner_file(file_name) return dataset
def read(self, dir_path: str, dataset: str = 'SQuAD', *args, **kwargs) -> Dict[str, Dict[str, Any]]: """ Args: dir_path: path to save data dataset: dataset name: ``'SQuAD'`` or ``'SberSQuAD'`` Returns: dataset split on train/valid """ if dataset == 'SQuAD': self.url = self.url_squad elif dataset == 'SberSQuAD': self.url = self.url_sber_squad else: raise RuntimeError('Dataset {} is unknown'.format(dataset)) dir_path = Path(dir_path) required_files = ['{}-v1.1.json'.format(dt) for dt in ['train', 'dev']] if not dir_path.exists(): dir_path.mkdir() if not all((dir_path / f).exists() for f in required_files): download_decompress(self.url, dir_path) dataset = {} for f in required_files: with dir_path.joinpath(f).open('r', encoding='utf8') as fp: data = json.load(fp) if f == 'dev-v1.1.json': dataset['valid'] = data else: dataset['train'] = data return dataset
def read(self, data_path: str, url: Optional[str] = None, *args, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]: """ Args: data_path: A path to a folder with dataset files. url: A url to the archive with the dataset to download if the data folder is empty. """ data_path = Path(data_path) if url is None: url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" if not is_done(data_path): log.info('[downloading data from {} to {}]'.format(url, data_path)) download_decompress(url, data_path) mark_done(data_path) alternative_data_path = data_path / "aclImdb" if alternative_data_path.exists(): data_path = alternative_data_path data = {"train": [], "test": []} for data_type in data.keys(): for label in ["neg", "pos"]: labelpath = data_path / data_type / label if not labelpath.exists(): raise RuntimeError(f"Cannot load data: {labelpath} does not exist") for filename in labelpath.glob("*.txt"): with filename.open(encoding='utf-8') as f: text = f.read() data[data_type].append((text, [label])) if not data[data_type]: raise RuntimeError(f"Could not load the '{data_type}' dataset, " "probably data dirs are empty") return data
def download_resources(args): if args.all: urls = ALL_URLS else: urls = REQ_URLS for url in urls: download_path = Path('../download') download_path.mkdir(exist_ok=True) dest_path = download_path embeddings_path = download_path.joinpath('embeddings') if url in EMBEDDING_URLS: embeddings_path.mkdir(exist_ok=True) dest_path = embeddings_path.joinpath(url.split("/")[-1]) download(dest_path, url) elif url in DATA_URLS: dest_path = download_path.joinpath(url.split("/")[-1].split(".")[0]) download_decompress(url, dest_path) else: download_decompress(url, dest_path)
def read(self, data_path: Union[List, str], language: Optional[str] = None, data_types: Optional[List[str]] = None, **kwargs) -> Dict[str, List]: """Reads UD dataset from data_path. Args: data_path: can be either 1. a directory containing files. The file for data_type 'mode' is then data_path / {language}-ud-{mode}.conllu 2. a list of files, containing the same number of items as data_types language: a language to detect filename when it is not given data_types: which dataset parts among 'train', 'dev', 'test' are returned Returns: a dictionary containing dataset fragments (see ``read_infile``) for given data types """ if data_types is None: data_types = ["train", "dev"] elif isinstance(data_types, str): data_types = list(data_types) for data_type in data_types: if data_type not in ["train", "dev", "test"]: raise ValueError( "Unknown data_type: {}, only train, dev and test " "datatypes are allowed".format(data_type)) if isinstance(data_path, str): data_path = Path(data_path) if isinstance(data_path, Path): if data_path.exists(): is_file = data_path.is_file() else: is_file = (len(data_types) == 1) if is_file: # path to a single file data_path, reserve_data_path = [data_path], None else: # path to data directory if language is None: raise ValueError("You must implicitly provide language " "when providing data directory as source") reserve_data_path = data_path data_path = [ data_path / "{}-ud-{}.conllu".format(language, mode) for mode in data_types ] reserve_data_path = [ reserve_data_path / language / "{}-ud-{}.conllu".format(language, mode) for mode in data_types ] else: data_path = [Path(data_path) for data_path in data_path] reserve_data_path = None if len(data_path) != len(data_types): raise ValueError( "The number of input files in data_path and data types " "in data_types must be equal") has_missing_files = any(not filepath.exists() for filepath in data_path) if has_missing_files and reserve_data_path is not None: has_missing_files = any(not filepath.exists() for filepath in reserve_data_path) if not has_missing_files: data_path = reserve_data_path if has_missing_files: # Files are downloaded from the Web repository dir_path = data_path[0].parent language = language or get_language(data_path[0].parts[-1]) url = self.URL + "{}.tar.gz".format(language) log.info('[downloading data from {} to {}]'.format(url, dir_path)) dir_path.mkdir(exist_ok=True, parents=True) download_decompress(url, dir_path) mark_done(dir_path) data = {} for mode, filepath in zip(data_types, data_path): if mode == "dev": mode = "valid" # if mode == "test": # kwargs["read_only_words"] = True data[mode] = read_infile(filepath, **kwargs) return data
def _download_data(self, data_path: str) -> None: """Download dataset""" url = "https://github.com/SamTube405/Amazon-E-commerce-Data-set/archive/master.zip" download_decompress(url, data_path) mark_done(data_path)
def download_conll(self, dir_path): download_decompress('http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz', dir_path)
def download_conll(self, dir_path): download_decompress( 'http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz', dir_path)
def download_conll(self, dir_path): download_decompress( 'http://lnsigo.mipt.ru/export/deeppavlov_data/conll2003_v2.tar.gz', dir_path)