예제 #1
0
 def download_data(self, data_path):
     if not is_done(Path(data_path)):
         download_decompress(
             url=
             "http://lnsigo.mipt.ru/export/datasets/insuranceQA-master.zip",
             download_path=data_path)
         mark_done(data_path)
예제 #2
0
    def read(self,
             data_path: str,
             dataset_name: str = None,
             provide_pos: bool = False,
             provide_doc_ids: bool = False,
             iob: bool = False,
             docstart_token: str = None):
        self.provide_pos = provide_pos
        self.provide_doc_ids = provide_doc_ids
        self.iob = iob
        self.docstart_token = docstart_token
        self.num_docs = 0
        self.x_is_tuple = self.provide_pos or self.provide_doc_ids
        data_path = Path(data_path)
        files = list(data_path.glob('*.txt'))
        if 'train.txt' not in {file_path.name for file_path in files}:
            if dataset_name == 'conll2003':
                url = 'http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz'
            elif dataset_name == 'collection_rus':
                url = 'http://files.deeppavlov.ai/deeppavlov_data/collection3_v2.tar.gz'
            elif dataset_name == 'ontonotes':
                url = 'http://files.deeppavlov.ai/deeppavlov_data/ontonotes_ner.tar.gz'
            else:
                raise RuntimeError(
                    'train.txt not found in "{}"'.format(data_path))
            data_path.mkdir(exist_ok=True, parents=True)
            download_decompress(url, data_path)
            files = list(data_path.glob('*.txt'))
        dataset = {}

        for file_name in files:
            name = file_name.with_suffix('').name
            dataset[name] = self.parse_ner_file(file_name)
        return dataset
예제 #3
0
    def __init__(
            self,
            path: str,
            model_name:str,
            tokenizer: AutoTokenizer = None,
            url: str = None,
            train: bool = True
    ):
        self.url = url or URL_SBERSQUAD
        self.path = path
        dir_path = Path(path)
        required_files = ['{}-v1.1.json'.format(dt) for dt in ['train', 'dev']]
        if not dir_path.exists():
            dir_path.mkdir()

        if not all((dir_path / f).exists() for f in required_files):
            download_decompress(self.url, dir_path)
        if tokenizer is None:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
        if train:
            self.dataset = load_and_cache_examples(
                model_name_or_path=model_name,
                tokenizer=tokenizer
            )
        else:
            self.dataset = load_and_cache_examples(
                model_name_or_path=model_name,
                tokenizer=tokenizer,
                evaluate=True
            )
    def read(self, dir_path: str, dataset='SQuAD'):
        if dataset == 'SQuAD':
            self.url = self.url_squad
        elif dataset == 'SberSQuAD':
            self.url = self.url_sber_squad
        else:
            raise RuntimeError('Dataset {} is unknown'.format(dataset))

        dir_path = Path(dir_path)
        required_files = ['{}-v1.1.json'.format(dt) for dt in ['train', 'dev']]
        if not dir_path.exists():
            dir_path.mkdir()

        if not all((dir_path / f).exists() for f in required_files):
            download_decompress(self.url, dir_path)

        dataset = {}
        for f in required_files:
            data = json.load((dir_path / f).open('r'))
            if f == 'dev-v1.1.json':
                dataset['valid'] = data
            else:
                dataset['train'] = data

        return dataset
예제 #5
0
    def load_model(self):
        # options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
        # options_file = "http://files.deeppavlov.ai/lang_models/sexy_elmo/elmo_2x4096_512_2048cnn_2xhighway_options.json"
        options_file = "http://files.deeppavlov.ai/lang_models/sexy_elmo/options.json"
        # weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

        # custom weights
        # weight_file = "http://files.deeppavlov.ai/lang_models/sexy_elmo/weights.hdf5"
        # weight_file = "http://files.deeppavlov.ai/lang_models/sexy_elmo/weights_epoch_n_2.hdf5"
        weight_file = "http://files.deeppavlov.ai/lang_models/sexy_elmo/weights_epoch_n_3.hdf5"
        # weight_file = "http://files.deeppavlov.ai/lang_models/sexy_elmo/weights_epoch_n_4.hdf5"

        # allennlp realizatioon with updating states
        # self._elmobilm = _ElmoBiLm(options_file, weight_file)
        # realizatioon without updating states
        self._elmobilm = ELMOBiLM(options_file, weight_file)
        self._elmobilm.cuda()

        # TODO load head:
        # self._ff = torch.nn.Linear(1024, 1000000)
        self._ff = torch.nn.Linear(512, 1000000)
        self._ff.cuda()
        ##############################################################
        # TODO refactor
        # Load checkpoint of TF:
        # base_path = ROOT_DIR + "/bidirectional_lms/elmo_ru_news"
        base_path = ROOT_DIR + "/bidirectional_lms/elmo_ru_news/elmo_weights_ckpt3"
        # base_path = ROOT_DIR + "/bidirectional_lms/elmo_ru_news/elmo_weights_ckpt3/elmo_ru_news_uptrained_ckpt"
        ckpt_prefixed_path = base_path + "/model.ckpt-0003"

        # metafile_path = base_path + "/model.ckpt-0003.meta"
        try:
            # matrix which holds embedding into words projection
            emb2words_w_matrix = tf.train.load_variable(
                ckpt_prefixed_path, 'lm/softmax/W')

        except Exception as e:
            #download it, then read it again
            # from deeppavlov.core.data.utils import download
            from deeppavlov.core.data.utils import download_decompress
            # TODO download all 3 files of checkpoint
            CKPTS_URL = "http://files.deeppavlov.ai/spelling_correctors/elmo_weights_ckpt3.tar.gz"
            download_decompress(CKPTS_URL, base_path)
            emb2words_w_matrix = tf.train.load_variable(
                ckpt_prefixed_path, 'lm/softmax/W')
        ##############################################################

        # torch_w = torch.from_numpy(np.concatenate((softmax_w, softmax_w), axis=1))
        torch_w = torch.from_numpy(emb2words_w_matrix)

        emb2words_bias = tf.train.load_variable(ckpt_prefixed_path,
                                                'lm/softmax/b')
        self._ff.load_state_dict(
            {
                'weight': torch_w,
                'bias': torch.from_numpy(emb2words_bias)
            },
            strict=False)

        self._softmax_fn = torch.nn.Softmax(dim=3)
예제 #6
0
    def read(self,
             data_path: str,
             language: str = 'en',
             *args,
             **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]:
        """
        Reads BoolQ dataset from files.

        Args:
            data_path: A path to a folder with dataset files.
            language: The dataset language ('ru', 'en' are available)

        Returns:
            dataset: items of the dataset [(question, passage), label]
        """

        if language in self.urls:
            self.url = self.urls[language]
        else:
            raise RuntimeError(f'The dataset for {language} is unavailable')

        data_path = expand_path(data_path)
        if not data_path.exists():
            data_path.mkdir(parents=True)

        download_decompress(self.url, data_path)
        dataset = {}

        for filename in ['train.jsonl', 'valid.jsonl']:
            dataset[filename.split('.')[0]] = self._build_data(
                language, data_path / filename)

        return dataset
예제 #7
0
def download_resources(args):
    if args.all:
        urls = ALL_URLS
    else:
        urls = REQ_URLS

    for url in urls:
        download_path = Path('../download')
        download_path.mkdir(exist_ok=True)
        dest_path = download_path

        embeddings_path = download_path.joinpath('embeddings')

        if url in EMBEDDING_URLS:
            embeddings_path.mkdir(exist_ok=True)
            dest_path = embeddings_path.joinpath(url.split("/")[-1])
            download(dest_path, url)

        elif url in DATA_URLS:
            dest_path = download_path.joinpath(
                url.split("/")[-1].split(".")[0])
            download_decompress(url, dest_path)

        else:
            download_decompress(url, dest_path)
예제 #8
0
    def read(self, data_path: str, dialogs: bool = False) -> Dict[str, List]:
        """
        Downloads ``'dstc2_v2.tar.gz'`` archive from ipavlov internal server,
        decompresses and saves files to ``data_path``.

        Parameters:
            data_path: path to save DSTC2 dataset
            dialogs: flag which indicates whether to output list of turns or
             list of dialogs

        Returns:
            dictionary that contains ``'train'`` field with dialogs from
            ``'dstc2-trn.jsonlist'``, ``'valid'`` field with dialogs from
            ``'dstc2-val.jsonlist'`` and ``'test'`` field with dialogs from
            ``'dstc2-tst.jsonlist'``. Each field is a list of tuples ``(x_i, y_i)``.
        """
        required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst'))
        if not all(Path(data_path, f).exists() for f in required_files):
            log.info('[downloading data from {} to {}]'.format(self.url, data_path))
            download_decompress(self.url, data_path)
            mark_done(data_path)

        data = {
            'train': self._read_from_file(
                Path(data_path, self._data_fname('trn')), dialogs),
            'valid': self._read_from_file(
                Path(data_path, self._data_fname('val')), dialogs),
            'test': self._read_from_file(
                Path(data_path, self._data_fname('tst')), dialogs)
        }
        return data
예제 #9
0
    def read(self, data_path: str, dialogs: bool = False) -> Dict[str, List]:
        """
        Downloads ``'kvrest_public.tar.gz'``, decompresses, saves files to ``data_path``.

        Parameters:
            data_path: path to save data
            dialogs: flag indices whether to output list of turns or list of dialogs

        Returns:
            dictionary with ``'train'`` containing dialogs from ``'kvret_train_public.json'``, ``'valid'`` containing dialogs from ``'kvret_valid_public.json'``, ``'test'`` containing dialogs from ``'kvret_test_public.json'``. Each fields is a list of tuples ``(x_i, y_i)``.
        """

        required_files = (self._data_fname(dt)
                          for dt in ('train', 'dev', 'test'))
        if not all(Path(data_path, f).exists() for f in required_files):
            log.info('[downloading dstc2 from {} to {}]'.format(
                self.url, data_path))
            download_decompress(self.url, data_path)
            mark_done(data_path)

        data = {
            'train':
            self._read_from_file(Path(data_path, self._data_fname('train')),
                                 dialogs),
            'valid':
            self._read_from_file(Path(data_path, self._data_fname('dev')),
                                 dialogs),
            'test':
            self._read_from_file(Path(data_path, self._data_fname('test')),
                                 dialogs)
        }
        return data
예제 #10
0
    def read(self, data_path: str, dialogs: bool = False) -> Dict[str, List]:
        """
        Downloads ``'dstc2_v2.tar.gz'`` archive from ipavlov internal server,
        decompresses and saves files to ``data_path``.

        Parameters:
            data_path: path to save DSTC2 dataset
            dialogs: flag which indicates whether to output list of turns or
             list of dialogs

        Returns:
            dictionary that contains ``'train'`` field with dialogs from
            ``'dstc2-trn.jsonlist'``, ``'valid'`` field with dialogs from
            ``'dstc2-val.jsonlist'`` and ``'test'`` field with dialogs from
            ``'dstc2-tst.jsonlist'``. Each field is a list of tuples ``(x_i, y_i)``.
        """
        required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst'))
        if not all(Path(data_path, f).exists() for f in required_files):
            log.info(f"[downloading data from {self.url} to {data_path}]")
            download_decompress(self.url, data_path)
            mark_done(data_path)

        data = {
            'train':
            self._read_from_file(Path(data_path, self._data_fname('trn')),
                                 dialogs),
            'valid':
            self._read_from_file(Path(data_path, self._data_fname('val')),
                                 dialogs),
            'test':
            self._read_from_file(Path(data_path, self._data_fname('tst')),
                                 dialogs)
        }
        return data
예제 #11
0
def download_resource(url, dest_paths):
    dest_paths = list(dest_paths)

    if url.endswith(('.tar.gz', '.gz', '.zip')):
        download_path = dest_paths[0].parent
        download_decompress(url, download_path, dest_paths)
    else:
        file_name = url.split('/')[-1]
        dest_files = [dest_path / file_name for dest_path in dest_paths]
        download(dest_files, url)
예제 #12
0
    def read(self, data_path: str, queries_per_intent: Optional[int] = None, test_validate_split: float = 0.5,
             *args, **kwargs) -> \
            Dict[str, List[Dict[str, Any]]]:
        """
        Each query in the output has the following form:
            { 'intent': intent_name,
              'data': [ { 'text': text, ('entity': slot_name)? } ]
            }

        Args:
            data_path: A path to a folder with dataset files.
            queries_per_intent: Number of queries to load for each intent. None to load all.
                If the requested number is greater than available in file, all queries are returned.
            test_validate_split: Proportion of `_validate` files to be used as test dataset (since Snips
                is split into training and validation sets without a separate test set).
        """
        data_path = Path(data_path)
        intents = [
            'AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic',
            'RateBook', 'SearchCreativeWork', 'SearchScreeningEvent'
        ]

        if not is_done(data_path):
            url = 'http://files.deeppavlov.ai/datasets/snips.tar.gz'
            log.info('[downloading data from {} to {}]'.format(url, data_path))
            download_decompress(url, data_path)
            mark_done(data_path)

        use_full_file = queries_per_intent is None or queries_per_intent > 70
        training_data = []
        validation_data = []
        test_data = []

        for intent in intents:
            intent_path = data_path / intent
            train_file_name = f"train_{intent}{'_full' if use_full_file else ''}.json"
            validate_file_name = f"validate_{intent}.json"

            train_queries = self._load_file(intent_path / train_file_name,
                                            intent, queries_per_intent)
            validate_queries = self._load_file(
                intent_path / validate_file_name, intent, queries_per_intent)
            num_test_queries = round(
                len(validate_queries) * test_validate_split)

            training_data.extend(train_queries)
            validation_data.extend(validate_queries[num_test_queries:])
            test_data.extend(validate_queries[:num_test_queries])

        return {
            'train': training_data,
            'valid': validation_data,
            'test': test_data
        }
예제 #13
0
def download_resource(url: str, dest_paths: Iterable[Path]) -> None:
    dest_paths = list(dest_paths)

    if check_md5(url, dest_paths):
        log.info(f'Skipped {url} download because of matching hashes')
    elif url.endswith(('.tar.gz', '.gz', '.zip')):
        download_path = dest_paths[0].parent
        download_decompress(url, download_path, dest_paths)
    else:
        file_name = url.split('/')[-1]
        dest_files = [dest_path / file_name for dest_path in dest_paths]
        download(dest_files, url)
예제 #14
0
    def _download_data(self, data_path):
        """Download archive with the InsuranceQA dataset files and decompress if there is no dataset files in `data_path`.

        Args:
            data_path: A path to a folder where dataset files are stored.
        """
        if not is_done(Path(data_path)):
            download_decompress(
                url=
                "http://files.deeppavlov.ai/datasets/insuranceQA-master.zip",
                download_path=data_path)
            mark_done(data_path)
예제 #15
0
def download_resource(url: str, dest_paths: Iterable[Path]) -> None:
    dest_paths = list(dest_paths)

    if check_md5(url, dest_paths):
        log.info(f'Skipped {url} download because of matching hashes')
    elif any(ext in url for ext in ('.tar.gz', '.gz', '.zip')):
        download_path = dest_paths[0].parent
        download_decompress(url, download_path, dest_paths)
    else:
        file_name = url.split('/')[-1].split('?')[0]
        dest_files = [dest_path / file_name for dest_path in dest_paths]
        download(dest_files, url)
예제 #16
0
    def read(self, data_path: str):
        data_path = Path(data_path)
        files = list(data_path.glob('*.txt'))
        test_set_filename = "test_set_with_answers.txt"
        if test_set_filename not in {file_path.name for file_path in files}:
            url = 'http://files.deeppavlov.ai/kbqa/test_set_with_answers.zip'
            data_path.mkdir(exist_ok=True, parents=True)
            download_decompress(url, data_path)
        dataset = {}

        dataset["test"] = self.parse_ner_file(data_path / test_set_filename)
        dataset["train"] = []
        dataset["valid"] = []
        return dataset
예제 #17
0
def download_resource(url: str, dest_paths: Iterable[Union[Path, str]]) -> None:
    dest_paths = [Path(dest) for dest in dest_paths]
    download_path = dest_paths[0].parent
    download_path.mkdir(parents=True, exist_ok=True)
    file_name = urlparse(url).path.split('/')[-1]
    lockfile = download_path / f'.{file_name}.lock'

    with FileLock(lockfile).acquire(poll_intervall=10):
        if check_md5(url, dest_paths):
            log.info(f'Skipped {url} download because of matching hashes')
        elif any(ext in url for ext in ('.tar.gz', '.gz', '.zip')):
            download_decompress(url, download_path, dest_paths)
        else:
            dest_files = [dest_path / file_name for dest_path in dest_paths]
            download(dest_files, url)
예제 #18
0
    def __init__(self, **kwargs):
        self.opt = deepcopy(kwargs)
        vocabs = self.opt.pop('vocabs')
        self.opt.update(vocabs)

        # Find all input parameters of the network init
        network_parameter_names = list(
            inspect.signature(NerNetwork.__init__).parameters)

        # Fill all provided parameters from opt
        network_parameters = {
            par: self.opt[par]
            for par in network_parameter_names if par in self.opt
        }

        # Initialize the network
        self.sess = tf.Session()
        network_parameters['sess'] = self.sess
        self._ner_network = NerNetwork(**network_parameters)

        download_best_model = self.opt.get('download_best_model', False)
        if download_best_model:
            model_path = str(self.load_path.parent.absolute())
            best_model_url = 'http://lnsigo.mipt.ru/export/models/ner/ner_dstc_model.tar.gz'
            download_decompress(best_model_url, model_path)

        # Training parameters
        # Find all parameters for network train
        train_parameters_names = list(
            inspect.signature(NerNetwork.train_on_batch).parameters)
        train_parameters = {
            par: self.opt[par]
            for par in train_parameters_names if par in self.opt
        }
        self.train_parameters = train_parameters

        super().__init__(**kwargs)

        # Check existance of file with slots, slot values, and corrupted (misspelled) slot values
        slot_vals_filepath = Path(self.save_path.parent) / 'slot_vals.json'
        if not slot_vals_filepath.is_file():
            self._download_slot_vals()

        with open(slot_vals_filepath) as f:
            self._slot_vals = json.load(f)

        if self.load_path is not None:
            self.load()
예제 #19
0
def download_resource(resource, download_path):
    url = resource['url']
    sub_dirs = resource['subdir']
    dest_paths = []

    for sub_dir in sub_dirs:
        dest_path = download_path.joinpath(sub_dir)
        dest_paths.append(dest_path)

    if url.endswith(('.tar.gz', '.gz', '.zip')):
        download_path = dest_paths[0].parent
        download_decompress(url, download_path, dest_paths)
    else:
        file_name = url.split('/')[-1]
        dest_files = [dest_path / file_name for dest_path in dest_paths]
        download(dest_files, url)
예제 #20
0
    def read(self, dir_path: str):
        dir_path = Path(dir_path)
        required_files = ['{}-v1.1.json'.format(dt) for dt in ['train', 'dev']]
        if not dir_path.exists():
            dir_path.mkdir()

        if not all((dir_path / f).exists() for f in required_files):
            download_decompress(self.url, dir_path)

        dataset = {}
        for f in required_files:
            data = json.load((dir_path / f).open('r'))
            if f == 'dev-v1.1.json':
                dataset['valid'] = data
            else:
                dataset['train'] = data

        return dataset
예제 #21
0
    def read(self, data_path, dialogs=False):
    #TODO: mkdir if it doesn't exist

        required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst'))
        if not all(Path(data_path, f).exists() for f in required_files):
            print('Loading dstc2 from `{}` to `{}`'.format(self.url, data_path))
            download_decompress(self.url, data_path)
            mark_done(data_path)

        data = {
            'train': self._read_from_file(
                Path(data_path, self._data_fname('trn')), dialogs),
            'valid': self._read_from_file(
                Path(data_path, self._data_fname('val')), dialogs),
            'test': self._read_from_file(
                Path(data_path, self._data_fname('tst')), dialogs)
        }
        return data
예제 #22
0
    def read(self,
             dir_path: str,
             dataset: Optional[str] = 'MultiSQuADRetr',
             url: Optional[str] = None,
             *args,
             **kwargs) -> Dict[str, Dict[str, Any]]:
        """

        Args:
            dir_path: path to save data
            dataset: default dataset names: ``'MultiSQuADRetr'``, ``'MultiSQuADRuRetr'``
            url: link to archive with dataset, use url argument if non-default dataset is used

        Returns:
            dataset split on train/valid

        Raises:
            RuntimeError: if `dataset` is not one of these: ``'MultiSQuADRetr'``, ``'MultiSQuADRuRetr'``.
        """
        if url is not None:
            self.url = url
        elif dataset == 'MultiSQuADRetr':
            self.url = self.url_multi_squad_retr
        elif dataset == 'MultiSQuADRuRetr':
            self.url = self.url_multi_squad_ru_retr
        else:
            raise RuntimeError('Dataset {} is unknown'.format(dataset))

        dir_path = Path(dir_path)
        required_files = ['{}.jsonl'.format(dt) for dt in ['train', 'dev']]
        if not dir_path.exists():
            dir_path.mkdir(parents=True)

        if not all((dir_path / f).exists() for f in required_files):
            download_decompress(self.url, dir_path)

        dataset = {}
        for f in required_files:
            if 'dev' in f:
                dataset['valid'] = dir_path.joinpath(f)
            else:
                dataset['train'] = dir_path.joinpath(f)

        return dataset
예제 #23
0
    def read(self, dir_path: str, dataset: Optional[str] = 'SQuAD', url: Optional[str] = None, *args, **kwargs) \
            -> Dict[str, Dict[str, Any]]:
        """

        Args:
            dir_path: path to save data
            dataset: default dataset names: ``'SQuAD'``, ``'SberSQuAD'`` or ``'MultiSQuAD'``
            url: link to archive with dataset, use url argument if non-default dataset is used

        Returns:
            dataset split on train/valid

        Raises:
            RuntimeError: if `dataset` is not one of these: ``'SQuAD'``, ``'SberSQuAD'``, ``'MultiSQuAD'``.
        """
        if url is not None:
            self.url = url
        elif dataset == 'SQuAD':
            self.url = self.url_squad
        elif dataset == 'SberSQuAD':
            self.url = self.url_sber_squad
        elif dataset == 'MultiSQuAD':
            self.url = self.url_multi_squad
        else:
            raise RuntimeError('Dataset {} is unknown'.format(dataset))

        dir_path = Path(dir_path)
        required_files = ['{}-v1.1.json'.format(dt) for dt in ['train', 'dev']]
        if not dir_path.exists():
            dir_path.mkdir()

        if not all((dir_path / f).exists() for f in required_files):
            download_decompress(self.url, dir_path)

        dataset = {}
        for f in required_files:
            with dir_path.joinpath(f).open('r', encoding='utf8') as fp:
                data = json.load(fp)
            if f == 'dev-v1.1.json':
                dataset['valid'] = data
            else:
                dataset['train'] = data

        return dataset
예제 #24
0
    def read(self,
             data_path: str,
             dialogs: bool = False,
             encoding='utf-8') -> Dict[str, List]:
        """
        Downloads ``'simple_dstc2.tar.gz'`` archive from internet,
        decompresses and saves files to ``data_path``.

        Parameters:
            data_path: path to save DSTC2 dataset
            dialogs: flag which indicates whether to output list of turns or
             list of dialogs

        Returns:
            dictionary that contains ``'train'`` field with dialogs from
            ``'simple-dstc2-trn.json'``, ``'valid'`` field with dialogs
            from ``'simple-dstc2-val.json'`` and ``'test'`` field with
            dialogs from ``'simple-dstc2-tst.json'``.
            Each field is a list of tuples ``(user turn, system turn)``.
        """
        required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst'))
        if not all(Path(data_path, f).exists() for f in required_files):
            log.info(f"{[Path(data_path, f) for f in required_files]}]")
            log.info(f"[downloading data from {self.url} to {data_path}]")
            download_decompress(self.url, data_path)
            mark_done(data_path)

        data = {
            'train':
            self._read_from_file(Path(data_path, self._data_fname('trn')),
                                 dialogs, encoding),
            'valid':
            self._read_from_file(Path(data_path, self._data_fname('val')),
                                 dialogs, encoding),
            'test':
            self._read_from_file(Path(data_path, self._data_fname('tst')),
                                 dialogs, encoding)
        }
        log.info(f"There are {len(data['train'])} samples in train split.")
        log.info(f"There are {len(data['valid'])} samples in valid split.")
        log.info(f"There are {len(data['test'])} samples in test split.")
        return data
예제 #25
0
    def read(self, data_path, dialogs=False):
        required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst'))
        if not all(Path(data_path, f).exists() for f in required_files):
            log.info('[downloading data from {} to {}]'.format(
                self.url, data_path))
            download_decompress(self.url, data_path)
            mark_done(data_path)

        data = {
            'train':
            self._read_from_file(Path(data_path, self._data_fname('trn')),
                                 dialogs),
            'valid':
            self._read_from_file(Path(data_path, self._data_fname('val')),
                                 dialogs),
            'test':
            self._read_from_file(Path(data_path, self._data_fname('tst')),
                                 dialogs)
        }
        return data
예제 #26
0
 def read(self, dir_path: str, dataset_name=None, provide_pos=False):
     self.provide_pos = provide_pos
     dir_path = Path(dir_path)
     files = list(dir_path.glob('*.txt'))
     if 'train.txt' not in {file_path.name for file_path in files}:
         if dataset_name == 'conll2003':
             url = 'http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz'
         elif dataset_name == 'collection_rus':
             url = 'http://files.deeppavlov.ai/deeppavlov_data/collection5.tar.gz'
         else:
             raise RuntimeError(
                 'train.txt not found in "{}"'.format(dir_path))
         dir_path.mkdir(exist_ok=True, parents=True)
         download_decompress(url, dir_path)
         files = list(dir_path.glob('*.txt'))
     dataset = {}
     for file_name in files:
         name = file_name.with_suffix('').name
         dataset[name] = self.parse_ner_file(file_name)
     return dataset
예제 #27
0
    def read(self, dir_path: str, dataset: str = 'SQuAD', *args, **kwargs) -> Dict[str, Dict[str, Any]]:
        """

        Args:
            dir_path: path to save data
            dataset: dataset name: ``'SQuAD'``, ``'SberSQuAD'`` or ``'MultiSQuAD'``

        Returns:
            dataset split on train/valid

        Raises:
            RuntimeError: if `dataset` is not one of these: ``'SQuAD'``, ``'SberSQuAD'``, ``'MultiSQuAD'``.
        """
        if dataset == 'SQuAD':
            self.url = self.url_squad
        elif dataset == 'SberSQuAD':
            self.url = self.url_sber_squad
        elif dataset == 'MultiSQuAD':
            self.url = self.url_multi_squad
        else:
            raise RuntimeError('Dataset {} is unknown'.format(dataset))

        dir_path = Path(dir_path)
        required_files = ['{}-v1.1.json'.format(dt) for dt in ['train', 'dev']]
        if not dir_path.exists():
            dir_path.mkdir()

        if not all((dir_path / f).exists() for f in required_files):
            download_decompress(self.url, dir_path)

        dataset = {}
        for f in required_files:
            with dir_path.joinpath(f).open('r', encoding='utf8') as fp:
                data = json.load(fp)
            if f == 'dev-v1.1.json':
                dataset['valid'] = data
            else:
                dataset['train'] = data

        return dataset
예제 #28
0
    def read(self, data_path: str, dataset_name=None, provide_pos=False):
        self.provide_pos = provide_pos
        data_path = Path(data_path)
        files = list(data_path.glob('*.txt'))
        if 'train.txt' not in {file_path.name for file_path in files}:
            if dataset_name == 'conll2003':
                url = 'http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz'
            elif dataset_name == 'collection_rus':
                url = 'http://files.deeppavlov.ai/deeppavlov_data/collection5.tar.gz'
            elif dataset_name == 'ontonotes':
                url = 'http://files.deeppavlov.ai/deeppavlov_data/ontonotes_ner.tar.gz'
            else:
                raise RuntimeError('train.txt not found in "{}"'.format(data_path))
            data_path.mkdir(exist_ok=True, parents=True)
            download_decompress(url, data_path)
            files = list(data_path.glob('*.txt'))
        dataset = {}

        for file_name in files:
            name = file_name.with_suffix('').name
            dataset[name] = self.parse_ner_file(file_name)
        return dataset
    def read(self,
             dir_path: str,
             dataset: str = 'SQuAD',
             *args,
             **kwargs) -> Dict[str, Dict[str, Any]]:
        """

        Args:
            dir_path: path to save data
            dataset: dataset name: ``'SQuAD'`` or ``'SberSQuAD'``

        Returns:
            dataset split on train/valid
        """
        if dataset == 'SQuAD':
            self.url = self.url_squad
        elif dataset == 'SberSQuAD':
            self.url = self.url_sber_squad
        else:
            raise RuntimeError('Dataset {} is unknown'.format(dataset))

        dir_path = Path(dir_path)
        required_files = ['{}-v1.1.json'.format(dt) for dt in ['train', 'dev']]
        if not dir_path.exists():
            dir_path.mkdir()

        if not all((dir_path / f).exists() for f in required_files):
            download_decompress(self.url, dir_path)

        dataset = {}
        for f in required_files:
            with dir_path.joinpath(f).open('r', encoding='utf8') as fp:
                data = json.load(fp)
            if f == 'dev-v1.1.json':
                dataset['valid'] = data
            else:
                dataset['train'] = data

        return dataset
예제 #30
0
    def read(self, data_path: str, url: Optional[str] = None,
             *args, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]:
        """
        Args:
            data_path: A path to a folder with dataset files.
            url: A url to the archive with the dataset to download if the data folder is empty.
        """
        data_path = Path(data_path)

        if url is None:
            url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

        if not is_done(data_path):
            log.info('[downloading data from {} to {}]'.format(url, data_path))
            download_decompress(url, data_path)
            mark_done(data_path)

        alternative_data_path = data_path / "aclImdb"
        if alternative_data_path.exists():
            data_path = alternative_data_path

        data = {"train": [],
                "test": []}
        for data_type in data.keys():
            for label in ["neg", "pos"]:
                labelpath = data_path / data_type / label
                if not labelpath.exists():
                    raise RuntimeError(f"Cannot load data: {labelpath} does not exist")
                for filename in labelpath.glob("*.txt"):
                    with filename.open(encoding='utf-8') as f:
                        text = f.read()
                    data[data_type].append((text, [label]))

            if not data[data_type]:
                raise RuntimeError(f"Could not load the '{data_type}' dataset, "
                                   "probably data dirs are empty")

        return data
예제 #31
0
def download_resources(args):
    if args.all:
        urls = ALL_URLS
    else:
        urls = REQ_URLS

    for url in urls:
        download_path = Path('../download')
        download_path.mkdir(exist_ok=True)
        dest_path = download_path

        embeddings_path = download_path.joinpath('embeddings')

        if url in EMBEDDING_URLS:
            embeddings_path.mkdir(exist_ok=True)
            dest_path = embeddings_path.joinpath(url.split("/")[-1])
            download(dest_path, url)

        elif url in DATA_URLS:
            dest_path = download_path.joinpath(url.split("/")[-1].split(".")[0])
            download_decompress(url, dest_path)

        else:
            download_decompress(url, dest_path)
    def read(self,
             data_path: Union[List, str],
             language: Optional[str] = None,
             data_types: Optional[List[str]] = None,
             **kwargs) -> Dict[str, List]:
        """Reads UD dataset from data_path.

        Args:
            data_path: can be either
                1. a directory containing files. The file for data_type 'mode'
                is then data_path / {language}-ud-{mode}.conllu
                2. a list of files, containing the same number of items as data_types
            language: a language to detect filename when it is not given
            data_types: which dataset parts among 'train', 'dev', 'test' are returned

        Returns:
            a dictionary containing dataset fragments (see ``read_infile``) for given data types
        """
        if data_types is None:
            data_types = ["train", "dev"]
        elif isinstance(data_types, str):
            data_types = list(data_types)
        for data_type in data_types:
            if data_type not in ["train", "dev", "test"]:
                raise ValueError(
                    "Unknown data_type: {}, only train, dev and test "
                    "datatypes are allowed".format(data_type))
        if isinstance(data_path, str):
            data_path = Path(data_path)
        if isinstance(data_path, Path):
            if data_path.exists():
                is_file = data_path.is_file()
            else:
                is_file = (len(data_types) == 1)
            if is_file:
                # path to a single file
                data_path, reserve_data_path = [data_path], None
            else:
                # path to data directory
                if language is None:
                    raise ValueError("You must implicitly provide language "
                                     "when providing data directory as source")
                reserve_data_path = data_path
                data_path = [
                    data_path / "{}-ud-{}.conllu".format(language, mode)
                    for mode in data_types
                ]
                reserve_data_path = [
                    reserve_data_path / language /
                    "{}-ud-{}.conllu".format(language, mode)
                    for mode in data_types
                ]
        else:
            data_path = [Path(data_path) for data_path in data_path]
            reserve_data_path = None
        if len(data_path) != len(data_types):
            raise ValueError(
                "The number of input files in data_path and data types "
                "in data_types must be equal")
        has_missing_files = any(not filepath.exists()
                                for filepath in data_path)
        if has_missing_files and reserve_data_path is not None:
            has_missing_files = any(not filepath.exists()
                                    for filepath in reserve_data_path)
            if not has_missing_files:
                data_path = reserve_data_path
        if has_missing_files:
            # Files are downloaded from the Web repository
            dir_path = data_path[0].parent
            language = language or get_language(data_path[0].parts[-1])
            url = self.URL + "{}.tar.gz".format(language)
            log.info('[downloading data from {} to {}]'.format(url, dir_path))
            dir_path.mkdir(exist_ok=True, parents=True)
            download_decompress(url, dir_path)
            mark_done(dir_path)
        data = {}
        for mode, filepath in zip(data_types, data_path):
            if mode == "dev":
                mode = "valid"
#             if mode == "test":
#                 kwargs["read_only_words"] = True
            data[mode] = read_infile(filepath, **kwargs)
        return data
예제 #33
0
 def _download_data(self, data_path: str) -> None:
     """Download dataset"""
     url = "https://github.com/SamTube405/Amazon-E-commerce-Data-set/archive/master.zip"
     download_decompress(url, data_path)
     mark_done(data_path)
예제 #34
0
 def download_conll(self, dir_path):
     download_decompress('http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz', dir_path)
예제 #35
0
 def _download_data(self, data_path: str) -> None:
     """Download dataset"""
     url = "https://github.com/SamTube405/Amazon-E-commerce-Data-set/archive/master.zip"
     download_decompress(url, data_path)
     mark_done(data_path)
 def download_conll(self, dir_path):
     download_decompress(
         'http://files.deeppavlov.ai/deeppavlov_data/conll2003_v2.tar.gz',
         dir_path)
예제 #37
0
 def download_conll(self, dir_path):
     download_decompress(
         'http://lnsigo.mipt.ru/export/deeppavlov_data/conll2003_v2.tar.gz',
         dir_path)