def build(data_path: str): data_path = Path(data_path) / 'typos_wiki' fname = data_path / 'misspelings.tsv' if not is_done(data_path): url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines' download(fname, url) with fname.open() as f: data = [] for line in f: if line.strip().endswith('<pre>'): break for line in f: if line.strip().startswith('</pre>'): break data.append(line.strip().split('->')) with fname.open('w', newline='') as tsvfile: writer = csv.writer(tsvfile, delimiter='\t') for line in data: writer.writerow(line) mark_done(data_path) print('Built', file=sys.stderr) return fname
def download_resource(url, dest_paths): dest_paths = list(dest_paths) if url.endswith(('.tar.gz', '.gz', '.zip')): download_path = dest_paths[0].parent download_decompress(url, download_path, dest_paths) else: file_name = url.split('/')[-1] dest_files = [dest_path / file_name for dest_path in dest_paths] download(dest_files, url)
def download_resource(url: str, dest_paths: Iterable[Path]) -> None: dest_paths = list(dest_paths) if check_md5(url, dest_paths): log.info(f'Skipped {url} download because of matching hashes') elif url.endswith(('.tar.gz', '.gz', '.zip')): download_path = dest_paths[0].parent download_decompress(url, download_path, dest_paths) else: file_name = url.split('/')[-1] dest_files = [dest_path / file_name for dest_path in dest_paths] download(dest_files, url)
def __init__(self, data_dir: str = '', data_url: str = DB_URL, batch_size: int = None, shuffle: bool = None, seed: int = None, **kwargs): download_dir = expand_path(data_dir) download_path = download_dir.joinpath(data_url.split("/")[-1]) download(download_path, data_url, force_download=False) self.connect = sqlite3.connect(str(download_path), check_same_thread=False) self.db_name = self.get_db_name() self.doc_ids = self.get_doc_ids() self.doc2index = self.map_doc2idx() self.batch_size = batch_size self.shuffle = shuffle self.random = Random(seed)
def build(data_path: str): data_path = Path(data_path) / 'kartaslov' fname = data_path / 'orfo_and_typos.L1_5.csv' if not is_done(data_path): url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv' download(fname, url) mark_done(data_path) print('Built', file=sys.stderr) return fname
def read(self, data_path, file_name: str='ontonotes_senna.pckl', provide_senna_pos=False, provide_senna_ner=False): path = Path(data_path).resolve() / file_name if not path.exists(): download(str(path), self.URL) with open(path, 'rb') as f: dataset = pickle.load(f) dataset_filtered = {} for key, data in dataset.items(): dataset_filtered[key] = [] for (toks, pos, ner), tags in data: if not provide_senna_pos and not provide_senna_ner: dataset_filtered[key].append((toks, tags)) else: x = [toks] if provide_senna_pos: x.append(pos) if provide_senna_ner: x.append(ner) dataset_filtered[key].append((x, tags)) return dataset_filtered
def build(data_path: str) -> Path: """Download misspellings list from `github <https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv>`_ Args: data_path: target directory to download the data to Returns: path to the resulting csv-file """ data_path = Path(data_path) / 'kartaslov' fname = data_path / 'orfo_and_typos.L1_5.csv' if not is_done(data_path): url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv' download(fname, url) mark_done(data_path) log.info('Built') return fname
def read(self, data_path, data_types=["train"]): """ Read dataset from data_path directory. Reading files are all data_types + extension (i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form data_path will be read) Args: data_path: directory with files data_types: types of considered data (possible: "train", "valid", "test") Returns: dictionary with types from data_types. Each field of dictionary is a list of tuples (x_i, y_i) """ for data_type in data_types: if not Path(data_path).joinpath(data_type + ".csv").exists(): print("Loading {} data from {} to {}".format(data_type, self.url, data_path)) download(source_url=self.url, dest_file_path=Path(data_path).joinpath(data_type + ".csv")) mark_done(data_path) data = {} for data_type in data_types: data[data_type] = pd.read_csv(Path(data_path).joinpath(data_type + ".csv")) new_data = {'train': [], 'valid': [], 'test': []} for field in data_types: for i in range(data[field].shape[0]): new_data[field].append( (data[field].loc[i, 'text'], data[field].loc[i, "intents"].split(","))) return new_data
def download_resources(args): if args.all: urls = ALL_URLS else: urls = REQ_URLS for url in urls: download_path = Path('../download') download_path.mkdir(exist_ok=True) dest_path = download_path embeddings_path = download_path.joinpath('embeddings') if url in EMBEDDING_URLS: embeddings_path.mkdir(exist_ok=True) dest_path = embeddings_path.joinpath(url.split("/")[-1]) download(dest_path, url) elif url in DATA_URLS: dest_path = download_path.joinpath(url.split("/")[-1].split(".")[0]) download_decompress(url, dest_path) else: download_decompress(url, dest_path)
def __init__(self, emb_folder: str, emb_url: str, save_path: str, load_path: str, context_limit: int = 450, question_limit: int = 150, char_limit: int = 16, level: str = 'token', *args, **kwargs): self.emb_folder = expand_path(emb_folder) self.level = level self.emb_url = emb_url self.emb_file_name = Path(emb_url).name self.save_path = expand_path(save_path) self.load_path = expand_path(load_path) self.context_limit = context_limit self.question_limit = question_limit self.char_limit = char_limit self.loaded = False self.NULL = "<NULL>" self.OOV = "<OOV>" self.emb_folder.mkdir(parents=True, exist_ok=True) if not (self.emb_folder / self.emb_file_name).exists(): download(self.emb_folder / self.emb_file_name, self.emb_url) if self.load_path.exists(): self.load()
def read(self, data_path: str, url: str = None, format: str = "csv", class_sep: str = ",", *args, **kwargs) -> dict: """ Read dataset from data_path directory. Reading files are all data_types + extension (i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form data_path will be read) Args: data_path: directory with files url: download data files if data_path not exists or empty format: extension of files. Set of Values: ``"csv", "json"`` class_sep: string separator of labels in column with labels sep (str): delimeter for ``"csv"`` files. Default: ``","`` header (int): row number to use as the column names names (array): list of column names to use orient (str): indication of expected JSON string format lines (boolean): read the file as a json object per line. Default: ``False`` Returns: dictionary with types from data_types. Each field of dictionary is a list of tuples (x_i, y_i) """ data_types = ["train", "valid", "test"] train_file = kwargs.get('train', 'train.csv') if not Path(data_path, train_file).exists(): if url is None: raise Exception( "data path {} does not exist or is empty, and download url parameter not specified!" .format(data_path)) log.info("Loading train data from {} to {}".format(url, data_path)) download(source_url=url, dest_file_path=Path(data_path, train_file)) data = {"train": [], "valid": [], "test": []} for data_type in data_types: file_name = kwargs.get(data_type, '{}.{}'.format(data_type, format)) file = Path(data_path).joinpath(file_name) if file.exists(): if format == 'csv': keys = ('sep', 'header', 'names') options = {k: kwargs[k] for k in keys if k in kwargs} df = pd.read_csv(file, **options) elif format == 'json': keys = ('orient', 'lines') options = {k: kwargs[k] for k in keys if k in kwargs} df = pd.read_json(file, **options) else: raise Exception( 'Unsupported file format: {}'.format(format)) x = kwargs.get("x", "text") y = kwargs.get('y', 'labels') if isinstance(x, list): data[data_type] = [([row[x_] for x_ in x], str(row[y]).split(class_sep)) for _, row in df.iterrows()] else: data[data_type] = [(row[x], str(row[y]).split(class_sep)) for _, row in df.iterrows()] else: log.warning("Cannot find {} file".format(file)) return data
def _build_slot_vals(slot_vals_json_path='data/'): url = 'http://files.deeppavlov.ai/datasets/dstc_slot_vals.json' download(slot_vals_json_path, url)
def _download_slot_vals(self): url = 'http://lnsigo.mipt.ru/export/datasets/dstc_slot_vals.json' download(self.save_path, url)
def load(self): url = 'http://lnsigo.mipt.ru/export/datasets/dstc_slot_vals.json' download(self.save_path, url)
def _download_slot_vals(self): url = 'http://files.deeppavlov.ai/datasets/dstc_slot_vals.json' download(self.save_path, url)
def read(self, data_path: str, url: str = None, format: str = "csv", class_sep: str = ",", *args, **kwargs) -> dict: """ Read dataset from data_path directory. Reading files are all data_types + extension (i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form data_path will be read) Args: data_path: directory with files url: download data files if data_path not exists or empty format: extension of files. Set of Values: ``"csv", "json"`` class_sep: string separator of labels in column with labels sep (str): delimeter for ``"csv"`` files. Default: ``","`` header (int): row number to use as the column names names (array): list of column names to use orient (str): indication of expected JSON string format lines (boolean): read the file as a json object per line. Default: ``False`` Returns: dictionary with types from data_types. Each field of dictionary is a list of tuples (x_i, y_i) """ data_types = ["train", "valid", "test"] train_file = kwargs.get('train', 'train.csv') if not Path(data_path, train_file).exists(): if url is None: raise Exception("data path {} does not exist or is empty, and download url parameter not specified!".format(data_path)) log.info("Loading train data from {} to {}".format(url, data_path)) download(source_url=url, dest_file_path=Path(data_path, train_file)) data = {"train": [], "valid": [], "test": []} for data_type in data_types: file_name = kwargs.get(data_type, '{}.{}'.format(data_type, format)) file = Path(data_path).joinpath(file_name) if file.exists(): if format == 'csv': keys = ('sep', 'header', 'names') options = {k: kwargs[k] for k in keys if k in kwargs} df = pd.read_csv(file, **options) elif format == 'json': keys = ('orient', 'lines') options = {k: kwargs[k] for k in keys if k in kwargs} df = pd.read_json(file, **options) else: raise Exception('Unsupported file format: {}'.format(format)) x = kwargs.get("x", "text") y = kwargs.get('y', 'labels') class_sep = kwargs.get('class_sep', ',') if isinstance(x, list): data[data_type] = [([row[x_] for x_ in x], str(row[y]).split(class_sep)) for _, row in df.iterrows()] else: data[data_type] = [(row[x], str(row[y]).split(class_sep)) for _, row in df.iterrows()] else: log.warning("Cannot find {} file".format(file)) return data
def _build_slot_vals(slot_vals_json_path='data/'): url = 'http://lnsigo.mipt.ru/export/datasets/dstc_slot_vals.json' download(slot_vals_json_path, url)
from deeppavlov.core.data.utils import download import os if not os.path.exists('data/models/glove.txt'): download( 'data/models/glove.txt', source_url='http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt')