def build(data_path: str) -> Path: """Download and parse common misspellings list from `Wikipedia <https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines>`_ Args: data_path: target directory to download the data to Returns: path to the resulting tsv-file """ data_path = Path(data_path) / 'typos_wiki' fname = data_path / 'misspelings.tsv' if not is_done(data_path): url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines' page = requests.get(url) tree = html.fromstring(page.content) raw = tree.xpath('//pre/text()')[0].splitlines() data = [] for pair in raw: typo, corrects = pair.strip().split('->') for correct in corrects.split(','): data.append([typo.strip(), correct.strip()]) fname.parent.mkdir(parents=True, exist_ok=True) with fname.open('w', newline='', encoding='utf8') as tsvfile: writer = csv.writer(tsvfile, delimiter='\t') for line in data: writer.writerow(line) mark_done(data_path) log.info('Built') return fname
def download_data(self, data_path): if not is_done(Path(data_path)): download_decompress( url= "http://lnsigo.mipt.ru/export/datasets/insuranceQA-master.zip", download_path=data_path) mark_done(data_path)
def build(data_path: str): data_path = Path(data_path) / 'typos_wiki' fname = data_path / 'misspelings.tsv' if not is_done(data_path): url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines' page = requests.get(url) tree = html.fromstring(page.content) raw = tree.xpath('//pre/text()')[0].splitlines() data = [] for pair in raw: typo, corrects = pair.strip().split('->') for correct in corrects.split(','): data.append([typo.strip(), correct.strip()]) fname.parent.mkdir(parents=True, exist_ok=True) with fname.open('w', newline='') as tsvfile: writer = csv.writer(tsvfile, delimiter='\t') for line in data: writer.writerow(line) mark_done(data_path) log.info('Built') return fname
def build(data_path: str): data_path = Path(data_path) / 'typos_wiki' fname = data_path / 'misspelings.tsv' if not is_done(data_path): url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines' download(fname, url) with fname.open() as f: data = [] for line in f: if line.strip().endswith('<pre>'): break for line in f: if line.strip().startswith('</pre>'): break data.append(line.strip().split('->')) with fname.open('w', newline='') as tsvfile: writer = csv.writer(tsvfile, delimiter='\t') for line in data: writer.writerow(line) mark_done(data_path) print('Built', file=sys.stderr) return fname
def read(self, data_path: str, queries_per_intent: Optional[int] = None, test_validate_split: float = 0.5, *args, **kwargs) -> \ Dict[str, List[Dict[str, Any]]]: """ Each query in the output has the following form: { 'intent': intent_name, 'data': [ { 'text': text, ('entity': slot_name)? } ] } Args: data_path: A path to a folder with dataset files. queries_per_intent: Number of queries to load for each intent. None to load all. If the requested number is greater than available in file, all queries are returned. test_validate_split: Proportion of `_validate` files to be used as test dataset (since Snips is split into training and validation sets without a separate test set). """ data_path = Path(data_path) intents = [ 'AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork', 'SearchScreeningEvent' ] if not is_done(data_path): url = 'http://files.deeppavlov.ai/datasets/snips.tar.gz' log.info('[downloading data from {} to {}]'.format(url, data_path)) download_decompress(url, data_path) mark_done(data_path) use_full_file = queries_per_intent is None or queries_per_intent > 70 training_data = [] validation_data = [] test_data = [] for intent in intents: intent_path = data_path / intent train_file_name = f"train_{intent}{'_full' if use_full_file else ''}.json" validate_file_name = f"validate_{intent}.json" train_queries = self._load_file(intent_path / train_file_name, intent, queries_per_intent) validate_queries = self._load_file( intent_path / validate_file_name, intent, queries_per_intent) num_test_queries = round( len(validate_queries) * test_validate_split) training_data.extend(train_queries) validation_data.extend(validate_queries[num_test_queries:]) test_data.extend(validate_queries[:num_test_queries]) return { 'train': training_data, 'valid': validation_data, 'test': test_data }
def _download_data(self, data_path): """Download archive with the InsuranceQA dataset files and decompress if there is no dataset files in `data_path`. Args: data_path: A path to a folder where dataset files are stored. """ if not is_done(Path(data_path)): download_decompress( url= "http://files.deeppavlov.ai/datasets/insuranceQA-master.zip", download_path=data_path) mark_done(data_path)
def build(data_path: str): data_path = Path(data_path) / 'kartaslov' fname = data_path / 'orfo_and_typos.L1_5.csv' if not is_done(data_path): url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv' download(fname, url) mark_done(data_path) print('Built', file=sys.stderr) return fname
def build(data_path: str): data_path = Path(data_path) / 'kartaslov' fname = data_path / 'orfo_and_typos.L1_5.csv' if not is_done(data_path): url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv' download(fname, url) mark_done(data_path) log.info('Built') return fname
def __init__(self, data_dir=None, *args, **kwargs): if data_dir is None: data_dir = paths.USR_PATH data_dir = Path(data_dir) if self.dict_name is None: self.dict_name = args[0] if args else kwargs.get( 'dictionary_name', 'dictionary') data_dir = data_dir / self.dict_name alphabet_path = data_dir / 'alphabet.pkl' words_path = data_dir / 'words.pkl' words_trie_path = data_dir / 'words_trie.pkl' if not is_done(data_dir): print('Trying to build a dictionary in {}'.format(data_dir), file=sys.stderr) if data_dir.is_dir(): shutil.rmtree(data_dir) data_dir.mkdir(parents=True) words = self._get_source(data_dir, *args, **kwargs) words = {self._normalize(word) for word in words} alphabet = {c for w in words for c in w} alphabet.remove('⟬') alphabet.remove('⟭') save_pickle(alphabet, alphabet_path) save_pickle(words, words_path) words_trie = defaultdict(set) for word in words: for i in range(len(word)): words_trie[word[:i]].add(word[:i + 1]) words_trie[word] = set() words_trie = {k: sorted(v) for k, v in words_trie.items()} save_pickle(words_trie, words_trie_path) mark_done(data_dir) print('built', file=sys.stderr) else: print('Loading a dictionary from {}'.format(data_dir), file=sys.stderr) self.alphabet = load_pickle(alphabet_path) self.words_set = load_pickle(words_path) self.words_trie = load_pickle(words_trie_path)
def __init__(self, data_dir=None, *args, **kwargs): if data_dir is None: data_dir = paths.USR_PATH data_dir = Path(data_dir) if self.dict_name is None: self.dict_name = args[0] if args else kwargs.get('dictionary_name', 'dictionary') data_dir = data_dir / self.dict_name alphabet_path = data_dir / 'alphabet.pkl' words_path = data_dir / 'words.pkl' words_trie_path = data_dir / 'words_trie.pkl' if not is_done(data_dir): print('Trying to build a dictionary in {}'.format(data_dir), file=sys.stderr) if data_dir.is_dir(): shutil.rmtree(data_dir) data_dir.mkdir(parents=True) words = self._get_source(data_dir, *args, **kwargs) words = {self._normalize(word) for word in words} alphabet = {c for w in words for c in w} alphabet.remove('⟬') alphabet.remove('⟭') save_pickle(alphabet, alphabet_path) save_pickle(words, words_path) words_trie = defaultdict(set) for word in words: for i in range(len(word)): words_trie[word[:i]].add(word[:i+1]) words_trie[word] = set() words_trie = {k: sorted(v) for k, v in words_trie.items()} save_pickle(words_trie, words_trie_path) mark_done(data_dir) print('built', file=sys.stderr) else: print('Loading a dictionary from {}'.format(data_dir), file=sys.stderr) self.alphabet = load_pickle(alphabet_path) self.words_set = load_pickle(words_path) self.words_trie = load_pickle(words_trie_path)
def __init__(self, data_dir: [Path, str] = '', *args, dictionary_name: str = 'dictionary', **kwargs): data_dir = expand_path(data_dir) / dictionary_name alphabet_path = data_dir / 'alphabet.pkl' words_path = data_dir / 'words.pkl' words_trie_path = data_dir / 'words_trie.pkl' if not is_done(data_dir): log.info('Trying to build a dictionary in {}'.format(data_dir)) if data_dir.is_dir(): shutil.rmtree(str(data_dir)) data_dir.mkdir(parents=True) words = self._get_source(data_dir, *args, **kwargs) words = {self._normalize(word) for word in words} alphabet = {c for w in words for c in w} alphabet.remove('⟬') alphabet.remove('⟭') save_pickle(alphabet, alphabet_path) save_pickle(words, words_path) words_trie = defaultdict(set) for word in words: for i in range(len(word)): words_trie[word[:i]].add(word[:i + 1]) words_trie[word] = set() words_trie = {k: sorted(v) for k, v in words_trie.items()} save_pickle(words_trie, words_trie_path) mark_done(data_dir) log.info('built') else: log.info('Loading a dictionary from {}'.format(data_dir)) self.alphabet = load_pickle(alphabet_path) self.words_set = load_pickle(words_path) self.words_trie = load_pickle(words_trie_path)
def __init__(self, data_dir: [Path, str]='', *args, dictionary_name: str='dictionary', **kwargs): data_dir = expand_path(data_dir) / dictionary_name alphabet_path = data_dir / 'alphabet.pkl' words_path = data_dir / 'words.pkl' words_trie_path = data_dir / 'words_trie.pkl' if not is_done(data_dir): log.info('Trying to build a dictionary in {}'.format(data_dir)) if data_dir.is_dir(): shutil.rmtree(str(data_dir)) data_dir.mkdir(parents=True) words = self._get_source(data_dir, *args, **kwargs) words = {self._normalize(word) for word in words} alphabet = {c for w in words for c in w} alphabet.remove('⟬') alphabet.remove('⟭') save_pickle(alphabet, alphabet_path) save_pickle(words, words_path) words_trie = defaultdict(set) for word in words: for i in range(len(word)): words_trie[word[:i]].add(word[:i+1]) words_trie[word] = set() words_trie = {k: sorted(v) for k, v in words_trie.items()} save_pickle(words_trie, words_trie_path) mark_done(data_dir) log.info('built') else: log.info('Loading a dictionary from {}'.format(data_dir)) self.alphabet = load_pickle(alphabet_path) self.words_set = load_pickle(words_path) self.words_trie = load_pickle(words_trie_path)
def build(data_path: str) -> Path: """Download misspellings list from `github <https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv>`_ Args: data_path: target directory to download the data to Returns: path to the resulting csv-file """ data_path = Path(data_path) / 'kartaslov' fname = data_path / 'orfo_and_typos.L1_5.csv' if not is_done(data_path): url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv' download(fname, url) mark_done(data_path) log.info('Built') return fname
def read(self, data_path: str, catalog: list, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]: """Load data from specific catalog Parameters: data_path: where the dataset is located catalog: names of the specific subcategories Returns: dataset: loaded dataset """ logger.info(f"Ecommerce loader is loaded with catalog {catalog}") if not isinstance(catalog, list): catalog = [catalog] ec_data_global: List[Any] = [] data_path = Path(expand_path(data_path)) if not is_done(data_path): self._download_data(data_path) if data_path.is_dir(): for fname in data_path.rglob("*.txt"): if any(cat in fname.name for cat in catalog): logger.info(f"File {fname.name} is loaded") ec_data_global += self._load_amazon_ecommerce_file(fname) dataset = { 'train': [((item['Title'], [], {}), item) for item in ec_data_global], 'valid': [], 'test': [] } logger.info(f"In total {len(ec_data_global)} items are loaded") return dataset
def read(self, data_path: str, url: Optional[str] = None, *args, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]: """ Args: data_path: A path to a folder with dataset files. url: A url to the archive with the dataset to download if the data folder is empty. """ data_path = Path(data_path) if url is None: url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" if not is_done(data_path): log.info('[downloading data from {} to {}]'.format(url, data_path)) download_decompress(url, data_path) mark_done(data_path) alternative_data_path = data_path / "aclImdb" if alternative_data_path.exists(): data_path = alternative_data_path data = {"train": [], "test": []} for data_type in data.keys(): for label in ["neg", "pos"]: labelpath = data_path / data_type / label if not labelpath.exists(): raise RuntimeError(f"Cannot load data: {labelpath} does not exist") for filename in labelpath.glob("*.txt"): with filename.open(encoding='utf-8') as f: text = f.read() data[data_type].append((text, [label])) if not data[data_type]: raise RuntimeError(f"Could not load the '{data_type}' dataset, " "probably data dirs are empty") return data