def setUp(self): self.tempdir = tempfile.mkdtemp(prefix='test_corpora', dir=os.path.dirname( os.path.abspath(__file__))) wiki_fname = os.path.join(self.tempdir, 'wikitext.xml.bz2') write_file(WIKITEXT, wiki_fname, mode='wb', auto_make_dirs=True) self.wikireader = WikiReader(wiki_fname)
def test_read_write_file_bytes(self): expected = unicode_to_bytes(self.text) for ext in ('.txt', '.gz', '.bz2', '.xz'): filename = os.path.join( self.tempdir, 'test_read_write_file_bytes' + ext) fileio.write_file(expected, filename, mode='wb', auto_make_dirs=True) observed = fileio.read_file(filename, mode='rb') self.assertEqual(observed, expected)
def test_read_write_file_unicode(self): expected = self.text for ext in ('.txt', '.gz', '.bz2', '.xz'): filename = os.path.join( self.tempdir, 'test_read_write_file_unicode' + ext) if PY2 is True and ext != '.txt': self.assertRaises( ValueError, fileio.open_sesame, filename, 'wt', 'utf-8', True) else: fileio.write_file(expected, filename, mode='wt', auto_make_dirs=True) observed = fileio.read_file(filename, mode='rt') self.assertEqual(observed, expected)
def _download_bernie_and_hillary(data_dir): """ Download the Bernie & Hillary corpus from S3, save to disk as JSON lines. Args: data_dir (str): path on disk where corpus will be saved Raises: HTTPError: if something goes wrong with the download """ try: data = urlopen(URL).read() except HTTPError as e: logger.exception( 'unable to download corpus from %s; status code %s', URL, e.code) raise logger.info('corpus downloaded from %s (10 MB)', URL) data = data.decode('utf8') fname = os.path.join(data_dir, FNAME) write_file(data, fname, mode='wt', encoding=None)
def _download_bernie_and_hillary(data_dir): """ Download the Bernie & Hillary corpus from S3, save to disk as JSON lines. Args: data_dir (str): path on disk where corpus will be saved Raises: HTTPError: if something goes wrong with the download """ try: data = urlopen(URL).read() except HTTPError as e: logger.exception('unable to download corpus from %s; status code %s', URL, e.code) raise logger.info('corpus downloaded from %s (10 MB)', URL) data = data.decode('utf8') fname = os.path.join(data_dir, FNAME) write_file(data, fname, mode='wt', encoding=None)
def setUp(self): self.tempdir = tempfile.mkdtemp( prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__))) wiki_fname = os.path.join(self.tempdir, 'wikitext.xml.bz2') write_file(WIKITEXT, wiki_fname, mode='wb', auto_make_dirs=True) self.wikireader = WikiReader(wiki_fname)