def setUp(self):
     self.tempdir = tempfile.mkdtemp(prefix='test_corpora',
                                     dir=os.path.dirname(
                                         os.path.abspath(__file__)))
     wiki_fname = os.path.join(self.tempdir, 'wikitext.xml.bz2')
     write_file(WIKITEXT, wiki_fname, mode='wb', auto_make_dirs=True)
     self.wikireader = WikiReader(wiki_fname)
예제 #2
0
 def test_read_write_file_bytes(self):
     expected = unicode_to_bytes(self.text)
     for ext in ('.txt', '.gz', '.bz2', '.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_file_bytes' + ext)
         fileio.write_file(expected, filename, mode='wb',
                           auto_make_dirs=True)
         observed = fileio.read_file(filename, mode='rb')
         self.assertEqual(observed, expected)
예제 #3
0
 def test_read_write_file_unicode(self):
     expected = self.text
     for ext in ('.txt', '.gz', '.bz2', '.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_file_unicode' + ext)
         if PY2 is True and ext != '.txt':
             self.assertRaises(
                 ValueError, fileio.open_sesame,
                 filename, 'wt', 'utf-8', True)
         else:
             fileio.write_file(expected, filename, mode='wt',
                               auto_make_dirs=True)
             observed = fileio.read_file(filename, mode='rt')
             self.assertEqual(observed, expected)
예제 #4
0
 def test_read_write_file_unicode(self):
     expected = self.text
     for ext in ('.txt', '.gz', '.bz2', '.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_file_unicode' + ext)
         if PY2 is True and ext != '.txt':
             self.assertRaises(
                 ValueError, fileio.open_sesame,
                 filename, 'wt', 'utf-8', True)
         else:
             fileio.write_file(expected, filename, mode='wt',
                               auto_make_dirs=True)
             observed = fileio.read_file(filename, mode='rt')
             self.assertEqual(observed, expected)
예제 #5
0
def _download_bernie_and_hillary(data_dir):
    """
    Download the Bernie & Hillary corpus from S3, save to disk as JSON lines.

    Args:
        data_dir (str): path on disk where corpus will be saved

    Raises:
        HTTPError: if something goes wrong with the download
    """
    try:
        data = urlopen(URL).read()
    except HTTPError as e:
        logger.exception(
            'unable to download corpus from %s; status code %s', URL, e.code)
        raise
    logger.info('corpus downloaded from %s (10 MB)', URL)
    data = data.decode('utf8')
    fname = os.path.join(data_dir, FNAME)
    write_file(data, fname, mode='wt', encoding=None)
예제 #6
0
def _download_bernie_and_hillary(data_dir):
    """
    Download the Bernie & Hillary corpus from S3, save to disk as JSON lines.

    Args:
        data_dir (str): path on disk where corpus will be saved

    Raises:
        HTTPError: if something goes wrong with the download
    """
    try:
        data = urlopen(URL).read()
    except HTTPError as e:
        logger.exception('unable to download corpus from %s; status code %s',
                         URL, e.code)
        raise
    logger.info('corpus downloaded from %s (10 MB)', URL)
    data = data.decode('utf8')
    fname = os.path.join(data_dir, FNAME)
    write_file(data, fname, mode='wt', encoding=None)
 def setUp(self):
     self.tempdir = tempfile.mkdtemp(
         prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__)))
     wiki_fname = os.path.join(self.tempdir, 'wikitext.xml.bz2')
     write_file(WIKITEXT, wiki_fname, mode='wb', auto_make_dirs=True)
     self.wikireader = WikiReader(wiki_fname)