def test_compress_error(self, paths): feature_reader = FeatureReader(paths, compressed=False) with pytest.raises(ValueError): next(feature_reader.volumes()) paths = [path.replace('.bz2', '') for path in paths] feature_reader = FeatureReader(paths, compressed=True) with pytest.raises(IOError): next(feature_reader.volumes())
def create_corpus(ids, verbose=1): paths = download_vols(ids) filtered_ids = [os.path.basename(p).replace('.json.bz2','') for p in paths] if verbose: pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(ids)) pbar = pbar.start() n = 0 fr = FeatureReader(paths) corpus = [] with concurrent.futures.ProcessPoolExecutor() as executor: vols = [executor.submit(process_pages, vol) for id_n, vol in enumerate(fr.volumes())] if verbose: for f in concurrent.futures.as_completed(vols): n += 1 pbar.update(n) corpus = map(concurrent.futures.Future.result, vols) pbar.finish() corpus = list(corpus) c = corpus_fromlist(corpus, context_type='book') c = apply_stoplist(c, nltk_stop=True, freq=5) c.context_data[0]['book_label'] = filtered_ids return c
def test_id_list_remote_load(self, ids, titles): feature_reader = FeatureReader(ids=ids) vol = next(feature_reader.volumes()) assert type(vol) == htrc_features.feature_reader.Volume for i, vol in enumerate(feature_reader): assert type(vol) == htrc_features.feature_reader.Volume assert vol.title == titles[i]
def test_list_load(self, paths): feature_reader = FeatureReader(paths) vol = next(feature_reader.volumes()) assert type(vol) == htrc_features.feature_reader.Volume for i, vol in enumerate(feature_reader): assert type(vol) == htrc_features.feature_reader.Volume assert vol.title == self.TITLES[i]
def test_parquet_reading(self, ids, titles): dirpath = os.path.join('tests', 'data', 'partialparq') feature_reader = FeatureReader(ids=ids, format='parquet', dir=dirpath) vol = next(feature_reader.volumes()) assert type(vol) == htrc_features.feature_reader.Volume for i, vol in enumerate(feature_reader): assert type(vol) == htrc_features.feature_reader.Volume assert vol.title == titles[i]
def test_internal_tokencount_representation(self, paths): paths = paths[0] feature_reader = FeatureReader(paths, compression=None) vol = next(feature_reader.volumes()) assert vol._tokencounts.empty vol.tokenlist() assert vol._tokencounts.index.names == [ 'page', 'section', 'token', 'pos' ] vol.tokenlist(case=False) assert vol._tokencounts.index.names == [ 'page', 'section', 'token', 'pos' ]
def test_caching(self, paths): import time # Load new volume specifically for this test paths = paths[0] feature_reader = FeatureReader(paths, compression=None) vol = next(feature_reader.volumes()) # Systems are different, the rough test here simply checks whether # the first run is much slower than later runs. tokenlist_times = [] for i in range(0, 6): start = time.time() vol.tokenlist() passed = time.time() - start tokenlist_times.append(passed) assert 2 * tokenlist_times[0] > sum(tokenlist_times[1:])
def make_hashes(vocab, ids=None, paths=None, **kwargs): if ids and paths: raise "Can't include both ids and paths" elif ids: fr = FeatureReader(ids=ids) elif paths: fr = FeatureReader(paths=paths) else: raise "Need either a list of ids or paths" i = 0 for vol in fr.volumes(): tokens = set(vol.tokens()).intersection(vocab) i += 1 if i % 100 == 0: print(os.getpid(), i, 'files processed') yield make_hash(vol.id, tokens, **kwargs)
def create_corpus(ids, nltk_stop=False, freq=0, verbose=1): paths = download_vols(ids) filtered_ids = [ os.path.basename(p).replace('.json.bz2', '') for p in paths ] if verbose: pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(ids)) pbar = pbar.start() n = 0 if sys.version_info[0] == 2: TD = backports.tempfile.TemporaryDirectory else: TD = tempfile.TemporaryDirectory with TD(prefix='vsm-') as pickle_dir: with warnings.catch_warnings(): warnings.simplefilter('ignore') fr = FeatureReader(paths) corpus = [] with concurrent.futures.ProcessPoolExecutor() as executor: vols = [ executor.submit(process_pages, vol, pickle_dir) for id_n, vol in enumerate(fr.volumes()) ] if verbose: for _ in concurrent.futures.as_completed(vols): n += 1 pbar.update(n) pbar.finish() corpus_files = [vol.result() for vol in vols] corpus = [PickledWords(filename) for filename in corpus_files] c = corpus_fromlist(corpus, context_type='book') c = apply_stoplist(c, nltk_stop=nltk_stop, freq=freq) c.context_data[0]['book_label'] = filtered_ids return c
def create_corpus(ids, nltk_stop=False, freq=0, verbose=1): paths = download_vols(ids) filtered_ids = [os.path.basename(p).replace('.json.bz2','') for p in paths] if verbose: pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(ids)) pbar = pbar.start() n = 0 if sys.version_info[0] == 2: TD = backports.tempfile.TemporaryDirectory else: TD = tempfile.TemporaryDirectory with TD(prefix='vsm-') as pickle_dir: with warnings.catch_warnings(): warnings.simplefilter('ignore') fr = FeatureReader(paths) corpus = [] with concurrent.futures.ProcessPoolExecutor() as executor: vols = [executor.submit(process_pages, vol, pickle_dir) for id_n, vol in enumerate(fr.volumes())] if verbose: for _ in concurrent.futures.as_completed(vols): n += 1 pbar.update(n) pbar.finish() corpus_files = [vol.result() for vol in vols] corpus = [PickledWords(filename) for filename in corpus_files] c = corpus_fromlist(corpus, context_type='book') c = apply_stoplist(c, nltk_stop=nltk_stop, freq=freq) c.context_data[0]['book_label'] = filtered_ids return c
import tensorflow as tf import pandas as pd from htrc_features import FeatureReader, utils import itertools import glob from ef_utils import * ef_root = "data/ef-files/comedy/" ef_file_paths = glob.glob(ef_root + "/*.bz2") ef_files = FeatureReader(paths=list(ef_file_paths)) token_ref = load_tokenref('eng-vocab-1.txt.bz2', trim_head=0) volumes = ef_files.volumes() i = 0 writer = tf.python_io.TFRecordWriter( 'data/literature/tfrecords/lit-%d.tfrecord' % int(i / 100)) for vol in volumes: i += 1 if i % 100 == 0: writer.close() writer = tf.python_io.TFRecordWriter( 'data/literature/tfrecords/lit-%d.tfrecord' % int(i / 100)) print(vol.id) pages_en = [p for p in vol.pages() if {'en': '1.00'} in p.languages] for page in pages_en: page_body_tokens = page.tokenlist(section='body', case=False,
def volume(paths): paths = paths[0] feature_reader = FeatureReader(paths, compression=None) return next(feature_reader.volumes())
def test_iteration(self, paths): feature_reader = FeatureReader(paths) for vol in feature_reader: assert type(vol) == htrc_features.feature_reader.Volume for vol in feature_reader.volumes(): assert type(vol) == htrc_features.feature_reader.Volume
def test_id_remote_load(self, ids): id = ids[0] feature_reader = FeatureReader(ids=id) vol = next(feature_reader.volumes()) assert type(vol) == htrc_features.feature_reader.Volume
def test_single_path_load(self, paths): path = paths[0] feature_reader = FeatureReader(path) vol = next(feature_reader.volumes()) assert type(vol) == htrc_features.feature_reader.Volume
df_tl = vol.tokenlist().reset_index()# convert to dataframe df_tl = df_tl[df_tl['section']=='body']#get rid of header and footer; keep only body page_count=df_tl['page'].tolist()[-1]# get total page number page_hat=round(page_count*hat)# find the 15% page page_tail=page_count-round(page_count*tail)# find the "counter-5%" page df_tl=df_tl[df_tl['page'].between(page_hat, page_tail, inclusive=False)] # locate the pages in between series_tl=df_tl.groupby(["token"]).size()# group the tokens across pages new_df_tl = series_tl.to_frame().reset_index() # convert to df return new_df_tl docfreqs = Counter() termfreqs = dict() ctr = 0 fr = FeatureReader(paths) for vol in fr.volumes(): ctr += 1 if ctr % 100 == 1: print(ctr) output = get_token_counts(vol,0.15,0.05) docid = str(vol.id) thesewords = Counter() for row in output.itertuples(index = False): if pd.isnull(row[0]): continue word = row[0].lower().strip('.",') # we're lowercasing everything and also