示例#1
0
    def test_compress_error(self, paths):
        feature_reader = FeatureReader(paths, compressed=False)
        with pytest.raises(ValueError):
            next(feature_reader.volumes())

        paths = [path.replace('.bz2', '') for path in paths]
        feature_reader = FeatureReader(paths, compressed=True)
        with pytest.raises(IOError):
            next(feature_reader.volumes())
示例#2
0
    def test_compress_error(self, paths):
        feature_reader = FeatureReader(paths, compressed=False)
        with pytest.raises(ValueError):
            next(feature_reader.volumes())

        paths = [path.replace('.bz2', '') for path in paths]
        feature_reader = FeatureReader(paths, compressed=True)
        with pytest.raises(IOError):
            next(feature_reader.volumes())
示例#3
0
def create_corpus(ids, verbose=1):
    paths = download_vols(ids)
    filtered_ids = [os.path.basename(p).replace('.json.bz2','') for p in paths]

    if verbose:
        pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(ids))
        pbar = pbar.start()
        n = 0

    fr = FeatureReader(paths)
    corpus = []
    with concurrent.futures.ProcessPoolExecutor() as executor:
        vols = [executor.submit(process_pages, vol) 
                    for id_n, vol in enumerate(fr.volumes())]
        
        if verbose:
            for f in concurrent.futures.as_completed(vols):
                n += 1
                pbar.update(n)

        corpus = map(concurrent.futures.Future.result, vols)
        pbar.finish()
    corpus = list(corpus)
    
    c = corpus_fromlist(corpus, context_type='book')
    c = apply_stoplist(c, nltk_stop=True, freq=5)
    c.context_data[0]['book_label'] = filtered_ids

    return c
示例#4
0
    def test_id_list_remote_load(self, ids, titles):
        feature_reader = FeatureReader(ids=ids)
        vol = next(feature_reader.volumes())
        assert type(vol) == htrc_features.feature_reader.Volume

        for i, vol in enumerate(feature_reader):
            assert type(vol) == htrc_features.feature_reader.Volume
            assert vol.title == titles[i]
示例#5
0
    def test_list_load(self, paths):
        feature_reader = FeatureReader(paths)
        vol = next(feature_reader.volumes())
        assert type(vol) == htrc_features.feature_reader.Volume

        for i, vol in enumerate(feature_reader):
            assert type(vol) == htrc_features.feature_reader.Volume
            assert vol.title == self.TITLES[i]
    def test_list_load(self, paths):
        feature_reader = FeatureReader(paths)
        vol = next(feature_reader.volumes())
        assert type(vol) == htrc_features.feature_reader.Volume

        for i, vol in enumerate(feature_reader):
            assert type(vol) == htrc_features.feature_reader.Volume
            assert vol.title == self.TITLES[i]
示例#7
0
    def test_parquet_reading(self, ids, titles):
        dirpath = os.path.join('tests', 'data', 'partialparq')
        feature_reader = FeatureReader(ids=ids, format='parquet', dir=dirpath)

        vol = next(feature_reader.volumes())
        assert type(vol) == htrc_features.feature_reader.Volume

        for i, vol in enumerate(feature_reader):
            assert type(vol) == htrc_features.feature_reader.Volume
            assert vol.title == titles[i]
示例#8
0
    def test_internal_tokencount_representation(self, paths):
        paths = paths[0]
        feature_reader = FeatureReader(paths, compression=None)
        vol = next(feature_reader.volumes())

        assert vol._tokencounts.empty
        vol.tokenlist()
        assert vol._tokencounts.index.names == [
            'page', 'section', 'token', 'pos'
        ]
        vol.tokenlist(case=False)
        assert vol._tokencounts.index.names == [
            'page', 'section', 'token', 'pos'
        ]
示例#9
0
 def test_caching(self, paths):
     import time
     # Load new volume specifically for this test
     paths = paths[0]
     feature_reader = FeatureReader(paths, compression=None)
     vol = next(feature_reader.volumes())
     # Systems are different, the rough test here simply checks whether
     # the first run is much slower than later runs.
     tokenlist_times = []
     for i in range(0, 6):
         start = time.time()
         vol.tokenlist()
         passed = time.time() - start
         tokenlist_times.append(passed)
     assert 2 * tokenlist_times[0] > sum(tokenlist_times[1:])
示例#10
0
def make_hashes(vocab, ids=None, paths=None, **kwargs):
    if ids and paths:
        raise "Can't include both ids and paths"
    elif ids:
        fr = FeatureReader(ids=ids)
    elif paths:
        fr = FeatureReader(paths=paths)
    else:
        raise "Need either a list of ids or paths"

    i = 0
    for vol in fr.volumes():
        tokens = set(vol.tokens()).intersection(vocab)
        i += 1
        if i % 100 == 0:
            print(os.getpid(), i, 'files processed')
        yield make_hash(vol.id, tokens, **kwargs)
示例#11
0
def create_corpus(ids, nltk_stop=False, freq=0, verbose=1):
    paths = download_vols(ids)
    filtered_ids = [
        os.path.basename(p).replace('.json.bz2', '') for p in paths
    ]

    if verbose:
        pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(ids))
        pbar = pbar.start()
        n = 0

    if sys.version_info[0] == 2:
        TD = backports.tempfile.TemporaryDirectory
    else:
        TD = tempfile.TemporaryDirectory
    with TD(prefix='vsm-') as pickle_dir:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')

            fr = FeatureReader(paths)
            corpus = []
            with concurrent.futures.ProcessPoolExecutor() as executor:
                vols = [
                    executor.submit(process_pages, vol, pickle_dir)
                    for id_n, vol in enumerate(fr.volumes())
                ]

                if verbose:
                    for _ in concurrent.futures.as_completed(vols):
                        n += 1
                        pbar.update(n)

                pbar.finish()
                corpus_files = [vol.result() for vol in vols]

            corpus = [PickledWords(filename) for filename in corpus_files]

        c = corpus_fromlist(corpus, context_type='book')
        c = apply_stoplist(c, nltk_stop=nltk_stop, freq=freq)
        c.context_data[0]['book_label'] = filtered_ids

    return c
示例#12
0
def create_corpus(ids, nltk_stop=False, freq=0, verbose=1):
    paths = download_vols(ids)
    filtered_ids = [os.path.basename(p).replace('.json.bz2','') for p in paths]

    if verbose:
        pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(ids))
        pbar = pbar.start()
        n = 0

    if sys.version_info[0] == 2:
        TD = backports.tempfile.TemporaryDirectory 
    else:
        TD = tempfile.TemporaryDirectory
    with TD(prefix='vsm-') as pickle_dir:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')

            fr = FeatureReader(paths)
            corpus = []
            with concurrent.futures.ProcessPoolExecutor() as executor:
                vols = [executor.submit(process_pages, vol, pickle_dir) 
                            for id_n, vol in enumerate(fr.volumes())]
                
                if verbose:
                    for _ in concurrent.futures.as_completed(vols):
                        n += 1
                        pbar.update(n)

                pbar.finish()
                corpus_files = [vol.result() for vol in vols]

            corpus = [PickledWords(filename) for filename in corpus_files]
    
        c = corpus_fromlist(corpus, context_type='book')
        c = apply_stoplist(c, nltk_stop=nltk_stop, freq=freq)
        c.context_data[0]['book_label'] = filtered_ids

    return c
示例#13
0
import tensorflow as tf
import pandas as pd
from htrc_features import FeatureReader, utils
import itertools
import glob
from ef_utils import *

ef_root = "data/ef-files/comedy/"
ef_file_paths = glob.glob(ef_root + "/*.bz2")
ef_files = FeatureReader(paths=list(ef_file_paths))

token_ref = load_tokenref('eng-vocab-1.txt.bz2', trim_head=0)

volumes = ef_files.volumes()

i = 0
writer = tf.python_io.TFRecordWriter(
    'data/literature/tfrecords/lit-%d.tfrecord' % int(i / 100))

for vol in volumes:
    i += 1
    if i % 100 == 0:
        writer.close()
        writer = tf.python_io.TFRecordWriter(
            'data/literature/tfrecords/lit-%d.tfrecord' % int(i / 100))

    print(vol.id)
    pages_en = [p for p in vol.pages() if {'en': '1.00'} in p.languages]
    for page in pages_en:
        page_body_tokens = page.tokenlist(section='body',
                                          case=False,
示例#14
0
def volume(paths):
    paths = paths[0]
    feature_reader = FeatureReader(paths, compression=None)
    return next(feature_reader.volumes())
示例#15
0
 def test_iteration(self, paths):
     feature_reader = FeatureReader(paths)
     for vol in feature_reader:
         assert type(vol) == htrc_features.feature_reader.Volume
     for vol in feature_reader.volumes():
         assert type(vol) == htrc_features.feature_reader.Volume
示例#16
0
 def test_iteration(self, paths):
     feature_reader = FeatureReader(paths)
     for vol in feature_reader:
         assert type(vol) == htrc_features.feature_reader.Volume
     for vol in feature_reader.volumes():
         assert type(vol) == htrc_features.feature_reader.Volume
示例#17
0
 def test_id_remote_load(self, ids):
     id = ids[0]
     feature_reader = FeatureReader(ids=id)
     vol = next(feature_reader.volumes())
     assert type(vol) == htrc_features.feature_reader.Volume
示例#18
0
 def test_single_path_load(self, paths):
     path = paths[0]
     feature_reader = FeatureReader(path)
     vol = next(feature_reader.volumes())
     assert type(vol) == htrc_features.feature_reader.Volume
示例#19
0
 def test_single_path_load(self, paths):
     path = paths[0]
     feature_reader = FeatureReader(path)
     vol = next(feature_reader.volumes())
     assert type(vol) == htrc_features.feature_reader.Volume
示例#20
0
    df_tl = vol.tokenlist().reset_index()# convert to dataframe
    df_tl = df_tl[df_tl['section']=='body']#get rid of header and footer; keep only body
    page_count=df_tl['page'].tolist()[-1]# get total page number
    page_hat=round(page_count*hat)# find the 15% page
    page_tail=page_count-round(page_count*tail)# find the "counter-5%" page
    df_tl=df_tl[df_tl['page'].between(page_hat, page_tail, inclusive=False)] # locate the pages in between
    series_tl=df_tl.groupby(["token"]).size()# group the tokens across pages
    new_df_tl = series_tl.to_frame().reset_index() # convert to df
    return new_df_tl

docfreqs = Counter()
termfreqs = dict()
ctr = 0

fr = FeatureReader(paths)
for vol in fr.volumes():
    ctr += 1
    if ctr % 100 == 1:
        print(ctr)

    output = get_token_counts(vol,0.15,0.05)
    docid = str(vol.id)

    thesewords = Counter()

    for row in output.itertuples(index = False):
        if pd.isnull(row[0]):
            continue
        word = row[0].lower().strip('.",')

            # we're lowercasing everything and also