def compile_text(idx, by_page=False): from htrc_features import Volume from urllib.error import HTTPError try: path_freqs = os.path.join( load_corpus('Hathi').path_freqs, idx + '.json') path_freqs_dir = os.path.dirname(path_freqs) if os.path.exists(path_freqs): return if not os.path.exists(path_freqs_dir): try: os.makedirs(path_freqs_dir) except FileExistsError: pass # print('compiling!') htid = idx.replace('/', '.', 1) # print('Getting: ',htid) vol = Volume(htid) vol_freqs = vol.term_volume_freqs(pos=False, case=True) vol_freqs_d = dict(zip(vol_freqs['token'], vol_freqs['count'])) with open(path_freqs, 'w') as of: json.dump(vol_freqs_d, of) except (HTTPError, FileNotFoundError, KeyError) as e: # print('!!',e) pass
def yielder(ids, thread_no, totalthreads, chunk_size = 10000, already_imported_list=[]): """ ids: a list of htids to iterate over. chunks_size: the chunk size. returns: an iterable over tuples of id, chunk number, and the grouped token counts. """ locs = [id for (i, id) in enumerate(ids) if i % totalthreads == thread_no] locs = [loc for loc in locs if loc not in already_imported_list] for i, id in enumerate(locs): vol = Volume(id, id_resolver=customizable_resolver) try: if chunk_size == -1: # artificially create a 'chunk', which is actually the full book. chunks = vol.tokenlist(pages=False, pos=False, case=False) old_idx = chunks.index.to_frame() old_idx.insert(0, 'chunk', 1) old_idx.insert(1, 'pstart', 1) old_idx.insert(2, 'pend', vol.page_count) chunks.index = pd.MultiIndex.from_frame(old_idx) else: chunks = vol.tokenlist(chunk = True, chunk_target = chunk_size, overflow = 'ends', case=False, pos=False, page_ref = True) if chunks.empty: continue for (chunk, start, end), group in chunks.reset_index().groupby(['chunk', 'pstart', 'pend']): yield (id, chunk, start, end, group) except: print("Error chunking {}... skipping\n".format(id)) continue
def test_full_parquet(self): dir = os.path.join('tests', 'data', 'fullparquet') vol = Volume(id='uc2.ark:/13960/t1xd0sc6x', format='parquet', dir=dir) assert vol.id == 'uc2.ark:/13960/t1xd0sc6x' assert type(vol.tokenlist()) is pd.core.frame.DataFrame assert type(vol.begin_line_chars()) is pd.core.frame.DataFrame assert type( vol.section_features(section='all')) is pd.core.frame.DataFrame
def test_write_to_chunked_parquet(self, tmpdir): dir = "tests/data" vol_in = Volume(id='aeu.ark:/13960/t1rf63t52', dir=str(dir), id_resolver='local') output = Volume(id='foo.123', dir=tmpdir, format='parquet', mode='wb') output.write(vol_in, token_kwargs={"chunk": True}) read = pd.read_parquet(Path(tmpdir, "foo.123.tokens.parquet")).reset_index() assert ("chunk" in read.columns)
def test_token_only_parquet(self): htid = 'uc2.ark:/13960/t1xd0sc6x' filepath = os.path.join('tests', 'data', 'justtokens') vol = Volume(id=htid, format='parquet', dir=filepath) # Should be inferred from path assert vol.id == 'uc2.ark:/13960/t1xd0sc6x' # Only basic metadata is inferred from ID with pytest.raises(KeyError): vol.parser.meta['language'] with pytest.raises(AttributeError): vol.language assert type(vol.tokenlist()) is pd.core.frame.DataFrame for method in ['section_features', 'begin_line_chars']: with pytest.raises(MissingDataError): getattr(vol, method)()
def test_bad_parser(self): ''' Tests if format mismatch from data raises error''' dir = os.path.join('tests', 'data', 'fullparquet') with pytest.raises(ValueError): # This tries to load the ID from vol = Volume(id='uc2.ark:/13960/t1xd0sc6x', format='json', dir=dir, id_resolver="http")
def combo_test(self, format, resolver, compression): id = "aeu.ark:/13960/t1rf63t52" print(format, resolver, compression) basic_resolver = resolvers.LocalResolver(dir = "tests/data", format="json", compression="bz2") with tempfile.TemporaryDirectory() as tempdir: testing_resolver_write = resolver(dir = tempdir, format = format, compression = compression) copy_between_resolvers(id, basic_resolver, testing_resolver_write) # Test read on a freshly made resolver just in case there's entanglement testing_resolver_read = resolver(dir = tempdir, format = format, compression = compression) assert(Volume(id, id_resolver = testing_resolver_read).tokenlist()['count'].sum() == 97691)
def test_meta_only_parquet(self): htid = 'uc2.ark:/13960/t1xd0sc6x' filepath = os.path.join('tests', 'data', 'justmeta') vol = Volume(htid, dir=filepath, format='parquet', id_resolver="local") assert vol.id == 'uc2.ark:/13960/t1xd0sc6x' assert vol.language == 'eng' for method in ['section_features', 'tokenlist', 'begin_line_chars']: with pytest.raises(MissingDataError): getattr(vol, method)()
def test_chunked_parq_tokenlist(self): htid = 'uc2.ark+=13960=t1xd0sc6x' dirpath = os.path.join('tests', 'data', 'chunkedparq') vol = Volume(id=htid, format='parquet', dir=dirpath) assert vol.tokenlist(case=False, pos=True).reset_index().columns.tolist() == [ 'chunk', 'section', 'lowercase', 'pos', 'count' ] assert vol.tokenlist(case=True, pos=False).reset_index().columns.tolist() == [ 'chunk', 'section', 'token', 'count' ] assert vol.tokenlist().reset_index().columns.tolist() == [ 'chunk', 'section', 'token', 'pos', 'count' ] assert vol.tokenlist( drop_section=True).reset_index().columns.tolist() == [ 'chunk', 'token', 'pos', 'count' ]
def test_local_to_pairtree_to_parquet(self): """ An elaborate trip. """ with tempfile.TemporaryDirectory() as first_new_dir: with tempfile.TemporaryDirectory() as second_new_dir: resolver1 = htrc_features.resolvers.LocalResolver(dir = Path(project_root, "tests", "data"), format = "json", compression = "bz2") resolver2 = htrc_features.resolvers.PairtreeResolver(dir = first_new_dir, format = "json", compression = "gz") resolver3 = htrc_features.resolvers.LocalResolver(dir = second_new_dir, format = "parquet", compression = "snappy") copy_between_resolvers("aeu.ark:/13960/t1rf63t52", resolver1, resolver2) copy_between_resolvers("aeu.ark:/13960/t1rf63t52", resolver2, resolver3) all_files = [] for loc, dir, files in os.walk(first_new_dir): for file in files: all_files.append(os.path.join(loc, file)) assert(len(all_files) == 1) assert(all_files[0].endswith("aeu/pairtree_root/ar/k+/=1/39/60/=t/1r/f6/3t/52/ark+=13960=t1rf63t52/aeu.ark+=13960=t1rf63t52.json.gz")) # Our test assertion ensures that the data has made it all the way through. assert(Volume("aeu.ark:/13960/t1rf63t52", id_resolver = resolver3).tokenlist()['count'].sum() == 97691)
def test_partial_parq_tokenlist(self): ''' Test loading of tokenlists saved with less information. In this case, vol.save_parquet('tests/data/partialparq/', token_kwargs=dict(case=False, pos=False, drop_section=False) ) ''' htid = 'uc2.ark:/13960/t1xd0sc6x' dirpath = os.path.join('tests', 'data', 'partialparq') vol = Volume(id=htid, format='parquet', dir=dirpath) tl = vol.tokenlist(case=False, pos=False) assert tl.reset_index().columns.tolist() == [ 'page', 'lowercase', 'count' ] with pytest.raises(MissingFieldError): tl = vol.tokenlist(case=True, pos=False) with pytest.raises(MissingFieldError): tl = vol.tokenlist(case=False, pos=True) with pytest.raises(MissingFieldError): tl = vol.tokenlist(case=False, pos=False, section='header')
def main(): parser = argparse.ArgumentParser( description='Convert EF files to Parquet compressed with Snappy') parser.add_argument('--efdir', type=str, default='/data/extracted-features/', help='Location of the EF files') parser.add_argument('--outdir', type=str, default='/data/extracted-features-parquet/', help='Output location for parquet files.') parser.add_argument( '--parser', type=str, default='json', help= "Allows you to change the parser for the input files - e.g. if you're opening EF files that are already parquet, with the intent of chunking or lowercasing them." ) parser.add_argument('--chunked', action='store_true', help='Whether to chunk the internal tokenlist.') parser.add_argument('--page-ref', action='store_true', help='Store page reference when chunking.') parser.add_argument('--chunk-size', type=int, default=5000, help='Word target for chunks.') parser.add_argument('--lowercase', action='store_true', help='Lowercase tokens.') parser.add_argument('filepaths', type=str, nargs='+', help='files to convert') args = parser.parse_args() for efpath in args.filepaths: try: vol = Volume(os.path.join(args.efdir, efpath), parser=args.parser) path = args.outdir + utils.id_to_rsync(vol.id) path, filename = os.path.split(path) os.makedirs(path, exist_ok=True) token_kwargs = dict(section='body', drop_section=True, pos=False, case=(not args.lowercase)) if args.chunked: token_kwargs['chunk_target'] = args.chunk_size token_kwargs['page_ref'] = args.page_ref vol.save_parquet(path, chunked=args.chunked, token_kwargs=token_kwargs) except: with open('errs.txt', mode='a') as f: f.write(efpath + "\n") print("Error", efpath)
def combine_books(ids, style='anthology'): ''' Create a fake book from a set of HTIDs''' # Use front and back from 1 book, munge the centers fake_front = None fake_back = None centers = [] # New metadata meta = dict(names=[], pub_date=[], source_htids=[], notes="", language="eng", schema_version="1.3", enumeration_chronology="") if style == "anthology": meta['title'] = "Fake Anthology: " elif style == "multivol": meta['title'] = "Fake Combined Vol: " for i, htid in enumerate(ids): vol = Volume(htid, dir='/data/extracted-features-parquet-stubby/', format='parquet', id_resolver='stubbytree') tl = vol.tokenlist(case=False, pos=False) split_details, front, center, back = split_tokenlist(tl) if i == 0: fake_front = front fake_back = back meta['notes'] += "Beginning: {} of {}; ".format( page_range(front), vol.id) meta['notes'] += "Ending: {} of {}; ".format( page_range(back), vol.id) centers.append(center) meta['names'] += vol.author meta['title'] += '{}) {}...'.format(i, vol.title[:40]) meta['source_htids'].append(vol.id) meta['notes'] += "{} of {};".format(page_range(center), vol.id) if style == "multivol": if i == 0: meta['enumeration_chronology'] = vol.enumeration_chronology else: meta['enumeration_chronology'] = meta[ 'enumeration_chronology'].split('-')[ 0] + '-' + vol.enumeration_chronology.lower().replace( 'v.', '') try: year = int(vol.year) meta['pub_date'].append(year) except ValueError: pass meta['names'] = list(set(meta['names'])) meta['pub_date'] = np.mean(meta['pub_date']) new_tl = combine_tokenlist([front] + centers + [back]) meta['page_count'] = int(new_tl.index.get_level_values('page').max()) m = hashlib.md5() m.update(",".join(meta['source_htids']).encode('utf-8')) meta['id'] = "fake.{}".format(m.hexdigest()[:6]) return meta, new_tl
def copy_between_resolvers(id, resolver1, resolver2): input = Volume(id, id_resolver=resolver1) output = Volume(id, id_resolver=resolver2, mode='wb') output.write(input)
meta['page_count'] = int(new_tl.index.get_level_values('page').max()) m = hashlib.md5() m.update(",".join(meta['source_htids']).encode('utf-8')) meta['id'] = "fake.{}".format(m.hexdigest()[:6]) return meta, new_tl def save_fake_vol(meta, tokenlist, dir, id_resolver='stubbytree', token_kwargs=dict(case=False, pos=False)): vol = Volume(meta['id'], dir=dir, id_resolver=id_resolver, format='parquet', mode='wb') vol._tokencounts = tokenlist vol.parser.meta = meta vol._pagecolname = 'page' vol._update_meta_attrs() vol.write(vol, token_kwargs=token_kwargs) return meta['id'] def pairwise_title_similarity(titles, bpemb_en=None): ''' Clean titles and use BPE encodings to compare their similarity''' if bpemb_en is None: bpemb_en = BPEmb(lang="en") # Convert cleaned title to BPE encodings and keep those vectors
def fullvolume(paths): return Volume(os.path.join('tests', 'data', 'green-gables-full.json'), compression=None)
def chunked_volume(self): if not self._chunked_volume: self._chunked_volume = Volume(self.htid, id_resolver=self.chunked_resolver) return self._chunked_volume
def volume(self): if not self._volume: self._volume = Volume(self.htid, id_resolver=self.id_resolver) return self._volume
def test_direct_loading(self, paths): import time # Load new volume specifically for this test vol = Volume(paths[0], compression=None) assert type(vol) == htrc_features.feature_reader.Volume
# You can change this here to get a different resolver. default = """ resolver: - id_resolver: pairtree dir: /drobo/feature-counts format: json compression: bz2 """ try: for path in ["~/.htrc-config.yaml", "local.yaml"]: if os.path.exists(path): config = yaml.safe_load(Path(path).expanduser().open()) break if not config: raise FileNotFoundError except FileNotFoundError: raise config = yaml.safe_load(default) resolver = config['resolver'] my_resolver = combine_resolvers(resolver) if __name__ == "__main__": print( Volume(id="mdp.39015012434786", id_resolver=my_resolver).tokenlist(pos=False, section="default"))