def gen_section_types(): import tools from llp.corpus.eebo import EEBO_TCP from llp.corpus.ecco import ECCO_TCP corpora = [EEBO_TCP(), ECCO_TCP()] from collections import defaultdict, Counter section_types = defaultdict(Counter) for c in corpora: cs = c.sections for d in cs.meta: section_types[c.name][d['section_type']] += 1 def writegen(): all_stypes = set( [key for cname in section_types for key in section_types[cname]]) for stype in all_stypes: dx = {'section_type': stype} dx['count'] = 0 for cname in section_types: dx['count_' + cname] = section_types[cname].get(stype, 0) dx['count'] += dx['count_' + cname] yield dx tools.writegen('data.section_types.txt', writegen)
def compile_metadata(self, collection=DEFAULT_COLLECTION): def _writegen(): for item in self.get_collection_ids(collection=collection, iter_as_items=True): dx = item.metadata print(dx) yield dx tools.iter_move(self.path_metadata, prefix='bak/') tools.writegen(self.path_metadata, _writegen)
def save_metadata(self): print('>> generating metadata...') texts = self.texts() num_texts = len(texts) def meta(text): return text.meta def writegen(): from gevent.pool import Pool pool = Pool(50) for metad in pool.imap(meta, texts): yield metad tools.writegen('corpus-metadata.' + self.name + '.txt', writegen)
def save_metadata(self): print('>> generating metadata...') texts = self.texts() num_texts = len(texts) estc_ids_in_ecco = set( open('/Users/ryan/DH/18C/titles/estc/estc_ids_in_ecco.txt').read(). split()) def meta(text): dx = text.meta_by_file dx['in_ecco'] = dx['id_estc'] in estc_ids_in_ecco return dx def writegen(): for i, t in enumerate(self.texts()): if not i % 1000: print(i) yield meta(t) tools.writegen('corpus-metadata.' + self.name + '.txt', writegen)
def make_mini_db( keys=['author', 'title', 'year', 'genre', 'medium'], extra_keys=[]): #keys=['corpus','id','author','title','year']): from llp import tools from tqdm import tqdm #tqdm_notebook as tqdm dbtable = get_table() total = dbtable.count() def _writegen(): for dx in tqdm(dbtable.find(), total=total, desc='>> saving tsv from mongo'): minidx = dict([(k, dx.get(k, '')) for k in keys + extra_keys]) minidx['_addr'] = str(dx.get('corpus', 'Corpus')) + ADDR_SEP + str( dx.get('id', 'ID')) yield minidx tools.writegen('data.llp_mini_db.txt.gz', _writegen)