Пример #1
0
def gen_section_types():
    import tools
    from llp.corpus.eebo import EEBO_TCP
    from llp.corpus.ecco import ECCO_TCP
    corpora = [EEBO_TCP(), ECCO_TCP()]

    from collections import defaultdict, Counter
    section_types = defaultdict(Counter)
    for c in corpora:
        cs = c.sections
        for d in cs.meta:
            section_types[c.name][d['section_type']] += 1

    def writegen():
        all_stypes = set(
            [key for cname in section_types for key in section_types[cname]])
        for stype in all_stypes:
            dx = {'section_type': stype}
            dx['count'] = 0
            for cname in section_types:
                dx['count_' + cname] = section_types[cname].get(stype, 0)
                dx['count'] += dx['count_' + cname]
            yield dx

    tools.writegen('data.section_types.txt', writegen)
Пример #2
0
    def compile_metadata(self, collection=DEFAULT_COLLECTION):
        def _writegen():
            for item in self.get_collection_ids(collection=collection,
                                                iter_as_items=True):
                dx = item.metadata
                print(dx)
                yield dx

        tools.iter_move(self.path_metadata, prefix='bak/')
        tools.writegen(self.path_metadata, _writegen)
Пример #3
0
    def save_metadata(self):
        print('>> generating metadata...')
        texts = self.texts()
        num_texts = len(texts)

        def meta(text):
            return text.meta

        def writegen():
            from gevent.pool import Pool
            pool = Pool(50)
            for metad in pool.imap(meta, texts):
                yield metad

        tools.writegen('corpus-metadata.' + self.name + '.txt', writegen)
Пример #4
0
    def save_metadata(self):
        print('>> generating metadata...')
        texts = self.texts()
        num_texts = len(texts)
        estc_ids_in_ecco = set(
            open('/Users/ryan/DH/18C/titles/estc/estc_ids_in_ecco.txt').read().
            split())

        def meta(text):
            dx = text.meta_by_file
            dx['in_ecco'] = dx['id_estc'] in estc_ids_in_ecco
            return dx

        def writegen():
            for i, t in enumerate(self.texts()):
                if not i % 1000: print(i)
                yield meta(t)

        tools.writegen('corpus-metadata.' + self.name + '.txt', writegen)
Пример #5
0
def make_mini_db(
        keys=['author', 'title', 'year', 'genre', 'medium'],
        extra_keys=[]):  #keys=['corpus','id','author','title','year']):
    from llp import tools
    from tqdm import tqdm  #tqdm_notebook as tqdm

    dbtable = get_table()
    total = dbtable.count()

    def _writegen():
        for dx in tqdm(dbtable.find(),
                       total=total,
                       desc='>> saving tsv from mongo'):
            minidx = dict([(k, dx.get(k, '')) for k in keys + extra_keys])
            minidx['_addr'] = str(dx.get('corpus', 'Corpus')) + ADDR_SEP + str(
                dx.get('id', 'ID'))
            yield minidx

    tools.writegen('data.llp_mini_db.txt.gz', _writegen)