def args(self): """ Generate BPO archive paths. """ yield from scan_paths(self.corpus_dir, '\.zip')
def args(self): """ Generate BPO paths. """ yield from scan_paths(self.corpus_dir, '\.xml')
def slugs(self): """ Generate a list of slugs from the corpus. """ for path in scan_paths(self.path, '\.txt'): yield os.path.splitext(os.path.basename(path))[0]
def ingest(cls, result_dir: str): """Ingest BPO articles. """ paths = scan_paths(result_dir, '\.json') # Walk paths. for i, path in enumerate(paths): with open(path) as fh: # Bulk-insert articles. session.bulk_insert_mappings(cls, ujson.load(fh)) session.commit() print(dt.now().isoformat(), i)
def ingest(cls, corpus_dir: str): """Ingest texts. """ for path in scan_paths(corpus_dir, '\.txt'): slug = os.path.splitext(os.path.basename(path))[0] year = int(re.search('[0-9]{4}', slug).group()) with open(path) as fh: novel = cls(slug=slug, year=year, text=fh.read()) session.add(novel) session.commit()
def gather(cls, result_dir: str): """ Bulk-insert alignments. """ paths = scan_paths(result_dir, '\.json') # Walk paths. for i, path in enumerate(paths): with open(path, 'rb') as fh: mappings = ujson.load(fh) # Bulk-insert matches. session.bulk_insert_mappings(cls, mappings) session.commit() print(dt.now().isoformat(), i)