Пример #1
0
    def args(self):

        """
        Generate BPO archive paths.
        """

        yield from scan_paths(self.corpus_dir, '\.zip')
Пример #2
0
    def args(self):

        """
        Generate BPO paths.
        """

        yield from scan_paths(self.corpus_dir, '\.xml')
Пример #3
0
    def slugs(self):

        """
        Generate a list of slugs from the corpus.
        """

        for path in scan_paths(self.path, '\.txt'):
            yield os.path.splitext(os.path.basename(path))[0]
Пример #4
0
    def ingest(cls, result_dir: str):
        """Ingest BPO articles.
        """
        paths = scan_paths(result_dir, '\.json')

        # Walk paths.
        for i, path in enumerate(paths):
            with open(path) as fh:

                # Bulk-insert articles.
                session.bulk_insert_mappings(cls, ujson.load(fh))
                session.commit()

                print(dt.now().isoformat(), i)
Пример #5
0
    def ingest(cls, corpus_dir: str):
        """Ingest texts.
        """
        for path in scan_paths(corpus_dir, '\.txt'):

            slug = os.path.splitext(os.path.basename(path))[0]

            year = int(re.search('[0-9]{4}', slug).group())

            with open(path) as fh:
                novel = cls(slug=slug, year=year, text=fh.read())
                session.add(novel)

        session.commit()
Пример #6
0
    def gather(cls, result_dir: str):

        """
        Bulk-insert alignments.
        """

        paths = scan_paths(result_dir, '\.json')

        # Walk paths.
        for i, path in enumerate(paths):
            with open(path, 'rb') as fh:

                mappings = ujson.load(fh)

                # Bulk-insert matches.
                session.bulk_insert_mappings(cls, mappings)
                session.commit()

                print(dt.now().isoformat(), i)