Python OrderedCorpus примеры использования

Язык программирования: Python

Пространство имен/Пакет: corpora

Класс/Тип: OrderedCorpus

Примеров на hotexamples.com: 2

Python OrderedCorpus - 2 примера найдено. Это лучшие примеры Python кода для corpora.OrderedCorpus, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

OrderedCorpus(2)

serialize(2)

Пример #1

Показать файл

def create_queries_vec(project):
    corpus_fname_base = project.full_path + 'Queries'
    corpus_fname = corpus_fname_base + '.ordered.gz'
    dict_fname = corpus_fname_base + '.dict.gz'

    if not os.path.exists(corpus_fname):
        pp = GeneralCorpus(lazy_dict=True)
        id2word = Dictionary()

        with open(os.path.join(project.full_path, 'ids.txt')) as f:
            ids = [x.strip() for x in f.readlines()]

        queries = list()
        for id in ids:
            with open(
                    os.path.join(project.full_path, 'queries',
                                 'ShortDescription' + id + '.txt')) as f:
                short = f.read()

            with open(
                    os.path.join(project.full_path, 'queries',
                                 'LongDescription' + id + '.txt')) as f:
                long = f.read()

            text = ' '.join([short, long])
            text = list(pp.preprocess(text))

            queries.append((text, (id, 'query')))

        OrderedCorpus.serialize(corpus_fname, queries, metadata=True)

    corpus = OrderedCorpus(corpus_fname)
    return corpus

Пример #2

Показать файл

def create_corpus_vec(project, repos, Kind, use_level=True, forced_ref=None):
    corpus_fname_base = project.full_path + Kind.__name__

    if use_level:
        corpus_fname_base += project.level

    if forced_ref:
        corpus_fname_base += forced_ref[:8]

    corpus_fname = corpus_fname_base + '.ordered.gz'
    dict_fname = corpus_fname_base + '.dict.gz'
    made_one = False

    if not os.path.exists(corpus_fname):
        combiner = CorpusCombiner()

        for repo in repos:
            try:
                if repo or forced_ref:
                    corpus = Kind(
                        project=project,
                        repo=repo,
                        lazy_dict=True,
                        ref=forced_ref,
                    )
                else:
                    corpus = Kind(project=project, lazy_dict=True)

            except KeyError:
                continue
            except TaserError as e:
                if repo == repos[-1] and not made_one:
                    raise e
                    # basically, if we are at the last repo and we STILL
                    # haven't sucessfully extracted a corpus, ring some bells
                else:
                    # otherwise, keep trying. winners never quit.
                    continue

            combiner.add(corpus)
            made_one = True

        # write the corpus and dictionary to disk. this will take awhile.
        combiner.metadata = True
        OrderedCorpus.serialize(corpus_fname, combiner, metadata=True)
        combiner.metadata = False

    corpus = OrderedCorpus(corpus_fname)

    return corpus