Пример #1
0
def create_queries_vec(project):
    corpus_fname_base = project.full_path + 'Queries'
    corpus_fname = corpus_fname_base + '.ordered.gz'
    dict_fname = corpus_fname_base + '.dict.gz'

    if not os.path.exists(corpus_fname):
        pp = GeneralCorpus(lazy_dict=True)
        id2word = Dictionary()

        with open(os.path.join(project.full_path, 'ids.txt')) as f:
            ids = [x.strip() for x in f.readlines()]

        queries = list()
        for id in ids:
            with open(
                    os.path.join(project.full_path, 'queries',
                                 'ShortDescription' + id + '.txt')) as f:
                short = f.read()

            with open(
                    os.path.join(project.full_path, 'queries',
                                 'LongDescription' + id + '.txt')) as f:
                long = f.read()

            text = ' '.join([short, long])
            text = list(pp.preprocess(text))

            queries.append((text, (id, 'query')))

        OrderedCorpus.serialize(corpus_fname, queries, metadata=True)

    corpus = OrderedCorpus(corpus_fname)
    return corpus
Пример #2
0
def create_corpus_vec(project, repos, Kind, use_level=True, forced_ref=None):
    corpus_fname_base = project.full_path + Kind.__name__

    if use_level:
        corpus_fname_base += project.level

    if forced_ref:
        corpus_fname_base += forced_ref[:8]

    corpus_fname = corpus_fname_base + '.ordered.gz'
    dict_fname = corpus_fname_base + '.dict.gz'
    made_one = False

    if not os.path.exists(corpus_fname):
        combiner = CorpusCombiner()

        for repo in repos:
            try:
                if repo or forced_ref:
                    corpus = Kind(
                        project=project,
                        repo=repo,
                        lazy_dict=True,
                        ref=forced_ref,
                    )
                else:
                    corpus = Kind(project=project, lazy_dict=True)

            except KeyError:
                continue
            except TaserError as e:
                if repo == repos[-1] and not made_one:
                    raise e
                    # basically, if we are at the last repo and we STILL
                    # haven't sucessfully extracted a corpus, ring some bells
                else:
                    # otherwise, keep trying. winners never quit.
                    continue

            combiner.add(corpus)
            made_one = True

        # write the corpus and dictionary to disk. this will take awhile.
        combiner.metadata = True
        OrderedCorpus.serialize(corpus_fname, combiner, metadata=True)
        combiner.metadata = False

    corpus = OrderedCorpus(corpus_fname)

    return corpus