def create_queries_vec(project): corpus_fname_base = project.full_path + 'Queries' corpus_fname = corpus_fname_base + '.ordered.gz' dict_fname = corpus_fname_base + '.dict.gz' if not os.path.exists(corpus_fname): pp = GeneralCorpus(lazy_dict=True) id2word = Dictionary() with open(os.path.join(project.full_path, 'ids.txt')) as f: ids = [x.strip() for x in f.readlines()] queries = list() for id in ids: with open( os.path.join(project.full_path, 'queries', 'ShortDescription' + id + '.txt')) as f: short = f.read() with open( os.path.join(project.full_path, 'queries', 'LongDescription' + id + '.txt')) as f: long = f.read() text = ' '.join([short, long]) text = list(pp.preprocess(text)) queries.append((text, (id, 'query'))) OrderedCorpus.serialize(corpus_fname, queries, metadata=True) corpus = OrderedCorpus(corpus_fname) return corpus
def create_corpus_vec(project, repos, Kind, use_level=True, forced_ref=None): corpus_fname_base = project.full_path + Kind.__name__ if use_level: corpus_fname_base += project.level if forced_ref: corpus_fname_base += forced_ref[:8] corpus_fname = corpus_fname_base + '.ordered.gz' dict_fname = corpus_fname_base + '.dict.gz' made_one = False if not os.path.exists(corpus_fname): combiner = CorpusCombiner() for repo in repos: try: if repo or forced_ref: corpus = Kind( project=project, repo=repo, lazy_dict=True, ref=forced_ref, ) else: corpus = Kind(project=project, lazy_dict=True) except KeyError: continue except TaserError as e: if repo == repos[-1] and not made_one: raise e # basically, if we are at the last repo and we STILL # haven't sucessfully extracted a corpus, ring some bells else: # otherwise, keep trying. winners never quit. continue combiner.add(corpus) made_one = True # write the corpus and dictionary to disk. this will take awhile. combiner.metadata = True OrderedCorpus.serialize(corpus_fname, combiner, metadata=True) combiner.metadata = False corpus = OrderedCorpus(corpus_fname) return corpus