def train_word2vec(iterations): word2vec_init = gensim.models.Word2Vec( size=300, # Embedding size window=5, # One-sided window size sg=True, # Skip-gram. min_count=5, # Minimum word frequency. sample=1e-3, # Sub-sample threshold. hs=False, # Hierarchical softmax. negative=10, # Number of negative examples. iter=1, # Number of iterations. workers=8, # Number of workers. ) with pyndri.open('index/') as index: dictionary = pyndri.extract_dictionary(index) sentences = pyndri.compat.IndriSentences(index, dictionary) # Build vocab word2vec_init.build_vocab(sentences, trim_rule=None) models = [word2vec_init] for epoch in range(iterations): start_time = time.time() print('Epoch {} started..'.format(epoch + 1)) model = copy.deepcopy(models[-1]) model.train(sentences, total_examples=len(sentences), epochs=model.iter) models.append(model) print('Epoch {} finished in {}'.format(epoch + 1, time.time() - start_time)) return models[-1]
def main(argv): if len(argv) < 1: print("Invalid configuration file.") sys.exit(0) query_log_fold = argv[1] print("Generating candidate queries") candidate_queries = [] pattern = re.compile('([^\s\w]|_)+') for query_log_file in glob.glob(query_log_fold + "user-ct-test-collection-*"): f = gzip.open(query_log_file) # skip first line f.readline() for line in f: line = line.decode("utf-8").split("\t") query_string = line[1] if is_url_substring(query_string): continue query_string = pattern.sub('', query_string) candidate_queries.append(query_string) candidate_queries = set(candidate_queries) print("Found {} candidate queries".format(len(candidate_queries))) print("Generating pseudo labels") f_query = open("training_query_set.txt", encoding='utf-8', mode="w") f_label = open("training_pseudo_labels.txt", "w") with pyndri.open(config["index"]) as index: i = 0 bm25_query_env = pyndri.OkapiQueryEnvironment(index, k1=1.2, b=0.75, k3=1000) for candidate_query in candidate_queries: try: results = index.query(candidate_query, results_requested=1000) except: print(candidate_query) continue if len(results) < 10: continue f_query.write("{} {}\n".format(i, candidate_query)) for docid, score in results: docno, _ = index.document(docid) f_label.write("{} {} {}\n".format(i, docno, score)) i += 1 f.close() print("Finished with {} queries".format(i))
import pyndri import sys import numpy as np import scipy.stats.mstats #test GH if len(sys.argv) <= 1: print('Usage: python {0} <path-to-indri-index> [<index-name>]'.format( sys.argv[0])) sys.exit(0) with pyndri.open(sys.argv[1]) as index: num_documents = 0 mean = 0.0 M2 = 0.0 min_ = np.inf max_ = -np.inf lengths = [] for document_id in range(index.document_base(), index.maximum_document()): x = float(index.document_length(document_id)) lengths.append(x) num_documents += 1 delta = x - mean mean += delta / num_documents
def test_with(self): with pyndri.open(self.index_path) as index: self.assertTrue(isinstance(index, pyndri.Index))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-pairs') parser.add_argument('-run') parser.add_argument('-queries') parser.add_argument('-index') parser.add_argument('-o') args = parser.parse_args() if args.pairs: docs_train = get_docs_train(args.pairs) if args.run: docs_test = get_docs_test(args.run) queries = load_queries(args.queries) if not os.path.exists(args.o): os.makedirs(args.o) max_ngraph_len = 5 with pyndri.open(args.index) as index: token2id, id2token, id2df = index.get_dictionary() top_ngraphs = get_top_ngraph(index) with open(args.o + "/top_ngraphs.txt", "w") as f: for ngraph in top_ngraphs: f.write("{} {}\n".format(top_ngraphs[ngraph], ngraph.encode("utf-8"))) f.close() with open(args.o + "/ngraphs.txt", "w") as f: for term_id in id2token: term = id2token[term_id] ngraphs = get_top_ngraphs(top_ngraphs, 5, term) f.write("{} {}\n".format(str(term_id), " ".join(ngraphs))) f.close() print("Got top ngraphs.") def get_query_obj(qid): query = queries[qid] query_terms = [ term for term in pyndri.tokenize(escape(query.lower())) if term in token2id ] ids = [str(token2id[term]) for term in query_terms] query_obj = " ".join(ids) return query_obj def get_doc_obj(docids, docno): docno, doc = index.document(docids[docno]) doc = [str(w) for w in doc if w > 0][:1000] doc_obj = " ".join(doc) return doc_obj if args.pairs: with open(args.o + "/train_data.txt", "w") as f: print("Processing train...") docids = docids_from_index(index, docs_train) for line in open(args.pairs): qid, docs = line.strip().split(" ", 1) if qid not in queries: continue docs = docs.split() query_obj = get_query_obj(qid) f.write("{}\t{}\t{}\n".format( qid, query_obj, "\t".join([get_doc_obj(docids, d) for d in docs]))) f.close() if args.run: with open(args.o + "/test_data.txt", "w") as f: docids = docids_from_index(index, docs_test) print("Processing test...") for line in open(args.run): qid, _, doc, rank, score, label = line.strip().split() query_obj = get_query_obj(qid) doc_obj = get_doc_obj(docids, doc) f.write("{}\t{}\t{}\t{}\n".format(qid, query_obj, doc, doc_obj)) f.close()
features = {} events = os.listdir(path_events) tree = etree.parse(path_queries) for q in tree.xpath("/parameters/query"): tobedone = "false" for element in q.iter(): #print("%s - %s" % (element.tag, element.text)) # si la requete fait partie des requetes à tester, on doit lancer Indri if (element.tag=="number") and (element.text in events): tobedone="true" query = element.text if (element.tag=="text") and tobedone=="true": print("----------------"+element.text ) with pyndri.open(path_index+query) as index: features = {} num_documents=0 for document_id in range(index.document_base(), index.maximum_document()): num_documents += 1 ext_document_id, _ = index.document(document_id) features[ext_document_id] = {} features[ext_document_id]["lm"] = 0 features[ext_document_id]["tfidf"] = 0 features[ext_document_id]["bm25"] = 0 # Constructs a QueryEnvironment that uses a # language model with Dirichlet smoothing. lm_query_env = pyndri.QueryEnvironment(index, rules=('method:dirichlet,mu:5000',)) #results = lm_query_env.query(element.text, results_requested=num_documents, include_snippets=True)