Exemplo n.º 1
0
def train_word2vec(iterations):
    word2vec_init = gensim.models.Word2Vec(
        size=300,  # Embedding size
        window=5,  # One-sided window size
        sg=True,  # Skip-gram.
        min_count=5,  # Minimum word frequency.
        sample=1e-3,  # Sub-sample threshold.
        hs=False,  # Hierarchical softmax.
        negative=10,  # Number of negative examples.
        iter=1,  # Number of iterations.
        workers=8,  # Number of workers.
    )

    with pyndri.open('index/') as index:
        dictionary = pyndri.extract_dictionary(index)
        sentences = pyndri.compat.IndriSentences(index, dictionary)

        # Build vocab
        word2vec_init.build_vocab(sentences, trim_rule=None)
        models = [word2vec_init]

        for epoch in range(iterations):
            start_time = time.time()
            print('Epoch {} started..'.format(epoch + 1))

            model = copy.deepcopy(models[-1])
            model.train(sentences,
                        total_examples=len(sentences),
                        epochs=model.iter)

            models.append(model)
            print('Epoch {} finished in {}'.format(epoch + 1,
                                                   time.time() - start_time))
    return models[-1]
Exemplo n.º 2
0
def main(argv):

    if len(argv) < 1:
        print("Invalid configuration file.")
        sys.exit(0)

    query_log_fold = argv[1]

    print("Generating candidate queries")
    candidate_queries = []
    pattern = re.compile('([^\s\w]|_)+')
    for query_log_file in glob.glob(query_log_fold +
                                    "user-ct-test-collection-*"):
        f = gzip.open(query_log_file)
        # skip first line
        f.readline()
        for line in f:
            line = line.decode("utf-8").split("\t")
            query_string = line[1]
            if is_url_substring(query_string):
                continue
            query_string = pattern.sub('', query_string)
            candidate_queries.append(query_string)
    candidate_queries = set(candidate_queries)
    print("Found {} candidate queries".format(len(candidate_queries)))

    print("Generating pseudo labels")
    f_query = open("training_query_set.txt", encoding='utf-8', mode="w")
    f_label = open("training_pseudo_labels.txt", "w")
    with pyndri.open(config["index"]) as index:
        i = 0
        bm25_query_env = pyndri.OkapiQueryEnvironment(index,
                                                      k1=1.2,
                                                      b=0.75,
                                                      k3=1000)
        for candidate_query in candidate_queries:
            try:
                results = index.query(candidate_query, results_requested=1000)
            except:
                print(candidate_query)
                continue
            if len(results) < 10:
                continue
            f_query.write("{} {}\n".format(i, candidate_query))
            for docid, score in results:
                docno, _ = index.document(docid)
                f_label.write("{} {} {}\n".format(i, docno, score))
            i += 1
        f.close()
        print("Finished with {} queries".format(i))
Exemplo n.º 3
0
import pyndri
import sys

import numpy as np
import scipy.stats.mstats

#test GH

if len(sys.argv) <= 1:
    print('Usage: python {0} <path-to-indri-index> [<index-name>]'.format(
        sys.argv[0]))

    sys.exit(0)

with pyndri.open(sys.argv[1]) as index:
    num_documents = 0
    mean = 0.0
    M2 = 0.0

    min_ = np.inf
    max_ = -np.inf

    lengths = []

    for document_id in range(index.document_base(), index.maximum_document()):
        x = float(index.document_length(document_id))
        lengths.append(x)

        num_documents += 1
        delta = x - mean
        mean += delta / num_documents
Exemplo n.º 4
0
 def test_with(self):
     with pyndri.open(self.index_path) as index:
         self.assertTrue(isinstance(index, pyndri.Index))
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-pairs')

    parser.add_argument('-run')

    parser.add_argument('-queries')

    parser.add_argument('-index')

    parser.add_argument('-o')

    args = parser.parse_args()

    if args.pairs:
        docs_train = get_docs_train(args.pairs)

    if args.run:
        docs_test = get_docs_test(args.run)

    queries = load_queries(args.queries)

    if not os.path.exists(args.o):
        os.makedirs(args.o)

    max_ngraph_len = 5

    with pyndri.open(args.index) as index:
        token2id, id2token, id2df = index.get_dictionary()
        top_ngraphs = get_top_ngraph(index)
        with open(args.o + "/top_ngraphs.txt", "w") as f:
            for ngraph in top_ngraphs:
                f.write("{} {}\n".format(top_ngraphs[ngraph],
                                         ngraph.encode("utf-8")))
            f.close()
        with open(args.o + "/ngraphs.txt", "w") as f:
            for term_id in id2token:
                term = id2token[term_id]
                ngraphs = get_top_ngraphs(top_ngraphs, 5, term)
                f.write("{} {}\n".format(str(term_id), " ".join(ngraphs)))
            f.close()

        print("Got top ngraphs.")

        def get_query_obj(qid):
            query = queries[qid]
            query_terms = [
                term for term in pyndri.tokenize(escape(query.lower()))
                if term in token2id
            ]
            ids = [str(token2id[term]) for term in query_terms]
            query_obj = " ".join(ids)
            return query_obj

        def get_doc_obj(docids, docno):
            docno, doc = index.document(docids[docno])
            doc = [str(w) for w in doc if w > 0][:1000]
            doc_obj = " ".join(doc)
            return doc_obj

        if args.pairs:

            with open(args.o + "/train_data.txt", "w") as f:

                print("Processing train...")

                docids = docids_from_index(index, docs_train)

                for line in open(args.pairs):
                    qid, docs = line.strip().split(" ", 1)
                    if qid not in queries:
                        continue
                    docs = docs.split()
                    query_obj = get_query_obj(qid)
                    f.write("{}\t{}\t{}\n".format(
                        qid, query_obj,
                        "\t".join([get_doc_obj(docids, d) for d in docs])))
                f.close()

        if args.run:

            with open(args.o + "/test_data.txt", "w") as f:
                docids = docids_from_index(index, docs_test)

                print("Processing test...")

                for line in open(args.run):
                    qid, _, doc, rank, score, label = line.strip().split()
                    query_obj = get_query_obj(qid)
                    doc_obj = get_doc_obj(docids, doc)
                    f.write("{}\t{}\t{}\t{}\n".format(qid, query_obj, doc,
                                                      doc_obj))
                f.close()
features = {}

events = os.listdir(path_events)

tree = etree.parse(path_queries)
for q in tree.xpath("/parameters/query"):
	tobedone = "false"
	for element in q.iter():
		#print("%s - %s" % (element.tag, element.text))
		# si la requete fait partie des requetes à tester, on doit lancer Indri
		if (element.tag=="number") and (element.text in events):
			tobedone="true"
			query = element.text
		if (element.tag=="text") and tobedone=="true":
			print("----------------"+element.text )
			with pyndri.open(path_index+query) as index:
				features = {}
				num_documents=0
				for document_id in range(index.document_base(), index.maximum_document()):
					num_documents += 1
					ext_document_id, _ = index.document(document_id)
					features[ext_document_id] = {}
					features[ext_document_id]["lm"] = 0
					features[ext_document_id]["tfidf"] = 0
					features[ext_document_id]["bm25"] = 0


				# Constructs a QueryEnvironment that uses a
				# language model with Dirichlet smoothing.
				lm_query_env = pyndri.QueryEnvironment(index, rules=('method:dirichlet,mu:5000',))
				#results = lm_query_env.query(element.text, results_requested=num_documents, include_snippets=True)