示例#1
0
    def setUp(self):
        super(IndriTest, self).setUp()

        self.test_dir = tempfile.mkdtemp()

        with open(os.path.join(self.test_dir,
                               'corpus.trectext'),
                  'w', encoding='latin1') as f:
            f.write(self.CORPUS)

        with open(os.path.join(self.test_dir,
                               'IndriBuildIndex.conf'), 'w') as f:
            f.write(self.INDRI_CONFIG)

        with open(os.devnull, "w") as f:
            ret = subprocess.call(['IndriBuildIndex', 'IndriBuildIndex.conf'],
                                  stdout=f,
                                  cwd=self.test_dir)

        self.assertEqual(ret, 0)

        self.index_path = os.path.join(self.test_dir, 'index')
        self.assertTrue(os.path.exists(self.index_path))

        self.index = pyndri.Index(self.index_path)
示例#2
0
def test_run_queries():
    with mock.patch('pyndri.Index') as mock_index:
        with mock.patch('pyndri.QueryEnvironment') as mock_qenv:
            mock_index.return_value = MockIndex()
            mock_qenv.return_value = MockQueryEnv()
            index = pyndri.Index('/index/path')

            token2id, id2token, id2df = index.get_dictionary()
            total_terms = index.total_terms()
            id2tf = index.get_term_frequencies()

            queries = list_from_xml('retrievable/tests/test_queries.yaml',
                                    token2id, id2tf, total_terms)

            (num, text, qv, cp) = queries[0]
            assert num == '51'
            assert text == 'airbus subsidies'
            assert qv == {6146: 1, 3313: 1}
            assert cp == {6146: 1086 / 76148180, 3313: 2608 / 76148180}

            output = run_queries('/index/path', 'retrievable.scorers.api',
                                 'ScorerDirichlet', {'mu': 1000}, queries[0])

            res = output.result()

            assert len(res) == 2
示例#3
0
def main(args=None):

    parser = argparse.ArgumentParser(
            description='Create term timeseries index')
    parser.add_argument('-i', '--index', dest='index', help='Input index path')
    parser.add_argument('-o', '--output', dest='output', help='Output path')
    parser.add_argument("-v", "--verbose", help='Verbose logging',
                        action="store_true")

    args = parser.parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    index = pyndri.Index(args.index)

    logging.info('Get dictionary')
    token2id, id2token, id2df = index.get_dictionary()

    doc_ids = range(index.document_base(), index.maximum_document())

    logging.info('Building index')
    ts = {}
    for doc_id in tqdm(doc_ids):

        epoch = int(index.field(doc_id, 'epoch'))
        date = datetime.fromtimestamp(epoch).date()

        docno, token_ids = index.document(doc_id)

        for token_id in token_ids:
            if token_id > 0 and id2df[token_id] > 1000:
                if date not in ts:
                    ts[date] = {}

                if token_id not in ts[date]:
                    ts[date][token_id] = 0

                ts[date][token_id] += 1

    logging.info('Creating dataframe')
    t0 = time.time()
    df = pd.DataFrame.from_dict(ts, orient='index', dtype=int)
    t1 = time.time()
    logging.debug("time: %s" % (t1 - t0))

    logging.info('Serializing dataframe')
    t0 = time.time()
    df.to_csv(args.output, compression="gzip")
    t1 = time.time()
    logging.debug("time: %s" % (t1 - t0))
示例#4
0
def indri_doc_extractor(path):
    import pyndri
    index = pyndri.Index(path)
    id2token = index.get_dictionary()[1]

    def wrapped(docid):
        doc_id_tuples = index.document_ids([docid])
        if not doc_id_tuples:
            return None  # not found
        int_docid = doc_id_tuples[0][1]
        _, doc_toks = index.document(int_docid)
        return ' '.join(id2token[tok] for tok in doc_toks if tok != 0)

    return wrapped
示例#5
0
    def __init__(self, env: str = 'default', verbose: bool = False, avg_len=False):
        if verbose:
            helpers.log(f'Loading index {INDRI_INDEX_DIR} with {env} query environment.')
        start = datetime.now()

        self.index = pyndri.Index(f'{INDRI_INDEX_DIR}')
        self.token2id, self.id2token, self.id2df = self.index.get_dictionary()
        self.id2tf = self.index.get_term_frequencies()

        if avg_len:
            # Monte Carlo Estimation for document length:
            doc_lengths = np.empty(self.index.document_count(), dtype=np.float)
            for (idx, doc_iid) in enumerate(range(self.index.document_base(), self.index.maximum_document())):
                doc_lengths[idx] = self.index.document_length(doc_iid)
            self.avg_doc_len = float(doc_lengths.mean())

        self.tokenizer = Tokenizer()

        if os.path.isfile(TITLE2WID):
            with open(TITLE2WID, 'rb') as file:
                self.title2wid = pickle.load(file)

        if os.path.isfile(WID2TITLE):
            with open(WID2TITLE, 'rb') as file:
                self.wid2title = pickle.load(file)
        try:
            if os.path.isfile(WID2INT):
                with open(WID2INT, 'rb') as file:
                    self.wid2int = pickle.load(file)

            if os.path.isfile(INT2WID):
                with open(INT2WID, 'rb') as file:
                    self.int2wid = pickle.load(file)
        except FileNotFoundError:
            helpers.log('ID mappings do not exist yet. Not loaded.')

        if env == 'default':
            self.env = pyndri.QueryEnvironment(self.index)
        elif env == 'tfidf':
            self.env = pyndri.TFIDFQueryEnvironment(self.index, k1=1.2, b=0.75)
        elif env == 'prf':
            env = pyndri.QueryEnvironment(self.index)
            self.env = pyndri.PRFQueryEnvironment(env, fb_docs=10, fb_terms=10)
        else:
            raise ValueError(f'Unknown environment configuration {env}')

        stop = datetime.now()
        if verbose:
            helpers.log(f'Loaded index in {stop - start}.')
示例#6
0
def run_queries(index_path, scorer_module, scorer_class, params, queries=[]):
    """
    Parsl app instantiates a scorer, sets the parameters,
    runs the query, returns the result
    """

    module = importlib.import_module(scorer_module)
    class_ = getattr(module, scorer_class)
    scorer_instance = class_()

    # set parameter

    # open index. Assumes access to index_path
    index = pyndri.Index(index_path)
    term_count = index.total_terms()

    # initial retrieval
    try:
        rule = 'method:dirichlet,mu:%s' % params['mu']
        query_env = pyndri.QueryEnvironment(index, rules=(rule,))
        hits = query_env.query(queries[1], results_requested=1000)
        # hits = index.query(queries[1], rules=(rule,), results_requested=1000)

        results = []
        for doc_id, score in hits:
            docno, tokens = index.document(doc_id)
            doc_vector = Counter(tokens)
            doc_len = float(index.document_length(doc_id))

            new_score = scorer_instance.score(query_vector=queries[2],
                                              document_vector=doc_vector,
                                              doc_length=doc_len,
                                              term_count=term_count,
                                              col_prob=queries[3],
                                              params=params)

            # TODO: rescore
            results.append((queries[0], docno, new_score))
    finally:
        index.close()

    return results
示例#7
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--loglevel', type=str, default='INFO')

    parser.add_argument('--index',
                        type=argparse_utils.existing_directory_path,
                        required=True)
    parser.add_argument('--model',
                        type=argparse_utils.existing_file_path,
                        required=True)
    parser.add_argument('--vocabulary_list',
                        type=argparse_utils.nonexisting_file_path,
                        required=True)

    args = parser.parse_args()

    args.index = pyndri.Index(args.index)

    try:
        logging_utils.configure_logging(args)
    except IOError:
        return -1

    logging.info('Loading dictionary.')
    dictionary = pyndri.extract_dictionary(args.index)

    logging.info('Loading model.')
    model_base, epoch_and_ext = args.model.rsplit('_', 1)
    epoch = int(epoch_and_ext.split('.')[0])

    if not os.path.exists('{}_meta'.format(model_base)):
        model_meta_base, batch_idx = model_base.rsplit('_', 1)
    else:
        model_meta_base = model_base

    model = nvsm.load_model(nvsm.load_meta(model_meta_base), model_base, epoch)

    with open(args.vocabulary_list, 'w') as f_vocabulary_list:
        for index_term_id in model.term_mapping:
            f_vocabulary_list.write(dictionary[index_term_id])
            f_vocabulary_list.write('\n')
示例#8
0
def main():
    options = argparse.ArgumentParser()
    options.add_argument('pseudo_queries')
    options.add_argument('expansion_index')
    options.add_argument('stoplist')
    args = options.parse_args()

    pseudo_queries = collections.defaultdict(collections.Counter)
    with open(args.pseudo_queries) as f:
        for line in f:
            docno, term, weight = line.strip().split(',')
            pseudo_queries[docno][term] = float(weight)

    stopper = Stopper(file=args.stoplist)

    index = IndexWrapper(pyndri.Index(args.expansion_index))
    for docno in pseudo_queries:
        query = Query(docno, vector=pseudo_queries[docno])

        top_results = index.query(query, count=10)

        rm1 = build_rm1(top_results, index, stopper=stopper)

        # Features
        rm1_clarity = clarity(rm1.vector, index)
        weighted_ig = wig(query, index, top_results=top_results)
        normalized_qc = nqc(query, index, top_results=top_results)
        average_idf = avg_idf(query.vector.keys(), index)
        simple_clarity = scs(query, index)
        average_scq = statistics.mean(scqs(query, index))

        print(query.title,
              rm1_clarity,
              weighted_ig,
              normalized_qc,
              average_idf,
              simple_clarity,
              average_scq,
              sep=',')
示例#9
0
def get_index():
    index = getattr(g, 'index', None)

    if index is None:
        logging.info('Loading index.')

        index_path = os.environ.get('INDEX_PATH', None)
        assert index_path is not None and os.path.isdir(index_path)

        index = pyndri.Index(index_path)
        g.index = index

        logging.info('Opened index %s.', index)

    dictionary = getattr(g, 'dictionary', None)

    if dictionary is None:
        logging.info('Extracting dictionary.')

        dictionary = pyndri.extract_dictionary(index)
        g.dictionary = dictionary

    return index, dictionary
示例#10
0
文件: indri.py 项目: zuacubd/macaw
    def __init__(self, params):
        """
		The Indri retrieval model. Indri is an open-source search engine implemented as part of the lemur project by
		UMass Amherst and CMU. Refer to http://lemurproject.org/indri.php for more information.
		The retrieval model used here is based on language modeling framework and retrieves documents using the query
		likelihood retrieval model [Ponte & Croft; SIGIR 1998] and Dirichlet prior smoothing [Zhai and Lafferty; SIGIR
		2001]. It is implemented using the Pyndri [Van Gysel et al.; ECIR 2017], which is a python interface to Indri.
		Refer to http://lemurproject.org/indri.php for more information on the Lemur toolkit.

		Args:
			params(dict): A dict containing some parameters. Here is the list of all required parameters:
			'indri_path': The path to the installed Indri toolkit.
			'index': The path to the Indri index constructed from the collection.
			'results_requested': The maximum number of requested documents for retrieval. If not given, it is set to 1.
			'text_format': The text format for document collection (e.g., 'trectext').
			Note that the parameters 'query_generation' and 'logger' are required by the parent class.
		"""
        super().__init__(params)
        self.results_requested = self.params[
            'results_requested'] if 'results_requested' in self.params else 1
        self.indri_path = self.params['indri_path']
        self.index = pyndri.Index(self.params['index'])
        self.term2id, self.id2term, self.id2df = self.index.get_dictionary()
        self.id2tf = self.index.get_term_frequencies()
示例#11
0
# tests de fonctions et programmes
from os import listdir
from os.path import isfile, join
from re import sub
import ast
import numpy
import pyndri

collection = 'C:/Users/Thiziri/Desktop/govExt'
index = pyndri.Index(collection)

for document_id in range(index.document_base(), index.maximum_document()):
    print(index.document(document_id))

# Queries the index with 'hello world' and returns the first 1000 results.
results = index.query('hello world', results_requested=1000)

for int_document_id, score in results:
    ext_document_id, _ = index.document(int_document_id)
    print(ext_document_id, score)

token2id, id2token, id2df = index.get_dictionary()

id2tf = index.get_term_frequencies()
示例#12
0
import pyndri
import sys

if len(sys.argv) <= 1:
    print 'Usage: python {0} <path-to-indri-index>'.format(sys.argv[0])

    sys.exit(0)

index = pyndri.Index(sys.argv[1])

for document_id in xrange(index.document_base(), index.maximum_document()):
    # Prints pairs of form (external_document_id, terms).
    #
    # Example:
    #   ('eUK950521', (877, 2171, 797, 877, 2171, 2771, 1768, 1262, 2171))
    #   ('eUK436208', (381, 3346))
    print index.document(document_id)

# The following line will raise an exception, as there is no document
# with internal identifier 0.
print index.document(0)
        ])
        for relation in relations
    }

    uniq_documents = set()
    for rel in relations:
        uniq_documents.add(rel[1])

    # extracted queries in .txt files
    queries = get_queries(config["queries"])
    # print(queries)

    queries_length = {q: len(queries[q].split()) for q in queries}

    out = config["output"]  # output folder
    index = pyndri.Index(config["index"])  # documents index

    print("Reading data index ...")
    externalDocId = {}
    documents_length = {}
    for doc_id in range(index.document_base(),
                        index.maximum_document()):  # type: int
        extD_id, content = index.document(doc_id)
        if extD_id in uniq_documents:
            externalDocId[extD_id] = doc_id
            documents_length[extD_id] = len(content)

    for fold in listdir(config["split_data"]):
        print(fold + "########################")
        train = [
            l.strip()
                  metrics=config_model_train["metrics"])
    print(model.summary())
    plot_model(model,
               to_file=join(config_model_train["train_details"],
                            config_model_param['model_name'] + ".png"))
    # save model and resume

    print("Reading training data:")
    print("[First]:\nRead label files to relations...")
    relations, relation_labeler = read_lablers_to_relations(
        config_data["labels"])

    print("[Second]:\nSet relations as train instances...")

    print("Reading data index ...")
    index = pyndri.Index(config_data["index"])
    token2id, _, _ = index.get_dictionary()
    externalDocId = {}
    for doc_id in range(index.document_base(),
                        index.maximum_document()):  # type: int
        extD_id, _ = index.document(doc_id)
        externalDocId[extD_id] = doc_id
    train_queries = get_queries(config_data["train_queries"])

    print("x_train preparation...")
    # the model needs list of 3 input arrays :
    v_q_words = []
    v_d_words = []
    v_rel_labels = []

    # print(train_queries)
示例#15
0
def main():
    options = argparse.ArgumentParser()
    options.add_argument('pseudo_queries')
    options.add_argument('queries')
    options.add_argument('qrels')
    options.add_argument('stoplist')
    options.add_argument('--index')
    args = options.parse_args()

    if args.index:
        index = IndexWrapper(pyndri.Index(args.index))
        scorer = DirichletTermScorer(index)

    qrels = Qrels(file=args.qrels)

    stopper = Stopper(file=args.stoplist)

    judged = collections.defaultdict(set)
    with open(args.qrels) as f:
        for line in f:
            query, _, doc, _ = line.split()
            judged[doc].add(query)

    pq = collections.defaultdict(dict)
    with open(args.pseudo_queries) as f:
        for line in f:
            doc, term, weight = line.strip().split(',')
            pq[doc][term] = float(weight)

    q = collections.defaultdict(set)
    with open(args.queries) as f:
        for line in f:
            query, term = line.strip().split(',')
            q[query].add(term)

    def normalize_results_scores(results):
        total = sum([score for _, score in results])
        return [(doc, score / total) for doc, score in results]

    col_names = 'doc,query,pq_q_recall,pq_q_ap,q_weight_perc'
    if args.index:
        col_names += ',pq_q_results_jacc,pq_q_results_cosine,pq_results_ap,q_results_ap,pq_results_prec,q_results_prec'
    print(col_names)
    for doc in pq:
        pq_query = Query(doc, vector=collections.Counter(pq[doc]))
        if args.index:
            pq_results = index.query(pq_query, 10)
            pq_results_set = set([r.docno for r, _ in pq_results])
        for associated_query in judged[doc]:
            q_query = Query(associated_query,
                            vector=stopper.stop(
                                collections.Counter(q[associated_query])))
            if args.index:
                q_results = index.query(q_query, 10)
                q_results_set = set([r.docno for r, _ in q_results])
                results_jacc = jaccard_similarity(pq_results_set,
                                                  q_results_set)

                pq_results_ap = average_precision(
                    associated_query, [r.docno for r, _ in pq_results], qrels)
                q_results_ap = average_precision(
                    associated_query, [r.docno for r, _ in q_results], qrels)
                pq_results_prec = precision(pq_results_set,
                                            qrels.rel_docs(associated_query))
                q_results_prec = precision(q_results_set,
                                           qrels.rel_docs(associated_query))

                pq_vocab = build_vocab(
                    *[r.document_vector() for r, _ in pq_results])
                q_vocab = build_vocab(
                    *[r.document_vector() for r, _ in q_results])

                pq_pseudo_doc = {
                    term: sum([
                        exp_score * scorer.score(term, exp_doc) for exp_doc,
                        exp_score in normalize_results_scores(pq_results)
                    ])
                    for term in pq_vocab
                }
                q_pseudo_doc = {
                    term: sum([
                        exp_score * scorer.score(term, exp_doc) for exp_doc,
                        exp_score in normalize_results_scores(q_results)
                    ])
                    for term in q_vocab
                }
                cosine = cosine_similarity(pq_pseudo_doc, q_pseudo_doc)

            q_qrels = Qrels()
            q_qrels._qrels[associated_query] = q_query.vector

            pseudo_ap = average_precision(
                associated_query,
                sorted(pq[doc].keys(), key=lambda k: pq[doc][k], reverse=True),
                q_qrels)
            pq_q_recall = recall(set(pq[doc].keys()), q[associated_query])
            q_weight_perc = sum([pq[doc][term] if term in pq[doc] else 0.0 for term in q[associated_query]]) / \
                            sum([pq[doc][term] for term in pq[doc]])

            output = [
                doc, associated_query,
                str(pq_q_recall),
                str(pseudo_ap),
                str(q_weight_perc)
            ]
            if args.index:
                output += [
                    str(results_jacc),
                    str(cosine),
                    str(pq_results_ap),
                    str(q_results_ap),
                    str(pq_results_prec),
                    str(q_results_prec)
                ]
            print(','.join(output))
	    """)

    config_file = sys.argv[1]
    config = json.load(open(config_file))
    print(json.dumps(config, indent=2))
    configuration = config["word2vec_config"].copy()
    print("Word2Vec will be trained with the following configuration:")
    print(json.dumps(configuration, indent=2))
    stopWordsList = set(
        stopwords.words('english')) if not bool(config["stop_file"]) else set(
            [line.strip() for line in open(config["stop_file"]).readlines()])

    text_in = ""
    if bool(config['index']):
        print("Index reading ...")
        index = pyndri.Index(config["index"])
        _, id2token, _ = index.get_dictionary()
        documents = [
            document_id for document_id in range(index.document_base(),
                                                 index.maximum_document())
        ]

        text_in = os.path.join(config["out"], "Sentences.txt")
        intxt = open(
            text_in, "w"
        )  #construct a file of text lines, each line is a document content as one sentence
        for id_d in documents:
            _, terms = index.document(id_d)
            txt_line = ""
            if config["stopping"]:
                txt_line = " ".join([
import pickle
import gensim
import numpy as np

from collections import defaultdict
from math import log, exp
from pprint import pprint

from gensim import corpora, similarities
from gensim.models.ldamodel import LdaModel
from gensim.models.lsimodel import LsiModel

from scipy.stats import entropy as kl_divergence


index = pyndri.Index('index/')
token2id, id2token, _ = index.get_dictionary()


def parse_topics(file_or_files, max_topics=sys.maxsize, delimiter=';'):
    assert max_topics >= 0 or max_topics is None

    topics = collections.OrderedDict()

    if not isinstance(file_or_files, list) and \
            not isinstance(file_or_files, tuple):
        if hasattr(file_or_files, '__iter__'):
            file_or_files = list(file_or_files)
        else:
            file_or_files = [file_or_files]
示例#18
0
return: dict
"""


def get_queries(query_file):
    with open(query_file, "r") as f:
        return {l.strip().split("\t")[0]: l.strip().split("\t")[1] for l in f}


if __name__ == "__main__":
    print("[First]:\nRead label files to relations...")
    relations, _ = read_lablers_to_relations(
        sys.argv[1])  # relation .label file
    queries = get_queries(sys.argv[2])  # extracted queries
    out = sys.argv[3]  # output folder
    index = pyndri.Index(sys.argv[4])  # index
    print("Reading data index ...")
    token2id, _, _ = index.get_dictionary()
    print(len(token2id))
    externalDocId = {}
    for doc_id in range(index.document_base(),
                        index.maximum_document()):  # type: int
        extD_id, _ = index.document(doc_id)
        externalDocId[extD_id] = doc_id
    q_max_len, d_max_len = int(sys.argv[5]), int(
        sys.argv[6])  # query and document max length respectively

    relations_list = list(relations)
    queries_list = list(queries.keys())
    reader = ContentReader(relations_list,
                           token2id,
示例#19
0
if __name__ == '__main__':
    X_cols = [
        'TF-IDF', 'LDA', 'LSI', 'dp_mu_500', 'GLM_top1000docs_sigma50_mu1000',
        'doc_len', 'query_len'
    ]
    y_cols = ['relevance_label']

    # Load training data
    print("Loading training data... ", end='')
    try:
        training_data = load_pickle('../pickles/LTR_DF_Training.pkl')
    except FileNotFoundError:
        tfidf_data = dict(
            load_pickle('../pickles/prepro_doc_col_q50_top1000_tfidf.pkl'))
        index = pyndri.Index('../index/')

        models_files = ['TF-IDF', 'LDA', 'LSI', 'dp_mu_500', 'GLM']
        training_rel_file = '../ap_88_89/qrel_test'

        data_loader = LTR_Process_Data.TrainingDataLoader(
            ranked_data=tfidf_data,
            index=index,
            models=models_files,
            rel_file=training_rel_file,
            doc_len=Helper.document_lengths,
            int_to_ext_dict=Helper.int_to_ext_dict,
            ext_to_int_dict=Helper.ext_to_int_dict,
            queries=Helper.tokenized_queries)
        training_data = data_loader.data
示例#20
0
    algo = ""
    if bool(args["--p"]):
        algo = "porter"
    else:
        algo = "krovetz"

    print(
        "Please wait while we are collecting the {k} neighbors of each word ...  \n"
        .format(k=int(args["--n"])))

    #print("\nWord2vec loading ...")
    #model=Word2Vec.load_word2vec_format(args["<embedding_model>"], binary=bool(args["--binary"]))
    #print("\nOK")
    #filtering with the collection vocabulary
    print("Cleaning word embeddings ...")
    word2vec_intersect_dataset(pyndri.Index(args["--index_dataset"]),
                               args["<embedding_model>"],
                               args["<outputfolder>"], bool(args["--binary"]),
                               args["--dataset"], algo)
    #open the new word2vec
    model = Word2Vec.load_word2vec_format(join(
        args["<outputfolder>"], "word2vec_of_" + args["--dataset"]),
                                          binary=False)
    #model=Word2Vec.load_word2vec_format(args["<embedding_model>"], binary=False)
    print("Word embeddings OK.")

    # Processing of the input text before neighbors finding
    prog = re.compile("[_\-\(]*([A-Z]\.)*[_\-\(]*")

    for t in toProcess:
        terms = toProcess[t].split()  #stem(algo,toProcess[t]).split()
def main():
    options = argparse.ArgumentParser()
    options.add_argument('topic_terms')
    options.add_argument('queries')
    options.add_argument('qrels')
    options.add_argument('index')
    options.add_argument('stoplist')
    options.add_argument('--skip-retrieval', action='store_true')
    args = options.parse_args()

    index = IndexWrapper(pyndri.Index(args.index))
    stopper = Stopper(file=args.stoplist)

    topic_terms = collections.defaultdict(
        lambda: collections.defaultdict(list))
    with open(args.topic_terms) as f:
        for line in f:
            user, docno, _, term = line.strip().split(',')
            topic_terms[docno][user].append(term)

    queries = read_queries(args.queries, format=args.queries.split('.')[-1])
    qrels = Qrels(file=args.qrels)

    for query in queries:
        judged_docs = qrels.judged_docs(query.title)
        judged_with_tt = judged_docs & set(topic_terms.keys())

        if judged_with_tt and not args.skip_retrieval:
            query_results = index.query(query, count=10)
            query_results_docs = [r[0].docno for r in query_results]

        for docno in judged_with_tt:
            for user in topic_terms[docno]:
                tt_set = set(topic_terms[docno][user]) - stopper.stopwords
                qt_set = set(query.vector.keys()) - stopper.stopwords

                tt_query_jaccard = jaccard_similarity(tt_set, qt_set)
                tt_query_recall = recall(tt_set, qt_set)
                results_jaccard = -1
                results_recall = -1

                if not args.skip_retrieval:
                    tt_query = Query(docno,
                                     vector=collections.Counter(
                                         topic_terms[docno][user]))
                    tt_results = index.query(tt_query, count=10)
                    tt_results_docs = [r[0].docno for r in tt_results]

                    results_jaccard = jaccard_similarity(
                        set(tt_results_docs), set(query_results_docs))
                    results_recall = recall(set(tt_results_docs),
                                            set(query_results_docs))

                print(user,
                      docno,
                      query.title,
                      qrels.relevance_of(docno, query.title),
                      tt_query_jaccard,
                      tt_query_recall,
                      results_jaccard,
                      results_recall,
                      sep=',')
示例#22
0
    args = docopt.docopt("""
        Usage:
            embed_idf.py --i=<indexed_data> --d=<word_dict_file> --o=<output_folder>

        Example:
            embed_idf.py --i=/home/thiziri/Documents/DOCTORAT/COLLECTION/Indri_index/AP88  

        Options:
            --i=<indexed_data>    Gives the INDRI index of the collection.
            --d=<word_dict_file>    Gives the word_dict.txt file generated by MatchZoo. 
            --o=<output_folder>    Gives the output folder where constructed embed.idf file will be stored.

        """)

    print("Reading index ...")
    index = pyndri.Index(args["--i"])
    token2id, id2token, id2df = index.get_dictionary()
    id2tf = index.get_term_frequencies()

    print("Reading word_dict_file ...")
    w_dict = read_word_dict(args["--d"])

    out = open(join(args["--o"], "embed.idf"), "w")

    for w in tqdm(w_dict):
        try:
            # idf = log((index.maximum_document()-1-id2df[token2id[w_dict[w]]]+0.0)/id2df[token2id[w_dict[w]]]+0.0)
            idf = log(
                (index.maximum_document() - 1) / id2df[token2id[w_dict[w]]])
        except:
            idf = 0.0
示例#23
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('model')

    parser.add_argument('index', type=argparse_utils.existing_directory_path)

    parser.add_argument('--limit',
                        type=argparse_utils.positive_int,
                        default=None)

    parser.add_argument('--object_classification',
                        type=argparse_utils.existing_file_path,
                        nargs='+',
                        default=None)

    parser.add_argument('--filter_unclassified',
                        action='store_true',
                        default=False)

    parser.add_argument('--l2_normalize',
                        action='store_true',
                        default=False)

    parser.add_argument('--mode',
                        choices=('tsne', 'embedding_projector'),
                        default='tsne')

    parser.add_argument('--legend',
                        action='store_true',
                        default=False)

    parser.add_argument('--tick_labels',
                        action='store_true',
                        default=False)

    parser.add_argument('--edges',
                        action='store_true',
                        default=False)

    parser.add_argument('--border',
                        action='store_true',
                        default=False)

    parser.add_argument('--plot_out',
                        type=argparse_utils.nonexisting_file_path,
                        required=True)

    args = parser.parse_args()

    try:
        logging_utils.configure_logging(args)
    except IOError:
        return -1

    # Set matplotlib style.
    plt.style.use('bmh')

    logging.info('Loading index.')
    index = pyndri.Index(args.index)

    logging.info('Loading cuNVSM model.')
    model_base, epoch_and_ext = args.model.rsplit('_', 1)
    epoch = int(epoch_and_ext.split('.')[0])

    if not os.path.exists('{}_meta'.format(model_base)):
        model_meta_base, batch_idx = model_base.rsplit('_', 1)
    else:
        model_meta_base = model_base

    model = nvsm.load_model(
        nvsm.load_meta(model_meta_base),
        model_base, epoch,
        only_object_embeddings=True)

    raw_object_representations = np.copy(model.object_representations)

    if args.limit:
        raw_object_representations = raw_object_representations[:args.limit, :]

    for object_classification in args.object_classification:
        root, ext = os.path.splitext(args.plot_out)

        plot_out = '{}-{}.{}'.format(
            root, os.path.basename(object_classification), ext.lstrip('.'))

        if object_classification and args.filter_unclassified:
            logging.info('Filtering unclassified.')

            with open(object_classification, 'r') as f_objects:
                object_ids = [line.strip().split()[0] for line in f_objects]
                indices = sorted(model.inv_object_mapping[idx]
                                 for _, idx in index.document_ids(object_ids)
                                 if idx in model.inv_object_mapping)

                logging.info('Considering %d out of %d representations.',
                             len(indices), len(object_ids))

                translation_table = {idx: i for i, idx in enumerate(indices)}

                object_representations = raw_object_representations[indices]

                assert object_representations.shape[0] == \
                    len(translation_table)
        else:
            translation_table = None

            raise NotImplementedError()

        logging.info('Loading object clusters.')

        cluster_id_to_product_ids = {}

        if object_classification:
            with open(object_classification, 'r') as f_objects:
                for line in f_objects:
                    object_id, cluster_id = line.strip().split()

                    if cluster_id not in cluster_id_to_product_ids:
                        cluster_id_to_product_ids[cluster_id] = set()

                    cluster_id_to_product_ids[cluster_id].add(object_id)

                for cluster_id in list(cluster_id_to_product_ids.keys()):
                    object_ids = list(cluster_id_to_product_ids[cluster_id])

                    cluster_id_to_product_ids[cluster_id] = set(
                        (model.inv_object_mapping[int_object_id]
                            if translation_table is None
                            else translation_table[
                                model.inv_object_mapping[int_object_id]])
                        for ext_object_id, int_object_id in
                        index.document_ids(object_ids)
                        if int_object_id in model.inv_object_mapping and
                        (args.limit is None or
                         (model.inv_object_mapping[int_object_id] <
                             args.limit)))
        else:
            raise NotImplementedError()

        assert len(cluster_id_to_product_ids) < len(MARKERS)

        if args.l2_normalize:
            logging.info('L2-normalizing representations.')

            object_representations /= np.linalg.norm(
                object_representations,
                axis=1, keepdims=True)

        if args.mode == 'tsne':
            logging.info('Running t-SNE.')

            twodim_object_representations = \
                TSNE(n_components=2, init='pca', random_state=0).\
                fit_transform(object_representations)

            logging.info('Plotting %s.', twodim_object_representations.shape)

            colors = cm.rainbow(
                np.linspace(0, 1, len(cluster_id_to_product_ids)))

            for idx, cluster_id in enumerate(
                    sorted(cluster_id_to_product_ids.keys(),
                           key=lambda cluster_id: len(
                               cluster_id_to_product_ids[cluster_id]),
                           reverse=True)):
                row_ids = list(cluster_id_to_product_ids[cluster_id])

                plt.scatter(
                    twodim_object_representations[row_ids, 0],
                    twodim_object_representations[row_ids, 1],
                    marker=MARKERS[idx],
                    edgecolors='grey' if args.edges else None,
                    cmap=plt.cm.Spectral,
                    color=colors[idx],
                    alpha=0.3,
                    label=pylatex.utils.escape_latex(cluster_id))

            plt.grid()

            plt.tight_layout()

            if args.legend:
                plt.legend(bbox_to_anchor=(0, -0.15, 1, 0),
                           loc=2,
                           ncol=2,
                           mode='expand',
                           borderaxespad=0)

            if not args.tick_labels:
                plt.gca().get_xaxis().set_visible(False)
                plt.gca().get_yaxis().set_visible(False)

            if not args.border:
                # plt.gcf().patch.set_visible(False)
                plt.gca().axis('off')

            logging.info('Writing %s.', plot_out)

            plt.savefig(plot_out,
                        bbox_inches='tight',
                        transparent=True,
                        pad_inches=0,
                        dpi=200)
        elif args.mode == 'embedding_projector':
            logging.info('Dumping to TensorFlow embedding projector format.')

            with open('{}_vectors.tsv'.format(plot_out), 'w') as f_vectors, \
                    open('{}_meta.tsv'.format(plot_out), 'w') as f_meta:
                f_meta.write('document_id\tclass\n')

                def write_rowids(row_ids, cluster_id):
                    for row_id in row_ids:
                        f_vectors.write(
                            '{}\n'.format('\t'.join(
                                '{:.5f}'.format(x)
                                for x in object_representations[row_id])))

                        f_meta.write('{}\t{}\n'.format(
                            index.ext_document_id(
                                model.object_mapping[row_id]),
                            cluster_id))

                for cluster_id in cluster_id_to_product_ids.keys():
                    row_ids = list(cluster_id_to_product_ids[cluster_id])
                    write_rowids(row_ids, cluster_id)

    logging.info('All done!')
示例#24
0
def main(args=None):
    """
    Given a config file defining a set of collections and scoreres,
    generate run a parallel Parl workflow to generate run and
    eval output.
    """

    parser = argparse.ArgumentParser(description='Query runner.')
    parser.add_argument('-c', '--config-file', dest='config_file',
                        default='config/scorers.yaml')
    parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')

    args = parser.parse_args()

    overwrite = False

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    cfg = Config()
    cfg.read_config(args.config_file)

    run_prefix = cfg.get_run_prefix()
    output_dir = cfg.get_output_dir()
    eval_dir = cfg.get_eval_dir()

    for col in cfg.get_collections():
        logging.info("Processing collection %s" % col['name'])

        qrels_path = col['qrels']

        index_path = "%s/%s" % (cfg.get_index_root(), col['index'])
        index = pyndri.Index(index_path)
        token2id, id2token, id2df = index.get_dictionary()
        total_terms = index.total_terms()
        id2tf = index.get_term_frequencies()

        for query_file in col['queries']:
            logging.info("Processing query_file %s" % query_file)

            # read the queries as a list
            # TODO: need queries as feature vector
            queries = list_from_xml(col['queries'][query_file], token2id,
                                    id2tf, total_terms)

            for scorer in cfg.get_scorers():

                params_list, params_str_list = cfg.get_param_combinations(
                                                    scorer['name'])

                for idx, params in enumerate(params_list):

                    param_str = params_str_list[idx]

                    # Create output file
                    results_file = "{}/{}/{}/{}.out".format(
                            output_dir, col['name'],
                            scorer['name'], param_str)
                    eval_file = "{}/{}/{}/{}.eval".format(
                            eval_dir, col['name'],
                            scorer['name'], param_str)

                    # skip if exists
                    if not overwrite and os.path.exists(results_file):
                        logging.info("Found existing output file, skipping")
                        pass

                    results_dir = os.path.dirname(results_file)
                    if not os.path.exists(results_dir):
                        os.makedirs(results_dir)

                    trec_eval_dir = os.path.dirname(eval_file)
                    if not os.path.exists(trec_eval_dir):
                        os.makedirs(trec_eval_dir)

                    # For each col (index + topics + qrels), scorer, paramset
                    r = []
                    for query in queries:
                        r.append(run_queries(index_path, scorer['module'],
                                             scorer['class'], params, query))

                    outputs = [x.result() for x in r]

                    with open(results_file, 'w') as f:
                        for output in outputs:
                            for idx, res in enumerate(output):
                                row = "{} Q0 {} {} {} {}\n".format(
                                        res[0], res[1], idx+1, res[2],
                                        run_prefix)
                                f.write(row)
                    f.close()

                    trec_eval('all_trec', qrels_path, results_file,
                              eval_file)
示例#25
0
    return list_of_series


t = time.time()

# read query validation set
filename = "data/validation_set/query_validation_set.txt"
base_filename, file_extension = os.path.splitext(filename)
output = f'{base_filename}.csv'
input = open(filename, "r")
lines = input.readlines()
input.close()

# index of corpus
index = pyndri.Index('Vol45/Vol45-index')

# define bm25 query environment
bm25_query_env = pyndri.OkapiQueryEnvironment(index, k1=1.2, b=0.75, k3=1000)

# retrieve documents and bm25 score
df = pd.DataFrame()
for i in range(len(lines)):
    query = lines[i].rstrip()
    list_of_series = getDocuments(index, bm25_query_env, query)
    df = pd.concat([df, pd.DataFrame(list_of_series)])

df.columns = ['topic', 'query', 'document_name', 'document_score']

# uncomment if you want to write queries and documents to csv
#df.to_csv(output, index=False, chunksize=1000)
示例#26
0
def create_graph_from_sentences(sentences, path_to_index):
    index = pyndri.Index(path_to_index)
    token2id, id2token, id2df = index.get_dictionary()
from tools4text import extractTopics, clean, get_qrels, save_corpus, get_docs_from_run, run2relations
from tools4text import rank_to_relevance, path_leaf, remove_extension, extract_trec_million_queries

logging.basicConfig(filename='collect2MZinpuText.log', level=logging.DEBUG)

if __name__ == '__main__':
    config_file = sys.argv[1]
    config = json.load(open(config_file))
    logging.info('Config: ' + json.dumps(config, indent=2))

    print("Data extraction\nConfiguration: ")
    print(json.dumps(config, indent=2), end='\n')

    print("Reading index ...")
    index = pyndri.Index(config["indexed_data"])
    _, id2token, _ = index.get_dictionary()
    externalDocId = {}
    for doc in range(index.document_base(), index.maximum_document()):
        extD, _ = index.document(doc)
        externalDocId[extD] = doc
    print("Extract queries ...")
    queries = {}
    if config["train_queries"] == config["test_queries"]:
        queries = extractTopics(config["train_queries"]) if config["train_queries_format"] == "trec"\
            else extract_trec_million_queries(config["train_queries"])
    else:
        train_queries = extractTopics(config["train_queries"]) if config["train_queries_format"] == "trec" \
            else extract_trec_million_queries(config["train_queries"])
        test_queries = extractTopics(config["test_queries"]) if config["test_queries_format"] == "trec" \
            else extract_trec_million_queries(config["test_queries"])
示例#28
0
import pyndri
import params
import pickle
import sys

print("uploading index")

index = pyndri.Index(params.path_to_index)
dic = {}
for document_id in range(index.document_base(), index.maximum_document()):
    if document_id % 1000000 == 0:
        print("in document", document_id)
        sys.stdout.flush()
    if index.document(document_id)[0].__contains__("ROUND-04"):
        dic[index.document(document_id)[0]] = document_id
print("loading index finished")
f = open("dic4.pickle", "wb")
pickle.dump(dic, f)
f.close()
if not dic:
    print("empty dictionary")
示例#29
0
def create_index_resources(index_path="index/"):
    index = pyndri.Index(index_path)
    token2id, id2token, id2df = index.get_dictionary()
    dictionary = pyndri.extract_dictionary(index)
    document_ids = list(range(index.document_base(), index.maximum_document()))
    return index, token2id, id2token, id2df, dictionary, document_ids
from os.path import join
import os.path
from gensim.models import Word2Vec
import numpy
import docopt
import pyndri

if __name__ == "__main__":
	print("\n----BEGIN----\n")
	args = docopt.docopt("""
	    Usage:
	        get_idf_vocab_dataset.py  <outputfolder> [--dataset=<val2>] [--index_dataset=<val3>] [--b=<val4>] 
	    
	    Options:
	        --dataset=<val2>	Precise the collection name that corresonds to the topics you are processing .
	        --index_dataset=val3   Provides the index of your data set to filter the word embeddings vocabulary while computing the neighbors .
	        --b    The b value of computing the alpha parameter of NWT model while computing the neighbors [default : 2].
	        
	    """)

	index=pyndri.Index(args["--index_dataset"])
	token2id,_,id2df=index.get_dictionary()
	file=open(join(args["<outputfolder>"],args["--dataset"])+"wv.idf.txt","w")
	b=2#args["--b"

	for word in token2id:
		alpha = (index.maximum_document()-id2df[token2id[word]]+0.5)/(id2df[token2id[word]]+0.5)+float(b)
		file.write(word+"\t"+str(alpha)+"\n")
	file.close()
	print("Finished.")