def setUp(self): super(IndriTest, self).setUp() self.test_dir = tempfile.mkdtemp() with open(os.path.join(self.test_dir, 'corpus.trectext'), 'w', encoding='latin1') as f: f.write(self.CORPUS) with open(os.path.join(self.test_dir, 'IndriBuildIndex.conf'), 'w') as f: f.write(self.INDRI_CONFIG) with open(os.devnull, "w") as f: ret = subprocess.call(['IndriBuildIndex', 'IndriBuildIndex.conf'], stdout=f, cwd=self.test_dir) self.assertEqual(ret, 0) self.index_path = os.path.join(self.test_dir, 'index') self.assertTrue(os.path.exists(self.index_path)) self.index = pyndri.Index(self.index_path)
def test_run_queries(): with mock.patch('pyndri.Index') as mock_index: with mock.patch('pyndri.QueryEnvironment') as mock_qenv: mock_index.return_value = MockIndex() mock_qenv.return_value = MockQueryEnv() index = pyndri.Index('/index/path') token2id, id2token, id2df = index.get_dictionary() total_terms = index.total_terms() id2tf = index.get_term_frequencies() queries = list_from_xml('retrievable/tests/test_queries.yaml', token2id, id2tf, total_terms) (num, text, qv, cp) = queries[0] assert num == '51' assert text == 'airbus subsidies' assert qv == {6146: 1, 3313: 1} assert cp == {6146: 1086 / 76148180, 3313: 2608 / 76148180} output = run_queries('/index/path', 'retrievable.scorers.api', 'ScorerDirichlet', {'mu': 1000}, queries[0]) res = output.result() assert len(res) == 2
def main(args=None): parser = argparse.ArgumentParser( description='Create term timeseries index') parser.add_argument('-i', '--index', dest='index', help='Input index path') parser.add_argument('-o', '--output', dest='output', help='Output path') parser.add_argument("-v", "--verbose", help='Verbose logging', action="store_true") args = parser.parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) index = pyndri.Index(args.index) logging.info('Get dictionary') token2id, id2token, id2df = index.get_dictionary() doc_ids = range(index.document_base(), index.maximum_document()) logging.info('Building index') ts = {} for doc_id in tqdm(doc_ids): epoch = int(index.field(doc_id, 'epoch')) date = datetime.fromtimestamp(epoch).date() docno, token_ids = index.document(doc_id) for token_id in token_ids: if token_id > 0 and id2df[token_id] > 1000: if date not in ts: ts[date] = {} if token_id not in ts[date]: ts[date][token_id] = 0 ts[date][token_id] += 1 logging.info('Creating dataframe') t0 = time.time() df = pd.DataFrame.from_dict(ts, orient='index', dtype=int) t1 = time.time() logging.debug("time: %s" % (t1 - t0)) logging.info('Serializing dataframe') t0 = time.time() df.to_csv(args.output, compression="gzip") t1 = time.time() logging.debug("time: %s" % (t1 - t0))
def indri_doc_extractor(path): import pyndri index = pyndri.Index(path) id2token = index.get_dictionary()[1] def wrapped(docid): doc_id_tuples = index.document_ids([docid]) if not doc_id_tuples: return None # not found int_docid = doc_id_tuples[0][1] _, doc_toks = index.document(int_docid) return ' '.join(id2token[tok] for tok in doc_toks if tok != 0) return wrapped
def __init__(self, env: str = 'default', verbose: bool = False, avg_len=False): if verbose: helpers.log(f'Loading index {INDRI_INDEX_DIR} with {env} query environment.') start = datetime.now() self.index = pyndri.Index(f'{INDRI_INDEX_DIR}') self.token2id, self.id2token, self.id2df = self.index.get_dictionary() self.id2tf = self.index.get_term_frequencies() if avg_len: # Monte Carlo Estimation for document length: doc_lengths = np.empty(self.index.document_count(), dtype=np.float) for (idx, doc_iid) in enumerate(range(self.index.document_base(), self.index.maximum_document())): doc_lengths[idx] = self.index.document_length(doc_iid) self.avg_doc_len = float(doc_lengths.mean()) self.tokenizer = Tokenizer() if os.path.isfile(TITLE2WID): with open(TITLE2WID, 'rb') as file: self.title2wid = pickle.load(file) if os.path.isfile(WID2TITLE): with open(WID2TITLE, 'rb') as file: self.wid2title = pickle.load(file) try: if os.path.isfile(WID2INT): with open(WID2INT, 'rb') as file: self.wid2int = pickle.load(file) if os.path.isfile(INT2WID): with open(INT2WID, 'rb') as file: self.int2wid = pickle.load(file) except FileNotFoundError: helpers.log('ID mappings do not exist yet. Not loaded.') if env == 'default': self.env = pyndri.QueryEnvironment(self.index) elif env == 'tfidf': self.env = pyndri.TFIDFQueryEnvironment(self.index, k1=1.2, b=0.75) elif env == 'prf': env = pyndri.QueryEnvironment(self.index) self.env = pyndri.PRFQueryEnvironment(env, fb_docs=10, fb_terms=10) else: raise ValueError(f'Unknown environment configuration {env}') stop = datetime.now() if verbose: helpers.log(f'Loaded index in {stop - start}.')
def run_queries(index_path, scorer_module, scorer_class, params, queries=[]): """ Parsl app instantiates a scorer, sets the parameters, runs the query, returns the result """ module = importlib.import_module(scorer_module) class_ = getattr(module, scorer_class) scorer_instance = class_() # set parameter # open index. Assumes access to index_path index = pyndri.Index(index_path) term_count = index.total_terms() # initial retrieval try: rule = 'method:dirichlet,mu:%s' % params['mu'] query_env = pyndri.QueryEnvironment(index, rules=(rule,)) hits = query_env.query(queries[1], results_requested=1000) # hits = index.query(queries[1], rules=(rule,), results_requested=1000) results = [] for doc_id, score in hits: docno, tokens = index.document(doc_id) doc_vector = Counter(tokens) doc_len = float(index.document_length(doc_id)) new_score = scorer_instance.score(query_vector=queries[2], document_vector=doc_vector, doc_length=doc_len, term_count=term_count, col_prob=queries[3], params=params) # TODO: rescore results.append((queries[0], docno, new_score)) finally: index.close() return results
def main(): parser = argparse.ArgumentParser() parser.add_argument('--loglevel', type=str, default='INFO') parser.add_argument('--index', type=argparse_utils.existing_directory_path, required=True) parser.add_argument('--model', type=argparse_utils.existing_file_path, required=True) parser.add_argument('--vocabulary_list', type=argparse_utils.nonexisting_file_path, required=True) args = parser.parse_args() args.index = pyndri.Index(args.index) try: logging_utils.configure_logging(args) except IOError: return -1 logging.info('Loading dictionary.') dictionary = pyndri.extract_dictionary(args.index) logging.info('Loading model.') model_base, epoch_and_ext = args.model.rsplit('_', 1) epoch = int(epoch_and_ext.split('.')[0]) if not os.path.exists('{}_meta'.format(model_base)): model_meta_base, batch_idx = model_base.rsplit('_', 1) else: model_meta_base = model_base model = nvsm.load_model(nvsm.load_meta(model_meta_base), model_base, epoch) with open(args.vocabulary_list, 'w') as f_vocabulary_list: for index_term_id in model.term_mapping: f_vocabulary_list.write(dictionary[index_term_id]) f_vocabulary_list.write('\n')
def main(): options = argparse.ArgumentParser() options.add_argument('pseudo_queries') options.add_argument('expansion_index') options.add_argument('stoplist') args = options.parse_args() pseudo_queries = collections.defaultdict(collections.Counter) with open(args.pseudo_queries) as f: for line in f: docno, term, weight = line.strip().split(',') pseudo_queries[docno][term] = float(weight) stopper = Stopper(file=args.stoplist) index = IndexWrapper(pyndri.Index(args.expansion_index)) for docno in pseudo_queries: query = Query(docno, vector=pseudo_queries[docno]) top_results = index.query(query, count=10) rm1 = build_rm1(top_results, index, stopper=stopper) # Features rm1_clarity = clarity(rm1.vector, index) weighted_ig = wig(query, index, top_results=top_results) normalized_qc = nqc(query, index, top_results=top_results) average_idf = avg_idf(query.vector.keys(), index) simple_clarity = scs(query, index) average_scq = statistics.mean(scqs(query, index)) print(query.title, rm1_clarity, weighted_ig, normalized_qc, average_idf, simple_clarity, average_scq, sep=',')
def get_index(): index = getattr(g, 'index', None) if index is None: logging.info('Loading index.') index_path = os.environ.get('INDEX_PATH', None) assert index_path is not None and os.path.isdir(index_path) index = pyndri.Index(index_path) g.index = index logging.info('Opened index %s.', index) dictionary = getattr(g, 'dictionary', None) if dictionary is None: logging.info('Extracting dictionary.') dictionary = pyndri.extract_dictionary(index) g.dictionary = dictionary return index, dictionary
def __init__(self, params): """ The Indri retrieval model. Indri is an open-source search engine implemented as part of the lemur project by UMass Amherst and CMU. Refer to http://lemurproject.org/indri.php for more information. The retrieval model used here is based on language modeling framework and retrieves documents using the query likelihood retrieval model [Ponte & Croft; SIGIR 1998] and Dirichlet prior smoothing [Zhai and Lafferty; SIGIR 2001]. It is implemented using the Pyndri [Van Gysel et al.; ECIR 2017], which is a python interface to Indri. Refer to http://lemurproject.org/indri.php for more information on the Lemur toolkit. Args: params(dict): A dict containing some parameters. Here is the list of all required parameters: 'indri_path': The path to the installed Indri toolkit. 'index': The path to the Indri index constructed from the collection. 'results_requested': The maximum number of requested documents for retrieval. If not given, it is set to 1. 'text_format': The text format for document collection (e.g., 'trectext'). Note that the parameters 'query_generation' and 'logger' are required by the parent class. """ super().__init__(params) self.results_requested = self.params[ 'results_requested'] if 'results_requested' in self.params else 1 self.indri_path = self.params['indri_path'] self.index = pyndri.Index(self.params['index']) self.term2id, self.id2term, self.id2df = self.index.get_dictionary() self.id2tf = self.index.get_term_frequencies()
# tests de fonctions et programmes from os import listdir from os.path import isfile, join from re import sub import ast import numpy import pyndri collection = 'C:/Users/Thiziri/Desktop/govExt' index = pyndri.Index(collection) for document_id in range(index.document_base(), index.maximum_document()): print(index.document(document_id)) # Queries the index with 'hello world' and returns the first 1000 results. results = index.query('hello world', results_requested=1000) for int_document_id, score in results: ext_document_id, _ = index.document(int_document_id) print(ext_document_id, score) token2id, id2token, id2df = index.get_dictionary() id2tf = index.get_term_frequencies()
import pyndri import sys if len(sys.argv) <= 1: print 'Usage: python {0} <path-to-indri-index>'.format(sys.argv[0]) sys.exit(0) index = pyndri.Index(sys.argv[1]) for document_id in xrange(index.document_base(), index.maximum_document()): # Prints pairs of form (external_document_id, terms). # # Example: # ('eUK950521', (877, 2171, 797, 877, 2171, 2771, 1768, 1262, 2171)) # ('eUK436208', (381, 3346)) print index.document(document_id) # The following line will raise an exception, as there is no document # with internal identifier 0. print index.document(0)
]) for relation in relations } uniq_documents = set() for rel in relations: uniq_documents.add(rel[1]) # extracted queries in .txt files queries = get_queries(config["queries"]) # print(queries) queries_length = {q: len(queries[q].split()) for q in queries} out = config["output"] # output folder index = pyndri.Index(config["index"]) # documents index print("Reading data index ...") externalDocId = {} documents_length = {} for doc_id in range(index.document_base(), index.maximum_document()): # type: int extD_id, content = index.document(doc_id) if extD_id in uniq_documents: externalDocId[extD_id] = doc_id documents_length[extD_id] = len(content) for fold in listdir(config["split_data"]): print(fold + "########################") train = [ l.strip()
metrics=config_model_train["metrics"]) print(model.summary()) plot_model(model, to_file=join(config_model_train["train_details"], config_model_param['model_name'] + ".png")) # save model and resume print("Reading training data:") print("[First]:\nRead label files to relations...") relations, relation_labeler = read_lablers_to_relations( config_data["labels"]) print("[Second]:\nSet relations as train instances...") print("Reading data index ...") index = pyndri.Index(config_data["index"]) token2id, _, _ = index.get_dictionary() externalDocId = {} for doc_id in range(index.document_base(), index.maximum_document()): # type: int extD_id, _ = index.document(doc_id) externalDocId[extD_id] = doc_id train_queries = get_queries(config_data["train_queries"]) print("x_train preparation...") # the model needs list of 3 input arrays : v_q_words = [] v_d_words = [] v_rel_labels = [] # print(train_queries)
def main(): options = argparse.ArgumentParser() options.add_argument('pseudo_queries') options.add_argument('queries') options.add_argument('qrels') options.add_argument('stoplist') options.add_argument('--index') args = options.parse_args() if args.index: index = IndexWrapper(pyndri.Index(args.index)) scorer = DirichletTermScorer(index) qrels = Qrels(file=args.qrels) stopper = Stopper(file=args.stoplist) judged = collections.defaultdict(set) with open(args.qrels) as f: for line in f: query, _, doc, _ = line.split() judged[doc].add(query) pq = collections.defaultdict(dict) with open(args.pseudo_queries) as f: for line in f: doc, term, weight = line.strip().split(',') pq[doc][term] = float(weight) q = collections.defaultdict(set) with open(args.queries) as f: for line in f: query, term = line.strip().split(',') q[query].add(term) def normalize_results_scores(results): total = sum([score for _, score in results]) return [(doc, score / total) for doc, score in results] col_names = 'doc,query,pq_q_recall,pq_q_ap,q_weight_perc' if args.index: col_names += ',pq_q_results_jacc,pq_q_results_cosine,pq_results_ap,q_results_ap,pq_results_prec,q_results_prec' print(col_names) for doc in pq: pq_query = Query(doc, vector=collections.Counter(pq[doc])) if args.index: pq_results = index.query(pq_query, 10) pq_results_set = set([r.docno for r, _ in pq_results]) for associated_query in judged[doc]: q_query = Query(associated_query, vector=stopper.stop( collections.Counter(q[associated_query]))) if args.index: q_results = index.query(q_query, 10) q_results_set = set([r.docno for r, _ in q_results]) results_jacc = jaccard_similarity(pq_results_set, q_results_set) pq_results_ap = average_precision( associated_query, [r.docno for r, _ in pq_results], qrels) q_results_ap = average_precision( associated_query, [r.docno for r, _ in q_results], qrels) pq_results_prec = precision(pq_results_set, qrels.rel_docs(associated_query)) q_results_prec = precision(q_results_set, qrels.rel_docs(associated_query)) pq_vocab = build_vocab( *[r.document_vector() for r, _ in pq_results]) q_vocab = build_vocab( *[r.document_vector() for r, _ in q_results]) pq_pseudo_doc = { term: sum([ exp_score * scorer.score(term, exp_doc) for exp_doc, exp_score in normalize_results_scores(pq_results) ]) for term in pq_vocab } q_pseudo_doc = { term: sum([ exp_score * scorer.score(term, exp_doc) for exp_doc, exp_score in normalize_results_scores(q_results) ]) for term in q_vocab } cosine = cosine_similarity(pq_pseudo_doc, q_pseudo_doc) q_qrels = Qrels() q_qrels._qrels[associated_query] = q_query.vector pseudo_ap = average_precision( associated_query, sorted(pq[doc].keys(), key=lambda k: pq[doc][k], reverse=True), q_qrels) pq_q_recall = recall(set(pq[doc].keys()), q[associated_query]) q_weight_perc = sum([pq[doc][term] if term in pq[doc] else 0.0 for term in q[associated_query]]) / \ sum([pq[doc][term] for term in pq[doc]]) output = [ doc, associated_query, str(pq_q_recall), str(pseudo_ap), str(q_weight_perc) ] if args.index: output += [ str(results_jacc), str(cosine), str(pq_results_ap), str(q_results_ap), str(pq_results_prec), str(q_results_prec) ] print(','.join(output))
""") config_file = sys.argv[1] config = json.load(open(config_file)) print(json.dumps(config, indent=2)) configuration = config["word2vec_config"].copy() print("Word2Vec will be trained with the following configuration:") print(json.dumps(configuration, indent=2)) stopWordsList = set( stopwords.words('english')) if not bool(config["stop_file"]) else set( [line.strip() for line in open(config["stop_file"]).readlines()]) text_in = "" if bool(config['index']): print("Index reading ...") index = pyndri.Index(config["index"]) _, id2token, _ = index.get_dictionary() documents = [ document_id for document_id in range(index.document_base(), index.maximum_document()) ] text_in = os.path.join(config["out"], "Sentences.txt") intxt = open( text_in, "w" ) #construct a file of text lines, each line is a document content as one sentence for id_d in documents: _, terms = index.document(id_d) txt_line = "" if config["stopping"]: txt_line = " ".join([
import pickle import gensim import numpy as np from collections import defaultdict from math import log, exp from pprint import pprint from gensim import corpora, similarities from gensim.models.ldamodel import LdaModel from gensim.models.lsimodel import LsiModel from scipy.stats import entropy as kl_divergence index = pyndri.Index('index/') token2id, id2token, _ = index.get_dictionary() def parse_topics(file_or_files, max_topics=sys.maxsize, delimiter=';'): assert max_topics >= 0 or max_topics is None topics = collections.OrderedDict() if not isinstance(file_or_files, list) and \ not isinstance(file_or_files, tuple): if hasattr(file_or_files, '__iter__'): file_or_files = list(file_or_files) else: file_or_files = [file_or_files]
return: dict """ def get_queries(query_file): with open(query_file, "r") as f: return {l.strip().split("\t")[0]: l.strip().split("\t")[1] for l in f} if __name__ == "__main__": print("[First]:\nRead label files to relations...") relations, _ = read_lablers_to_relations( sys.argv[1]) # relation .label file queries = get_queries(sys.argv[2]) # extracted queries out = sys.argv[3] # output folder index = pyndri.Index(sys.argv[4]) # index print("Reading data index ...") token2id, _, _ = index.get_dictionary() print(len(token2id)) externalDocId = {} for doc_id in range(index.document_base(), index.maximum_document()): # type: int extD_id, _ = index.document(doc_id) externalDocId[extD_id] = doc_id q_max_len, d_max_len = int(sys.argv[5]), int( sys.argv[6]) # query and document max length respectively relations_list = list(relations) queries_list = list(queries.keys()) reader = ContentReader(relations_list, token2id,
if __name__ == '__main__': X_cols = [ 'TF-IDF', 'LDA', 'LSI', 'dp_mu_500', 'GLM_top1000docs_sigma50_mu1000', 'doc_len', 'query_len' ] y_cols = ['relevance_label'] # Load training data print("Loading training data... ", end='') try: training_data = load_pickle('../pickles/LTR_DF_Training.pkl') except FileNotFoundError: tfidf_data = dict( load_pickle('../pickles/prepro_doc_col_q50_top1000_tfidf.pkl')) index = pyndri.Index('../index/') models_files = ['TF-IDF', 'LDA', 'LSI', 'dp_mu_500', 'GLM'] training_rel_file = '../ap_88_89/qrel_test' data_loader = LTR_Process_Data.TrainingDataLoader( ranked_data=tfidf_data, index=index, models=models_files, rel_file=training_rel_file, doc_len=Helper.document_lengths, int_to_ext_dict=Helper.int_to_ext_dict, ext_to_int_dict=Helper.ext_to_int_dict, queries=Helper.tokenized_queries) training_data = data_loader.data
algo = "" if bool(args["--p"]): algo = "porter" else: algo = "krovetz" print( "Please wait while we are collecting the {k} neighbors of each word ... \n" .format(k=int(args["--n"]))) #print("\nWord2vec loading ...") #model=Word2Vec.load_word2vec_format(args["<embedding_model>"], binary=bool(args["--binary"])) #print("\nOK") #filtering with the collection vocabulary print("Cleaning word embeddings ...") word2vec_intersect_dataset(pyndri.Index(args["--index_dataset"]), args["<embedding_model>"], args["<outputfolder>"], bool(args["--binary"]), args["--dataset"], algo) #open the new word2vec model = Word2Vec.load_word2vec_format(join( args["<outputfolder>"], "word2vec_of_" + args["--dataset"]), binary=False) #model=Word2Vec.load_word2vec_format(args["<embedding_model>"], binary=False) print("Word embeddings OK.") # Processing of the input text before neighbors finding prog = re.compile("[_\-\(]*([A-Z]\.)*[_\-\(]*") for t in toProcess: terms = toProcess[t].split() #stem(algo,toProcess[t]).split()
def main(): options = argparse.ArgumentParser() options.add_argument('topic_terms') options.add_argument('queries') options.add_argument('qrels') options.add_argument('index') options.add_argument('stoplist') options.add_argument('--skip-retrieval', action='store_true') args = options.parse_args() index = IndexWrapper(pyndri.Index(args.index)) stopper = Stopper(file=args.stoplist) topic_terms = collections.defaultdict( lambda: collections.defaultdict(list)) with open(args.topic_terms) as f: for line in f: user, docno, _, term = line.strip().split(',') topic_terms[docno][user].append(term) queries = read_queries(args.queries, format=args.queries.split('.')[-1]) qrels = Qrels(file=args.qrels) for query in queries: judged_docs = qrels.judged_docs(query.title) judged_with_tt = judged_docs & set(topic_terms.keys()) if judged_with_tt and not args.skip_retrieval: query_results = index.query(query, count=10) query_results_docs = [r[0].docno for r in query_results] for docno in judged_with_tt: for user in topic_terms[docno]: tt_set = set(topic_terms[docno][user]) - stopper.stopwords qt_set = set(query.vector.keys()) - stopper.stopwords tt_query_jaccard = jaccard_similarity(tt_set, qt_set) tt_query_recall = recall(tt_set, qt_set) results_jaccard = -1 results_recall = -1 if not args.skip_retrieval: tt_query = Query(docno, vector=collections.Counter( topic_terms[docno][user])) tt_results = index.query(tt_query, count=10) tt_results_docs = [r[0].docno for r in tt_results] results_jaccard = jaccard_similarity( set(tt_results_docs), set(query_results_docs)) results_recall = recall(set(tt_results_docs), set(query_results_docs)) print(user, docno, query.title, qrels.relevance_of(docno, query.title), tt_query_jaccard, tt_query_recall, results_jaccard, results_recall, sep=',')
args = docopt.docopt(""" Usage: embed_idf.py --i=<indexed_data> --d=<word_dict_file> --o=<output_folder> Example: embed_idf.py --i=/home/thiziri/Documents/DOCTORAT/COLLECTION/Indri_index/AP88 Options: --i=<indexed_data> Gives the INDRI index of the collection. --d=<word_dict_file> Gives the word_dict.txt file generated by MatchZoo. --o=<output_folder> Gives the output folder where constructed embed.idf file will be stored. """) print("Reading index ...") index = pyndri.Index(args["--i"]) token2id, id2token, id2df = index.get_dictionary() id2tf = index.get_term_frequencies() print("Reading word_dict_file ...") w_dict = read_word_dict(args["--d"]) out = open(join(args["--o"], "embed.idf"), "w") for w in tqdm(w_dict): try: # idf = log((index.maximum_document()-1-id2df[token2id[w_dict[w]]]+0.0)/id2df[token2id[w_dict[w]]]+0.0) idf = log( (index.maximum_document() - 1) / id2df[token2id[w_dict[w]]]) except: idf = 0.0
def main(): parser = argparse.ArgumentParser() parser.add_argument('model') parser.add_argument('index', type=argparse_utils.existing_directory_path) parser.add_argument('--limit', type=argparse_utils.positive_int, default=None) parser.add_argument('--object_classification', type=argparse_utils.existing_file_path, nargs='+', default=None) parser.add_argument('--filter_unclassified', action='store_true', default=False) parser.add_argument('--l2_normalize', action='store_true', default=False) parser.add_argument('--mode', choices=('tsne', 'embedding_projector'), default='tsne') parser.add_argument('--legend', action='store_true', default=False) parser.add_argument('--tick_labels', action='store_true', default=False) parser.add_argument('--edges', action='store_true', default=False) parser.add_argument('--border', action='store_true', default=False) parser.add_argument('--plot_out', type=argparse_utils.nonexisting_file_path, required=True) args = parser.parse_args() try: logging_utils.configure_logging(args) except IOError: return -1 # Set matplotlib style. plt.style.use('bmh') logging.info('Loading index.') index = pyndri.Index(args.index) logging.info('Loading cuNVSM model.') model_base, epoch_and_ext = args.model.rsplit('_', 1) epoch = int(epoch_and_ext.split('.')[0]) if not os.path.exists('{}_meta'.format(model_base)): model_meta_base, batch_idx = model_base.rsplit('_', 1) else: model_meta_base = model_base model = nvsm.load_model( nvsm.load_meta(model_meta_base), model_base, epoch, only_object_embeddings=True) raw_object_representations = np.copy(model.object_representations) if args.limit: raw_object_representations = raw_object_representations[:args.limit, :] for object_classification in args.object_classification: root, ext = os.path.splitext(args.plot_out) plot_out = '{}-{}.{}'.format( root, os.path.basename(object_classification), ext.lstrip('.')) if object_classification and args.filter_unclassified: logging.info('Filtering unclassified.') with open(object_classification, 'r') as f_objects: object_ids = [line.strip().split()[0] for line in f_objects] indices = sorted(model.inv_object_mapping[idx] for _, idx in index.document_ids(object_ids) if idx in model.inv_object_mapping) logging.info('Considering %d out of %d representations.', len(indices), len(object_ids)) translation_table = {idx: i for i, idx in enumerate(indices)} object_representations = raw_object_representations[indices] assert object_representations.shape[0] == \ len(translation_table) else: translation_table = None raise NotImplementedError() logging.info('Loading object clusters.') cluster_id_to_product_ids = {} if object_classification: with open(object_classification, 'r') as f_objects: for line in f_objects: object_id, cluster_id = line.strip().split() if cluster_id not in cluster_id_to_product_ids: cluster_id_to_product_ids[cluster_id] = set() cluster_id_to_product_ids[cluster_id].add(object_id) for cluster_id in list(cluster_id_to_product_ids.keys()): object_ids = list(cluster_id_to_product_ids[cluster_id]) cluster_id_to_product_ids[cluster_id] = set( (model.inv_object_mapping[int_object_id] if translation_table is None else translation_table[ model.inv_object_mapping[int_object_id]]) for ext_object_id, int_object_id in index.document_ids(object_ids) if int_object_id in model.inv_object_mapping and (args.limit is None or (model.inv_object_mapping[int_object_id] < args.limit))) else: raise NotImplementedError() assert len(cluster_id_to_product_ids) < len(MARKERS) if args.l2_normalize: logging.info('L2-normalizing representations.') object_representations /= np.linalg.norm( object_representations, axis=1, keepdims=True) if args.mode == 'tsne': logging.info('Running t-SNE.') twodim_object_representations = \ TSNE(n_components=2, init='pca', random_state=0).\ fit_transform(object_representations) logging.info('Plotting %s.', twodim_object_representations.shape) colors = cm.rainbow( np.linspace(0, 1, len(cluster_id_to_product_ids))) for idx, cluster_id in enumerate( sorted(cluster_id_to_product_ids.keys(), key=lambda cluster_id: len( cluster_id_to_product_ids[cluster_id]), reverse=True)): row_ids = list(cluster_id_to_product_ids[cluster_id]) plt.scatter( twodim_object_representations[row_ids, 0], twodim_object_representations[row_ids, 1], marker=MARKERS[idx], edgecolors='grey' if args.edges else None, cmap=plt.cm.Spectral, color=colors[idx], alpha=0.3, label=pylatex.utils.escape_latex(cluster_id)) plt.grid() plt.tight_layout() if args.legend: plt.legend(bbox_to_anchor=(0, -0.15, 1, 0), loc=2, ncol=2, mode='expand', borderaxespad=0) if not args.tick_labels: plt.gca().get_xaxis().set_visible(False) plt.gca().get_yaxis().set_visible(False) if not args.border: # plt.gcf().patch.set_visible(False) plt.gca().axis('off') logging.info('Writing %s.', plot_out) plt.savefig(plot_out, bbox_inches='tight', transparent=True, pad_inches=0, dpi=200) elif args.mode == 'embedding_projector': logging.info('Dumping to TensorFlow embedding projector format.') with open('{}_vectors.tsv'.format(plot_out), 'w') as f_vectors, \ open('{}_meta.tsv'.format(plot_out), 'w') as f_meta: f_meta.write('document_id\tclass\n') def write_rowids(row_ids, cluster_id): for row_id in row_ids: f_vectors.write( '{}\n'.format('\t'.join( '{:.5f}'.format(x) for x in object_representations[row_id]))) f_meta.write('{}\t{}\n'.format( index.ext_document_id( model.object_mapping[row_id]), cluster_id)) for cluster_id in cluster_id_to_product_ids.keys(): row_ids = list(cluster_id_to_product_ids[cluster_id]) write_rowids(row_ids, cluster_id) logging.info('All done!')
def main(args=None): """ Given a config file defining a set of collections and scoreres, generate run a parallel Parl workflow to generate run and eval output. """ parser = argparse.ArgumentParser(description='Query runner.') parser.add_argument('-c', '--config-file', dest='config_file', default='config/scorers.yaml') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') args = parser.parse_args() overwrite = False if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) cfg = Config() cfg.read_config(args.config_file) run_prefix = cfg.get_run_prefix() output_dir = cfg.get_output_dir() eval_dir = cfg.get_eval_dir() for col in cfg.get_collections(): logging.info("Processing collection %s" % col['name']) qrels_path = col['qrels'] index_path = "%s/%s" % (cfg.get_index_root(), col['index']) index = pyndri.Index(index_path) token2id, id2token, id2df = index.get_dictionary() total_terms = index.total_terms() id2tf = index.get_term_frequencies() for query_file in col['queries']: logging.info("Processing query_file %s" % query_file) # read the queries as a list # TODO: need queries as feature vector queries = list_from_xml(col['queries'][query_file], token2id, id2tf, total_terms) for scorer in cfg.get_scorers(): params_list, params_str_list = cfg.get_param_combinations( scorer['name']) for idx, params in enumerate(params_list): param_str = params_str_list[idx] # Create output file results_file = "{}/{}/{}/{}.out".format( output_dir, col['name'], scorer['name'], param_str) eval_file = "{}/{}/{}/{}.eval".format( eval_dir, col['name'], scorer['name'], param_str) # skip if exists if not overwrite and os.path.exists(results_file): logging.info("Found existing output file, skipping") pass results_dir = os.path.dirname(results_file) if not os.path.exists(results_dir): os.makedirs(results_dir) trec_eval_dir = os.path.dirname(eval_file) if not os.path.exists(trec_eval_dir): os.makedirs(trec_eval_dir) # For each col (index + topics + qrels), scorer, paramset r = [] for query in queries: r.append(run_queries(index_path, scorer['module'], scorer['class'], params, query)) outputs = [x.result() for x in r] with open(results_file, 'w') as f: for output in outputs: for idx, res in enumerate(output): row = "{} Q0 {} {} {} {}\n".format( res[0], res[1], idx+1, res[2], run_prefix) f.write(row) f.close() trec_eval('all_trec', qrels_path, results_file, eval_file)
return list_of_series t = time.time() # read query validation set filename = "data/validation_set/query_validation_set.txt" base_filename, file_extension = os.path.splitext(filename) output = f'{base_filename}.csv' input = open(filename, "r") lines = input.readlines() input.close() # index of corpus index = pyndri.Index('Vol45/Vol45-index') # define bm25 query environment bm25_query_env = pyndri.OkapiQueryEnvironment(index, k1=1.2, b=0.75, k3=1000) # retrieve documents and bm25 score df = pd.DataFrame() for i in range(len(lines)): query = lines[i].rstrip() list_of_series = getDocuments(index, bm25_query_env, query) df = pd.concat([df, pd.DataFrame(list_of_series)]) df.columns = ['topic', 'query', 'document_name', 'document_score'] # uncomment if you want to write queries and documents to csv #df.to_csv(output, index=False, chunksize=1000)
def create_graph_from_sentences(sentences, path_to_index): index = pyndri.Index(path_to_index) token2id, id2token, id2df = index.get_dictionary()
from tools4text import extractTopics, clean, get_qrels, save_corpus, get_docs_from_run, run2relations from tools4text import rank_to_relevance, path_leaf, remove_extension, extract_trec_million_queries logging.basicConfig(filename='collect2MZinpuText.log', level=logging.DEBUG) if __name__ == '__main__': config_file = sys.argv[1] config = json.load(open(config_file)) logging.info('Config: ' + json.dumps(config, indent=2)) print("Data extraction\nConfiguration: ") print(json.dumps(config, indent=2), end='\n') print("Reading index ...") index = pyndri.Index(config["indexed_data"]) _, id2token, _ = index.get_dictionary() externalDocId = {} for doc in range(index.document_base(), index.maximum_document()): extD, _ = index.document(doc) externalDocId[extD] = doc print("Extract queries ...") queries = {} if config["train_queries"] == config["test_queries"]: queries = extractTopics(config["train_queries"]) if config["train_queries_format"] == "trec"\ else extract_trec_million_queries(config["train_queries"]) else: train_queries = extractTopics(config["train_queries"]) if config["train_queries_format"] == "trec" \ else extract_trec_million_queries(config["train_queries"]) test_queries = extractTopics(config["test_queries"]) if config["test_queries_format"] == "trec" \ else extract_trec_million_queries(config["test_queries"])
import pyndri import params import pickle import sys print("uploading index") index = pyndri.Index(params.path_to_index) dic = {} for document_id in range(index.document_base(), index.maximum_document()): if document_id % 1000000 == 0: print("in document", document_id) sys.stdout.flush() if index.document(document_id)[0].__contains__("ROUND-04"): dic[index.document(document_id)[0]] = document_id print("loading index finished") f = open("dic4.pickle", "wb") pickle.dump(dic, f) f.close() if not dic: print("empty dictionary")
def create_index_resources(index_path="index/"): index = pyndri.Index(index_path) token2id, id2token, id2df = index.get_dictionary() dictionary = pyndri.extract_dictionary(index) document_ids = list(range(index.document_base(), index.maximum_document())) return index, token2id, id2token, id2df, dictionary, document_ids
from os.path import join import os.path from gensim.models import Word2Vec import numpy import docopt import pyndri if __name__ == "__main__": print("\n----BEGIN----\n") args = docopt.docopt(""" Usage: get_idf_vocab_dataset.py <outputfolder> [--dataset=<val2>] [--index_dataset=<val3>] [--b=<val4>] Options: --dataset=<val2> Precise the collection name that corresonds to the topics you are processing . --index_dataset=val3 Provides the index of your data set to filter the word embeddings vocabulary while computing the neighbors . --b The b value of computing the alpha parameter of NWT model while computing the neighbors [default : 2]. """) index=pyndri.Index(args["--index_dataset"]) token2id,_,id2df=index.get_dictionary() file=open(join(args["<outputfolder>"],args["--dataset"])+"wv.idf.txt","w") b=2#args["--b" for word in token2id: alpha = (index.maximum_document()-id2df[token2id[word]]+0.5)/(id2df[token2id[word]]+0.5)+float(b) file.write(word+"\t"+str(alpha)+"\n") file.close() print("Finished.")