def __init__(self, qrels_input_path, qrels_output_path, topics_input_path, topics_output_path, corpus_input_path, corpus_output_path, include_linked=False, query_sample_size=None): self.qrels_input_path = qrels_input_path self.qrels_output_path = qrels_output_path self.topics_input_path = topics_input_path self.topics_output_path = topics_output_path self.corpus_input_path = corpus_input_path self.corpus_output_path = corpus_output_path self.include_linked = include_linked self.query_sample_size = query_sample_size if os.path.exists(self.corpus_output_path): raise ArmyAntException("%s already exists" % self.corpus_output_path) qrels_dir = os.path.dirname(self.qrels_output_path) if os.path.exists(qrels_dir): raise ArmyAntException("%s already exists" % qrels_dir) topics_dir = os.path.dirname(self.topics_output_path) if os.path.exists(topics_dir): raise ArmyAntException("%s already exists" % topics_dir) os.makedirs(self.corpus_output_path) os.makedirs(qrels_dir) os.makedirs(topics_dir)
def __init__(self, task, eval_location): super().__init__(task, eval_location) self.o_results_path = os.path.join(eval_location, 'results', task._id) self.o_assessments_path = os.path.join(eval_location, 'assessments', task._id) try: os.makedirs(self.o_results_path) except FileExistsError: raise ArmyAntException("Results directory '%s' already exists" % self.o_results_path) try: os.makedirs(self.o_assessments_path) except FileExistsError: raise ArmyAntException("Assessments directory '%s' already exists" % self.o_assessments_path)
async def index(self, features_location=None, pgonly=True): if os.path.exists(self.index_location): raise ArmyAntException("%s already exists" % self.index_location) os.mkdir(self.index_location) async for item in super().index(pgonly=pgonly): yield item conn = psycopg2.connect( "dbname='army_ant' user='******' host='localhost'") c = conn.cursor() logging.info("Creating term nodes CSV file") with open(os.path.join(self.index_location, 'term-nodes.csv'), 'w') as f: c.copy_expert( """COPY (SELECT node_id AS "node_id:ID", attributes->'name'->0->>'value' AS name, """ """label AS ":LABEL" FROM nodes) TO STDOUT WITH CSV HEADER""", f) logging.info("Creating in_window_of edges CSV file") with open(os.path.join(self.index_location, 'in_window_of-edges.csv'), 'w') as f: c.copy_expert( """COPY (SELECT source_node_id AS ":START_ID", attributes->>'doc_id' AS doc_id, """ """target_node_id AS ":END_ID", label AS ":TYPE" FROM edges) TO STDOUT WITH CSV HEADER""", f)
def __init__(self, db_location, db_name, eval_location): self.tasks = [] self.running = None self.eval_location = eval_location self.results_dirname = os.path.join(eval_location, 'results') self.assessments_dirname = os.path.join(eval_location, 'assessments') self.spool_dirname = os.path.join(eval_location, 'spool') os.makedirs(self.results_dirname, exist_ok=True) os.makedirs(self.assessments_dirname, exist_ok=True) os.makedirs(self.spool_dirname, exist_ok=True) db_location_parts = db_location.split(':') if len(db_location_parts) > 1: db_location = db_location_parts[0] db_port = int(db_location_parts[1]) else: db_port = 27017 try: self.client = MongoClient(db_location, db_port) except ConnectionFailure: raise ArmyAntException( "Could not connect to MongoDB instance on %s:%s" % (db_location, db_port)) self.db = self.client[db_name] self.db['evaluation_tasks'].create_index('run_id', unique=True)
def __init__(self, source_path): super(MongoDBReader, self).__init__(source_path) db_location_parts = re.split(r'[:/]', source_path) if len(db_location_parts) >= 3: db_host = db_location_parts[0] db_port = int(db_location_parts[1]) db_name = db_location_parts[1] elif len(db_location_parts) == 2: db_host = db_location_parts[0] db_port = 27017 db_name = db_location_parts[1] else: db_host = 'localhost' db_port = 27017 db_name = db_location_parts[0] try: self.client = MongoClient(db_host, db_port) except ConnectionFailure: raise ArmyAntException( "Could not connect to MongoDB instance on %s:%s" % (db_host, db_port)) self.db = self.client[db_name]
async def index(self, features_location=None): if not features_location: raise ArmyAntException( "Must provide a features location with topics.txt and qrels.txt files" ) topics = self.load_topics(features_location) qrels = self.load_qrels(features_location) async for doc in self.lucene_engine.index( features_location=features_location): self.process_web_features(doc) yield doc ltr_helper = TensorFlowRanking.JLearningToRankHelper( self.lucene_index_location) ltr_helper.computeDocumentFeatures() j_graph_based_features = self.j_build_graph_based_features() ltr_helper.updateDocumentFeatures(j_graph_based_features) train_set = self.build_train_set(ltr_helper, topics, qrels) pd_train_generator = self.pandas_generator(train_set) logger.info("Training model") hparams = tf.contrib.training.HParams(learning_rate=0.05) ranker = self.get_estimator(hparams) ranker.train(input_fn=lambda: self.input_fn(pd_train_generator), steps=100)
async def search(self, query, offset, limit, query_type=None, task=None, base_index_location=None, base_index_type=None, ranking_function=None, ranking_params=None, debug=False): try: self.cluster = await Cluster.open(self.loop, hosts=[self.index_host], port=self.index_port) except ClientConnectorError: raise ArmyAntException( "Could not connect to Gremlin Server on %s:%s" % (self.index_host, self.index_port)) self.client = await self.cluster.connect() query_tokens = GraphOfWord.analyze(query) result_set = await self.client.submit( ('g = %s.traversal()\n' % self.graph) + load_gremlin_script('graph_of_word_query'), { 'queryTokens': query_tokens, 'offset': offset, 'limit': limit }) results = await result_set.one() await self.cluster.close() return results
def get_topic_assessments(self): topic_doc_judgements = {} if not os.path.exists(self.task.assessments_path): raise ArmyAntException("Topic assessments file not found: %s" % self.task.assessments_path) with open(self.task.assessments_path, 'r') as f: for line in f: if self.retrieval_task == Index.RetrievalTask.entity_retrieval: topic_id, _, id, _, judgement = line.split(' ', 4) judgement = int(judgement) if judgement == 2: judgement = 0 else: topic_id, _, id, judgement, _ = line.split(' ', 4) judgement = int(judgement) if judgement > 0: judgement = 1 if topic_id not in topic_doc_judgements: topic_doc_judgements[topic_id] = {} topic_doc_judgements[topic_id][id] = judgement return topic_doc_judgements
def factory(source_path, source_reader, features_location=None, limit=None): import army_ant.reader as rd if source_reader == 'wikipedia_data': return rd.WikipediaDataReader(source_path) elif source_reader == 'inex': return rd.INEXReader(source_path, include_dbpedia=False, limit=limit) elif source_reader == 'inex_dbpedia': return rd.INEXReader(source_path, include_dbpedia=True, limit=limit) elif source_reader == 'inex_dir': return rd.INEXDirectoryReader(source_path, include_dbpedia=False, limit=limit) elif source_reader == 'inex_dir_dbpedia': return rd.INEXDirectoryReader(source_path, include_dbpedia=True, limit=limit) elif source_reader == 'living_labs': return rd.LivingLabsReader(source_path, limit) elif source_reader == 'wapo': return rd.TRECWashingtonPostReader(source_path, limit=limit) elif source_reader == 'wapo_doc_profile': return rd.TRECWashingtonPostReader( source_path, features_location=features_location, include_ae_doc_profile=True, limit=limit) elif source_reader == 'wapo_dbpedia': return rd.TRECWashingtonPostReader(source_path, include_dbpedia=True, limit=limit) elif source_reader == 'wapo_doc_profile_dbpedia': return rd.TRECWashingtonPostReader( source_path, features_location=features_location, include_ae_doc_profile=True, include_dbpedia=True, limit=limit) elif source_reader == 'csv': return rd.CSVReader(source_path) # elif source_reader == 'gremlin': # return rd.GremlinReader(source_path) else: raise ArmyAntException("Unsupported source reader %s" % source_reader)
async def search(self, query, offset, limit, query_type=None, task=None, base_index_location=None, base_index_type=None, ranking_function=None, ranking_params=None, debug=False): raise ArmyAntException("Search not implemented for %s" % self.__class__.__name__)
def queue(self): duplicate_error = False run_id_error = False inserted_ids = [] for task in self.tasks: run_id_error = task.run_id is None or task.run_id.strip() == '' if run_id_error: continue try: task.time = int(round(time.time() * 1000)) result = self.db['evaluation_tasks'].insert_one(task.__dict__) inserted_ids.append(result.inserted_id) except DuplicateKeyError: duplicate_error = True if duplicate_error: raise ArmyAntException("The Run ID must be unique.") if run_id_error: raise ArmyAntException("Tasks without a Run ID are not accepted") return inserted_ids
async def index(self, features_location=None): if not features_location: raise ArmyAntException( "Must provide a features location with topics.txt and qrels.txt files" ) async for doc in self.lucene_engine.index( features_location=features_location): yield doc features_helper = LuceneFeaturesEngine.JFeaturesHelper( self.lucene_index_location) j_features = self.j_load_features(features_location) features_helper.setDocumentFeatures(j_features)
def kendall_w(pd_dfs): pd_dfs = fill_missing(pd_dfs, 'id', rank=FillMethod.INC_MAX, score=FillMethod.ZERO) rankings = np.stack([df.sort_values('id')['rank'] for df in pd_dfs], axis=0) if rankings.ndim != 2: raise ArmyAntException('Rankings matrix must be 2-dimensional') m = rankings.shape[0] # rankers n = rankings.shape[1] # documents return (12 * n * np.var(np.sum(rankings, axis=0))) / (m**2 * (n**3 - n))
def get_topic_assessments(self): topic_doc_judgements = {} if not os.path.exists(self.task.assessments_path): raise ArmyAntException("Topic assessments file not found: %s" % self.task.assessments_path) with open(self.task.assessments_path, 'r') as f: for line in f: topic_id, _, id, judgement = line.split(' ') if topic_id not in topic_doc_judgements: topic_doc_judgements[topic_id] = {} topic_doc_judgements[topic_id][id] = int(judgement) return topic_doc_judgements
def __init__(self, source_path, doc_id_suffix=':doc_id', text_suffix=':text'): super(CSVReader, self).__init__(source_path) self.reader = csv.DictReader(open(source_path, newline='')) self.doc_id_suffix = doc_id_suffix self.text_suffix = text_suffix if not any([ fieldname.endswith(self.text_suffix) for fieldname in self.reader.fieldnames ]): raise ArmyAntException( "CSV must have at least one column name with a %s suffix (other supported suffixes include %s)" % (self.text_suffix, self.doc_id_suffix))
def __init__(self, db_location, db_name, loop): super().__init__(db_location, db_name, loop) db_location_parts = db_location.split(':') if len(db_location_parts) > 1: db_location = db_location_parts[0] db_port = int(db_location_parts[1]) else: db_port = 27017 try: self.client = MongoClient(db_location, db_port) except ConnectionFailure: raise ArmyAntException( "Could not connect to MongoDB instance on %s:%s" % (db_location, db_port)) self.db = self.client[self.db_name]
def factory(task, eval_location): import army_ant.evaluation as evl if task.eval_format == 'inex': return evl.INEXEvaluator(task, eval_location, Index.QueryType.keyword, Index.RetrievalTask.document_retrieval) if task.eval_format == 'inex-xer': return evl.INEXEvaluator(task, eval_location, Index.QueryType.keyword, Index.RetrievalTask.entity_retrieval) if task.eval_format == 'inex-xer-elc': return evl.INEXEvaluator(task, eval_location, Index.QueryType.entity, Index.RetrievalTask.entity_retrieval) if task.eval_format == 'trec': return evl.TRECEvaluator(task, eval_location) elif task.eval_format == 'll-api': return evl.LivingLabsEvaluator(task, eval_location) else: raise ArmyAntException("Unsupported evaluator format")
def open(index_location, index_type, loop): import army_ant.index as idx key = Index.__preloaded_key__(index_location, index_type) if key in Index.PRELOADED: return Index.PRELOADED[key] index_features = index_type.split(':')[1:] if index_type == 'gow': return idx.GraphOfWord(None, index_location, loop) elif index_type == 'goe': return idx.GraphOfEntity(None, index_location, loop) elif index_type == 'gow_batch': return idx.GraphOfWordBatch(None, index_location, loop) elif index_type == 'goe_batch': return idx.GraphOfEntityBatch(None, index_location, loop) elif index_type == 'gow_csv': return idx.GraphOfWordCSV(None, index_location, loop) elif index_type == 'goe_csv': return idx.GraphOfEntityCSV(None, index_location, loop) elif index_type == 'gremlin': return idx.GremlinServerIndex(None, index_location, loop) elif index_type.startswith('hgoe'): return idx.HypergraphOfEntity(None, index_location, index_features, loop) elif index_type.startswith('lucene_features'): return idx.LuceneEngine(None, index_location, index_features, loop) elif index_type.startswith('lucene_entities'): return idx.LuceneEntitiesEngine(None, index_location, index_features, loop) elif index_type.startswith('lucene'): return idx.LuceneEngine(None, index_location, index_features, loop) elif index_type.startswith('tfr'): return idx.TensorFlowRanking(None, index_location, index_features, loop) else: raise ArmyAntException("Unsupported index type %s" % index_type)
def __init__(self, task, eval_location): super().__init__(task, eval_location) try: base_url, api_key, run_id = eval_location.split('::') except ValueError: raise ArmyAntException( "Must provide the base_url, api_key and run_id, separated by '::'" ) self.base_url = urljoin(base_url, 'api/v2/participant/') self.auth = HTTPBasicAuth(api_key, '') self.headers = {'Content-Type': 'application/json'} requests_cache.install_cache('living_labs_cache', expire_after=10800) self.loop = asyncio.get_event_loop() self.index = Index.open(task.index_location, task.index_type, self.loop) self.run_id = run_id self.pickle_dir = '/opt/army-ant/cache/%s' % run_id if not os.path.exists(self.pickle_dir): os.mkdir(self.pickle_dir)
def factory(reader, index_location, index_type, loop): import army_ant.index as idx index_features = index_type.split(':')[1:] if index_type == 'gow': return idx.GraphOfWord(reader, index_location, loop) elif index_type == 'goe': return idx.GraphOfEntity(reader, index_location, loop) elif index_type == 'gow_batch': return idx.GraphOfWordBatch(reader, index_location, loop) elif index_type == 'goe_batch': return idx.GraphOfEntityBatch(reader, index_location, loop) elif index_type == 'gow_csv': return idx.GraphOfWordCSV(reader, index_location, loop) elif index_type == 'goe_csv': return idx.GraphOfEntityCSV(reader, index_location, loop) elif index_type.startswith('hgoe'): return idx.HypergraphOfEntity(reader, index_location, index_features, loop) elif index_type.startswith('lucene_features'): return idx.LuceneFeaturesEngine(reader, index_location, index_features, loop) elif index_type.startswith('lucene_entities'): return idx.LuceneEntitiesEngine(reader, index_location, index_features, loop) elif index_type.startswith('lucene'): return idx.LuceneEngine(reader, index_location, index_features, loop) elif index_type.startswith('tfr'): return idx.TensorFlowRanking(reader, index_location, index_features, loop) elif index_type.startswith('null_index'): return idx.NullIndex(reader, index_location, loop) elif index_type.startswith('text_index'): return idx.TextIndex(reader, index_location, index_features, loop) else: raise ArmyAntException("Unsupported index type %s" % index_type)
def load_to_postgres(self, conn, doc): raise ArmyAntException("Load function not implemented for %s" % self.__class__.__name__)
async def retrieve(self, results): raise ArmyAntException("Retrieve not implemented for %s" % self.__class__.__name__)
async def store(self, index): raise ArmyAntException("Store not implemented for %s" % self.__class__.__name__)
def factory(db_location, db_name, db_type, loop): if db_type == 'mongo': return MongoDatabase(db_location, db_name, loop) else: raise ArmyAntException("Unsupported database type %s" % db_type)
async def index(self, features_location=None): try: if HypergraphOfEntity.Feature.keywords in self.index_features: logger.info( "Indexing top %.0f%% keywords per document based on TextRank" % (Index.KW_RATIO * 100)) index_features_str = ':'.join( [index_feature.value for index_feature in self.index_features]) features = [ HypergraphOfEntity.JFeature.valueOf(index_feature.value) for index_feature in self.index_features if index_feature != HypergraphOfEntity.Feature.keywords ] if HypergraphOfEntity.Feature.context in self.index_features: if features_location is None: raise ArmyAntException( "Must provide a features_location pointing to a directory" ) if 'word2vec_simnet.graphml.gz' not in os.listdir( features_location): raise ArmyAntException( "Must provide a 'word2vec_simnet.graphml.gz' file within features directory" ) hgoe = HypergraphOfEntity.JHypergraphOfEntityInMemory( self.index_location, java.util.Arrays.asList(features), features_location, True) corpus = [] for doc in self.reader: logger.debug("Preloading document %s (%d triples)" % (doc.doc_id, len(doc.triples))) entities = [] if doc.entities: for entity in doc.entities: try: entities.append( HypergraphOfEntity.JEntity( entity.label, entity.uri)) except Exception as e: logger.warning("Entity %s skipped" % entity) logger.exception(e) triples = [] for s, p, o in doc.triples: try: triples.append( HypergraphOfEntity.JTriple( HypergraphOfEntity.JEntity(s.label, s.uri), HypergraphOfEntity.JEntity(p.label, p.uri), HypergraphOfEntity.JEntity(o.label, o.uri))) except Exception as e: logger.warning("Triple (%s, %s, %s) skipped" % (s, p, o)) logger.exception(e) if HypergraphOfEntity.Feature.keywords in self.index_features: doc.text = textrank(doc.text, ratio=Index.KW_RATIO) jDoc = HypergraphOfEntity.JDocument( JString(doc.doc_id), JString(doc.title), JString(doc.text), java.util.Arrays.asList(triples), java.util.Arrays.asList(entities)) corpus.append(jDoc) if len(corpus) % (JavaIndex.BLOCK_SIZE // 10) == 0: logger.info("%d documents preloaded" % len(corpus)) if len(corpus) >= JavaIndex.BLOCK_SIZE: logger.info("Indexing batch of %d documents using %s" % (len(corpus), index_features_str)) hgoe.indexCorpus(java.util.Arrays.asList(corpus)) corpus = [] yield Document(doc_id=doc.doc_id, metadata={ 'url': doc.metadata.get('url'), 'name': doc.metadata.get('name') }) if len(corpus) > 0: logger.info("Indexing batch of %d documents using %s" % (len(corpus), index_features_str)) hgoe.indexCorpus(java.util.Arrays.asList(corpus)) hgoe.postProcessing() hgoe.save() except JException as e: logger.error("Java Exception: %s" % e.stacktrace())
def __next__(self): raise ArmyAntException("Reader __next__ not implemented")
def extract(self): raise ArmyAntException("Extract not implemented for %s" % self.__class__.__name__)
async def run(self): raise ArmyAntException("Unsupported evaluator format %s" % self.task.eval_format)
def evaluation(self, index_location, index_type, eval_format, topics_filename=None, assessments_filename=None, base_url=None, api_key=None, run_id=None, output_dir='/opt/army-ant/eval'): if eval_format == 'inex' and (topics_filename is None or assessments_filename is None): raise ArmyAntException( "Must include the arguments --topics-filename and --assessments-filename" ) if eval_format == 'll-api' and (base_url is None or api_key is None or run_id is None): raise ArmyAntException( "Must include the arguments --base-url, --api-key and --run-id" ) if eval_format == 'inex': spool_dir = os.path.join(output_dir, 'spool') with open(topics_filename, 'rb') as fsrc, \ tempfile.NamedTemporaryFile(dir=spool_dir, prefix='eval_topics_', delete=False) as fdst: shutil.copyfileobj(fsrc, fdst) topics_path = fdst.name with open(assessments_filename, 'rb') as fsrc, \ tempfile.NamedTemporaryFile(dir=spool_dir, prefix='eval_assessments_', delete=False) as fdst: shutil.copyfileobj(fsrc, fdst) assessments_path = fdst.name else: topics_path = None assessments_path = None # TODO must add query_type, base_indexes, ranking_function and ranking_params task = EvaluationTask(index_location=index_location, index_type=index_type, eval_format=eval_format, query_type=None, base_indexes=None, ranking_function=None, ranking_params=None, topics_filename=topics_filename, topics_path=topics_path, assessments_filename=assessments_filename, assessments_path=assessments_path, base_url=base_url, api_key=api_key, run_id=run_id) config = yaml.load(open('config.yaml')) db_location = config['default'].get('db', {}).get('location', 'localhost') db_name = config['default'].get('db', {}).get('name', 'army_ant') manager = EvaluationTaskManager(db_location, db_name, output_dir) manager.add_task(task) inserted_ids = manager.queue() if len(inserted_ids) < 1: raise ArmyAntException("Could not queue task") loop = asyncio.get_event_loop() try: loop.run_until_complete(manager.process(task_id=inserted_ids[0])) except KeyboardInterrupt: for task in asyncio.Task.all_tasks(): task.cancel() finally: loop.run_until_complete(loop.shutdown_asyncgens()) loop.close()
async def index(self, features_location=None, pgonly=True): if os.path.exists(self.index_location): raise ArmyAntException("%s already exists" % self.index_location) os.mkdir(self.index_location) async for item in super().index(pgonly=pgonly): yield item conn = psycopg2.connect( "dbname='army_ant' user='******' host='localhost'") c = conn.cursor() logging.info("Creating term nodes CSV file") with open(os.path.join(self.index_location, 'term-nodes.csv'), 'w') as f: c.copy_expert( """ COPY ( SELECT node_id AS "node_id:ID", attributes->'name'->0->>'value' AS name, attributes->'type'->0->>'value' AS type, label AS ":LABEL" FROM nodes WHERE label = 'term' ) TO STDOUT WITH CSV HEADER """, f) logging.info("Creating entity nodes CSV file") with open(os.path.join(self.index_location, 'entity-nodes.csv'), 'w') as f: c.copy_expert( """ COPY ( SELECT node_id AS "node_id:ID", regexp_replace(attributes->'name'->0->>'value', E'[\\n\\r]', ' ', 'g') AS name, attributes->'type'->0->>'value' AS type, attributes->'doc_id'->0->>'value' AS doc_id, label AS ":LABEL" FROM nodes WHERE label = 'entity' ) TO STDOUT WITH CSV HEADER """, f) logging.info("Creating before edges CSV file") with open(os.path.join(self.index_location, 'before-edges.csv'), 'w') as f: c.copy_expert( """ COPY ( SELECT source_node_id AS ":START_ID", attributes->>'doc_id' AS doc_id, target_node_id AS ":END_ID", label AS ":TYPE" FROM edges WHERE label = 'before' ) TO STDOUT WITH CSV HEADER """, f) logging.info("Creating related_to edges CSV file") with open(os.path.join(self.index_location, 'related_to-edges.csv'), 'w') as f: c.copy_expert( """ COPY ( SELECT source_node_id AS ":START_ID", target_node_id AS ":END_ID", label AS ":TYPE" FROM edges WHERE label = 'related_to' ) TO STDOUT WITH CSV HEADER """, f) logging.info("Creating contained_in edges CSV file") with open(os.path.join(self.index_location, 'contained_in-edges.csv'), 'w') as f: c.copy_expert( """ COPY ( SELECT source_node_id AS ":START_ID", target_node_id AS ":END_ID", label AS ":TYPE" FROM edges WHERE label = 'contained_in' ) TO STDOUT WITH CSV HEADER """, f)