def round_robin_alg(runs, qrels, pool_size, log_path):

	# identify the names of all unique runs, and how many there are
	runs_ids = runs["RUN"].unique().tolist()
	runs_ids.sort()
	runs_count = len(runs_ids)

	# keep track of which run we're retrieving from
	next_run_index = -1


	### HELPER STRUCTURES
	# dict containing all occurrences of all documents
	docs_occurrences = adjudication.get_occurrences(runs, log_path)

	# dict keeping track of which docs have been retrieved from where, and if they are unique
	retrieved_docs = {}

	# dict keeping track of how many docs have been retrieved from each run
	runs_status = {}					
	for item in runs_ids:
		runs_status[item] = 1
	
	# pandas' dataframe keeping track of how many docs for each run are relevant/unique relevant/nonrelevant
	run_relevancies = pd.DataFrame(columns=['RUN', 'DOCS_RETRIEVED', 'REL', 'REL_UNIQUE', 'NON_REL'])
	run_relevancies.set_index('RUN', inplace=True)

	### OUTPUT STRUCTURES
	retrieved_docs_order = []			# ordered array of each doc retrieved at each step
	run_relevancies_order = []		# ordered array of the relevancies status at each step


	print("➡️ Retrieving documents to be adjudicated...")
	for i in range(1, pool_size+1):
		# repeat the process until we have retrieved the desired number of documents

		# print("\n### DOCUMENT", i)																												### LOGGING
		# logging.debug("\n#### DOCUMENT %d", i)																						### LOGGING


		next_doc_found = False
		while not next_doc_found:
			# find next_run
			next_run, next_run_index = round_robin.get_next_run(runs_ids, runs_status, next_run_index)


			# print("next_run", next_run)																											### LOGGING
			# logging.debug("next_run:: %s", next_run)																				### LOGGING


			while True:
				# find next_doc
				next_doc = adjudication.get_next_doc(runs, next_run, runs_status, retrieved_docs)

				if next_doc.empty:
					# if it's empty, next_run must have run out of documents

					# update runs_status to track that
					runs_status[next_run] = -1

					# print("Run", next_run, "is empty. Trying again.")														### LOGGING
					# logging.debug("Run %s appears to be empty. Trying again.", next_run)				### LOGGING

					# break to go look for another run
					break
			

				# since the proposed next_doc is not empty, we can gather its info
				next_doc_info = {
					'DOCUMENT': next_doc.iloc[0].DOCUMENT,
					'RANK': next_doc.iloc[0].RANK
				}

				''' LOGGING ''' ''' LOGGING ''' ''' LOGGING '''
				# print("next_doc:: DOCUMENT:", next_doc_info['DOCUMENT'], "RANK:", next_doc_info['RANK'] )
				# logging.debug("next_doc:: DOCUMENT: %s RANK %s", next_doc_info['DOCUMENT'], next_doc_info['RANK'])
				''' LOGGING ''' ''' LOGGING ''' ''' LOGGING '''

				if next_doc_info['DOCUMENT'] not in retrieved_docs:
					# if the document found has not been retrieved before
					next_doc_found = True
					break

				# otherwise doc has already been retrieved

				# print("This document is already retrieved. Retrieving a new one:")							### LOGGING
				# logging.debug("This document is already retrieved. Retrieving a new one:")			### LOGGING

				# update run_relevancies to track which documents are unique relevants
				adjudication.check_for_unique_relevants(next_doc_info, retrieved_docs, run_relevancies)
					
				# loop to look for another next_doc

		# finally new document is retrieved

		# get corresponding qrel
		next_doc_info['RELEVANCY'] = get_doc_info.relevancy(qrels, next_doc_info['DOCUMENT'])

		# track which documents have already been retrieved
		retrieved_docs[next_doc_info['DOCUMENT']] = {
																									'RELEVANCY': next_doc_info['RELEVANCY'],
																									'RETRIEVED_FROM': next_run,
																									'UNIQUE': 1
																								}

		# update info for first graph
		retrieved_docs_order.append({
			"DOCUMENT": next_doc_info['DOCUMENT'],
			"RANK": next_doc_info['RANK'],
			"RELEVANCY": next_doc_info['RELEVANCY'],
			"RETRIEVED_FROM": next_run,
			"OCCURRENCES": docs_occurrences[next_doc_info['DOCUMENT']]
		})

		# update info for second graph
		run_relevancies = adjudication.update_run_relevancies(run_relevancies, next_run, next_doc_info)
		run_relevancies_order.append(convert_data.get_dict_from_df(run_relevancies))


	convert_data.write_dict_into_json(runs_status, 
																		log_path+'adjudication/round_robin/', 
																		"runs_status.json", 0)
	convert_data.write_dict_into_json(retrieved_docs, 
																		log_path+'adjudication/round_robin/', 
																		"retrieved_docs.json", 0)
	convert_data.write_dict_into_json(retrieved_docs_order, 
																		log_path+'adjudication/round_robin/', 
																		"retrieved_docs_order.json", 0)
	convert_data.write_dict_into_json(convert_data.get_dict_from_df(run_relevancies), 
																		log_path+'adjudication/round_robin/', 
																		"run_relevancies.json", 0)
	convert_data.write_dict_into_json(run_relevancies_order, 
																		log_path+'adjudication/round_robin/', 
																		"run_relevancies_order.json", 0)
	
	print("✅ Complete.")

	return {
            'retrieved_docs_order': retrieved_docs_order,
            'run_relevancies_order': run_relevancies_order
        }
Пример #2
0
def round_robin_alg_og(runs, qrels, pool_size, log_path):

    # identify the names of all unique runs, and how many there are
    runs_ids = runs["RUN"].unique().tolist()
    runs_ids.sort()
    runs_count = len(runs_ids)
    print(runs_ids)

    ### HELPER STRUCTURES
    # dict keeping track of which docs have been retrieved from where, and if they are unique
    retrieved_docs = {}

    # dict keeping track of how many docs have been retrieved from each run
    runs_status = {}
    for item in runs_ids:
        runs_status[item] = 1

    # pandas' dataframe keeping track of how many docs for each run are relevant/unique relevant/nonrelevant
    run_relevancies = pd.DataFrame(
        columns=['RUN', 'DOCS_RETRIEVED', 'REL', 'REL_UNIQUE', 'NON_REL'])
    run_relevancies.set_index('RUN', inplace=True)

    ### OUTPUT STRUCTURES
    retrieved_docs_order = [
    ]  # ordered array of each doc retrieved at each step
    run_relevancies_order = [
    ]  # ordered array of the relevancies status at each step

    print("➡️ Retrieving documents to be adjudicated...")
    for i in range(1, pool_size + 1):
        # repeat the process until we have retrieved the desired number of documents
        # print("Retrieving document", i, "\n")

        next_run = round_robin.get_next_run_og(runs_ids)
        next_doc = round_robin.get_next_doc(runs, next_run, runs_status,
                                            retrieved_docs)

        # proposed next_doc
        next_doc_info = {
            'DOCUMENT': next_doc.iloc[0].DOCUMENT,
            'RANK': next_doc.iloc[0].RANK
        }

        logging.debug("#### DOCUMENT %d", i)
        logging.debug("next_run: %s", next_run)
        logging.debug("DOCUMENT: %s \n RANK %s", next_doc_info['DOCUMENT'],
                      next_doc_info['RANK'])

        while next_doc_info['DOCUMENT'] in retrieved_docs:
            # if document was already retrieved

            # update run_relevancies to track which documents are unique relevants
            round_robin.check_for_unique_relevants(next_doc_info,
                                                   retrieved_docs,
                                                   run_relevancies)

            # find new proposed next_doc
            next_doc = round_robin.get_next_doc(runs, next_run, runs_status,
                                                retrieved_docs)

            logging.debug(
                "This document is already retrieved. Retrieving a new one:")
            logging.debug("DOCUMENT: %s \n RANK %s", next_doc_info['DOCUMENT'],
                          next_doc_info['RANK'])

            while (next_doc.empty):
                print("run", next_run, "is empty. Trying again.")
                logging.debug("Run %s appears to be empty. Trying again.",
                              next_run)

                # current next_run must have run out of documents
                runs_ids.pop(len(runs_ids) - 1)
                next_run = round_robin.get_next_run_og(runs_ids)

                print("next_run", next_run)
                logging.debug("next_run: %s", next_run)

                next_doc = round_robin.get_next_doc(runs, next_run,
                                                    runs_status,
                                                    retrieved_docs)

                logging.debug("DOCUMENT: %s \n RANK %s",
                              next_doc_info['DOCUMENT'], next_doc_info['RANK'])

            next_doc_info = {
                'DOCUMENT': next_doc.iloc[0].DOCUMENT,
                'RANK': next_doc.iloc[0].RANK
            }

        # finally new document is retrieved

        # get corresponding qrel
        next_doc_info['RELEVANCY'] = get_doc_info.relevancy(
            qrels, next_doc_info['DOCUMENT'])

        # track which documents have already been retrieved
        retrieved_docs[next_doc_info['DOCUMENT']] = {
            'RELEVANCY': next_doc_info['RELEVANCY'],
            'RETRIEVED_FROM': next_run,
            'UNIQUE': 1
        }

        # update info for first graph
        retrieved_docs_order.append({
            "DOCUMENT":
            next_doc_info['DOCUMENT'],
            "RANK":
            next_doc_info['RANK'],
            "RELEVANCY":
            next_doc_info['RELEVANCY'],
            "RETRIEVED_FROM":
            next_run,
            "OCCURRENCES":
            get_doc_info.occurrences(runs, next_doc_info['DOCUMENT'])
        })

        # update info for second graph
        run_relevancies = round_robin.update_run_relevancies(
            run_relevancies, next_run, next_doc_info)
        run_relevancies_order.append(
            convert_data.get_dict_from_df(run_relevancies))

    convert_data.write_dict_into_json(
        runs_status, log_path + 'adjudication/round_robin_og/',
        "runs_status.json", 0)
    convert_data.write_dict_into_json(
        retrieved_docs, log_path + 'adjudication/round_robin_og/',
        "retrieved_docs.json", 0)
    convert_data.write_dict_into_json(
        retrieved_docs_order, log_path + 'adjudication/round_robin_og/',
        "retrieved_docs_order.json", 0)
    convert_data.write_dict_into_json(
        convert_data.get_dict_from_df(run_relevancies),
        log_path + 'adjudication/round_robin_og/', "run_relevancies.json", 0)
    convert_data.write_dict_into_json(
        run_relevancies_order, log_path + 'adjudication/round_robin_og/',
        "run_relevancies_order.json", 0)

    print("✅ Complete.")

    return {
        'retrieved_docs_order': retrieved_docs_order,
        'run_relevancies_order': run_relevancies_order
    }