def load_pan_data(xmls_directory, truth_path, write_to_txt_files=False, txts_destination_directory=None, exp_dict={}, exec_type='training'): """Load PAN data This function loads the PAN dataset and the truth, parses the XML and returns: Merged tweets of the authors, the truth, Author IDs, and the original length of the tweets. It also writes the tweets to TXT files (optional). Args: xmls_directory: The directory where the XML files of the dataset reside. truth_path: The path of the truth file. write_to_txt_files: (boolean) If True, the XML files will also be written as TXT files after being parsed. txts_destination_directory: The TXT files will be written to this directory. Returns: merged_tweets_of_authors: List. Each item is all of the tweets of an author, merged into one string. Refer to the list of replacements in the remarks. truths: List of truths for authors. author_ids: List of Author IDs. original_tweet_lengths: List of original tweet lengths. Raises: RuntimeError: If a non-XML file exists inside the *xmls_directory* Remarks: - Since *xml_filenames* is sorted in ascending order, all the returned lists will also be in the same order (sorted in ascending order of the Author IDs). - List of replacements: Line feed <LineFeed> End of Tweet <EndOfTweet> """ ''' *os.listdir* returns a list containing the name of all files and folders in the given directory. Normally, the list is created in ascending order. However, the Python documentation states, “the list is in arbitrary order”. To ensure consistency and avoid errors in syncing the order of the items among different lists (e.g., *author_ids*, *truths*), we sort the list by calling *sorted*. *sorted()* returns a new sorted list (in ascending lexicographical order) of all the items in an iterable. ''' global global_exp_dict global testTool global_exp_dict = exp_dict.copy() testTool = IndexToolManager(indexName=str(authorprof_db_name), top_k=global_exp_dict['ir_top_k']) xml_filenames = sorted(os.listdir(xmls_directory)) if exec_type == 'testing': global_exp_dict['ignore_first_result'] = False # Store the Author IDs in a list # The Author IDs list will have the same order as the XML filenames list. author_ids = [] # Create an empty list for xml_filename in xml_filenames: author_ids.append(xml_filename[:-4]) # Skip loading truth if path input is None. Else, load the truth from the file. if truth_path is None: logger.info("*truth_path* is None => Skipped loading the truth") truths = None # This scenario will happen when loading the test dataset for **TIRA** evaluation, where the truth of the test # set is not provided. else: truths = load_truth(truth_path, author_ids) if write_to_txt_files: logger.info("The parsed XMLs will also be written to TXT files.") # Create the directory if it does not exist. os.makedirs(txts_destination_directory, exist_ok=True) # Initialize the lists. # The lists will have the same order as the XML filenames list (refer to: “Iterate over XML Files”) original_tweet_lengths = [] # Create an empty list # ↳ Every row will represent an author, every column will represent a tweet. merged_tweets_of_authors = [] # Create an empty list # ↳ Each cell will contain all 100 tweets of an author, merged. ir_vars_of_authors = [] time_query_list = [] time_query = 0.0 # Iterate over XML files for author_index, xml_filename in enumerate(xml_filenames): # Make sure only XML files go through if not fnmatch.fnmatch(xml_filename, '*.xml'): logger.error( "Encountered a non-XML file inside the directory: %s >>> The program will now exit.", xml_filename) raise RuntimeError( 'Encountered a non-XML file inside the directory: %s' % xml_filename) # ↳ This is printf-style String Formatting. # Read the XML file and parse it into a tree # Parser is explicitly defined to ensure UTF-8 encoding. tree = ElementTree.parse( os.path.join(xmls_directory, xml_filename), parser=ElementTree.XMLParser(encoding="utf-8")) root = tree.getroot() ''' root is the root element of the parsed tree root[0], ..., root[m-1] are the children of root—elements one level below the root. root[0][0], ..., root[0][n-1] are the children of root[0]. and so on. Each element has a tag, a dictionary of attributes, and sometimes some text: root[i][j].tag, ”.attrib, ”.text ''' # Add an empty new row to the list. Each row represents an author. original_tweet_lengths.append([]) # Initialize the list. Note that this list resets in every author (XML file) loop. tweets_of_this_author = [] # Create an empty list author_id = xml_filename[:-4] # print(int(author_id, base=16)) # print(int('eb151ca9c0e31d615dd8c335bdbc9226', base=16)) # if int(author_id, base=16) < int('4502f17f7a9d88f6a9594e82968740b0', base=16): # continue ir_variables_of_this_author = [] text_list = [] # doc_id = 1 logger.info( f'{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")} Author: {author_id}' ) # Iterate over the tweets within this parsed XML file: # Record the tweet length, replace line feeds, and append the tweet to a list for child in root[0]: # Element.text accesses the element's text content, # which is saved with the following format in the XML files: <![CDATA[some text]]> tweet = child.text text = tweet original_tweet_lengths[author_index].append(len(tweet)) # Replace line feed (LF = \n) with “ <LineFeed> ” # Note: There were no carriage return (CR = \r) characters in any of the 3,000 XML files. tweet = tweet.replace('\n', " <LineFeed> ") # Create a list of the tweets of this author, to write to a text file and merge, after the loop terminates. ''' Google Python Style Guide: Avoid using the + and += operators to accumulate a string within a loop. Since strings are immutable, this creates unnecessary temporary objects and results in quadratic rather than linear running time. Avoid: merged_tweets_of_authors[author_index] += tweet + " <EndOfTweet> " Instead, append each substring to a list and ''.join the list after the loop terminates. ''' tweets_of_this_author.append(tweet) text_list.append(text) # ir_variables = {} # if (exp_dict['add_ir_variables']): # initial = None # final = None # if (exp_dict['tool'] == 'arango'): # initial = time.time() # ir_variables = testTool.arango_get_IR_variables( # text, 'male') # final = time.time() # elif (exp_dict['tool'] == 'elastic'): # initial = time.time() # ir_variables = testTool.elastic_get_IR_variables( # text, 'male') # final = time.time() # elif (exp_dict['tool'] == 'zettair'): # initial = time.time() # ir_variables = testTool.zettair_get_IR_variables( # text, 'male') # final = time.time() # time_query_list.append(float(final-initial)) # ir_variables_of_this_author.append(ir_variables) if (global_exp_dict['add_ir_variables']): ir_variables_of_this_author, tq_list = get_ir_variables(text_list) # ir_variables_of_this_author = [{ # 'CLASS_0_BM25_AVG': 0, # 'CLASS_0_BM25_COUNT': 0, # 'CLASS_0_BM25_SUM': 0, # 'CLASS_1_BM25_AVG': 0, # 'CLASS_1_BM25_COUNT': 0, # 'CLASS_1_BM25_SUM': 0, # }] time_query_list.extend(tq_list) # doc_id += 1 ir_vars_dict = [] if (global_exp_dict['add_ir_variables']): ir_vars = pd.DataFrame(ir_variables_of_this_author) ir_vars_mean = ir_vars.mean() ir_vars_sum = ir_vars.sum() # ir_vars_dict = {} # ir_vars_dict = { # 'CLASS_0_BM25_AVG': ir_vars_mean['CLASS_0_BM25_AVG'], # 'CLASS_0_BM25_COUNT': ir_vars_sum['CLASS_0_BM25_COUNT'], # 'CLASS_0_BM25_SUM': ir_vars_sum['CLASS_0_BM25_SUM'], # 'CLASS_1_BM25_AVG': ir_vars_mean['CLASS_1_BM25_AVG'], # 'CLASS_1_BM25_COUNT': ir_vars_sum['CLASS_1_BM25_COUNT'], # 'CLASS_1_BM25_SUM': ir_vars_sum['CLASS_1_BM25_SUM'], # } ir_vars_dict = [ ir_vars_mean['CLASS_0_BM25_AVG'], ir_vars_sum['CLASS_0_BM25_COUNT'], ir_vars_sum['CLASS_0_BM25_SUM'], ir_vars_mean['CLASS_1_BM25_AVG'], ir_vars_sum['CLASS_1_BM25_COUNT'], ir_vars_sum['CLASS_1_BM25_SUM'], ] # Write the tweets of this author to a TXT file # Note that in these tweets, the line feed characters are replaced with a tag. if write_to_txt_files: # Create a TXT file with the Author ID as the filename (same as the XML files) in the write mode with open(os.path.join(txts_destination_directory, author_ids[author_index] + ".txt"), 'w', encoding="utf-8") as txt_output_file: txt_output_file.write('\n'.join(tweets_of_this_author)) # ↳ '\n'.join adds a newline character between every two strings, # so there won't be any extra line feeds on the last line of the file. # Concatenate the tweets of this author, and append it to the main list merged_tweets_of_this_author = " <EndOfTweet> ".join( tweets_of_this_author) + " <EndOfTweet>" # ↳ " <EndOfTweet> ".join adds the tag between every two strings, so we need to add another tag to the end. merged_tweets_of_authors.append(merged_tweets_of_this_author) ir_vars_of_authors.append(ir_vars_dict) # print('\n\nir_vars_dict') # print(ir_vars_dict) logger.info("@ %.2f seconds: Finished loading the dataset", time.process_time()) result_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S.%f") time_query = np.mean(time_query_list) # testTool.log_result(result_id, { # 'exp_id': exp_dict['exp_id'], # 'variable': 'TIME_QUERY', # ** testTool.get_parameters(), # 'db': exp_dict['db'], # 'tool': exp_dict['tool'], # 'db_name': exp_dict['db_name'], # 'add_ir_variables': 'true' if exp_dict['add_ir_variables'] else 'false', # 'solution_number': exp_dict['solution_number'], # 'solution_name': exp_dict['solution_name'], # 'train_data_folder': exp_dict['train_data_folder'], # 'test_data_folder': exp_dict['test_data_folder'], # 'execution_type': exec_type, # 'number_queries': str(len(time_query_list)), # 'value': str(time_query), # }) testTool.log_result( result_id, { 'variable': 'TIME_QUERY', **testTool.get_parameters(), **global_exp_dict, 'execution_type': exec_type, 'number_queries': str(len(time_query_list)), 'value': str(time_query), }) return merged_tweets_of_authors, truths, author_ids, original_tweet_lengths, ir_vars_of_authors
def index(idx_type='normal', db='authorprof', tool='arango', db_name='authorprof', exp_id='unamed'): mylogger.info('') mylogger.info(f'INDEX TYPE: {idx_type}') mylogger.info(f'DB: {db}') mylogger.info(f'TOOL: {tool}') mylogger.info(f'DB NAME: {db_name}') initial = time.time() testTool = IndexToolManager(indexName=db_name) start = time.time() append_class_to_id = False if (tool == 'zettair'): append_class_to_id = True bulk = testTool.get_documents(db, db_files[db]['xml_folder'], db_files[db]['truth_txt'], append_class_to_id) end = time.time() mylogger.info(f'get_documents {end - start}') mylogger.info(f'TOTAL documents {len(bulk)}') start = time.time() if (tool == 'arango'): documentList = testTool.bulkListGeneratorArango(bulk) end = time.time() mylogger.info(f'bulkListGeneratorArango {end - start}') if (idx_type == 'normal'): start = time.time() for doc in documentList: testTool.insertDocumentArango(doc) end = time.time() mylogger.info(f'for-loop insertDocumentArango {end - start}') if (idx_type == 'bulk'): start = time.time() testTool.bulkImportArango(documentList) end = time.time() mylogger.info(f'bulkImportArango {end - start}') if (tool == 'elastic'): if (idx_type == 'normal'): start = time.time() for doc in bulk: testTool.insertElastic(doc.pop('id'), doc) end = time.time() mylogger.info(f'for-loop insertElastic {end - start}') if (idx_type == 'bulk'): start = time.time() bulkBody = testTool.bulkHelperInsertGeneratorElastic(bulk) end = time.time() mylogger.info(f'bulkHelperInsertGeneratorElastic {end - start}') start = time.time() testTool.bulkHelperElastic(bulkBody) end = time.time() mylogger.info(f'bulkHelperElastic {end - start}') start = time.time() testTool.refreshElastic() end = time.time() mylogger.info(f'refreshElastic {end - start}') if (tool == 'zettair'): start = time.time() testTool.saveToTrecFileZettair(bulk) end = time.time() mylogger.info(f'saveToTrecFileZettair {end - start}') start = time.time() testTool.zettair_index() end = time.time() mylogger.info(f'zettair_index {end - start}') final = time.time() result_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S.%f") testTool.log_result( result_id, { 'exp_id': exp_id, 'variable': 'TIME_INDEX', 'index_type': idx_type, 'db': db, 'tool': tool, 'db_name': db_name, 'value': str((final - initial)), }) mylogger.info(f'index TOTAL TIME: {final - initial}')