def create_dictionaries_from_wiki_tables(input_file, output_folder): dict_headers = {} dict_page_titles = {} dict_captions = {} dict_section_titles = {} dict_data = {} with open(input_file) as json_file: wiki_tables = json.load(json_file) for table_id, wiki_table in wiki_tables.items(): preprocessed_page_title = preprocess_string(wiki_table['pgTitle']) list(map(lambda x: add_to_dict(dict_page_titles, x, table_id), preprocessed_page_title)) preprocessed_section_title = preprocess_string(wiki_table['secondTitle']) list(map(lambda x: add_to_dict(dict_section_titles, x, table_id), preprocessed_section_title)) preprocessed_caption = preprocess_string(wiki_table['caption']) list(map(lambda x: add_to_dict(dict_captions, str(x), table_id), preprocessed_caption)) preprocessed_headers = [x for title in wiki_table['title'] for x in preprocess_string(title)] list(map(lambda x: add_to_dict(dict_headers, str(x), table_id), preprocessed_headers)) preprocessed_data = list(map(lambda x: list(map(lambda y: preprocess_string(y), x)), wiki_table['data'])) list(map(lambda x: list(map(lambda y: list(map(lambda z: add_to_dict(dict_data, z, table_id), y)), x)), preprocessed_data)) write_dictionary_to_file(dict_headers, output_folder + '/words_headers.json') write_dictionary_to_file(dict_page_titles, output_folder + '/words_page_titles.json') write_dictionary_to_file(dict_section_titles, output_folder + '/words_section_titles.json') write_dictionary_to_file(dict_captions, output_folder + '/words_captions.json') write_dictionary_to_file(dict_data, output_folder + '/words_data.json')
def ratio_query_terms_in_page_title(query, table): """ Ratio of the number of query tokens found in page title to total number of tokens :param query: :param table: :return: """ tokenized_query = preprocess_string(query) tokenized_page_title = preprocess_string(table['pgTitle']) number_found = 0 for query_token in tokenized_query: if query_token in tokenized_page_title: number_found += 1 return number_found / len(tokenized_query)
def pmi(table): """ Takes the table and returns the ACSDb-based schema coherency score :param table: :return: """ average_pmi = 0 counter = 0 preprocessed_headers = list( map(lambda x: preprocess_string(x), table['title'])) for i in range(len(preprocessed_headers) - 1): for j in range(i + 1, len(preprocessed_headers)): counter += 1 pmi = 0 for h1 in preprocessed_headers[i]: for h2 in preprocessed_headers[j]: pmi += compute_pmi(h1, h2, n_documents, dict_headers) if pmi == 0: average_pmi = 0 else: average_pmi += (pmi / (len(preprocessed_headers[i]) * len(preprocessed_headers[j]))) if counter == 0: return 0.0 return average_pmi / counter
def tokenize_table(table, incl_headers=True): ''' All word tokens from the title, captions and heading of the table ''' pgTable_tokens = word_tokenize(table['pgTitle']) caption_tokens = word_tokenize(table['caption']) if incl_headers: headers_tokens = [x for title in table['title'] for x in preprocess_string(title)] else: headers_tokens = [] result = [x.lower() for x in list(set(pgTable_tokens + caption_tokens + headers_tokens))] return result
def preprocess_field(field): field_result = field # Preprocess the initial string field_result = preprocess_string(field_result) filtered_result = [] stop_words = set(stopwords.words('english')) for w in field_result: if w not in stop_words: filtered_result.append(w) return filtered_result
def idf_table_body(query): """ Takes the query and returns the sum of the IDF scores of the words in the table bodies :param query: :return: """ preprocessed_query = preprocess_string(query) final_idf = 0 for term in preprocessed_query: if term in dict_data: final_idf += compute_idf_t(n_documents, len(dict_data[term])) return final_idf
def idf_section_title(query): """ Takes the query and returns the sum of the IDF scores of the words in the section titles :param query: :return: """ preprocessed_query = preprocess_string(query) final_idf = 0 for term in preprocessed_query: if term in dict_section_titles: final_idf += compute_idf_t(n_documents, len(dict_section_titles[term])) return final_idf
def term_frequency_query_in_table_body(query, table): """ Total query term frequency in the table body :param query: :param table: :return: """ if len(table['data']) > 0: tokenized_query = word_tokenize(query.lower()) number_found = 0 for row in table['data']: data_row = list(map(lambda x: preprocess_string(x), row)) for cell in data_row: for query_token in tokenized_query: if query_token in cell: number_found += 1 return number_found return -1
def term_frequency_query_in_left_column(query, table): """ Total query term frequency in the leftmost column cells :param query: :param table: :return: """ if len(table['data']) > 0: tokenized_query = word_tokenize(query.lower()) first_column = [i[0] for i in table['data']] tokenized_first_column = list( map(lambda x: preprocess_string(x), first_column)) number_found = 0 for query_token in tokenized_query: for cell in tokenized_first_column: if query_token in cell: number_found += 1 return number_found return -1
def idf_catch_all(query): """ Takes the query and returns the sum of the IDF scores of the words in the all text of the tables :param query: :return: """ preprocessed_query = preprocess_string(query) final_idf = 0 for term in preprocessed_query: if term in dict_page_titles: final_idf += compute_idf_t(n_documents, len(dict_page_titles[term])) if term in dict_section_titles: final_idf += compute_idf_t(n_documents, len(dict_section_titles[term])) if term in dict_captions: final_idf += compute_idf_t(n_documents, len(dict_captions[term])) if term in dict_headers: final_idf += compute_idf_t(n_documents, len(dict_headers[term])) if term in dict_data: final_idf += compute_idf_t(n_documents, len(dict_data[term])) return final_idf