def upload_to_es(self, kgtk_file: str): """ main function call to upload the index """ output_json = tempfile.NamedTemporaryFile(mode='r+') map_json = tempfile.NamedTemporaryFile(mode='r+') kgtk_index = tempfile.NamedTemporaryFile(mode='r+') self.generate_index(kgtk_file, kgtk_index.name) _ = kgtk_index.seek(0) # build index Utility.build_elasticsearch_file(kgtk_index.name, "preflabel,label", map_json.name, output_json.name, "aliases") # upload _ = map_json.seek(0) _ = output_json.seek(0) Utility.load_elasticsearch_index(output_json.name, self.es_server, self.es_index, map_json.name)
def produce(self, input_file: str = None, input_df: pd.DataFrame = None, target_column: str = None, output_column_name: str = None) -> pd.DataFrame: """ Main function of wikifier, the input could either be a dataframe or a input path """ if input_file is None and input_df is None: raise ValueError("input_file and input_df can't both be None!") if target_column is None: raise ValueError("A target column name is needed!") if input_file is not None: input_df = pd.read_csv(input_file) else: temp_file_obj = tempfile.NamedTemporaryFile(mode='r+') input_df.to_csv(temp_file_obj, index=False) _ = temp_file_obj.seek(0) input_file = temp_file_obj.name df_all = self.run_table_linker(input_file, target_column) final_answer = self.find_best_candidates(df_all) final_answer = Utility.sort_by_col_and_row(final_answer).reset_index().drop(columns=["index"]) # return output output_df = input_df.copy() if output_column_name is None: output_column_name = "{}_wikifier".format(target_column) output_df[output_column_name] = final_answer["kg_id"] # clear level memo self.level_memo = defaultdict(int) return output_df
def get_candidates(self, input_file_path: str, target_column: str) -> pd.DataFrame: """ Main query to get most candidates :param input_file_path: input file path :param target_column: target column name :return: """ shell_code = """tl --url {} --index {} \ canonicalize {} --csv -c {} --add-other-information \ / clean -c label \ / get-exact-matches -i -c label_clean \ / get-phrase-matches -c label_clean -n 5 \ / get-fuzzy-matches -c label_clean -n 5 \ / normalize-scores -c retrieval_score \ / drop-duplicate -c kg_id --keep-method exact-match --score-column retrieval_score_normalized""". \ format(self.es_server, self.es_index, input_file_path, target_column) res = Utility.execute_shell_code(shell_code) if res == "": raise ValueError("Executing first query error when running on {}!".format(input_file_path)) res_io = StringIO(res) output_file = pd.read_csv(res_io, dtype=object) return output_file