예제 #1
0
 def upload_to_es(self, kgtk_file: str):
     """
         main function call to upload the index
     """
     output_json = tempfile.NamedTemporaryFile(mode='r+')
     map_json = tempfile.NamedTemporaryFile(mode='r+')
     kgtk_index = tempfile.NamedTemporaryFile(mode='r+')
     self.generate_index(kgtk_file, kgtk_index.name)
     _ = kgtk_index.seek(0)
     # build index
     Utility.build_elasticsearch_file(kgtk_index.name, "preflabel,label", map_json.name, output_json.name, "aliases")
     # upload
     _ = map_json.seek(0)
     _ = output_json.seek(0)
     Utility.load_elasticsearch_index(output_json.name, self.es_server, self.es_index, map_json.name)
예제 #2
0
    def produce(self, input_file: str = None, input_df: pd.DataFrame = None,
                target_column: str = None, output_column_name: str = None) -> pd.DataFrame:
        """
        Main function of wikifier, the input could either be a dataframe or a input path
        """
        if input_file is None and input_df is None:
            raise ValueError("input_file and input_df can't both be None!")

        if target_column is None:
            raise ValueError("A target column name is needed!")

        if input_file is not None:
            input_df = pd.read_csv(input_file)
        else:
            temp_file_obj = tempfile.NamedTemporaryFile(mode='r+')
            input_df.to_csv(temp_file_obj, index=False)
            _ = temp_file_obj.seek(0)
            input_file = temp_file_obj.name

        df_all = self.run_table_linker(input_file, target_column)
        final_answer = self.find_best_candidates(df_all)
        final_answer = Utility.sort_by_col_and_row(final_answer).reset_index().drop(columns=["index"])
        # return output
        output_df = input_df.copy()
        if output_column_name is None:
            output_column_name = "{}_wikifier".format(target_column)
        output_df[output_column_name] = final_answer["kg_id"]
        # clear level memo
        self.level_memo = defaultdict(int)
        return output_df
예제 #3
0
 def get_candidates(self, input_file_path: str, target_column: str) -> pd.DataFrame:
     """
     Main query to get most candidates
     :param input_file_path: input file path
     :param target_column: target column name
     :return:
     """
     shell_code = """tl --url {} --index {} \
     canonicalize {} --csv -c {} --add-other-information \
     / clean -c label \
     / get-exact-matches -i -c label_clean \
     / get-phrase-matches -c label_clean -n 5 \
     / get-fuzzy-matches -c label_clean -n 5 \
     / normalize-scores -c retrieval_score \
     / drop-duplicate -c kg_id --keep-method exact-match --score-column retrieval_score_normalized""". \
         format(self.es_server, self.es_index,
                input_file_path, target_column)
     res = Utility.execute_shell_code(shell_code)
     if res == "":
         raise ValueError("Executing first query error when running on {}!".format(input_file_path))
     res_io = StringIO(res)
     output_file = pd.read_csv(res_io, dtype=object)
     return output_file