def generate_reciprocal_rank(score_column, output_column, file_path=None, df=None): if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format("file_path", "df")) if score_column is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {}'.format('score_column')) if file_path: df = pd.read_csv(file_path, dtype=object) df.fillna("", inplace=True) df = df.astype(dtype={score_column: "float64"}) ffv = FFV() if not (ffv.is_candidates_file(df)): raise UnsupportTypeError("The input file is not a candidate file!") final_list = [] grouped_obj = df.groupby(['row', 'column']) for cell in grouped_obj: reciprocal_rank = list(1/cell[1][score_column].rank(method='first',ascending=False)) cell[1][output_column] = reciprocal_rank final_list.extend(cell[1].to_dict(orient='records')) odf = pd.DataFrame(final_list) return odf
class TestCanonicalize(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestCanonicalize, self).__init__(*args, **kwargs) self.input_csv = pd.read_csv('{}/data/input.csv'.format(parent_path), dtype=object) self.input_tsv_path = '{}/data/input.tsv'.format(parent_path) self.ffv = FFV() def test_canonicalize_columns_1(self): odf = preprocess.canonicalize('col0,col1,col2,col3,col4', df=self.input_csv) odf.to_csv('{}/data/canonical.csv'.format(parent_path), index=False) self.assertTrue(self.ffv.is_canonical_file(odf)) self.assertEqual(len(odf), 685) columns = odf.columns self.assertTrue('label' in columns) def test_canonicalize_columns_2(self): odf = preprocess.canonicalize('col0', df=self.input_csv, output_column='alias') self.assertTrue(self.ffv.is_canonical_file(odf)) columns = odf.columns self.assertTrue('alias' in columns) def test_canonicalize_tsv(self): odf = preprocess.canonicalize('col0,col3', file_path=self.input_tsv_path, file_type='tsv') self.assertEqual(len(odf), 274) self.assertTrue(self.ffv.is_canonical_file(odf))
def mosaic_features(label_column, num_char, num_tokens, file_path=None, df=None): if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "file_path", "df")) if file_path: df = pd.read_csv(file_path, dtype=object) ffv = FFV() if not (ffv.is_candidates_file(df)): raise UnsupportTypeError("The input file is not a candidate file!") if not (num_char) and not (num_tokens): raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "num_char", "num_tokens")) if num_char: df['num_char'] = df[label_column].apply(lambda label: len(label) if not (pd.isna(label)) else 0) if num_tokens: df['num_tokens'] = df[label_column].apply( lambda label: len(label.split()) if not (pd.isna(label)) else 0) return df
def __init__(self, es, output_column_name: str = 'retrieval_score', previous_match_column_name: str = 'retrieval_score'): self.es = es self.previous_match_column_name = previous_match_column_name self.ffv = FFV(previous_match_column_name) self.score_column_name = output_column_name
def predict(features, output_column, ranking_model, min_max_scaler_path, file_path=None, df=None): if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "file_path", "df")) if file_path: df = pd.read_csv(file_path, dtype=object) ffv = FFV() if not (ffv.is_candidates_file(df)): raise UnsupportTypeError("The input file is not a candidate file!") if not (ranking_model) and not (normalization_factor): raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "ranking_model", "normalization_factor")) model = PairwiseNetwork(14) model.load_state_dict(torch.load(ranking_model)) scaler = pickle.load(open(min_max_scaler_path, 'rb')) normalize_features = features.split(",") df[normalize_features] = df[normalize_features].astype('float64') grouped_obj = df.groupby(['column', 'row']) new_df_list = [] pred = [] for cell in grouped_obj: cell[1][normalize_features] = scaler.transform( cell[1][normalize_features]) df_copy = cell[1].copy() df_features = df_copy[normalize_features] new_df_list.append(df_copy) arr = df_features.to_numpy() test_inp = [] for a in arr: test_inp.append(a) test_tensor = torch.tensor(test_inp).float() scores = model.predict(test_tensor) pred.extend(torch.squeeze(scores).tolist()) out_df = pd.concat(new_df_list) out_df[output_column] = pred return out_df
def create_singleton_feature(output_column, file_path=None, df=None): if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "file_path", "df")) if file_path: df = pd.read_csv(file_path, dtype=object) ffv = FFV() if not (ffv.is_candidates_file(df)): raise UnsupportTypeError("The input file is not a candidate file!") exact_match_count = df[df['method'] == 'exact-match'].groupby( ['column', 'row'])[['kg_id']].count() exact_match_singleton = list( exact_match_count[exact_match_count['kg_id'] == 1].index) df[output_column] = df.apply(lambda x: is_singleton( x.column, x.row, exact_match_singleton, x.method), axis=1) return df
def __init__(self, api_url='https://kgtk.isi.edu/api'): self.api_url = api_url self.ffv = FFV()
class KGTKSearchMatches(object): def __init__(self, api_url='https://kgtk.isi.edu/api'): self.api_url = api_url self.ffv = FFV() def get_matches(self, column, size=50, file_path=None, df=None, output_column_name: str = "retrieval_score"): """ uses KGTK search API to retrieve identifiers of KG entities matching the input search term. Args: column: the column used for retrieving candidates. size: maximum number of candidates to retrieve, default is 50. file_path: input file in canonical format df: input dataframe in canonical format, output_column_name: the output column name where the normalized scores will be stored.Default is kgtk_retrieval_score Returns: a dataframe in candidates format """ if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "file_path", "df")) if file_path: df = pd.read_csv(file_path, dtype=object) df.fillna(value="", inplace=True) columns = df.columns uniq_labels = list(df[column].unique()) results_dict = {} for uniq_label in uniq_labels: api_search_url = f"{self.api_url}/" \ f"{uniq_label}?extra_info=true&language=en&type=ngram&size={size}&lowercase=true" results_dict[uniq_label] = requests.get(api_search_url, verify=False).json() new_df_list = list() seen_dict = {} for i, row in df.iterrows(): row_key = f"{row['column']}_{row['row']}_{row[column]}" if row_key not in seen_dict: search_results = results_dict.get(row[column], []) if len(search_results) > 0: for sr in search_results: _ = {} for c in columns: _[c] = row[c] _['kg_id'] = sr['qnode'] _['pagerank'] = sr['pagerank'] kg_label = [] kg_description = '' if 'label' in sr and len(sr['label']) > 0: kg_label.extend(sr['label']) if 'alias' in sr and len(sr['alias']) > 0: kg_label.extend(sr['alias']) _['kg_labels'] = "|".join(kg_label) _['method'] = 'kgtk-search' if 'description' in sr and len(sr['description']) > 0: kg_description = "|".join(sr['description']) _['kg_descriptions'] = kg_description _[output_column_name] = sr['score'] new_df_list.append(_) else: _ = {} for c in columns: _[c] = row[c] _['kg_id'] = '' _['pagerank'] = '' _['kg_labels'] = '' _['method'] = '' _['kg_descriptions'] = '' _[output_column_name] = '' new_df_list.append(_) seen_dict[row_key] = 1 if self.ffv.is_canonical_file(df): return pd.DataFrame(new_df_list) if self.ffv.is_candidates_file(df): return pd.concat([df, pd.DataFrame(new_df_list) ]).sort_values(by=['column', 'row', column]) raise UnsupportTypeError( "The input file is neither a canonical file or a candidate file!")
def __init__(self, *args, **kwargs): super(TestCanonicalize, self).__init__(*args, **kwargs) self.input_csv = pd.read_csv('{}/data/input.csv'.format(parent_path), dtype=object) self.input_tsv_path = '{}/data/input.tsv'.format(parent_path) self.ffv = FFV()
class Utility(object): def __init__(self, es, output_column_name: str = 'retrieval_score', previous_match_column_name: str = 'retrieval_score'): self.es = es self.previous_match_column_name = previous_match_column_name self.ffv = FFV(previous_match_column_name) self.score_column_name = output_column_name def create_candidates_df(self, df, column, size, properties, method, lower_case=False): properties = properties.split(',') candidates_format = list() df_columns = df.columns if self.ffv.is_canonical_file(df): candidates_format = self.create_cfd_canonical( df, df_columns, column, size, properties, method, lower_case) return pd.DataFrame(candidates_format) elif self.ffv.is_candidates_file(df): grouped = df.groupby(by=['column', 'row', column]) relevant_columns = [ c for c in df_columns if c not in [ 'kg_id', 'kg_labels', 'method', self.previous_match_column_name ] ] for key_tuple, gdf in grouped: gdf.reset_index(inplace=True) tuple = ((c, gdf.at[0, c]) for c in relevant_columns) candidates_format.extend( self.create_cfd_candidates(tuple, column, size, properties, method, lower_case)) return pd.concat([df, pd.DataFrame(candidates_format)]) else: raise UnsupportTypeError( "The input df is neither a canonical format or a candidate format!" ) def create_cfd_canonical(self, df, relevant_columns, column, size, properties, method, lower_case): candidates_format = list() for i, row in df.iterrows(): candidate_dict = self.es.search_term_candidates( row[column], size, properties, method, lower_case=lower_case) if not candidate_dict: cf_dict = {} for key in relevant_columns: if key not in [ 'kg_id', 'kg_labels', 'method', self.score_column_name ]: cf_dict[key] = row[key] cf_dict['kg_id'] = "" cf_dict['kg_labels'] = "" cf_dict['method'] = method cf_dict[self.score_column_name] = 0.0 candidates_format.append(cf_dict) else: for kg_id in candidate_dict: cf_dict = {} for key in relevant_columns: if key not in [ 'kg_id', 'kg_labels', 'method', self.score_column_name ]: cf_dict[key] = row[key] cf_dict['kg_id'] = kg_id cf_dict['kg_labels'] = candidate_dict[kg_id]['label_str'] cf_dict['method'] = method cf_dict[self. score_column_name] = candidate_dict[kg_id]['score'] candidates_format.append(cf_dict) return candidates_format def create_cfd_candidates(self, key_tuple, column, size, properties, method, lower_case): candidates_format = list() _ = {} for k in key_tuple: _[k[0]] = k[1] candidate_dict = self.es.search_term_candidates(_[column], size, properties, method, lower_case=lower_case) if not candidate_dict: cf_dict = {} for k in _: cf_dict[k] = _[k] cf_dict['kg_id'] = "" cf_dict['kg_labels'] = "" cf_dict['method'] = method cf_dict[self.score_column_name] = 0.0 candidates_format.append(cf_dict) else: for kg_id in candidate_dict: cf_dict = {} for k in _: cf_dict[k] = _[k] cf_dict['kg_id'] = kg_id cf_dict['kg_labels'] = candidate_dict[kg_id]['label_str'] cf_dict['method'] = method cf_dict[ self.score_column_name] = candidate_dict[kg_id]['score'] candidates_format.append(cf_dict) return candidates_format
class Utility(object): def __init__(self, es, output_column_name: str = 'retrieval_score', previous_match_column_name: str = 'retrieval_score'): self.es = es self.previous_match_column_name = previous_match_column_name self.ffv = FFV(previous_match_column_name) self.score_column_name = output_column_name def create_candidates_df(self, df, column, size, properties, method, lower_case=False, auxiliary_fields=None, auxiliary_folder=None, auxiliary_file_prefix='', extra_musts: dict = None, max_threads=50): properties = [_.strip() for _ in properties.split(',')] candidates_format = list() df_columns = df.columns all_candidates_aux_dict = {} max_threads = min(df.shape[0], max_threads) if self.ffv.is_canonical_file(df): rows = df.to_dict("records") with ThreadPoolExecutor(max_workers=max_threads) as executor: for _candidates_format, candidates_aux_dict in executor.map( self.create_candidates, rows, repeat(df_columns), repeat(column), repeat(size), repeat(properties), repeat(method), repeat(lower_case), repeat(auxiliary_fields), repeat(extra_musts)): all_candidates_aux_dict = { **all_candidates_aux_dict, **candidates_aux_dict } candidates_format.extend(_candidates_format) self.write_auxiliary_files(auxiliary_folder, all_candidates_aux_dict, auxiliary_fields, prefix=auxiliary_file_prefix) return pd.DataFrame(candidates_format) elif self.ffv.is_candidates_file(df): grouped = df.groupby(by=['column', 'row', column]) relevant_columns = [ c for c in df_columns if c not in [ 'kg_id', 'kg_labels', 'method', 'kg_descriptions', self.previous_match_column_name ] ] rows = list() for key_tuple, gdf in grouped: gdf.reset_index(inplace=True) rows.append({c: gdf.at[0, c] for c in relevant_columns}) with ThreadPoolExecutor(max_workers=max_threads) as executor: for _candidates_format, candidates_aux_dict in executor.map( self.create_candidates, rows, repeat(relevant_columns), repeat(column), repeat(size), repeat(properties), repeat(method), repeat(lower_case), repeat(auxiliary_fields), repeat(extra_musts)): all_candidates_aux_dict = { **all_candidates_aux_dict, **candidates_aux_dict } candidates_format.extend(_candidates_format) self.write_auxiliary_files(auxiliary_folder, all_candidates_aux_dict, auxiliary_fields, prefix=auxiliary_file_prefix) return pd.concat([df, pd.DataFrame(candidates_format)]) else: raise UnsupportTypeError( "The input df is neither a canonical format" " or a candidate format!") def create_candidates(self, row, relevant_columns, column, size, properties, method, lower_case, auxiliary_fields=None, extra_musts=None): candidates_format = list() _ = {} for k in row: _[k] = row[k] candidate_dict, candidate_aux_dict = self.es.search_term_candidates( _[column], size, properties, method, lower_case=lower_case, auxiliary_fields=auxiliary_fields, extra_musts=extra_musts) if not candidate_dict: cf_dict = {} for k in relevant_columns: cf_dict[k] = _[k] cf_dict['kg_id'] = "" cf_dict['kg_labels'] = "" cf_dict['kg_aliases'] = "" cf_dict['method'] = method cf_dict['kg_descriptions'] = "" cf_dict['pagerank'] = 0.0 cf_dict[self.score_column_name] = 0.0 candidates_format.append(cf_dict) else: for kg_id in candidate_dict: cf_dict = {} for k in relevant_columns: cf_dict[k] = _[k] cf_dict['kg_id'] = kg_id cf_dict['kg_labels'] = candidate_dict[kg_id]['label_str'] cf_dict['kg_aliases'] = candidate_dict[kg_id]['alias_str'] cf_dict['method'] = method cf_dict['kg_descriptions'] = ( candidate_dict[kg_id]['description_str']) cf_dict['pagerank'] = candidate_dict[kg_id]['pagerank_float'] cf_dict[self.score_column_name] = ( candidate_dict[kg_id]['score']) candidates_format.append(cf_dict) return candidates_format, candidate_aux_dict def write_auxiliary_files(self, auxiliary_folder, all_candidates_aux_dict, auxiliary_fields, prefix=''): _ = {} if auxiliary_fields is not None: for aux_field in auxiliary_fields: _[aux_field] = list() for qnode in all_candidates_aux_dict: qnode_dict = all_candidates_aux_dict[qnode] for aux_field in auxiliary_fields: if aux_field in qnode_dict: _val = qnode_dict[aux_field] if isinstance(_val, list): _val = ','.join([str(x) for x in _val]) _[aux_field].append({'qnode': qnode, aux_field: _val}) for key in _: df = pd.DataFrame(_[key]) if len(df) > 0: df.to_csv(f"{auxiliary_folder}/{prefix}{key}.tsv", sep='\t', index=False)