Пример #1
0
def generate_reciprocal_rank(score_column, output_column, file_path=None, df=None):
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format("file_path", "df"))

    if score_column is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {}'.format('score_column'))

    if file_path:
        df = pd.read_csv(file_path, dtype=object)
        
    df.fillna("", inplace=True)
    df = df.astype(dtype={score_column: "float64"})
    ffv = FFV()
    if not (ffv.is_candidates_file(df)):
        raise UnsupportTypeError("The input file is not a candidate file!")

    final_list = []
    grouped_obj = df.groupby(['row', 'column'])
    for cell in grouped_obj:
        reciprocal_rank = list(1/cell[1][score_column].rank(method='first',ascending=False))
        cell[1][output_column] = reciprocal_rank
        final_list.extend(cell[1].to_dict(orient='records'))
    
    odf = pd.DataFrame(final_list)
    return odf
Пример #2
0
class TestCanonicalize(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super(TestCanonicalize, self).__init__(*args, **kwargs)
        self.input_csv = pd.read_csv('{}/data/input.csv'.format(parent_path), dtype=object)
        self.input_tsv_path = '{}/data/input.tsv'.format(parent_path)
        self.ffv = FFV()

    def test_canonicalize_columns_1(self):
        odf = preprocess.canonicalize('col0,col1,col2,col3,col4', df=self.input_csv)
        odf.to_csv('{}/data/canonical.csv'.format(parent_path), index=False)
        self.assertTrue(self.ffv.is_canonical_file(odf))
        self.assertEqual(len(odf), 685)
        columns = odf.columns
        self.assertTrue('label' in columns)

    def test_canonicalize_columns_2(self):
        odf = preprocess.canonicalize('col0', df=self.input_csv, output_column='alias')
        self.assertTrue(self.ffv.is_canonical_file(odf))
        columns = odf.columns
        self.assertTrue('alias' in columns)

    def test_canonicalize_tsv(self):
        odf = preprocess.canonicalize('col0,col3', file_path=self.input_tsv_path, file_type='tsv')
        self.assertEqual(len(odf), 274)
        self.assertTrue(self.ffv.is_canonical_file(odf))
Пример #3
0
def mosaic_features(label_column,
                    num_char,
                    num_tokens,
                    file_path=None,
                    df=None):
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format(
                "file_path", "df"))

    if file_path:
        df = pd.read_csv(file_path, dtype=object)

    ffv = FFV()
    if not (ffv.is_candidates_file(df)):
        raise UnsupportTypeError("The input file is not a candidate file!")

    if not (num_char) and not (num_tokens):
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format(
                "num_char", "num_tokens"))

    if num_char:
        df['num_char'] = df[label_column].apply(lambda label: len(label)
                                                if not (pd.isna(label)) else 0)

    if num_tokens:
        df['num_tokens'] = df[label_column].apply(
            lambda label: len(label.split()) if not (pd.isna(label)) else 0)

    return df
Пример #4
0
 def __init__(self,
              es,
              output_column_name: str = 'retrieval_score',
              previous_match_column_name: str = 'retrieval_score'):
     self.es = es
     self.previous_match_column_name = previous_match_column_name
     self.ffv = FFV(previous_match_column_name)
     self.score_column_name = output_column_name
def predict(features,
            output_column,
            ranking_model,
            min_max_scaler_path,
            file_path=None,
            df=None):
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format(
                "file_path", "df"))

    if file_path:
        df = pd.read_csv(file_path, dtype=object)

    ffv = FFV()
    if not (ffv.is_candidates_file(df)):
        raise UnsupportTypeError("The input file is not a candidate file!")

    if not (ranking_model) and not (normalization_factor):
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format(
                "ranking_model", "normalization_factor"))

    model = PairwiseNetwork(14)
    model.load_state_dict(torch.load(ranking_model))
    scaler = pickle.load(open(min_max_scaler_path, 'rb'))

    normalize_features = features.split(",")
    df[normalize_features] = df[normalize_features].astype('float64')
    grouped_obj = df.groupby(['column', 'row'])
    new_df_list = []
    pred = []
    for cell in grouped_obj:
        cell[1][normalize_features] = scaler.transform(
            cell[1][normalize_features])
        df_copy = cell[1].copy()
        df_features = df_copy[normalize_features]
        new_df_list.append(df_copy)
        arr = df_features.to_numpy()
        test_inp = []
        for a in arr:
            test_inp.append(a)
        test_tensor = torch.tensor(test_inp).float()
        scores = model.predict(test_tensor)
        pred.extend(torch.squeeze(scores).tolist())
    out_df = pd.concat(new_df_list)
    out_df[output_column] = pred

    return out_df
def create_singleton_feature(output_column, file_path=None, df=None):
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format(
                "file_path", "df"))

    if file_path:
        df = pd.read_csv(file_path, dtype=object)

    ffv = FFV()
    if not (ffv.is_candidates_file(df)):
        raise UnsupportTypeError("The input file is not a candidate file!")

    exact_match_count = df[df['method'] == 'exact-match'].groupby(
        ['column', 'row'])[['kg_id']].count()
    exact_match_singleton = list(
        exact_match_count[exact_match_count['kg_id'] == 1].index)

    df[output_column] = df.apply(lambda x: is_singleton(
        x.column, x.row, exact_match_singleton, x.method),
                                 axis=1)
    return df
Пример #7
0
 def __init__(self, api_url='https://kgtk.isi.edu/api'):
     self.api_url = api_url
     self.ffv = FFV()
Пример #8
0
class KGTKSearchMatches(object):
    def __init__(self, api_url='https://kgtk.isi.edu/api'):
        self.api_url = api_url
        self.ffv = FFV()

    def get_matches(self,
                    column,
                    size=50,
                    file_path=None,
                    df=None,
                    output_column_name: str = "retrieval_score"):
        """
        uses KGTK search API to retrieve identifiers of KG entities matching the input search term.

        Args:
            column: the column used for retrieving candidates.
            size: maximum number of candidates to retrieve, default is 50.
            file_path: input file in canonical format
            df: input dataframe in canonical format,
            output_column_name: the output column name where the normalized scores will be stored.Default is kgtk_retrieval_score
        Returns: a dataframe in candidates format

        """
        if file_path is None and df is None:
            raise RequiredInputParameterMissingException(
                'One of the input parameters is required: {} or {}'.format(
                    "file_path", "df"))

        if file_path:
            df = pd.read_csv(file_path, dtype=object)

        df.fillna(value="", inplace=True)
        columns = df.columns

        uniq_labels = list(df[column].unique())

        results_dict = {}
        for uniq_label in uniq_labels:
            api_search_url = f"{self.api_url}/" \
                             f"{uniq_label}?extra_info=true&language=en&type=ngram&size={size}&lowercase=true"
            results_dict[uniq_label] = requests.get(api_search_url,
                                                    verify=False).json()

        new_df_list = list()
        seen_dict = {}
        for i, row in df.iterrows():
            row_key = f"{row['column']}_{row['row']}_{row[column]}"
            if row_key not in seen_dict:
                search_results = results_dict.get(row[column], [])
                if len(search_results) > 0:
                    for sr in search_results:
                        _ = {}
                        for c in columns:
                            _[c] = row[c]

                        _['kg_id'] = sr['qnode']
                        _['pagerank'] = sr['pagerank']
                        kg_label = []
                        kg_description = ''

                        if 'label' in sr and len(sr['label']) > 0:
                            kg_label.extend(sr['label'])
                        if 'alias' in sr and len(sr['alias']) > 0:
                            kg_label.extend(sr['alias'])
                        _['kg_labels'] = "|".join(kg_label)

                        _['method'] = 'kgtk-search'

                        if 'description' in sr and len(sr['description']) > 0:
                            kg_description = "|".join(sr['description'])
                        _['kg_descriptions'] = kg_description

                        _[output_column_name] = sr['score']
                        new_df_list.append(_)
                else:
                    _ = {}
                    for c in columns:
                        _[c] = row[c]

                    _['kg_id'] = ''
                    _['pagerank'] = ''
                    _['kg_labels'] = ''
                    _['method'] = ''
                    _['kg_descriptions'] = ''
                    _[output_column_name] = ''
                    new_df_list.append(_)
                seen_dict[row_key] = 1

        if self.ffv.is_canonical_file(df):
            return pd.DataFrame(new_df_list)

        if self.ffv.is_candidates_file(df):
            return pd.concat([df, pd.DataFrame(new_df_list)
                              ]).sort_values(by=['column', 'row', column])

        raise UnsupportTypeError(
            "The input file is neither a canonical file or a candidate file!")
Пример #9
0
 def __init__(self, *args, **kwargs):
     super(TestCanonicalize, self).__init__(*args, **kwargs)
     self.input_csv = pd.read_csv('{}/data/input.csv'.format(parent_path), dtype=object)
     self.input_tsv_path = '{}/data/input.tsv'.format(parent_path)
     self.ffv = FFV()
Пример #10
0
class Utility(object):
    def __init__(self,
                 es,
                 output_column_name: str = 'retrieval_score',
                 previous_match_column_name: str = 'retrieval_score'):
        self.es = es
        self.previous_match_column_name = previous_match_column_name
        self.ffv = FFV(previous_match_column_name)
        self.score_column_name = output_column_name

    def create_candidates_df(self,
                             df,
                             column,
                             size,
                             properties,
                             method,
                             lower_case=False):
        properties = properties.split(',')
        candidates_format = list()
        df_columns = df.columns

        if self.ffv.is_canonical_file(df):
            candidates_format = self.create_cfd_canonical(
                df, df_columns, column, size, properties, method, lower_case)

            return pd.DataFrame(candidates_format)

        elif self.ffv.is_candidates_file(df):
            grouped = df.groupby(by=['column', 'row', column])
            relevant_columns = [
                c for c in df_columns if c not in [
                    'kg_id', 'kg_labels', 'method',
                    self.previous_match_column_name
                ]
            ]
            for key_tuple, gdf in grouped:
                gdf.reset_index(inplace=True)
                tuple = ((c, gdf.at[0, c]) for c in relevant_columns)

                candidates_format.extend(
                    self.create_cfd_candidates(tuple, column, size, properties,
                                               method, lower_case))
            return pd.concat([df, pd.DataFrame(candidates_format)])

        else:
            raise UnsupportTypeError(
                "The input df is neither a canonical format or a candidate format!"
            )

    def create_cfd_canonical(self, df, relevant_columns, column, size,
                             properties, method, lower_case):
        candidates_format = list()

        for i, row in df.iterrows():
            candidate_dict = self.es.search_term_candidates(
                row[column], size, properties, method, lower_case=lower_case)

            if not candidate_dict:
                cf_dict = {}
                for key in relevant_columns:
                    if key not in [
                            'kg_id', 'kg_labels', 'method',
                            self.score_column_name
                    ]:
                        cf_dict[key] = row[key]

                cf_dict['kg_id'] = ""
                cf_dict['kg_labels'] = ""
                cf_dict['method'] = method
                cf_dict[self.score_column_name] = 0.0
                candidates_format.append(cf_dict)
            else:
                for kg_id in candidate_dict:
                    cf_dict = {}
                    for key in relevant_columns:
                        if key not in [
                                'kg_id', 'kg_labels', 'method',
                                self.score_column_name
                        ]:
                            cf_dict[key] = row[key]

                    cf_dict['kg_id'] = kg_id
                    cf_dict['kg_labels'] = candidate_dict[kg_id]['label_str']
                    cf_dict['method'] = method
                    cf_dict[self.
                            score_column_name] = candidate_dict[kg_id]['score']
                    candidates_format.append(cf_dict)
        return candidates_format

    def create_cfd_candidates(self, key_tuple, column, size, properties,
                              method, lower_case):
        candidates_format = list()

        _ = {}
        for k in key_tuple:
            _[k[0]] = k[1]

        candidate_dict = self.es.search_term_candidates(_[column],
                                                        size,
                                                        properties,
                                                        method,
                                                        lower_case=lower_case)

        if not candidate_dict:
            cf_dict = {}

            for k in _:
                cf_dict[k] = _[k]

            cf_dict['kg_id'] = ""
            cf_dict['kg_labels'] = ""
            cf_dict['method'] = method
            cf_dict[self.score_column_name] = 0.0
            candidates_format.append(cf_dict)
        else:
            for kg_id in candidate_dict:
                cf_dict = {}
                for k in _:
                    cf_dict[k] = _[k]

                cf_dict['kg_id'] = kg_id
                cf_dict['kg_labels'] = candidate_dict[kg_id]['label_str']
                cf_dict['method'] = method
                cf_dict[
                    self.score_column_name] = candidate_dict[kg_id]['score']
                candidates_format.append(cf_dict)
        return candidates_format
Пример #11
0
class Utility(object):
    def __init__(self,
                 es,
                 output_column_name: str = 'retrieval_score',
                 previous_match_column_name: str = 'retrieval_score'):
        self.es = es
        self.previous_match_column_name = previous_match_column_name
        self.ffv = FFV(previous_match_column_name)
        self.score_column_name = output_column_name

    def create_candidates_df(self,
                             df,
                             column,
                             size,
                             properties,
                             method,
                             lower_case=False,
                             auxiliary_fields=None,
                             auxiliary_folder=None,
                             auxiliary_file_prefix='',
                             extra_musts: dict = None,
                             max_threads=50):
        properties = [_.strip() for _ in properties.split(',')]
        candidates_format = list()
        df_columns = df.columns
        all_candidates_aux_dict = {}
        max_threads = min(df.shape[0], max_threads)

        if self.ffv.is_canonical_file(df):
            rows = df.to_dict("records")
            with ThreadPoolExecutor(max_workers=max_threads) as executor:
                for _candidates_format, candidates_aux_dict in executor.map(
                        self.create_candidates, rows, repeat(df_columns),
                        repeat(column), repeat(size), repeat(properties),
                        repeat(method), repeat(lower_case),
                        repeat(auxiliary_fields), repeat(extra_musts)):
                    all_candidates_aux_dict = {
                        **all_candidates_aux_dict,
                        **candidates_aux_dict
                    }
                    candidates_format.extend(_candidates_format)
            self.write_auxiliary_files(auxiliary_folder,
                                       all_candidates_aux_dict,
                                       auxiliary_fields,
                                       prefix=auxiliary_file_prefix)
            return pd.DataFrame(candidates_format)
        elif self.ffv.is_candidates_file(df):
            grouped = df.groupby(by=['column', 'row', column])
            relevant_columns = [
                c for c in df_columns if c not in [
                    'kg_id', 'kg_labels', 'method', 'kg_descriptions',
                    self.previous_match_column_name
                ]
            ]
            rows = list()
            for key_tuple, gdf in grouped:
                gdf.reset_index(inplace=True)
                rows.append({c: gdf.at[0, c] for c in relevant_columns})
            with ThreadPoolExecutor(max_workers=max_threads) as executor:
                for _candidates_format, candidates_aux_dict in executor.map(
                        self.create_candidates, rows, repeat(relevant_columns),
                        repeat(column), repeat(size), repeat(properties),
                        repeat(method), repeat(lower_case),
                        repeat(auxiliary_fields), repeat(extra_musts)):
                    all_candidates_aux_dict = {
                        **all_candidates_aux_dict,
                        **candidates_aux_dict
                    }
                    candidates_format.extend(_candidates_format)
            self.write_auxiliary_files(auxiliary_folder,
                                       all_candidates_aux_dict,
                                       auxiliary_fields,
                                       prefix=auxiliary_file_prefix)
            return pd.concat([df, pd.DataFrame(candidates_format)])
        else:
            raise UnsupportTypeError(
                "The input df is neither a canonical format"
                " or a candidate format!")

    def create_candidates(self,
                          row,
                          relevant_columns,
                          column,
                          size,
                          properties,
                          method,
                          lower_case,
                          auxiliary_fields=None,
                          extra_musts=None):
        candidates_format = list()

        _ = {}
        for k in row:
            _[k] = row[k]

        candidate_dict, candidate_aux_dict = self.es.search_term_candidates(
            _[column],
            size,
            properties,
            method,
            lower_case=lower_case,
            auxiliary_fields=auxiliary_fields,
            extra_musts=extra_musts)

        if not candidate_dict:
            cf_dict = {}

            for k in relevant_columns:
                cf_dict[k] = _[k]

            cf_dict['kg_id'] = ""
            cf_dict['kg_labels'] = ""
            cf_dict['kg_aliases'] = ""
            cf_dict['method'] = method
            cf_dict['kg_descriptions'] = ""
            cf_dict['pagerank'] = 0.0
            cf_dict[self.score_column_name] = 0.0
            candidates_format.append(cf_dict)
        else:
            for kg_id in candidate_dict:
                cf_dict = {}
                for k in relevant_columns:
                    cf_dict[k] = _[k]

                cf_dict['kg_id'] = kg_id
                cf_dict['kg_labels'] = candidate_dict[kg_id]['label_str']
                cf_dict['kg_aliases'] = candidate_dict[kg_id]['alias_str']
                cf_dict['method'] = method
                cf_dict['kg_descriptions'] = (
                    candidate_dict[kg_id]['description_str'])
                cf_dict['pagerank'] = candidate_dict[kg_id]['pagerank_float']
                cf_dict[self.score_column_name] = (
                    candidate_dict[kg_id]['score'])
                candidates_format.append(cf_dict)
        return candidates_format, candidate_aux_dict

    def write_auxiliary_files(self,
                              auxiliary_folder,
                              all_candidates_aux_dict,
                              auxiliary_fields,
                              prefix=''):
        _ = {}
        if auxiliary_fields is not None:
            for aux_field in auxiliary_fields:
                _[aux_field] = list()

            for qnode in all_candidates_aux_dict:
                qnode_dict = all_candidates_aux_dict[qnode]
                for aux_field in auxiliary_fields:
                    if aux_field in qnode_dict:
                        _val = qnode_dict[aux_field]
                        if isinstance(_val, list):
                            _val = ','.join([str(x) for x in _val])
                        _[aux_field].append({'qnode': qnode, aux_field: _val})

            for key in _:
                df = pd.DataFrame(_[key])
                if len(df) > 0:
                    df.to_csv(f"{auxiliary_folder}/{prefix}{key}.tsv",
                              sep='\t',
                              index=False)