Пример #1
0
    def block_candset(self, vtable):
        """
        Block candidate set (virtual MTable)

        Parameters
        ----------
        vtable : MTable
            Input candidate set

        Returns
        -------
        blocked_table : MTable


        Notes
        -----
        Output MTable contains the following three attributes
            * _id
            * id column from ltable
            * id column from rtable

        Also, the properties of blocked table is updated with following key-value pairs
            * ltable - ref to ltable
            * rtable - ref to rtable
            * key
            * foreign_key_ltable - string, ltable's  id attribute name
            * foreign_key_rtable - string, rtable's id attribute name
        """

        ltable = vtable.get_property('ltable')
        rtable = vtable.get_property('rtable')

        self.check_attrs(ltable, rtable, None, None)
        l_key = vtable.get_property('foreign_key_ltable')
        r_key = vtable.get_property('foreign_key_rtable')

        # set the index and store it in l_tbl/r_tbl
        l_tbl = ltable.set_index(ltable.get_key(), drop=False)
        r_tbl = rtable.set_index(rtable.get_key(), drop=False)

        # create look up table for quick access of rows
        l_dict = {}
        for k, r in l_tbl.iterrows():
            l_dict[k] = r
        r_dict = {}
        for k, r in r_tbl.iterrows():
            r_dict[k] = r
        # keep track of valid ids
        valid = []
        # iterate candidate set and process each row
        if mg._verbose:
            count = 0
            per_count = math.ceil(mg._percent / 100.0 * len(vtable))
        elif mg._progbar:
            bar = pyprind.ProgBar(len(vtable))

        column_names = list(vtable.columns)
        lid_idx = column_names.index(l_key)
        rid_idx = column_names.index(r_key)

        for row in vtable.itertuples(index=False):
            if mg._verbose:
                count += 1
                if count % per_count == 0:
                    print str(mg._percent * count /
                              per_count) + ' percentage done !!!'
            elif mg._progbar:
                bar.update()

            l_row = l_dict[row[lid_idx]]
            r_row = r_dict[row[rid_idx]]

            res = self.apply_rules(l_row, r_row)
            if res is True:
                valid.append(True)
            else:
                valid.append(False)
        # should be modified
        if len(vtable) > 0:
            out_table = MTable(vtable[valid], key=vtable.get_key())
        else:
            out_table = MTable(columns=vtable.columns, key=vtable.get_key())
        out_table.set_property('ltable', ltable)
        out_table.set_property('rtable', rtable)
        out_table.set_property('foreign_key_ltable',
                               vtable.get_property('foreign_key_ltable'))
        out_table.set_property('foreign_key_rtable',
                               vtable.get_property('foreign_key_rtable'))
        return out_table
Пример #2
0
    def block_tables(self, ltable, rtable,
                     l_overlap_attr, r_overlap_attr,
                     rem_stop_words = False,
                     qgram=None, word_level=True, overlap_size=1,
                     l_output_attrs=None, r_output_attrs=None

                     ):
        """
        Block tables with overlap blocker

        Parameters
        ----------
        ltable, rtable : MTables, input MTables to block
        l_overlap_attr, r_overlap_attr : String, overlap attribute from left and right table
        rem_stop_words : flag to indicate whether stop words should be removed
        qgram : int, value of q in qgram tokenizer. Default value is None
        word_level : boolean, flag to indicate to use word level tokenizer
        overlap_size : int, number of tokens to overlap
        l_output_attrs, r_output_attrs - list of attribtues to be included in the output table

        Returns
        -------
        result : MTable
        """
        # do some integrity checks
        if l_overlap_attr not in ltable.columns:
            raise AssertionError('Left overlap attribute  not in ltable columns')
        if r_overlap_attr not in rtable.columns:
            raise AssertionError('Right overlap attribute not in rtable columns')

        l_output_attrs, r_output_attrs = self.check_attrs(ltable, rtable, l_output_attrs, r_output_attrs)

        if word_level == True and qgram != None:
            raise SyntaxError('Parameters word_level and qgram cannot be set together; Note that world_level is set'
                              'to true by default, so explicity set word_level=False to use qgram')



        # remove nans
        l_df = self.rem_nan(ltable, l_overlap_attr)
        r_df = self.rem_nan(rtable, r_overlap_attr)

        l_df.reset_index(inplace=True, drop=True)
        r_df.reset_index(inplace=True, drop=True)

        l_df['_dummy_'] = 1
        r_df['_dummy_'] = 1

        if l_df.dtypes[l_overlap_attr] != object:
            logger.warning('Left overlap attribute is not of type string; converting to string temporarily')
            l_df[l_overlap_attr] = l_df[l_overlap_attr].astype(str)

        if r_df.dtypes[r_overlap_attr] != object:
            logger.warning('Right overlap attribute is not of type string; converting to string temporarily')
            r_df[r_overlap_attr] = r_df[r_overlap_attr].astype(str)



        l_dict = {}
        r_dict = {}
        for k, r in l_df.iterrows():
            l_dict[k] = r
        for k, r in r_df.iterrows():
            r_dict[k] = r

        l_col_values_chopped = self.process_table(l_df, l_overlap_attr, qgram, rem_stop_words)# zip token list with index-val
        zipped_l_col_values = zip(l_col_values_chopped, range(0, len(l_col_values_chopped)))
        appended_l_col_idx_values = [self.append_index_values(v[0], v[1]) for v in zipped_l_col_values]
        inv_idx = {}
        if mg._verbose:
            print 'Creating inverted index '
        sink = [self.compute_inv_index(t, inv_idx) for c in appended_l_col_idx_values for t in c]
        if mg._verbose:
            print 'Done'

        r_col_values_chopped = self.process_table(r_df, r_overlap_attr, qgram, rem_stop_words)
        r_idx = 0;
        l_key =  ltable.get_key()
        r_key = rtable.get_key()
        block_list = [] # misnomer - should be white list
        if mg._verbose:
            count = 0
            per_count = math.ceil(mg._percent/100.0*len(rtable))
            per_float = mg._percent/100.0*len(rtable)
            print per_count
        elif mg._progbar:
            bar = pyprind.ProgBar(len(r_col_values_chopped))

        df_list = []
        for col_values in r_col_values_chopped:
            if mg._verbose:
                count += 1
                if count%per_count == 0:
                    print str(mg._percent*count/per_count) + ' percentage done !!!'
            elif mg._progbar:
                bar.update()

            qualifying_ltable_indices = self.get_potential_match_indices(col_values, inv_idx, overlap_size)
            r_row = r_dict[r_idx]
            r_row_dict = r_row.to_frame().T
            #r_row_dict['dummy'] = 1
            l_rows_dict = l_df.iloc[qualifying_ltable_indices]
            #l_rows_dict['dummy'] = 1
            df = l_rows_dict.merge(r_row_dict, on='_dummy_', suffixes=('_ltable', '_rtable'))
            if len(df) > 0:
                df_list.append(df)
            r_idx += 1

        candset = pd.concat(df_list)

       # get output columns
        retain_cols, final_cols = self.output_columns(ltable.get_key(), rtable.get_key(), list(candset.columns),
                                                   l_output_attrs, r_output_attrs)

        candset = candset[retain_cols]
        candset.columns = final_cols
        if len(candset) > 0:
            candset.sort(['ltable.'+ltable.get_key(), 'rtable.'+rtable.get_key()], inplace=True)
            candset.reset_index(inplace=True, drop=True)
        candset = MTable(candset)

        # set metadata
        candset.set_property('ltable', ltable)
        candset.set_property('rtable', rtable)
        candset.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key())
        candset.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key())
        return candset
Пример #3
0
    def block_tables(self,
                     ltable,
                     rtable,
                     l_output_attrs=None,
                     r_output_attrs=None):
        """
        Block two tables

        Parameters
        ----------
        ltable, rtable : MTable
            Input MTables
        l_output_attrs, r_output_attrs : list (of strings), defaults to None
            attribute names to be included in the output table

        Returns
        -------
        blocked_table : MTable

        Notes
        -----
        Output MTable contains the following three attributes
            * _id
            * id column from ltable
            * id column from rtable

        Also, the properties of blocked table is updated with following key-value pairs
            * ltable - ref to ltable
            * rtable - ref to rtable
            * key
            * foreign_key_ltable - string, ltable's  id attribute name
            * foreign_key_rtable - string, rtable's id attribute name
        """

        # do integrity checks
        l_output_attrs, r_output_attrs = self.check_attrs(
            ltable, rtable, l_output_attrs, r_output_attrs)
        block_list = []
        if mg._verbose:
            count = 0
            per_count = math.ceil(mg._percent / 100.0 * len(ltable) *
                                  len(rtable))
        elif mg._progbar:
            bar = pyprind.ProgBar(len(ltable) * len(rtable))

        l_df = ltable.set_index(ltable.get_key(), drop=False)
        r_df = rtable.set_index(rtable.get_key(), drop=False)
        l_dict = {}
        for k, r in l_df.iterrows():
            l_dict[k] = r
        r_dict = {}
        for k, r in r_df.iterrows():
            r_dict[k] = r

        lid_idx = ltable.get_attr_names().index(ltable.get_key())
        rid_idx = rtable.get_attr_names().index(rtable.get_key())
        for l_t in ltable.itertuples(index=False):
            for r_t in rtable.itertuples(index=False):
                if mg._verbose:
                    count += 1
                    if count % per_count == 0:
                        print str(mg._percent * count /
                                  per_count) + ' percentage done !!!'
                elif mg._progbar:
                    bar.update()

                l = l_dict[l_t[lid_idx]]
                r = r_dict[r_t[rid_idx]]
                # check whether it passes

                res = self.apply_rules(l, r)
                if res is True:
                    d = OrderedDict()
                    # add left id first
                    ltable_id = 'ltable.' + ltable.get_key()
                    d[ltable_id] = l[ltable.get_key()]

                    # add right id
                    rtable_id = 'rtable.' + rtable.get_key()
                    d[rtable_id] = r[rtable.get_key()]

                    # add left attributes
                    if l_output_attrs:
                        l_out = l[l_output_attrs]
                        l_out.index = 'ltable.' + l_out.index
                        d.update(l_out)

                    # add right attributes
                    if r_output_attrs:
                        r_out = r[r_output_attrs]
                        r_out.index = 'rtable.' + r_out.index
                        d.update(r_out)
                    block_list.append(d)

        candset = pd.DataFrame(block_list)
        ret_cols = self.get_attrs_to_retain(ltable.get_key(), rtable.get_key(),
                                            l_output_attrs, r_output_attrs)

        if len(candset) > 0:
            candset = MTable(candset[ret_cols])
        else:
            candset = MTable(candset, columns=ret_cols)

        # add key
        #key_name = candset._get_name_for_key(candset.columns)
        #candset.add_key(key_name)

        # set metadata
        candset.set_property('ltable', ltable)
        candset.set_property('rtable', rtable)
        candset.set_property('foreign_key_ltable',
                             'ltable.' + ltable.get_key())
        candset.set_property('foreign_key_rtable',
                             'rtable.' + rtable.get_key())
        return candset
Пример #4
0
    def block_candset(self, vtable):
        """
        Block candidate set (virtual MTable)

        Parameters
        ----------
        vtable : MTable
            Input candidate set

        Returns
        -------
        blocked_table : MTable


        Notes
        -----
        Output MTable contains the following three attributes
            * _id
            * id column from ltable
            * id column from rtable

        Also, the properties of blocked table is updated with following key-value pairs
            * ltable - ref to ltable
            * rtable - ref to rtable
            * key
            * foreign_key_ltable - string, ltable's  id attribute name
            * foreign_key_rtable - string, rtable's id attribute name
        """

        ltable = vtable.get_property('ltable')
        rtable = vtable.get_property('rtable')

        self.check_attrs(ltable, rtable, None, None)
        l_key = vtable.get_property('foreign_key_ltable')
        r_key = vtable.get_property('foreign_key_rtable')

        # set the index and store it in l_tbl/r_tbl
        l_tbl = ltable.set_index(ltable.get_key(), drop=False)
        r_tbl = rtable.set_index(rtable.get_key(), drop=False)

        # create look up table for quick access of rows
        l_dict = {}
        for k, r in l_tbl.iterrows():
            l_dict[k] = r
        r_dict = {}
        for k, r in r_tbl.iterrows():
            r_dict[k] = r
        # keep track of valid ids
        valid = []
        # iterate candidate set and process each row
        if mg._verbose:
            count = 0
            per_count = math.ceil(mg._percent/100.0*len(vtable))
        elif mg._progbar:
            bar = pyprind.ProgBar(len(vtable))

        column_names = list(vtable.columns)
        lid_idx = column_names.index(l_key)
        rid_idx = column_names.index(r_key)

        for row in vtable.itertuples(index=False):
            if mg._verbose:
                count += 1
                if count%per_count == 0:
                    print str(mg._percent*count/per_count) + ' percentage done !!!'
            elif mg._progbar:
                bar.update()

            l_row = l_dict[row[lid_idx]]
            r_row = r_dict[row[rid_idx]]

            res = self.apply_rules(l_row, r_row)
            if res is True:
                valid.append(True)
            else:
                valid.append(False)
        # should be modified
        if len(vtable) > 0:
            out_table = MTable(vtable[valid], key=vtable.get_key())
        else:
            out_table = MTable(columns=vtable.columns, key=vtable.get_key())
        out_table.set_property('ltable', ltable)
        out_table.set_property('rtable', rtable)
        out_table.set_property('foreign_key_ltable', vtable.get_property('foreign_key_ltable'))
        out_table.set_property('foreign_key_rtable', vtable.get_property('foreign_key_rtable'))
        return out_table
Пример #5
0
    def block_candset(self,vtable, l_overlap_attr, r_overlap_attr, rem_stop_words=False,
                     qgram=None, word_level=True, overlap_size=1):

        """
        Block candidateset with overlap blocker

        Parameters
        ----------
        vtable : MTable, candidate set to block
        l_overlap_attr, r_overlap_attr : String, overlap attribute from left and right table
        rem_stop_words : flag to indicate whether stop words should be removed
        qgram : int, value of q in qgram tokenizer. Default value is None
        word_level : boolean, flag to indicate to use word level tokenizer
        overlap_size : int, number of tokens to overlap
        l_output_attrs, r_output_attrs - list of attribtues to be included in the output table

        Returns
        -------
        result : MTable
        """


        ltable = vtable.get_property('ltable')
        rtable = vtable.get_property('rtable')


        self.check_attrs(ltable, rtable, None, None)
        # do some integrity checks
        if l_overlap_attr not in ltable.columns:
            raise AssertionError('Left overlap attribute  not in ltable columns')
        if r_overlap_attr not in rtable.columns:
            raise AssertionError('Right overlap attribute not in rtable columns')

        l_key = vtable.get_property('foreign_key_ltable')
        r_key = vtable.get_property('foreign_key_rtable')

        # set the index and store it in l_tbl/r_tbl
        l_tbl = ltable.set_index(ltable.get_key(), drop=False)
        r_tbl = rtable.set_index(rtable.get_key(), drop=False)

        # create look up table for quick access of rows
        l_dict = {}
        for k, r in l_tbl.iterrows():
            l_dict[k] = r
        r_dict = {}
        for k, r in r_tbl.iterrows():
            r_dict[k] = r

        valid = []

        column_names = list(vtable.columns)
        lid_idx = column_names.index(l_key)
        rid_idx = column_names.index(r_key)

        if mg._verbose:
            count = 0
            per_count = math.ceil(mg._percent/100.0*len(vtable))
            print per_count

        elif mg._progbar:
            bar = pyprind.ProgBar(len(vtable))


        for row in vtable.itertuples(index=False):
            if mg._verbose:
                count += 1
                if count%per_count == 0:
                    print str(mg._percent*count/per_count) + ' percentage done !!!'
            elif mg._progbar:
                bar.update()

            l_row = l_dict[row[lid_idx]]
            r_row = r_dict[row[rid_idx]]

            num_overlap = self.get_token_overlap_bt_two_tuples(l_row, r_row,
                                                               l_overlap_attr, r_overlap_attr,
                                                               qgram, rem_stop_words)

            if num_overlap >= overlap_size:
                valid.append(True)
            else:
                valid.append(False)

        if len(vtable) > 0:
            out_table = MTable(vtable[valid], key=vtable.get_key())
        else:
            out_table = MTable(columns=vtable.columns, key=vtable.get_key())
        out_table.set_property('ltable', ltable)
        out_table.set_property('rtable', rtable)
        out_table.set_property('foreign_key_ltable', vtable.get_property('foreign_key_ltable'))
        out_table.set_property('foreign_key_rtable', vtable.get_property('foreign_key_rtable'))
        return out_table
Пример #6
0
    def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None):
        """
        Block two tables

        Parameters
        ----------
        ltable, rtable : MTable
            Input MTables
        l_output_attrs, r_output_attrs : list (of strings), defaults to None
            attribute names to be included in the output table

        Returns
        -------
        blocked_table : MTable

        Notes
        -----
        Output MTable contains the following three attributes
            * _id
            * id column from ltable
            * id column from rtable

        Also, the properties of blocked table is updated with following key-value pairs
            * ltable - ref to ltable
            * rtable - ref to rtable
            * key
            * foreign_key_ltable - string, ltable's  id attribute name
            * foreign_key_rtable - string, rtable's id attribute name
        """

        # do integrity checks
        l_output_attrs, r_output_attrs = self.check_attrs(ltable, rtable, l_output_attrs, r_output_attrs)
        block_list = []
        if mg._verbose:
            count = 0
            per_count = math.ceil(mg._percent/100.0*len(ltable)*len(rtable))
        elif mg._progbar:
            bar = pyprind.ProgBar(len(ltable)*len(rtable))

        l_df = ltable.set_index(ltable.get_key(), drop=False)
        r_df = rtable.set_index(rtable.get_key(), drop=False)
        l_dict = {}
        for k, r in l_df.iterrows():
            l_dict[k] = r
        r_dict = {}
        for k, r in r_df.iterrows():
            r_dict[k] = r

        lid_idx = ltable.get_attr_names().index(ltable.get_key())
        rid_idx = rtable.get_attr_names().index(rtable.get_key())
        for l_t in ltable.itertuples(index=False):
            for r_t in rtable.itertuples(index=False):
                if mg._verbose:
                    count += 1
                    if count%per_count == 0:
                        print str(mg._percent*count/per_count) + ' percentage done !!!'
                elif mg._progbar:
                    bar.update()


                l = l_dict[l_t[lid_idx]]
                r = r_dict[r_t[rid_idx]]
                # check whether it passes

                res = self.apply_rules(l, r)
                if res is True:
                    d = OrderedDict()
                    # add left id first
                    ltable_id = 'ltable.' + ltable.get_key()
                    d[ltable_id] = l[ltable.get_key()]

                    # add right id
                    rtable_id = 'rtable.' + rtable.get_key()
                    d[rtable_id] = r[rtable.get_key()]

                    # add left attributes
                    if l_output_attrs:
                        l_out = l[l_output_attrs]
                        l_out.index = 'ltable.'+l_out.index
                        d.update(l_out)

                    # add right attributes
                    if r_output_attrs:
                        r_out = r[r_output_attrs]
                        r_out.index = 'rtable.'+r_out.index
                        d.update(r_out)
                    block_list.append(d)


        candset = pd.DataFrame(block_list)
        ret_cols = self.get_attrs_to_retain(ltable.get_key(), rtable.get_key(), l_output_attrs, r_output_attrs)

        if len(candset) > 0:
            candset = MTable(candset[ret_cols])
        else:
            candset = MTable(candset, columns=ret_cols)

        # add key
        #key_name = candset._get_name_for_key(candset.columns)
        #candset.add_key(key_name)

        # set metadata
        candset.set_property('ltable', ltable)
        candset.set_property('rtable', rtable)
        candset.set_property('foreign_key_ltable', 'ltable.'+ltable.get_key())
        candset.set_property('foreign_key_rtable', 'rtable.'+rtable.get_key())
        return candset
Пример #7
0
def train_test_split(labeled_data, train_proportion=0.5, random_state=None):
    """
    Split MTable into Train and Test

    Parameters
    ----------
    labeled_data : MTable
    train_proportion : float, in the range 0-1. Proportion of train tuples, by default set to 0.5
    random_state : int, Pseudo-random number generator state for random sampling

    Returns
    -------
    result: Python dictionary with two keys: train, test. The value for each key is
    a MTable containing tuples for train and test respectively.
    """

    num_rows = len(labeled_data)
    assert train_proportion >= 0 and train_proportion <= 1, " Train proportion is expected to be between 0 and 1"
    train_size = int(math.floor(num_rows * train_proportion))
    test_size = int(num_rows - train_size)

    idx_values = np.array(labeled_data.index.values)
    idx_train, idx_test = cv.train_test_split(idx_values,
                                              test_size=test_size,
                                              train_size=train_size,
                                              random_state=random_state)
    # create a MTable for train and test data
    lbl_train = MTable(labeled_data.ix[idx_train], key=labeled_data.get_key())
    lbl_test = MTable(labeled_data.ix[idx_test], key=labeled_data.get_key())

    # propogate properties
    lbl_train.set_property('key', labeled_data.get_key())
    lbl_train.set_property('ltable', labeled_data.get_property('ltable'))
    lbl_train.set_property('rtable', labeled_data.get_property('rtable'))
    lbl_train.set_property('foreign_key_ltable',
                           labeled_data.get_property('foreign_key_ltable'))
    lbl_train.set_property('foreign_key_rtable',
                           labeled_data.get_property('foreign_key_rtable'))

    lbl_test.set_property('key', labeled_data.get_key())
    lbl_test.set_property('ltable', labeled_data.get_property('ltable'))
    lbl_test.set_property('rtable', labeled_data.get_property('rtable'))
    lbl_test.set_property('foreign_key_ltable',
                          labeled_data.get_property('foreign_key_ltable'))
    lbl_test.set_property('foreign_key_rtable',
                          labeled_data.get_property('foreign_key_rtable'))

    result = OrderedDict()
    result['train'] = lbl_train
    result['test'] = lbl_test

    return result
Пример #8
0
def train_test_split(labeled_data, train_proportion = 0.5, random_state=None):
    """
    Split MTable into Train and Test

    Parameters
    ----------
    labeled_data : MTable
    train_proportion : float, in the range 0-1. Proportion of train tuples, by default set to 0.5
    random_state : int, Pseudo-random number generator state for random sampling

    Returns
    -------
    result: Python dictionary with two keys: train, test. The value for each key is
    a MTable containing tuples for train and test respectively.
    """

    num_rows = len(labeled_data)
    assert train_proportion >=0 and train_proportion <= 1, " Train proportion is expected to be between 0 and 1"
    train_size = int(math.floor(num_rows*train_proportion))
    test_size = int(num_rows - train_size)

    idx_values = np.array(labeled_data.index.values)
    idx_train, idx_test = cv.train_test_split(idx_values, test_size=test_size, train_size=train_size,
                                              random_state=random_state)
    # create a MTable for train and test data
    lbl_train = MTable(labeled_data.ix[idx_train], key=labeled_data.get_key())
    lbl_test = MTable(labeled_data.ix[idx_test], key=labeled_data.get_key())

    # propogate properties
    lbl_train.set_property('key', labeled_data.get_key())
    lbl_train.set_property('ltable', labeled_data.get_property('ltable'))
    lbl_train.set_property('rtable', labeled_data.get_property('rtable'))
    lbl_train.set_property('foreign_key_ltable', labeled_data.get_property('foreign_key_ltable'))
    lbl_train.set_property('foreign_key_rtable', labeled_data.get_property('foreign_key_rtable'))

    lbl_test.set_property('key', labeled_data.get_key())
    lbl_test.set_property('ltable', labeled_data.get_property('ltable'))
    lbl_test.set_property('rtable', labeled_data.get_property('rtable'))
    lbl_test.set_property('foreign_key_ltable', labeled_data.get_property('foreign_key_ltable'))
    lbl_test.set_property('foreign_key_rtable', labeled_data.get_property('foreign_key_rtable'))

    result = OrderedDict()
    result['train'] = lbl_train
    result['test'] = lbl_test

    return result
Пример #9
0
    def block_tables(self,
                     ltable,
                     rtable,
                     l_overlap_attr,
                     r_overlap_attr,
                     rem_stop_words=False,
                     qgram=None,
                     word_level=True,
                     overlap_size=1,
                     l_output_attrs=None,
                     r_output_attrs=None):
        """
        Block tables with overlap blocker

        Parameters
        ----------
        ltable, rtable : MTables, input MTables to block
        l_overlap_attr, r_overlap_attr : String, overlap attribute from left and right table
        rem_stop_words : flag to indicate whether stop words should be removed
        qgram : int, value of q in qgram tokenizer. Default value is None
        word_level : boolean, flag to indicate to use word level tokenizer
        overlap_size : int, number of tokens to overlap
        l_output_attrs, r_output_attrs - list of attribtues to be included in the output table

        Returns
        -------
        result : MTable
        """
        # do some integrity checks
        if l_overlap_attr not in ltable.columns:
            raise AssertionError(
                'Left overlap attribute  not in ltable columns')
        if r_overlap_attr not in rtable.columns:
            raise AssertionError(
                'Right overlap attribute not in rtable columns')

        l_output_attrs, r_output_attrs = self.check_attrs(
            ltable, rtable, l_output_attrs, r_output_attrs)

        if word_level == True and qgram != None:
            raise SyntaxError(
                'Parameters word_level and qgram cannot be set together; Note that world_level is set'
                'to true by default, so explicity set word_level=False to use qgram'
            )

        # remove nans
        l_df = self.rem_nan(ltable, l_overlap_attr)
        r_df = self.rem_nan(rtable, r_overlap_attr)

        l_df.reset_index(inplace=True, drop=True)
        r_df.reset_index(inplace=True, drop=True)

        l_df['_dummy_'] = 1
        r_df['_dummy_'] = 1

        if l_df.dtypes[l_overlap_attr] != object:
            logger.warning(
                'Left overlap attribute is not of type string; converting to string temporarily'
            )
            l_df[l_overlap_attr] = l_df[l_overlap_attr].astype(str)

        if r_df.dtypes[r_overlap_attr] != object:
            logger.warning(
                'Right overlap attribute is not of type string; converting to string temporarily'
            )
            r_df[r_overlap_attr] = r_df[r_overlap_attr].astype(str)

        l_dict = {}
        r_dict = {}
        for k, r in l_df.iterrows():
            l_dict[k] = r
        for k, r in r_df.iterrows():
            r_dict[k] = r

        l_col_values_chopped = self.process_table(
            l_df, l_overlap_attr, qgram,
            rem_stop_words)  # zip token list with index-val
        zipped_l_col_values = zip(l_col_values_chopped,
                                  range(0, len(l_col_values_chopped)))
        appended_l_col_idx_values = [
            self.append_index_values(v[0], v[1]) for v in zipped_l_col_values
        ]
        inv_idx = {}
        if mg._verbose:
            print 'Creating inverted index '
        sink = [
            self.compute_inv_index(t, inv_idx)
            for c in appended_l_col_idx_values for t in c
        ]
        if mg._verbose:
            print 'Done'

        r_col_values_chopped = self.process_table(r_df, r_overlap_attr, qgram,
                                                  rem_stop_words)
        r_idx = 0
        l_key = ltable.get_key()
        r_key = rtable.get_key()
        block_list = []  # misnomer - should be white list
        if mg._verbose:
            count = 0
            per_count = math.ceil(mg._percent / 100.0 * len(rtable))
            per_float = mg._percent / 100.0 * len(rtable)
            print per_count
        elif mg._progbar:
            bar = pyprind.ProgBar(len(r_col_values_chopped))

        df_list = []
        for col_values in r_col_values_chopped:
            if mg._verbose:
                count += 1
                if count % per_count == 0:
                    print str(mg._percent * count /
                              per_count) + ' percentage done !!!'
            elif mg._progbar:
                bar.update()

            qualifying_ltable_indices = self.get_potential_match_indices(
                col_values, inv_idx, overlap_size)
            r_row = r_dict[r_idx]
            r_row_dict = r_row.to_frame().T
            #r_row_dict['dummy'] = 1
            l_rows_dict = l_df.iloc[qualifying_ltable_indices]
            #l_rows_dict['dummy'] = 1
            df = l_rows_dict.merge(r_row_dict,
                                   on='_dummy_',
                                   suffixes=('_ltable', '_rtable'))
            if len(df) > 0:
                df_list.append(df)
            r_idx += 1

        candset = pd.concat(df_list)

        # get output columns
        retain_cols, final_cols = self.output_columns(ltable.get_key(),
                                                      rtable.get_key(),
                                                      list(candset.columns),
                                                      l_output_attrs,
                                                      r_output_attrs)

        candset = candset[retain_cols]
        candset.columns = final_cols
        if len(candset) > 0:
            candset.sort(
                ['ltable.' + ltable.get_key(), 'rtable.' + rtable.get_key()],
                inplace=True)
            candset.reset_index(inplace=True, drop=True)
        candset = MTable(candset)

        # set metadata
        candset.set_property('ltable', ltable)
        candset.set_property('rtable', rtable)
        candset.set_property('foreign_key_ltable',
                             'ltable.' + ltable.get_key())
        candset.set_property('foreign_key_rtable',
                             'rtable.' + rtable.get_key())
        return candset
Пример #10
0
    def block_candset(self,
                      vtable,
                      l_overlap_attr,
                      r_overlap_attr,
                      rem_stop_words=False,
                      qgram=None,
                      word_level=True,
                      overlap_size=1):
        """
        Block candidateset with overlap blocker

        Parameters
        ----------
        vtable : MTable, candidate set to block
        l_overlap_attr, r_overlap_attr : String, overlap attribute from left and right table
        rem_stop_words : flag to indicate whether stop words should be removed
        qgram : int, value of q in qgram tokenizer. Default value is None
        word_level : boolean, flag to indicate to use word level tokenizer
        overlap_size : int, number of tokens to overlap
        l_output_attrs, r_output_attrs - list of attribtues to be included in the output table

        Returns
        -------
        result : MTable
        """

        ltable = vtable.get_property('ltable')
        rtable = vtable.get_property('rtable')

        self.check_attrs(ltable, rtable, None, None)
        # do some integrity checks
        if l_overlap_attr not in ltable.columns:
            raise AssertionError(
                'Left overlap attribute  not in ltable columns')
        if r_overlap_attr not in rtable.columns:
            raise AssertionError(
                'Right overlap attribute not in rtable columns')

        l_key = vtable.get_property('foreign_key_ltable')
        r_key = vtable.get_property('foreign_key_rtable')

        # set the index and store it in l_tbl/r_tbl
        l_tbl = ltable.set_index(ltable.get_key(), drop=False)
        r_tbl = rtable.set_index(rtable.get_key(), drop=False)

        # create look up table for quick access of rows
        l_dict = {}
        for k, r in l_tbl.iterrows():
            l_dict[k] = r
        r_dict = {}
        for k, r in r_tbl.iterrows():
            r_dict[k] = r

        valid = []

        column_names = list(vtable.columns)
        lid_idx = column_names.index(l_key)
        rid_idx = column_names.index(r_key)

        if mg._verbose:
            count = 0
            per_count = math.ceil(mg._percent / 100.0 * len(vtable))
            print per_count

        elif mg._progbar:
            bar = pyprind.ProgBar(len(vtable))

        for row in vtable.itertuples(index=False):
            if mg._verbose:
                count += 1
                if count % per_count == 0:
                    print str(mg._percent * count /
                              per_count) + ' percentage done !!!'
            elif mg._progbar:
                bar.update()

            l_row = l_dict[row[lid_idx]]
            r_row = r_dict[row[rid_idx]]

            num_overlap = self.get_token_overlap_bt_two_tuples(
                l_row, r_row, l_overlap_attr, r_overlap_attr, qgram,
                rem_stop_words)

            if num_overlap >= overlap_size:
                valid.append(True)
            else:
                valid.append(False)

        if len(vtable) > 0:
            out_table = MTable(vtable[valid], key=vtable.get_key())
        else:
            out_table = MTable(columns=vtable.columns, key=vtable.get_key())
        out_table.set_property('ltable', ltable)
        out_table.set_property('rtable', rtable)
        out_table.set_property('foreign_key_ltable',
                               vtable.get_property('foreign_key_ltable'))
        out_table.set_property('foreign_key_rtable',
                               vtable.get_property('foreign_key_rtable'))
        return out_table