Exemplo n.º 1
0
    def predict(self,
                table,
                exclude_attrs=None,
                target_attr=None,
                append=False,
                inplace=True,
                nchunks=1,
                scheduler=threaded.get,
                num_workers=None,
                cache_size=1e9,
                compute=False,
                show_progress=True):
        candset_splitted = delayed(candsplit_df)(table, nchunks)
        results = []
        for i in xrange(nchunks):
            result = delayed(self._predict_table_part)(candset_splitted[i],
                                                       exclude_attrs,
                                                       target_attr, append,
                                                       inplace)
            results.append(result)
        feat_vecs = delayed(concat_df)(results)

        if compute:
            feat_vecs = exec_dag(feat_vecs, num_workers, cache_size, scheduler,
                                 show_progress)
        return feat_vecs

        pass
Exemplo n.º 2
0
    def block_candset(self, candset, ltable, rtable, fk_ltable, fk_rtable, l_key,
                      r_key, l_block_attr, r_block_attr, rem_stop_words=False, q_val=None,
                      word_level=True, overlap_size=1,
                      nchunks=1, scheduler=threaded.get,
                      num_workers=None, cache_size=1e9, compute=False,
                      show_progress=True):
        cand_splitted = delayed(candsplit_df)(candset, nchunks)

        # l_proj_attrs = (get_lattrs_to_project)(l_key, l_block_attr)
        # r_proj_attrs = (get_rattrs_to_project)(r_key, r_block_attr)
        #
        # ltbl = (lproj_df)(ltable, l_proj_attrs)
        # rtbl = (rproj_df)(rtable, r_proj_attrs)

        if word_level == True:
            tokenizer = WhiteSpaceTokenizer()
        else:
            tokenizer = QgramTokenizer(q_val=q_val)

        results = []
        for i in xrange(nchunks):
            result = delayed(self._block_candset_part)(cand_splitted[i], ltable, rtable,
                                                       fk_ltable,
                                                       fk_rtable, l_key, r_key,
                                                       l_block_attr,
                                                       r_block_attr, rem_stop_words,
                                                       tokenizer, overlap_size)
            results.append(result)

        valid_candset = delayed(concat_df)(results)
        if compute:
            valid_candset = exec_dag(valid_candset, num_workers, cache_size, scheduler,
                                     show_progress)
        return valid_candset
Exemplo n.º 3
0
def extract_feature_vecs(candset,
                         ltable,
                         rtable,
                         key,
                         fk_ltable,
                         fk_rtable,
                         l_key,
                         r_key,
                         attrs_before=None,
                         feature_table=None,
                         attrs_after=None,
                         nchunks=1,
                         scheduler=get,
                         num_workers=None,
                         cache_size=1e9,
                         compute=False,
                         show_progress=True):
    # candset_splitted = delayed(candsplit_df)(candset, nchunks)
    results = []
    # for i in xrange(nchunks):
    for i in range(candset.npartitions):
        result = delayed(_extract_feature_vecs_part)(
            candset.get_partition(i), ltable, rtable, key, fk_ltable,
            fk_rtable, l_key, r_key, attrs_before, feature_table, attrs_after)
        results.append(result)
    feat_vecs = delayed(concat_df)(results)

    if compute:
        feat_vecs = exec_dag(feat_vecs, num_workers, cache_size, scheduler,
                             show_progress)
    return feat_vecs
Exemplo n.º 4
0
    def block_candset(self,
                      candset,
                      ltable,
                      rtable,
                      fk_ltable,
                      fk_rtable,
                      l_key,
                      r_key,
                      nchunks=1,
                      scheduler=threaded.get,
                      num_workers=None,
                      cache_size=1e9,
                      compute=False,
                      show_progress=True):
        # candset_splitted = delayed(candsplit_df)(candset, nchunks)
        # l_proj_attrs = (get_lattrs_to_project)(l_key, self.ltable_attrs)
        # r_proj_attrs = (get_rattrs_to_project)(r_key, self.rtable_attrs)
        #
        # ltbl = (lproj_df)(ltable, l_proj_attrs)
        # rtbl = (rproj_df)(rtable, r_proj_attrs)

        results = []
        # for i in xrange(nchunks):
        for i in range(candset.npartitions):
            result = delayed(self._block_candset_part)(
                candset_splitted.get_partition(i), ltable, rtable, fk_ltable,
                fk_rtable, l_key, r_key)
            results.append(result)
        valid_candset = delayed(concat_df)(results)

        if compute:
            valid_candset = exec_dag(valid_candset, num_workers, cache_size,
                                     scheduler, show_progress)
        return valid_candset
Exemplo n.º 5
0
    def block_tables(self,
                     ltable,
                     rtable,
                     l_key,
                     r_key,
                     l_block_attr,
                     r_block_attr,
                     l_output_attrs=None,
                     r_output_attrs=None,
                     l_output_prefix='l_',
                     r_output_prefix='r_',
                     nltable_chunks=1,
                     nrtable_chunks=1,
                     scheduler=threaded.get,
                     num_workers=None,
                     cache_size=1e9,
                     compute=False,
                     show_progress=True):
        # @todo validate inputs
        # @todo need to handle missing values.

        ltable_splitted = (lsplit_df)(ltable, nltable_chunks)
        rtable_splitted = (rsplit_df)(rtable, nrtable_chunks)

        # l_proj_attrs = (get_lattrs_to_project)(l_key, l_block_attr, l_output_attrs)
        # r_proj_attrs = (get_rattrs_to_project)(r_key, r_block_attr, r_output_attrs)

        # list ot accomodate results
        results = []
        for i in xrange(nltable_chunks):
            # ltbl = (lproj_df)(ltable_splitted[i], l_proj_attrs)
            for j in xrange(nrtable_chunks):
                # rtbl = (rproj_df)(rtable_splitted[j], r_proj_attrs)
                res = delayed(self._block_table_part)(
                    ltable_splitted[i], rtable_splitted[j], l_key, r_key,
                    l_block_attr, r_block_attr, l_output_attrs, r_output_attrs,
                    l_output_prefix, r_output_prefix)
                results.append(res)
        candset = delayed(concat_df)(results)
        candset = delayed(add_id)(candset)

        if compute:
            candset = exec_dag(candset, num_workers, cache_size, scheduler,
                               show_progress)

        return candset
Exemplo n.º 6
0
    def block_tables(self,
                     ltable,
                     rtable,
                     l_key,
                     r_key,
                     l_output_attrs=None,
                     r_output_attrs=None,
                     l_output_prefix='l_',
                     r_output_prefix='r_',
                     nltable_chunks=1,
                     nrtable_chunks=1,
                     scheduler=threaded.get,
                     num_workers=None,
                     cache_size=1e9,
                     compute=False,
                     show_progress=True):
        ltable_splitted = (lsplit_df)(ltable, nltable_chunks)
        rtable_splitted = (rsplit_df)(rtable, nrtable_chunks)

        # l_proj_attrs = (get_lattrs_to_project)(l_key, self.ltable_attrs,
        #                                       l_output_attrs)
        # # needs to be modified as self.ltable_attrs can be None.
        # r_proj_attrs = (get_rattrs_to_project)(r_key, self.rtable_attrs,
        #                                       r_output_attrs)
        results = []
        # for i in xrange(nltable_chunks):
        for i in range(ltable.npartitions):
            # ltbl = (lproj_df)(ltable_splitted[i], l_proj_attrs)
            # for j in xrange(nrtable_chunks):
            for j in range(rtable.npartitions):
                # rtbl = (rproj_df)(rtable_splitted[j], r_proj_attrs)
                result = delayed(self._block_tables_part)(
                    ltable.get_partition(i), rtable.get_partition(j), l_key,
                    r_key, l_output_attrs, r_output_attrs, l_output_prefix,
                    r_output_prefix)
                results.append(result)
        candset = delayed(concat_df)(results)
        # candset = delayed(add_id)(candset)
        if compute:
            candset = exec_dag(candset, num_workers, cache_size, scheduler,
                               show_progress)
        return candset
Exemplo n.º 7
0
def select_matcher(matchers,
                   x=None,
                   y=None,
                   table=None,
                   exclude_attrs=None,
                   target_attr=None,
                   metric='precision',
                   k=5,
                   random_state=None,
                   scheduler=threaded.get,
                   num_workers=None,
                   cache_size=1e9,
                   compute=False,
                   show_progress=True):
    x, y = _get_xy_data(x, y, table, exclude_attrs, target_attr)
    scores = []
    for m in matchers:
        score = (cross_validation)(m, x, y, metric, k, random_state)
        scores.append(score)
    res = delayed(process_scores)(matchers, scores, k)
    if compute:
        res = exec_dag(res, num_workers, cache_size, scheduler, show_progress)
    return res