예제 #1
0
def _filter_candset_split(candset, candset_l_key_attr, candset_r_key_attr,
                          ltable, rtable, l_key_attr, r_key_attr,
                          l_filter_attr, r_filter_attr, filter_object):
    # Find column indices of key attr and filter attr in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_filter_attr_index = l_columns.index(l_filter_attr)

    # Find column indices of key attr and filter attr in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_filter_attr_index = r_columns.index(r_filter_attr)

    # Build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_filter_attr_index)

    # Build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_filter_attr_index)

    # Find indices of l_key_attr and r_key_attr in candset
    candset_columns = list(candset.columns.values)
    candset_l_key_attr_index = candset_columns.index(candset_l_key_attr)
    candset_r_key_attr_index = candset_columns.index(candset_r_key_attr)

    valid_rows = []

    for candset_row in candset.itertuples(index=False):
        l_id = candset_row[candset_l_key_attr_index]
        r_id = candset_row[candset_r_key_attr_index]

        l_row = ltable_dict[l_id]
        r_row = rtable_dict[r_id]
        if (pd.isnull(l_row[l_filter_attr_index])
                or pd.isnull(r_row[r_filter_attr_index])):
            valid_rows.append(False)
        else:
            valid_rows.append(not filter_object.filter_pair(
                l_row[l_filter_attr_index], r_row[r_filter_attr_index]))

    return candset[valid_rows]
예제 #2
0
def _filter_tables_split(ltable, rtable, l_key_attr, r_key_attr, l_filter_attr,
                         r_filter_attr, size_filter, l_out_attrs, r_out_attrs,
                         l_out_prefix, r_out_prefix):
    # find column indices of key attr, filter attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_filter_attr_index = l_columns.index(l_filter_attr)
    l_out_attrs_indices = []
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, filter attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_filter_attr_index = r_columns.index(r_filter_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_filter_attr_index)

    # build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_filter_attr_index)

    # Build size index over ltable
    size_index = SizeIndex(ltable_dict.values(), l_key_attr_index,
                           l_filter_attr_index, size_filter.tokenizer)
    size_index.build()

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)
    prog_bar = pyprind.ProgBar(len(rtable))

    for r_row in rtable_dict.values():
        r_id = r_row[r_key_attr_index]
        r_string = str(r_row[r_filter_attr_index])
        # check for empty string
        if not r_string:
            continue
        r_num_tokens = len(
            tokenize(r_string, size_filter.tokenizer,
                     size_filter.sim_measure_type))

        size_lower_bound = get_size_lower_bound(r_num_tokens,
                                                size_filter.sim_measure_type,
                                                size_filter.threshold)
        size_upper_bound = get_size_upper_bound(r_num_tokens,
                                                size_filter.sim_measure_type,
                                                size_filter.threshold)

        size_lower_bound = (size_index.min_length
                            if size_lower_bound < size_index.min_length else
                            size_lower_bound)

        size_upper_bound = (size_index.max_length
                            if size_upper_bound > size_index.max_length else
                            size_upper_bound)

        # probe size index and find candidates
        candidates = _find_candidates(size_lower_bound, size_upper_bound,
                                      size_index)

        for cand in candidates:
            if has_output_attributes:
                output_row = get_output_row_from_tables(
                    ltable_dict[cand], r_row, cand, r_id, l_out_attrs_indices,
                    r_out_attrs_indices)
                output_rows.append(output_row)
            else:
                output_rows.append([cand, r_id])

        prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)

    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
예제 #3
0
def _filter_tables_split(ltable, rtable,
                         l_key_attr, r_key_attr,
                         l_filter_attr, r_filter_attr,
                         prefix_filter,
                         l_out_attrs, r_out_attrs,
                         l_out_prefix, r_out_prefix):
    # find column indices of key attr, filter attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_filter_attr_index = l_columns.index(l_filter_attr)
    l_out_attrs_indices = []
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, filter attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_filter_attr_index = r_columns.index(r_filter_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_filter_attr_index)

    # build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_filter_attr_index)

    # generate token ordering using tokens in l_filter_attr and r_filter_attr
    token_ordering = gen_token_ordering_for_tables(
                                 [ltable_dict.values(), rtable_dict.values()],
                                 [l_filter_attr_index, r_filter_attr_index],
                                 prefix_filter.tokenizer,
                                 prefix_filter.sim_measure_type)

    # Build prefix index on l_filter_attr
    prefix_index = PrefixIndex(ltable_dict.values(), l_key_attr_index,
                               l_filter_attr_index, prefix_filter.tokenizer,
                               prefix_filter.sim_measure_type,
                               prefix_filter.threshold, token_ordering)
    prefix_index.build()

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)
    prog_bar = pyprind.ProgBar(len(rtable))

    for r_row in rtable_dict.values():
        r_id = r_row[r_key_attr_index]
        r_string = str(r_row[r_filter_attr_index])
        # check for empty string
        if not r_string:
            continue
        r_filter_attr_tokens = tokenize(r_string,
                                        prefix_filter.tokenizer,
                                        prefix_filter.sim_measure_type)
        r_ordered_tokens = order_using_token_ordering(r_filter_attr_tokens,
                                                      token_ordering)
           
        # probe prefix index and find candidates
        candidates = _find_candidates(r_ordered_tokens, len(r_ordered_tokens),
                                      prefix_filter, prefix_index)

        for cand in candidates:
            if has_output_attributes:
                output_row = get_output_row_from_tables(
                                     ltable_dict[cand], r_row,
                                     cand, r_id, 
                                     l_out_attrs_indices, r_out_attrs_indices)
                output_rows.append(output_row)
            else:
                output_rows.append([cand, r_id])
 
        prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs, 
                                                  l_out_prefix, r_out_prefix)

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
예제 #4
0
def _set_sim_join_split(ltable, rtable,
                        l_key_attr, r_key_attr,
                        l_join_attr, r_join_attr,
                        tokenizer,
                        sim_measure_type,
                        threshold,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix,
                        out_sim_score):
    """Perform set similarity join for a split of ltable and rtable"""

    # find column indices of key attr, join attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_join_attr_index)

    # build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_join_attr_index)

    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
                         [ltable_dict.values(),
                          rtable_dict.values()],
                         [l_join_attr_index,
                          r_join_attr_index],
                         tokenizer, sim_measure_type)

    # build a dictionary of tokenized l_join_attr
    l_join_attr_dict = {}
    for row in ltable_dict.values():
        l_join_attr_dict[row[l_key_attr_index]] = order_using_token_ordering(
            tokenize(str(row[l_join_attr_index]), tokenizer, sim_measure_type),
                                                  token_ordering)

    # Build position index on l_join_attr
    position_index = PositionIndex(ltable_dict.values(),
                                   l_key_attr_index, l_join_attr_index,
                                   tokenizer, sim_measure_type,
                                   threshold, token_ordering)
    position_index.build()

    pos_filter = PositionFilter(tokenizer, sim_measure_type, threshold)
    suffix_filter = SuffixFilter(tokenizer, sim_measure_type, threshold)
    sim_fn = get_sim_function(sim_measure_type)
    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)
    prog_bar = pyprind.ProgBar(len(rtable_dict.keys()))

    for r_row in rtable_dict.values():
        r_id = r_row[r_key_attr_index]
        r_string = str(r_row[r_join_attr_index])
        # check for empty string
        if not r_string:
            continue
        r_join_attr_tokens = tokenize(r_string, tokenizer, sim_measure_type)
        r_ordered_tokens = order_using_token_ordering(r_join_attr_tokens,
                                                      token_ordering)
        r_num_tokens = len(r_ordered_tokens)
        r_prefix_length = get_prefix_length(r_num_tokens,
                                            sim_measure_type,
                                            threshold, tokenizer)     

        candidate_overlap = find_candidates_position_filter(
                                r_ordered_tokens, r_num_tokens, r_prefix_length,
                                pos_filter, position_index)
        for cand, overlap in iteritems(candidate_overlap):
            if overlap > 0:
                l_ordered_tokens = l_join_attr_dict[cand]
                l_num_tokens = position_index.get_size(cand)
                l_prefix_length = get_prefix_length(
                                      l_num_tokens,
                                      sim_measure_type,
                                      threshold, tokenizer)
                if not suffix_filter._filter_suffix(
                           l_ordered_tokens[l_prefix_length:],
                           r_ordered_tokens[r_prefix_length:],
                           l_prefix_length,
                           r_prefix_length,
                           l_num_tokens, r_num_tokens):
                    sim_score = sim_fn(l_ordered_tokens, r_ordered_tokens)
                    if sim_score >= threshold:
                        if has_output_attributes:
                            output_row = get_output_row_from_tables(
                                             ltable_dict[cand], r_row,
                                             cand, r_id,
                                             l_out_attrs_indices,
                                             r_out_attrs_indices)
                            if out_sim_score:
                                output_row.append(sim_score)
                            output_rows.append(output_row)
                        else:
                            output_row = [cand, r_id]
                            if out_sim_score:
                                output_row.append(sim_score)
                            output_rows.append(output_row)
        prog_bar.update()

    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
예제 #5
0
def _edit_dist_join_split(ltable, rtable,
                          l_key_attr, r_key_attr,
                          l_join_attr, r_join_attr,
                          tokenizer,
                          threshold,
                          l_out_attrs, r_out_attrs,
                          l_out_prefix, r_out_prefix,
                          out_sim_score):
    # find column indices of key attr, join attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_join_attr_index)

    # build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_join_attr_index)

    sim_measure_type = 'EDIT_DISTANCE'
    # generate token ordering using tokens in l_join_attr
    # and r_join_attr
    token_ordering = gen_token_ordering_for_tables(
                         [ltable_dict.values(),
                          rtable_dict.values()],
                         [l_join_attr_index,
                          r_join_attr_index],
                         tokenizer, sim_measure_type)

    # build a dictionary of l_join_attr lengths
    l_join_attr_dict = {}
    for row in ltable_dict.values():
        l_join_attr_dict[row[l_key_attr_index]] = len(str(
                                                      row[l_join_attr_index]))

    # Build prefix index on l_join_attr
    prefix_index = PrefixIndex(ltable_dict.values(),
                               l_key_attr_index, l_join_attr_index,
                               tokenizer, sim_measure_type, threshold,
                               token_ordering)
    prefix_index.build()

    prefix_filter = PrefixFilter(tokenizer, sim_measure_type, threshold)
    sim_fn = get_sim_function(sim_measure_type)
    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)
    prog_bar = pyprind.ProgBar(len(rtable_dict.keys()))

    for r_row in rtable_dict.values():
        r_id = r_row[r_key_attr_index]
        r_string = str(r_row[r_join_attr_index])
        r_len = len(r_string)
        # check for empty string
        if not r_string:
            continue
        r_join_attr_tokens = tokenize(r_string, tokenizer, sim_measure_type)
        r_ordered_tokens = order_using_token_ordering(r_join_attr_tokens,
                                                      token_ordering)
        candidates = find_candidates_prefix_filter(
                         r_ordered_tokens, len(r_ordered_tokens),
                         prefix_filter, prefix_index) 
        for cand in candidates:
            if r_len - threshold <= l_join_attr_dict[cand] <= r_len + threshold:
                edit_dist = sim_fn(str(ltable_dict[cand][l_join_attr_index]),
                                   r_string)
                if edit_dist <= threshold:
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                                         ltable_dict[cand], r_row,
                                         cand, r_id,
                                         l_out_attrs_indices,
                                         r_out_attrs_indices)
                        if out_sim_score:
                            output_row.append(edit_dist)
                        output_rows.append(output_row)
                    else:
                        output_row = [cand, r_id]
                        if out_sim_score:
                            output_row.append(edit_dist)
                        output_rows.append(output_row)

        prog_bar.update()

    output_header = get_output_header_from_tables(
                        l_key_attr, r_key_attr,
                        l_out_attrs, r_out_attrs,
                        l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    # generate a dataframe from the list of output rows
    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
예제 #6
0
def _filter_tables_split(ltable, rtable, l_key_attr, r_key_attr, l_filter_attr,
                         r_filter_attr, overlap_filter, l_out_attrs,
                         r_out_attrs, l_out_prefix, r_out_prefix,
                         out_sim_score):
    # Find column indices of key attr, filter attr and output attrs in ltable
    l_columns = list(ltable.columns.values)
    l_key_attr_index = l_columns.index(l_key_attr)
    l_filter_attr_index = l_columns.index(l_filter_attr)
    l_out_attrs_indices = []
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # Find column indices of key attr, filter attr and output attrs in rtable
    r_columns = list(rtable.columns.values)
    r_key_attr_index = r_columns.index(r_key_attr)
    r_filter_attr_index = r_columns.index(r_filter_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # Build a dictionary on ltable
    ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                        l_filter_attr_index)

    # Build a dictionary on rtable
    rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                        r_filter_attr_index)

    # Build inverted index over ltable
    inverted_index = InvertedIndex(ltable_dict.values(), l_key_attr_index,
                                   l_filter_attr_index,
                                   overlap_filter.tokenizer)
    inverted_index.build()

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)
    prog_bar = pyprind.ProgBar(len(rtable))

    for r_row in rtable_dict.values():
        r_id = r_row[r_key_attr_index]
        r_string = str(r_row[r_filter_attr_index])
        # check for empty string
        if not r_string:
            continue
        r_filter_attr_tokens = tokenize(r_string, overlap_filter.tokenizer)

        # probe inverted index and find overlap of candidates
        candidate_overlap = _find_candidates(r_filter_attr_tokens,
                                             inverted_index)

        for cand, overlap in iteritems(candidate_overlap):
            if overlap >= overlap_filter.overlap_size:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                        ltable_dict[cand], r_row, cand, r_id,
                        l_out_attrs_indices, r_out_attrs_indices)
                    if out_sim_score:
                        output_row.append(overlap)
                    output_rows.append(output_row)
                else:
                    output_row = [cand, r_id]
                    if out_sim_score:
                        output_row.append(overlap)
                    output_rows.append(output_row)

        prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
예제 #7
0
    def filter_tables(self,
                      ltable,
                      rtable,
                      l_key_attr,
                      r_key_attr,
                      l_filter_attr,
                      r_filter_attr,
                      l_out_attrs=None,
                      r_out_attrs=None,
                      l_out_prefix='l_',
                      r_out_prefix='r_'):
        """Filter tables with suffix filter.

        Args:
        ltable, rtable : Pandas data frame
        l_key_attr, r_key_attr : String, key attribute from ltable and rtable
        l_filter_attr, r_filter_attr : String, filter attribute from ltable and rtable
        l_out_attrs, r_out_attrs : list of attribtues to be included in the output table from ltable and rtable
        l_out_prefix, r_out_prefix : String, prefix to be used in the attribute names of the output table 

        Returns:
        result : Pandas data frame
        """
        # check if the input tables are dataframes
        validate_input_table(ltable, 'left table')
        validate_input_table(rtable, 'right table')

        # check if the key attributes and filter attributes exist
        validate_attr(l_key_attr, ltable.columns, 'key attribute',
                      'left table')
        validate_attr(r_key_attr, rtable.columns, 'key attribute',
                      'right table')
        validate_attr(l_filter_attr, ltable.columns, 'filter attribute',
                      'left table')
        validate_attr(r_filter_attr, rtable.columns, 'filter attribute',
                      'right table')

        # check if the output attributes exist
        validate_output_attrs(l_out_attrs, ltable.columns, r_out_attrs,
                              rtable.columns)

        # check if the key attributes are unique and do not contain missing values
        validate_key_attr(l_key_attr, ltable, 'left table')
        validate_key_attr(r_key_attr, rtable, 'right table')

        # find column indices of key attr, filter attr and
        # output attrs in ltable
        l_columns = list(ltable.columns.values)
        l_key_attr_index = l_columns.index(l_key_attr)
        l_filter_attr_index = l_columns.index(l_filter_attr)
        l_out_attrs_indices = find_output_attribute_indices(
            l_columns, l_out_attrs)

        # find column indices of key attr, filter attr and
        # output attrs in rtable
        r_columns = list(rtable.columns.values)
        r_key_attr_index = r_columns.index(r_key_attr)
        r_filter_attr_index = r_columns.index(r_filter_attr)
        r_out_attrs_indices = find_output_attribute_indices(
            r_columns, r_out_attrs)

        # build a dictionary on ltable
        ltable_dict = build_dict_from_table(ltable, l_key_attr_index,
                                            l_filter_attr_index)

        # build a dictionary on rtable
        rtable_dict = build_dict_from_table(rtable, r_key_attr_index,
                                            r_filter_attr_index)

        # generate token ordering using tokens in l_filter_attr
        # and r_filter_attr
        token_ordering = gen_token_ordering_for_tables(
            [ltable_dict.values(), rtable_dict.values()],
            [l_filter_attr_index, r_filter_attr_index], self.tokenizer,
            self.sim_measure_type)

        output_rows = []
        has_output_attributes = (l_out_attrs is not None
                                 or r_out_attrs is not None)
        prog_bar = pyprind.ProgBar(len(ltable))

        for l_row in ltable_dict.values():
            l_id = l_row[l_key_attr_index]
            l_string = str(l_row[l_filter_attr_index])
            # check for empty string
            if not l_string:
                continue
            ltokens = tokenize(l_string, self.tokenizer, self.sim_measure_type)
            ordered_ltokens = order_using_token_ordering(
                ltokens, token_ordering)
            l_num_tokens = len(ordered_ltokens)
            l_prefix_length = get_prefix_length(l_num_tokens,
                                                self.sim_measure_type,
                                                self.threshold, self.tokenizer)
            l_suffix = ordered_ltokens[l_prefix_length:]
            for r_row in rtable_dict.values():
                r_id = r_row[r_key_attr_index]
                r_string = str(r_row[r_filter_attr_index])
                # check for empty string
                if not r_string:
                    continue
                rtokens = tokenize(r_string, self.tokenizer,
                                   self.sim_measure_type)
                ordered_rtokens = order_using_token_ordering(
                    rtokens, token_ordering)
                r_num_tokens = len(ordered_rtokens)
                r_prefix_length = get_prefix_length(r_num_tokens,
                                                    self.sim_measure_type,
                                                    self.threshold,
                                                    self.tokenizer)
                if not self._filter_suffix(
                        l_suffix, ordered_rtokens[r_prefix_length:],
                        l_prefix_length, r_prefix_length, l_num_tokens,
                        r_num_tokens):
                    if has_output_attributes:
                        output_row = get_output_row_from_tables(
                            ltable_dict[l_id], r_row, l_id, r_id,
                            l_out_attrs_indices, r_out_attrs_indices)
                        output_rows.append(output_row)
                    else:
                        output_rows.append([l_id, r_id])

            prog_bar.update()

        output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                      l_out_attrs, r_out_attrs,
                                                      l_out_prefix,
                                                      r_out_prefix)

        # generate a dataframe from the list of output rows
        output_table = pd.DataFrame(output_rows, columns=output_header)
        output_table.insert(0, '_id', range(0, len(output_table)))
        return output_table