예제 #1
0
def _sortedneighbourhood(df_a, df_b, column, window=3, sorting_key_values=None, block_on=[], block_left_on=[], block_right_on=[]):

	# Check if window is an odd number
	if not bool(window % 2):
		raise ValueError('The given window length is not an odd integer.')

	block_on = [block_on] if type(block_on) != list else block_on
	block_left_on = [block_left_on] if type(block_left_on) != list else block_left_on
	block_right_on = [block_right_on] if type(block_right_on) != list else block_right_on

	block_left_on, block_right_on = [block_on, block_on] if block_on else ([], [])
	keys_left = [column] + block_left_on
	keys_right = [column] + block_right_on

	df_a = df_a[df_a[column].notnull()] # df_a.dropna(inplace=True)
	df_b = df_b[df_b[column].notnull()] # df_a.dropna(inplace=True)

	# sorting_key_values is the terminology in Data Matching [Christen, 2012]
	if sorting_key_values is None:

		# Combine the results
		sorting_key_values = numpy.sort(numpy.unique(
			numpy.concatenate([df_a[column].values,df_b[column].values])
			))

	sorting_key_factors = numpy.arange(len(sorting_key_values))

	data_dict_A = {kl:df_a[kl] for kl in keys_left}
	data_dict_B = {kl:df_b[kl] for kl in keys_right}

	sorted_df_A = pandas.DataFrame(merge_dicts(data_dict_A, {column:df_a[column].replace(sorting_key_values, sorting_key_factors), df_a.index.name: df_a.index.values}))
	sorted_df_B = pandas.DataFrame({column:df_b[column].replace(sorting_key_values, sorting_key_factors), df_b.index.name: df_b.index.values})

	pairs_concat = None

	# Internal window size
	_window = int((window-1)/2)

	for w in range(-_window, _window+1):

		df = pandas.DataFrame(merge_dicts(data_dict_B, {column:sorted_df_B[column]+w, df_b.index.name: df_b.index.values}))

		pairs = sorted_df_A.merge(df, left_on=keys_left, right_on=keys_right, how='inner').set_index([df_a.index.name, df_b.index.name])
		pairs_concat = pairs.index if pairs_concat is None else pairs.index.append(pairs_concat)

	return pairs_concat
예제 #2
0
def _sortedneighbourhood(
        df_a, df_b, column, window=3, sorting_key_values=None,
        block_on=[], block_left_on=[], block_right_on=[]):

    # Check if window is an odd number
    if not isinstance(window, int) or (window < 0) or not bool(window % 2):
        raise ValueError('The given window length is not a positive and odd integer.')

    block_on = [block_on] if type(block_on) != list else block_on
    block_left_on = [block_left_on] if type(
        block_left_on) != list else block_left_on
    block_right_on = [block_right_on] if type(
        block_right_on) != list else block_right_on

    block_left_on, block_right_on = [
        block_on, block_on] if block_on else ([], [])
    keys_left = [column] + block_left_on
    keys_right = [column] + block_right_on

    df_a = df_a[df_a[column].notnull()]  # df_a.dropna(inplace=True)
    df_b = df_b[df_b[column].notnull()]  # df_a.dropna(inplace=True)

    # sorting_key_values is the terminology in Data Matching [Christen, 2012]
    if sorting_key_values is None:

        # Combine the results
        sorting_key_values = numpy.sort(numpy.unique(
            numpy.concatenate([df_a[column].values, df_b[column].values])
        ))

    sorting_key_factors = numpy.arange(len(sorting_key_values))

    data_dict_A = {kl: df_a[kl] for kl in keys_left}
    data_dict_B = {kl: df_b[kl] for kl in keys_right}

    sorted_index = pandas.Series(index=sorting_key_values, data=sorting_key_factors)
    sorted_df_A = pandas.DataFrame(
        merge_dicts(
            data_dict_A,
            {column: df_a[column].map(sorted_index),
             df_a.index.name: df_a.index.values}))
    sorted_df_B = pandas.DataFrame(
        {column: df_b[column].map(sorted_index),
            df_b.index.name: df_b.index.values})

    pairs_concat = None

    # Internal window size
    _window = int((window - 1) / 2)

    for w in range(-_window, _window + 1):

        df = pandas.DataFrame(
            merge_dicts(
                data_dict_B,
                {
                    column: sorted_df_B[column] + w,
                    df_b.index.name: df_b.index.values
                }
            )
        )

        pairs = sorted_df_A.merge(
            df, left_on=keys_left, right_on=keys_right, how='inner'
        ).set_index(
            [df_a.index.name, df_b.index.name]
        )

        if pairs_concat is None:
            pairs_concat = pairs.index
        else:
            pairs_concat = pairs.index.append(pairs_concat)

    return pairs_concat