示例#1
0
  def testIsDictionary(self):  # - - - - - - - - - - - - - - - - - - - - - - -
    """Test 'check_is_dictionary' function."""

    assert (auxiliary.check_is_dictionary('TestArgument',{})         == None)
    assert (auxiliary.check_is_dictionary('TestArgument', {1:2,6:0}) == None)
    assert (auxiliary.check_is_dictionary('TestArgument', \
            {'a':4,'t':1,(1,4,6):'tr'}) == None)
示例#2
0
  def testIsDictionary(self):  # - - - - - - - - - - - - - - - - - - - - - - -
    """Test 'check_is_dictionary' function."""

    assert (auxiliary.check_is_dictionary('TestArgument',{})         == None)
    assert (auxiliary.check_is_dictionary('TestArgument', {1:2,6:0}) == None)
    assert (auxiliary.check_is_dictionary('TestArgument', \
            {'a':4,'t':1,(1,4,6):'tr'}) == None)
示例#3
0
def pairs_quality(weight_vec_dict, match_check_funct):
    """Pairs quality is the ratio of true matches divided by the total number of
     matches of the compared record pairs returned after blocking. It is
     measured as:

       pq = |TP| / all_matches

     with TP being the true positives, and all matches being the number of
     weight vectors given.

     The arguments that have to be set when this method is called are:
       weight_vec_dict    A dictionary containing weight vectors.
       match_check_funct  This has to be a function (or method), assumed to
                          have as arguments the two record identifiers of a
                          record pair and its weight vector, and returns True
                          if the record pair is from a true match, or False
                          otherwise. Thus, 'match_check_funct' is of the form:

                            match_flag = match_check_funct(rec_id1, rec_id2,
                                                           weight_vec)
  """

    auxiliary.check_is_dictionary('weight_vec_dict', weight_vec_dict)
    auxiliary.check_is_function_or_method('match_check_funct',
                                          match_check_funct)

    total_num_rec_pairs = len(weight_vec_dict)

    logging.info('')
    logging.info('Calculate pairs quality:')
    logging.info('  Number of record pairs in weight vector dictionary: %d' % \
                 (total_num_rec_pairs))

    # Get number of true matches in weight vector dictionary - - - - - - - - - -
    #
    num_true_matches = 0
    num_false_matches = 0

    for (rec_id_tuple, this_vec) in weight_vec_dict.iteritems():

        if (match_check_funct(rec_id_tuple[0], rec_id_tuple[1],
                              this_vec) == True):
            num_true_matches += 1
        else:
            num_false_matches += 1

    assert total_num_rec_pairs == (num_true_matches + num_false_matches)

    logging.info('  Number of true and false matches in weight vector ' + \
                 'dictionary: %d / %d' % (num_true_matches, num_false_matches))

    pq = float(num_true_matches) / total_num_rec_pairs

    logging.info('  Pairs quality: %.4f%%' % (100.0 * pq))  # As percentage

    assert pq <= 1.0

    return pq
示例#4
0
def pairs_quality(weight_vec_dict, match_check_funct):
  """Pairs quality is the ratio of true matches divided by the total number of
     matches of the compared record pairs returned after blocking. It is
     measured as:

       pq = |TP| / all_matches

     with TP being the true positives, and all matches being the number of
     weight vectors given.

     The arguments that have to be set when this method is called are:
       weight_vec_dict    A dictionary containing weight vectors.
       match_check_funct  This has to be a function (or method), assumed to
                          have as arguments the two record identifiers of a
                          record pair and its weight vector, and returns True
                          if the record pair is from a true match, or False
                          otherwise. Thus, 'match_check_funct' is of the form:

                            match_flag = match_check_funct(rec_id1, rec_id2,
                                                           weight_vec)
  """

  auxiliary.check_is_dictionary('weight_vec_dict', weight_vec_dict)
  auxiliary.check_is_function_or_method('match_check_funct', match_check_funct)

  total_num_rec_pairs = len(weight_vec_dict)

  logging.info('')
  logging.info('Calculate pairs quality:')
  logging.info('  Number of record pairs in weight vector dictionary: %d' % \
               (total_num_rec_pairs))

  # Get number of true matches in weight vector dictionary - - - - - - - - - -
  #
  num_true_matches =  0
  num_false_matches = 0

  for (rec_id_tuple, this_vec) in weight_vec_dict.iteritems():

    if (match_check_funct(rec_id_tuple[0], rec_id_tuple[1], this_vec) == True):
      num_true_matches += 1
    else:
      num_false_matches += 1

  assert total_num_rec_pairs == (num_true_matches + num_false_matches)

  logging.info('  Number of true and false matches in weight vector ' + \
               'dictionary: %d / %d' % (num_true_matches, num_false_matches))

  pq = float(num_true_matches) / total_num_rec_pairs

  logging.info('  Pairs quality: %.4f%%' % (100.0*pq)) # As percentage

  assert pq <= 1.0

  return pq
示例#5
0
    def testIsDictionary(
            self):  # - - - - - - - - - - - - - - - - - - - - - - -
        """Test 'check_is_dictionary' function."""

        assert auxiliary.check_is_dictionary("TestArgument", {})
        assert auxiliary.check_is_dictionary("TestArgument", {1: 2, 6: 0})
        assert (auxiliary.check_is_dictionary("TestArgument", {
            "a": 4,
            "t": 1,
            (1, 4, 6): "tr"
        }))
def SaveMatchStatusFile(w_vec_dict, match_set, file_name):
    """Save the matched record identifiers into a CVS file.

     This function saves the record identifiers of all record pairs that are in
     the given match set into a CSV file with four columns:
     - First record identifier
     - Second record identifier
     - Summed matching weight from the corresponding weight vector
     - A unique match identifier (generated in the same way as the ones in the
       function SaveMatchDataSet below).
  """

    auxiliary.check_is_dictionary('w_vec_dict', w_vec_dict)
    auxiliary.check_is_set('match_set', match_set)
    auxiliary.check_is_string('file_name', file_name)

    match_rec_id_list = list(match_set)  # Make a list so it can be sorted
    match_rec_id_list.sort()

    if (len(match_set) > 0):
        num_digit = max(1, int(math.ceil(math.log(len(match_set), 10))))
    else:
        num_digit = 1
    mid_count = 1  # Counter for match identifiers

    # Try to open the file for writing
    #
    try:
        f = open(file_name, 'w')
    except:
        logging.exception('Cannot open file "%s" for writing' %
                          (str(file_name)))
        raise IOError

    for rec_id_tuple in match_rec_id_list:
        w_vec = w_vec_dict[rec_id_tuple]
        w_sum = sum(w_vec)

        mid_count_str = '%s' % (mid_count)
        this_mid = 'mid%s' % (mid_count_str.zfill(num_digit))

        rec_id1 = rec_id_tuple[0]
        rec_id2 = rec_id_tuple[1]

        f.write('%s,%s,%f,%s' % (rec_id1, rec_id2, w_sum, this_mid) +
                os.linesep)

        mid_count += 1

    f.close()
示例#7
0
def SaveMatchStatusFile(w_vec_dict, match_set, file_name):
  """Save the matched record identifiers into a CVS file.

     This function saves the record identifiers of all record pairs that are in
     the given match set into a CSV file with four columns:
     - First record identifier
     - Second record identifier
     - Summed matching weight from the corresponding weight vector
     - A unique match identifier (generated in the same way as the ones in the
       function SaveMatchDataSet below).
  """

  auxiliary.check_is_dictionary('w_vec_dict', w_vec_dict)
  auxiliary.check_is_set('match_set', match_set)
  auxiliary.check_is_string('file_name', file_name)

  match_rec_id_list = list(match_set)  # Make a list so it can be sorted
  match_rec_id_list.sort()

  if (len(match_set) > 0):
    num_digit = max(1,int(math.ceil(math.log(len(match_set), 10))))
  else:
    num_digit = 1
  mid_count = 1  # Counter for match identifiers

  # Try to open the file for writing
  #
  try:
    f = open(file_name, 'w')
  except:
    logging.exception('Cannot open file "%s" for writing' % (str(file_name)))
    raise IOError

  for rec_id_tuple in match_rec_id_list:
    w_vec = w_vec_dict[rec_id_tuple]
    w_sum = sum(w_vec)

    mid_count_str = '%s' % (mid_count)
    this_mid = 'mid%s' % (mid_count_str.zfill(num_digit))

    rec_id1 = rec_id_tuple[0]
    rec_id2 = rec_id_tuple[1]

    f.write('%s,%s,%f,%s' % (rec_id1, rec_id2, w_sum, this_mid) + os.linesep)

    mid_count += 1

  f.close()
示例#8
0
def GenerateHistogram(w_vec_dict, bin_width, file_name=None, match_sets=None):
  """Print and/or save a histogram of the weight vectors stored in the given
     dictionary, and according to the match sets (if given).

     The histogram is rotated 90 degrees clockwise, i.e. up to down instead of
     left to right.

     This function sums up the number of weight vectors with a matching weight
     in a given bin (according to the given bin width).

     If given, the match sets must be a tuple containing three sets, the first
     being a set with matches, the second with non-matches, and the third with
     possible matches, as generated by classifiers in the classification.py
     Febrl module.

     For each bin, the number of weight vectors in this bin is printed as well,
     and if the match sets are given the number of matches, non-matches and
     possible matches in this bin.

     If a file name is given, the output will be written into this text file.

     This function returns a list of containing the histogram as text strings.
  """

  MAX_HISTO_WIDTH = 80  # maximum width in characters

  auxiliary.check_is_dictionary('w_vec_dict', w_vec_dict)
  auxiliary.check_is_number('bin_width', bin_width)
  auxiliary.check_is_positive('bin_width', bin_width)
  if (file_name != None):
    auxiliary.check_is_string('file_name', file_name)
  if (match_sets != None):
    auxiliary.check_is_tuple('match_sets', match_sets)
    if (len(match_sets) != 3):
      logging.exception('Match sets must be a tuple containing three sets.')
      raise Exception
    auxiliary.check_is_set('match_sets[0]', match_sets[0])
    auxiliary.check_is_set('match_sets[1]', match_sets[1])
    auxiliary.check_is_set('match_sets[2]', match_sets[2])
    if (len(w_vec_dict) != (len(match_sets[0]) + len(match_sets[1]) + \
                            len(match_sets[2]))):
      logging.exception('Lengths of weight vector dictionary differs from' + \
                        'summed lengths of match sets.')
      raise Exception

  # Check if weight vector dictionary is empty, if so return empty list
  #
  if (w_vec_dict == {}):
    logging.warn('Empty weight vector dictionary given for histogram ' + \
                 'generation')
    return []

  # Get a random vector dictionary element to get dimensionality of vectors
  #
  (rec_id_tuple, w_vec) = w_vec_dict.popitem()
  v_dim = len(w_vec)
  w_vec_dict[rec_id_tuple] = w_vec  # Put back in

  histo_dict = {}  # A combined histogram dictionary

  if (match_sets != None):  #  Also matches, non-matches and possible matches
    match_histo_dict =      {}
    non_match_histo_dict =  {}
    poss_match_histo_dict = {}

  max_bin_w_count = -1 # Maximal count for one binned weight entry

  # Loop over weight vectors - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  for (rec_id_tuple, w_vec) in w_vec_dict.iteritems():

    w_sum = sum(w_vec)  # Sum all weight vector elements
    binned_w = w_sum - (w_sum % bin_width)

    binned_w_count = histo_dict.get(binned_w,0) + 1  # Increase count by one
    histo_dict[binned_w] = binned_w_count

    if (binned_w_count > max_bin_w_count): # Check if this is new maximum count
      max_bin_w_count = binned_w_count

    if (match_sets != None):
      if (rec_id_tuple in match_sets[0]):
        binned_w_count = match_histo_dict.get(binned_w,0) + 1
        match_histo_dict[binned_w] = binned_w_count
      elif (rec_id_tuple in match_sets[1]):
        binned_w_count = non_match_histo_dict.get(binned_w,0) + 1
        non_match_histo_dict[binned_w] = binned_w_count
      else: # A possible match
        binned_w_count = poss_match_histo_dict.get(binned_w,0) + 1
        poss_match_histo_dict[binned_w] = binned_w_count

  # Sort histogram according to X axis values - - - - - - - - - - - - - - - - -
  #
  x_vals = histo_dict.keys()
  x_vals.sort()

  assert sum(histo_dict.values()) == len(w_vec_dict)

  if (match_sets == None):  # Can use 68 characters for histogram
    scale_factor_y = float(MAX_HISTO_WIDTH-19) / max_bin_w_count
  elif (len(poss_match_histo_dict) == 0):  # No possible matches
    scale_factor_y = float(MAX_HISTO_WIDTH-30) / max_bin_w_count
  else:  # All three set non-empty
    scale_factor_y = float(MAX_HISTO_WIDTH-41) / max_bin_w_count

  # Generate the histogram as a list of strings - - - - - - - - - - - - - - - -
  #
  histo_list = []
  histo_list.append('Weight histogram:')
  histo_list.append('-----------------')

  if (match_sets == None):
    histo_list.append('  Counts  | w_sum |')
    histo_list.append('-------------------')
  elif (len(poss_match_histo_dict) == 0):  # No possible matches
    histo_list.append('       Counts        |')
    histo_list.append('  Match   | Non-Match| w_sum |')
    histo_list.append('------------------------------')
  else:
    histo_list.append('              Counts            |')
    histo_list.append('  Match   | Non-Match|Poss-Match| w_sum |')
    histo_list.append('-----------------------------------------')
  for x_val in x_vals:
    this_count = histo_dict[x_val]

    if (match_sets == None):
      line_str = '%9d | %5.2f |' % (this_count, x_val)
    elif (len(poss_match_histo_dict) == 0):  # No possible matches
      this_match_count =     match_histo_dict.get(x_val, 0)
      this_non_match_count = non_match_histo_dict.get(x_val, 0)

      line_str = '%9d |%9d | %5.2f |' % (this_match_count,
                                          this_non_match_count, x_val)
    else:
      this_match_count =      match_histo_dict.get(x_val, 0)
      this_non_match_count =  non_match_histo_dict.get(x_val, 0)
      this_poss_match_count = poss_match_histo_dict.get(x_val, 0)

      line_str = '%9d |%9d |%9d | %5.2f |' % (this_match_count,
                                                this_non_match_count,
                                                this_poss_match_count, x_val)

    line_str += '*'*int(this_count*scale_factor_y)
    histo_list.append(line_str)

  histo_list.append('')

  # If a file name is given open it for writing - - - - - - - - - - - - - - - -
  #
  if (file_name != None):
    try:
      f = open(file_name, 'w')
    except:
      logging.exception('Cannot open file "%s" for writing' % (str(file_name)))
      raise IOError

    for line in histo_list:
      f.write(line + os.linesep)

    f.close()
    logging.info('Histogram written to file: %s' % (file_name))

  if (match_sets != None):
    print match_histo_dict.items()
    print non_match_histo_dict.items()

  return histo_list
def GenerateHistogram(w_vec_dict, bin_width, file_name=None, match_sets=None):
    """Print and/or save a histogram of the weight vectors stored in the given
     dictionary, and according to the match sets (if given).

     The histogram is rotated 90 degrees clockwise, i.e. up to down instead of
     left to right.

     This function sums up the number of weight vectors with a matching weight
     in a given bin (according to the given bin width).

     If given, the match sets must be a tuple containing three sets, the first
     being a set with matches, the second with non-matches, and the third with
     possible matches, as generated by classifiers in the classification.py
     Febrl module.

     For each bin, the number of weight vectors in this bin is printed as well,
     and if the match sets are given the number of matches, non-matches and
     possible matches in this bin.

     If a file name is given, the output will be written into this text file.

     This function returns a list of containing the histogram as text strings.
  """

    MAX_HISTO_WIDTH = 80  # maximum width in characters

    auxiliary.check_is_dictionary('w_vec_dict', w_vec_dict)
    auxiliary.check_is_number('bin_width', bin_width)
    auxiliary.check_is_positive('bin_width', bin_width)
    if (file_name != None):
        auxiliary.check_is_string('file_name', file_name)
    if (match_sets != None):
        auxiliary.check_is_tuple('match_sets', match_sets)
        if (len(match_sets) != 3):
            logging.exception(
                'Match sets must be a tuple containing three sets.')
            raise Exception
        auxiliary.check_is_set('match_sets[0]', match_sets[0])
        auxiliary.check_is_set('match_sets[1]', match_sets[1])
        auxiliary.check_is_set('match_sets[2]', match_sets[2])
        if (len(w_vec_dict) != (len(match_sets[0]) + len(match_sets[1]) + \
                                len(match_sets[2]))):
            logging.exception('Lengths of weight vector dictionary differs from' + \
                              'summed lengths of match sets.')
            raise Exception

    # Check if weight vector dictionary is empty, if so return empty list
    #
    if (w_vec_dict == {}):
        logging.warn('Empty weight vector dictionary given for histogram ' + \
                     'generation')
        return []

    # Get a random vector dictionary element to get dimensionality of vectors
    #
    (rec_id_tuple, w_vec) = w_vec_dict.popitem()
    v_dim = len(w_vec)
    w_vec_dict[rec_id_tuple] = w_vec  # Put back in

    histo_dict = {}  # A combined histogram dictionary

    if (match_sets != None):  #  Also matches, non-matches and possible matches
        match_histo_dict = {}
        non_match_histo_dict = {}
        poss_match_histo_dict = {}

    max_bin_w_count = -1  # Maximal count for one binned weight entry

    # Loop over weight vectors - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    for (rec_id_tuple, w_vec) in w_vec_dict.iteritems():

        w_sum = sum(w_vec)  # Sum all weight vector elements
        binned_w = w_sum - (w_sum % bin_width)

        binned_w_count = histo_dict.get(binned_w,
                                        0) + 1  # Increase count by one
        histo_dict[binned_w] = binned_w_count

        if (binned_w_count >
                max_bin_w_count):  # Check if this is new maximum count
            max_bin_w_count = binned_w_count

        if (match_sets != None):
            if (rec_id_tuple in match_sets[0]):
                binned_w_count = match_histo_dict.get(binned_w, 0) + 1
                match_histo_dict[binned_w] = binned_w_count
            elif (rec_id_tuple in match_sets[1]):
                binned_w_count = non_match_histo_dict.get(binned_w, 0) + 1
                non_match_histo_dict[binned_w] = binned_w_count
            else:  # A possible match
                binned_w_count = poss_match_histo_dict.get(binned_w, 0) + 1
                poss_match_histo_dict[binned_w] = binned_w_count

    # Sort histogram according to X axis values - - - - - - - - - - - - - - - - -
    #
    x_vals = histo_dict.keys()
    x_vals.sort()

    assert sum(histo_dict.values()) == len(w_vec_dict)

    if (match_sets == None):  # Can use 68 characters for histogram
        scale_factor_y = float(MAX_HISTO_WIDTH - 19) / max_bin_w_count
    elif (len(poss_match_histo_dict) == 0):  # No possible matches
        scale_factor_y = float(MAX_HISTO_WIDTH - 30) / max_bin_w_count
    else:  # All three set non-empty
        scale_factor_y = float(MAX_HISTO_WIDTH - 41) / max_bin_w_count

    # Generate the histogram as a list of strings - - - - - - - - - - - - - - - -
    #
    histo_list = []
    histo_list.append('Weight histogram:')
    histo_list.append('-----------------')

    if (match_sets == None):
        histo_list.append('  Counts  | w_sum |')
        histo_list.append('-------------------')
    elif (len(poss_match_histo_dict) == 0):  # No possible matches
        histo_list.append('       Counts        |')
        histo_list.append('  Match   | Non-Match| w_sum |')
        histo_list.append('------------------------------')
    else:
        histo_list.append('              Counts            |')
        histo_list.append('  Match   | Non-Match|Poss-Match| w_sum |')
        histo_list.append('-----------------------------------------')
    for x_val in x_vals:
        this_count = histo_dict[x_val]

        if (match_sets == None):
            line_str = '%9d | %5.2f |' % (this_count, x_val)
        elif (len(poss_match_histo_dict) == 0):  # No possible matches
            this_match_count = match_histo_dict.get(x_val, 0)
            this_non_match_count = non_match_histo_dict.get(x_val, 0)

            line_str = '%9d |%9d | %5.2f |' % (this_match_count,
                                               this_non_match_count, x_val)
        else:
            this_match_count = match_histo_dict.get(x_val, 0)
            this_non_match_count = non_match_histo_dict.get(x_val, 0)
            this_poss_match_count = poss_match_histo_dict.get(x_val, 0)

            line_str = '%9d |%9d |%9d | %5.2f |' % (
                this_match_count, this_non_match_count, this_poss_match_count,
                x_val)

        line_str += '*' * int(this_count * scale_factor_y)
        histo_list.append(line_str)

    histo_list.append('')

    # If a file name is given open it for writing - - - - - - - - - - - - - - - -
    #
    if (file_name != None):
        try:
            f = open(file_name, 'w')
        except:
            logging.exception('Cannot open file "%s" for writing' %
                              (str(file_name)))
            raise IOError

        for line in histo_list:
            f.write(line + os.linesep)

        f.close()
        logging.info('Histogram written to file: %s' % (file_name))

    if (match_sets != None):
        print match_histo_dict.items()
        print non_match_histo_dict.items()

    return histo_list
示例#10
0
def pairs_completeness(weight_vec_dict, dataset1, dataset2, get_id_funct,
                       match_check_funct):
    """Pairs completeness is measured as

       pc = Nm / M

     with Nm (<= M) being the number of correctly classified truly matched
     record pairs in the blocked comparison space, and M the total number of
     true matches.

     If both data sets are the same a deduplication is assumed, otherwise a
     linkage.

     The arguments that have to be set when this method is called are:
       weight_vec_dict    A dictionary containing weight vectors.
       dataset1           The initialised first data set object.
       dataset2           The initialised second data set object.
       get_id_funct       This has to be a function (or method), assumed to
                          have argument a record (assumed to be a list fo field
                          values), and returns the record identifier from that
                          record.
       match_check_funct  This has to be a function (or method), assumed to
                          have as arguments the two record identifiers of a
                          record pair and its weight vector, and returns True
                          if the record pair is from a true match, or False
                          otherwise. Thus, 'match_check_funct' is of the form:

                            match_flag = match_check_funct(rec_id1, rec_id2,
                                                           weight_vec)
  """

    auxiliary.check_is_dictionary('weight_vec_dict', weight_vec_dict)
    auxiliary.check_is_not_none('dataset1', dataset1)
    auxiliary.check_is_not_none('dataset2', dataset2)
    auxiliary.check_is_function_or_method('get_id_funct', get_id_funct)
    auxiliary.check_is_function_or_method('match_check_funct',
                                          match_check_funct)

    # Check if a deduplication will be done or a linkage - - - - - - - - - - - -
    #
    if (dataset1 == dataset2):
        do_dedup = True
    else:
        do_dedup = False

    logging.info('')
    logging.info('Calculate pairs completeness:')
    logging.info('  Data set 1: %s (containing %d records)' % \
                 (dataset1.description, dataset1.num_records))
    if (do_dedup == True):
        logging.info('  Data sets are the same: Deduplication')
    else:
        logging.info('  Data set 2: %s (containing %d records)' % \
                     (dataset2.description, dataset2.num_records))
        logging.info('  Data sets differ:       Linkage')
    logging.info('  Number of record pairs in weight vector dictionary: %d' % \
                 (len(weight_vec_dict)))

    num_all_true_matches = 0  # Count the total number of all true matches

    # For a deduplication only process data set 1 - - - - - - - - - - - - - - - -
    #
    if (do_dedup == True):

        # Build a dictionary with entity identifiers as keys and a list of their
        # record identifier (rec_ident) as values
        #
        entity_ident_dict = {}

        for (rec_ident, rec) in dataset1.readall():
            ent_id = get_id_funct(rec)

            this_rec_list = entity_ident_dict.get(ent_id, [])
            this_rec_list.append(rec_ident)
            entity_ident_dict[ent_id] = this_rec_list

        logging.info('  Number of unique entity identifiers in data set 1: %d' % \
                     (len(entity_ident_dict)))

        for (ent_id, rec_list) in entity_ident_dict.iteritems():
            num_this_rec = len(rec_list)

            if (num_this_rec > 1):
                num_all_true_matches += num_this_rec * (num_this_rec - 1) / 2

        # More efficent version: Only count number of matches ber record don't
        # store them
        #
        entity_ident_dict2 = {}

        for (rec_ident, rec) in dataset1.readall():
            ent_id = get_id_funct(rec)
            ent_id_count = entity_ident_dict2.get(ent_id, 0) + 1
            entity_ident_dict2[ent_id] = ent_id_count

        assert sum(entity_ident_dict2.values()) == dataset1.num_records

        tm = 0  # Total number of true matches (without indexing)

        for (ent_id, ent_count) in entity_ident_dict2.iteritems():
            tm += ent_count * (ent_count - 1) / 2

        assert num_all_true_matches == tm

    else:  # For a linkage - - - - - - - - - - - - - - - - - - - - - - - - - - -

        # Build two dictionaries with  entity identifiers as keys and a list of
        # their record identifier (rec_ident) as values
        #
        entity_ident_dict1 = {}
        entity_ident_dict2 = {}

        for (rec_ident, rec) in dataset1.readall():
            ent_id = get_id_funct(rec)

            this_rec_list = entity_ident_dict1.get(ent_id, [])
            this_rec_list.append(rec_ident)
            entity_ident_dict1[ent_id] = this_rec_list

        logging.info('  Number of unique entity identifiers in data set 1: %d' % \
                     (len(entity_ident_dict1)))

        for (rec_ident, rec) in dataset2.readall():
            ent_id = get_id_funct(rec)

            this_rec_list = entity_ident_dict2.get(ent_id, [])
            this_rec_list.append(rec_ident)
            entity_ident_dict2[ent_id] = this_rec_list

        logging.info('  Number of unique entity identifiers in data set 2: %d' % \
                     (len(entity_ident_dict2)))

        # Now calculate total true match number (loop over smaller dict)
        #
        if (len(entity_ident_dict1) < len(entity_ident_dict2)):
            for (ent_id1, rec_list1) in entity_ident_dict1.iteritems():

                if (ent_id1 in entity_ident_dict2):
                    rec_list2 = entity_ident_dict2[ent_id1]

                    num_all_true_matches += len(rec_list1) * len(rec_list2)
        else:
            for (ent_id2, rec_list2) in entity_ident_dict2.iteritems():

                if (ent_id2 in entity_ident_dict1):
                    rec_list1 = entity_ident_dict1[ent_id2]

                    num_all_true_matches += len(rec_list1) * len(rec_list2)

        # More efficent version: Only count number of matches ber record don't
        # store them
        #
        entity_ident_dict3 = {}
        entity_ident_dict4 = {}

        for (rec_ident, rec) in dataset1.readall():
            ent_id = get_id_funct(rec)
            ent_id_count = entity_ident_dict3.get(ent_id, 0) + 1
            entity_ident_dict3[ent_id] = ent_id_count

        for (rec_ident, rec) in dataset2.readall():
            ent_id = get_id_funct(rec)
            ent_id_count = entity_ident_dict4.get(ent_id, 0) + 1
            entity_ident_dict4[ent_id] = ent_id_count

        assert sum(entity_ident_dict3.values()) == dataset1.num_records
        assert sum(entity_ident_dict4.values()) == dataset2.num_records

        tm = 0  # Total number of true matches (without indexing)

        if (len(entity_ident_dict3) < len(entity_ident_dict4)):
            for (ent_id, ent_count) in entity_ident_dict3.iteritems():
                if ent_id in entity_ident_dict4:
                    tm += ent_count * entity_ident_dict4[ent_id]
        else:
            for (ent_id, ent_count) in entity_ident_dict4.iteritems():
                if ent_id in entity_ident_dict3:
                    tm += ent_count * entity_ident_dict3[ent_id]

        assert num_all_true_matches == tm

    logging.info('  Number of all true matches: %d' % (num_all_true_matches))

    # Get number of true matches in weight vector dictionary - - - - - - - - - -
    #
    num_true_matches = 0
    num_false_matches = 0

    for (rec_id_tuple, this_vec) in weight_vec_dict.iteritems():

        if (match_check_funct(rec_id_tuple[0], rec_id_tuple[1],
                              this_vec) == True):
            num_true_matches += 1
        else:
            num_false_matches += 1

    assert len(weight_vec_dict) == num_true_matches + num_false_matches

    logging.info('  Number of true and false matches in weight vector ' + \
                 'dictionary: %d / %d' % (num_true_matches,num_false_matches))

    if (num_all_true_matches > 0):

        pc = float(num_true_matches) / float(num_all_true_matches)

        logging.info('  Pairs completeness: %.4f%%' %
                     (100.0 * pc))  # As percentage

    else:

        pc = 0.0

        logging.info('  No true matches - cannot calculate pairs completeness')

    assert pc <= 1.0, pc

    return pc
示例#11
0
def quality_measures(weight_vec_dict, match_set, non_match_set,
                     match_check_funct):
    """Calculate several quality measures based on the number of true positives,
     true negatives, false positives and false negatives in the given match
     and non-match sets and weight vector dictionary using the given match
     check function.

     The function calculates and returns:

     - Accuracy:       (|TP|+|TN|)
                  ---------------------
                  (|TP|+|TN|+|FP|+|FN|)

     - Precision:    |TP|
                  -----------
                  (|TP|+|FP|)

     - Recall:       |TP|
                  -----------
                  (|TP|+|FN|)

     - F-Measure:   2 * (Precision * Recall)
                  --------------------------
                     (Precision + Recall)

     With TP the True Positives, TN the True negatives, FP the False Positives
     and FN the False Negatives.

     For a discussion about measuring data linkage and deduplication quality
     please refer to:

       Quality and Complexity Measures for Data Linkage and Deduplication
       Peter Christen and Karl Goiser

       Book chapter in "Quality Measures in Data Mining"
                       Studies in Computational Intelligence, Vol. 43
                       F. Guillet and H. Hamilton (eds), Springer
                       March 2007.
  """

    auxiliary.check_is_dictionary('weight_vec_dict', weight_vec_dict)
    auxiliary.check_is_set('match set', match_set)
    auxiliary.check_is_set('non match set', non_match_set)
    auxiliary.check_is_function_or_method('match_check_funct',
                                          match_check_funct)

    if ((len(match_set) + len(non_match_set)) != len(weight_vec_dict)):
        logging.exception('Match and non-match set are not of same length as ' + \
                          'weight vector dictionary: %d, %d / %d' % \
                          (len(match_set),len(non_match_set),len(weight_vec_dict)))
        raise Exception

    tp = 0.0
    fp = 0.0
    tn = 0.0
    fn = 0.0

    for rec_id_tuple in match_set:
        w_vec = weight_vec_dict[rec_id_tuple]

        if (match_check_funct(rec_id_tuple[0], rec_id_tuple[1],
                              w_vec) == True):
            tp += 1
        else:
            fp += 1

    for rec_id_tuple in non_match_set:
        w_vec = weight_vec_dict[rec_id_tuple]

        if (match_check_funct(rec_id_tuple[0], rec_id_tuple[1],
                              w_vec) == False):
            tn += 1
        else:
            fn += 1

    logging.info('')
    logging.info('Classification results: TP=%d, FP=%d / TN=%d, FN=%d' % \
                 (tp, fp, tn, fn))

    if ((tp != 0) or (fp != 0) or (tn != 0) or (fn != 0)):
        acc = (tp + tn) / (tp + fp + tn + fn)
    else:
        acc = 0.0

    if ((tp != 0) or (fp != 0)):
        prec = tp / (tp + fp)
    else:
        prec = 0.0

    if ((tp != 0) or (fn != 0)):
        reca = tp / (tp + fn)
    else:
        reca = 0.0
    if ((prec != 0.0) or (reca != 0.0)):
        fmeas = 2 * (prec * reca) / (prec + reca)
    else:
        fmeas = 0.0

    logging.info('Quality measures:')
    logging.info('  Accuracy: %.6f  Precision:%.4f  Recall: %.4f  ' % \
                 (acc, prec, reca)+'F-measure: %.4f' % (fmeas))

    return acc, prec, reca, fmeas
示例#12
0
    def testIsDictionary(self):  # - - - - - - - - - - - - - - - - - - - - - - -
        """Test 'check_is_dictionary' function."""

        assert auxiliary.check_is_dictionary("TestArgument", {}) == None
        assert auxiliary.check_is_dictionary("TestArgument", {1: 2, 6: 0}) == None
        assert auxiliary.check_is_dictionary("TestArgument", {"a": 4, "t": 1, (1, 4, 6): "tr"}) == None
示例#13
0
def pairs_completeness(weight_vec_dict, dataset1, dataset2, get_id_funct,
                       match_check_funct):
  """Pairs completeness is measured as

       pc = Nm / M

     with Nm (<= M) being the number of correctly classified truly matched
     record pairs in the blocked comparison space, and M the total number of
     true matches.

     If both data sets are the same a deduplication is assumed, otherwise a
     linkage.

     The arguments that have to be set when this method is called are:
       weight_vec_dict    A dictionary containing weight vectors.
       dataset1           The initialised first data set object.
       dataset2           The initialised second data set object.
       get_id_funct       This has to be a function (or method), assumed to
                          have argument a record (assumed to be a list fo field
                          values), and returns the record identifier from that
                          record.
       match_check_funct  This has to be a function (or method), assumed to
                          have as arguments the two record identifiers of a
                          record pair and its weight vector, and returns True
                          if the record pair is from a true match, or False
                          otherwise. Thus, 'match_check_funct' is of the form:

                            match_flag = match_check_funct(rec_id1, rec_id2,
                                                           weight_vec)
  """

  auxiliary.check_is_dictionary('weight_vec_dict', weight_vec_dict)
  auxiliary.check_is_not_none('dataset1', dataset1)
  auxiliary.check_is_not_none('dataset2', dataset2)
  auxiliary.check_is_function_or_method('get_id_funct', get_id_funct)
  auxiliary.check_is_function_or_method('match_check_funct', match_check_funct)

  # Check if a deduplication will be done or a linkage - - - - - - - - - - - -
  #
  if (dataset1 == dataset2):
    do_dedup = True
  else:
    do_dedup = False

  logging.info('')
  logging.info('Calculate pairs completeness:')
  logging.info('  Data set 1: %s (containing %d records)' % \
               (dataset1.description, dataset1.num_records))
  if (do_dedup == True):
    logging.info('  Data sets are the same: Deduplication')
  else:
    logging.info('  Data set 2: %s (containing %d records)' % \
                 (dataset2.description, dataset2.num_records))
    logging.info('  Data sets differ:       Linkage')
  logging.info('  Number of record pairs in weight vector dictionary: %d' % \
               (len(weight_vec_dict)))

  num_all_true_matches = 0  # Count the total number of all true matches

  # For a deduplication only process data set 1 - - - - - - - - - - - - - - - -
  #
  if (do_dedup == True):

    # Build a dictionary with entity identifiers as keys and a list of their
    # record identifier (rec_ident) as values
    #
    entity_ident_dict = {}

    for (rec_ident, rec) in dataset1.readall():
      ent_id = get_id_funct(rec)

      this_rec_list = entity_ident_dict.get(ent_id, [])
      this_rec_list.append(rec_ident)
      entity_ident_dict[ent_id] = this_rec_list

    logging.info('  Number of unique entity identifiers in data set 1: %d' % \
                 (len(entity_ident_dict)))

    for (ent_id, rec_list) in entity_ident_dict.iteritems():
      num_this_rec = len(rec_list)

      if (num_this_rec > 1):
        num_all_true_matches += num_this_rec*(num_this_rec-1)/2

    # More efficent version: Only count number of matches ber record don't
    # store them
    #
    entity_ident_dict2 = {}

    for (rec_ident, rec) in dataset1.readall():
      ent_id = get_id_funct(rec)
      ent_id_count = entity_ident_dict2.get(ent_id, 0) + 1
      entity_ident_dict2[ent_id] = ent_id_count

    assert sum(entity_ident_dict2.values()) == dataset1.num_records

    tm = 0  # Total number of true matches (without indexing)

    for (ent_id, ent_count) in entity_ident_dict2.iteritems():
      tm += ent_count*(ent_count-1)/2

    assert num_all_true_matches == tm

  else:  # For a linkage - - - - - - - - - - - - - - - - - - - - - - - - - - -

    # Build two dictionaries with  entity identifiers as keys and a list of
    # their record identifier (rec_ident) as values
    #
    entity_ident_dict1 = {}
    entity_ident_dict2 = {}

    for (rec_ident, rec) in dataset1.readall():
      ent_id = get_id_funct(rec)

      this_rec_list = entity_ident_dict1.get(ent_id, [])
      this_rec_list.append(rec_ident)
      entity_ident_dict1[ent_id] = this_rec_list

    logging.info('  Number of unique entity identifiers in data set 1: %d' % \
                 (len(entity_ident_dict1)))

    for (rec_ident, rec) in dataset2.readall():
      ent_id = get_id_funct(rec)

      this_rec_list = entity_ident_dict2.get(ent_id, [])
      this_rec_list.append(rec_ident)
      entity_ident_dict2[ent_id] = this_rec_list

    logging.info('  Number of unique entity identifiers in data set 2: %d' % \
                 (len(entity_ident_dict2)))

    # Now calculate total true match number (loop over smaller dict)
    #
    if (len(entity_ident_dict1) < len(entity_ident_dict2)):
      for (ent_id1, rec_list1) in entity_ident_dict1.iteritems():

        if (ent_id1 in entity_ident_dict2):
          rec_list2 = entity_ident_dict2[ent_id1]

          num_all_true_matches += len(rec_list1) * len(rec_list2)
    else:
      for (ent_id2, rec_list2) in entity_ident_dict2.iteritems():

        if (ent_id2 in entity_ident_dict1):
          rec_list1 = entity_ident_dict1[ent_id2]

          num_all_true_matches += len(rec_list1) * len(rec_list2)

    # More efficent version: Only count number of matches ber record don't
    # store them
    #
    entity_ident_dict3 = {}
    entity_ident_dict4 = {}

    for (rec_ident, rec) in dataset1.readall():
      ent_id = get_id_funct(rec)
      ent_id_count = entity_ident_dict3.get(ent_id, 0) + 1
      entity_ident_dict3[ent_id] = ent_id_count

    for (rec_ident, rec) in dataset2.readall():
      ent_id = get_id_funct(rec)
      ent_id_count = entity_ident_dict4.get(ent_id, 0) + 1
      entity_ident_dict4[ent_id] = ent_id_count

    assert sum(entity_ident_dict3.values()) == dataset1.num_records
    assert sum(entity_ident_dict4.values()) == dataset2.num_records

    tm = 0  # Total number of true matches (without indexing)

    if (len(entity_ident_dict3) < len(entity_ident_dict4)):
      for (ent_id, ent_count) in entity_ident_dict3.iteritems():
        if ent_id in entity_ident_dict4:
          tm += ent_count*entity_ident_dict4[ent_id]
    else:
      for (ent_id, ent_count) in entity_ident_dict4.iteritems():
        if ent_id in entity_ident_dict3:
          tm += ent_count*entity_ident_dict3[ent_id]

    assert num_all_true_matches == tm

  logging.info('  Number of all true matches: %d' % (num_all_true_matches))

  # Get number of true matches in weight vector dictionary - - - - - - - - - -
  #
  num_true_matches =  0
  num_false_matches = 0

  for (rec_id_tuple, this_vec) in weight_vec_dict.iteritems():

    if (match_check_funct(rec_id_tuple[0], rec_id_tuple[1], this_vec) == True):
      num_true_matches += 1
    else:
      num_false_matches += 1

  assert len(weight_vec_dict) == num_true_matches+num_false_matches

  logging.info('  Number of true and false matches in weight vector ' + \
               'dictionary: %d / %d' % (num_true_matches,num_false_matches))

  if (num_all_true_matches > 0):

    pc = float(num_true_matches) / float(num_all_true_matches)

    logging.info('  Pairs completeness: %.4f%%' % (100.0*pc)) # As percentage

  else:

    pc = 0.0

    logging.info('  No true matches - cannot calculate pairs completeness')

  assert pc <= 1.0, pc

  return pc
示例#14
0
def quality_measures(weight_vec_dict, match_set, non_match_set,
                     match_check_funct):
  """Calculate several quality measures based on the number of true positives,
     true negatives, false positives and false negatives in the given match
     and non-match sets and weight vector dictionary using the given match
     check function.

     The function calculates and returns:

     - Accuracy:       (|TP|+|TN|)
                  ---------------------
                  (|TP|+|TN|+|FP|+|FN|)

     - Precision:    |TP|
                  -----------
                  (|TP|+|FP|)

     - Recall:       |TP|
                  -----------
                  (|TP|+|FN|)

     - F-Measure:   2 * (Precision * Recall)
                  --------------------------
                     (Precision + Recall)

     With TP the True Positives, TN the True negatives, FP the False Positives
     and FN the False Negatives.

     For a discussion about measuring data linkage and deduplication quality
     please refer to:

       Quality and Complexity Measures for Data Linkage and Deduplication
       Peter Christen and Karl Goiser

       Book chapter in "Quality Measures in Data Mining"
                       Studies in Computational Intelligence, Vol. 43
                       F. Guillet and H. Hamilton (eds), Springer
                       March 2007.
  """

  auxiliary.check_is_dictionary('weight_vec_dict', weight_vec_dict)
  auxiliary.check_is_set('match set', match_set)
  auxiliary.check_is_set('non match set', non_match_set)
  auxiliary.check_is_function_or_method('match_check_funct', match_check_funct)

  if ((len(match_set) + len(non_match_set)) != len(weight_vec_dict)):
    logging.exception('Match and non-match set are not of same length as ' + \
                      'weight vector dictionary: %d, %d / %d' % \
                      (len(match_set),len(non_match_set),len(weight_vec_dict)))
    raise Exception

  tp = 0.0
  fp = 0.0
  tn = 0.0
  fn = 0.0

  for rec_id_tuple in match_set:
    w_vec = weight_vec_dict[rec_id_tuple]

    if (match_check_funct(rec_id_tuple[0], rec_id_tuple[1], w_vec) == True):
      tp += 1
    else:
      fp += 1

  for rec_id_tuple in non_match_set:
    w_vec = weight_vec_dict[rec_id_tuple]

    if (match_check_funct(rec_id_tuple[0], rec_id_tuple[1], w_vec) == False):
      tn += 1
    else:
      fn += 1

  logging.info('')
  logging.info('Classification results: TP=%d, FP=%d / TN=%d, FN=%d' % \
               (tp, fp, tn, fn))

  if ((tp != 0) or (fp != 0) or (tn != 0) or (fn != 0)):
    acc = (tp + tn) / (tp + fp + tn + fn)
  else:
    acc = 0.0

  if ((tp != 0) or (fp != 0)):
    prec = tp / (tp + fp)
  else:
    prec = 0.0

  if ((tp != 0) or (fn != 0)):
    reca = tp / (tp + fn)
  else:
    reca = 0.0
  if ((prec != 0.0) or (reca != 0.0)):
    fmeas = 2*(prec*reca) / (prec+reca)
  else:
    fmeas = 0.0

  logging.info('Quality measures:')
  logging.info('  Accuracy: %.6f  Precision:%.4f  Recall: %.4f  ' % \
               (acc, prec, reca)+'F-measure: %.4f' % (fmeas))

  return acc, prec, reca, fmeas