예제 #1
0
def empirical_cost_edit_distance(r,q,uniform_cost=0.1,p_r_qr=0.95,mu=1.0):
  """
  Estimates the probability P(q|r) where q is a candidate spelling of r
  The cost of a single edit in the Damerau-Levenshtein distance is calculated from a noisy chanel model

  if editDistance(r,q) == 1 then P(r|q) is taken from the empirical noisy model
  if editDistance(r,q) > 1 then P(r|q) = P_empirical(r|q) * P_uniform(r|q)^(distance-1)
  
  Returns log( P(q|r) ) if r != q then P(q|r) = cost * P(q))
                        if r == q then P(q|r) = p_r_qr * P(q)
                        
                        if editDistance(r,q) == 1 then cost = P_empirical(r|q)
                        if editDistance(r,q) > 1  then cost = P_empirical(r|q) * (uniform_cost^(distance -1))
  
  """
  
  log_prob_q    = calculate_log_prob(q)
  d             = edit_distance(r,q)
  editOperation = findEditOperation(r,q)

  if d==0 or len(editOperation)==0:
    return log(p_r_qr) + mu*log_prob_q
  else: 
    
    log_prob_q         = calculate_log_prob(q)
    confusion_matrices = [edits_del_counter,edits_sub_counter,edits_tra_counter,edits_ins_counter]
    
    # editOperation e.g. [0, ('#','s')]  from: actual = un; intended = sun
    editName      = editOperation[0]
    editArguments = editOperation[1]

    # How many such edits were found on the training file for the noisy model
    numerator = confusion_matrices[editName][editArguments]
    
    if editName == 0: # deletion
        denominator = edits_bichar_counter[editArguments]
    elif editName == 1: # substitution
        denominator = edits_char_counter[editArguments[1]]
    elif editName == 2: # transposition
        denominator = edits_bichar_counter[editArguments]
    elif editName == 3: # insertion
        denominator = edits_char_counter[editArguments[0]]
    
    # Add-1 smoothing
    numberOfCharsInAlphabet = len(edits_char_counter)
    prob_r_q = float(numerator + 1) / float(denominator + numberOfCharsInAlphabet) 
    log_prob_q_r = log(prob_r_q) + (d-1)*log(uniform_cost) + log_prob_q
    
    return log_prob_q_r
예제 #2
0
def is_good_candidate(candidate,word,jaccard_cutoff = 0.2, edit_cutoff = 3):
  '''Test if a candidate is good enough to a word with some heuristics'''

  # Candidate should start with same letter
  if word[0] != candidate[0]: return False
  
  # Candidate should have length within edit_cutoff of word
  if abs(len(candidate) - len(word)) >= edit_cutoff: return False
  
  # Jaccard overlap
  if len(word) > 10: jaccard_cutoff = max(jaccard_cutoff,0.5)
  if jaccard_coeff(candidate,word) <= jaccard_cutoff: return False
  
  #Edit distance should be <= 2
  if edit_distance(candidate,word) >= edit_cutoff: return False
  
  return True
예제 #3
0
def uniform_cost_edit_distance(r,q,cost=0.001,p_r_qr=0.95,mu=1.0):
  """
  Estimates the probability P(q|r) where q is a candidate spelling of r
  Any single edit using an operator defined in the Damerau-Levenshtein distance
  has uniform probability defined by 'cost'
  
  Returns log( P(q|r) ) if r != q then P(q|r) = (cost^edit_distance(r,q) * P(q))
                        if r == q then P(q|r) = p_r_qr * p(q) 
  """

  log_prob_q = calculate_log_prob(q)
  
  if r==q:
    return log(p_r_qr) + mu*log_prob_q
  else:
    d = edit_distance(r,q)
    return d * log(cost) + mu*log_prob_q