def get_name_found_verbose(names, word_lookup): name_found = {} ordered_names = sorted(names.items(), key=lambda x: x[1], reverse=True) for name, count in ordered_names: if name in name_found: #print '%s already done' % name continue else: print 'checking %s (%d)' % (name, names[name]) words = name.split() for i in range(len(words)): for j in range(len(words)): new_words = words[:j] + [words[i]] + words[j:] new_name = ' '.join(new_words) if new_name in names and new_name not in name_found: print ' I: %s (%d)' % (new_name, names[new_name]) name_found[new_name] = name for i in range(len(words)): new_words = words[:i] + words[i+1:] new_name = ' '.join(new_words) if new_name in names and new_name not in name_found: print ' D: %s (%d)' % (new_name, names[new_name]) name_found[new_name] = name for i in range(len(words)): k = loader.sort_word(words[i]) for w, c in word_lookup[k]: if w != words[i]: new_words = words[:i] + [w] + words[i+1:] new_name = ' '.join(new_words) if new_name in names and new_name not in name_found: print ' S: %s (%d)' % (new_name, names[new_name]) name_found[new_name] = name return name_found
def get_name_found(names, word_lookup): name_found = {} ordered_names = sorted(names.items(), key=lambda x: x[1], reverse=True) for name, count in ordered_names: if name in name_found: continue words = name.split() for i in range(len(words)): for j in range(len(words)): new_words = words[:j] + [words[i]] + words[j:] new_name = ' '.join(new_words) if new_name in names and new_name not in name_found: name_found[new_name] = name for i in range(len(words)): new_words = words[:i] + words[i+1:] new_name = ' '.join(new_words) if new_name in names and new_name not in name_found: name_found[new_name] = name for i in range(len(words)): k = loader.sort_word(words[i]) for w, c in word_lookup[k]: if w != words[i]: new_words = words[:i] + [w] + words[i+1:] new_name = ' '.join(new_words) if new_name in names and new_name not in name_found: name_found[new_name] = name return name_found
import loader if __name__ == '__main__': with open('names_short.txt') as infile: names = [name.strip() for name in infile.readlines()] ninstances = len(names) print 'read %d instances' % ninstances word_lookup = loader.make_word_lookup(names) print word_lookup correct = [] for name in names: words = name.split() for i in range(len(words)): words[i] = word_lookup[loader.sort_word(words[i])][0][0] correct.append(' '.join(words)) with open('corrected.txt', 'w') as outfile: for i in range(len(names)): outfile.write('%s\t%s\n' % (names[i], correct[i])) # for each name in order of descending frequency # Hypothesis: there is a correct version of the name somewhere in # the data. For each name in the dataset, especially the ones that # have only one or two instances, I should be able to remake the # name by applying the edits. For instance, one of the words is # probably scrambled, or maybe deleted or doubled. Maybe two of # the words have been swapped.