Пример #1
0
def get_name_found_verbose(names, word_lookup):
    name_found = {}
    ordered_names = sorted(names.items(), key=lambda x: x[1], reverse=True)
    for name, count in ordered_names:
        if name in name_found:
            #print '%s already done' % name
            continue
        else:
            print 'checking %s (%d)' % (name, names[name])
        words = name.split()
        for i in range(len(words)):
            for j in range(len(words)):
                new_words = words[:j] + [words[i]] + words[j:]
                new_name = ' '.join(new_words)
                if new_name in names and new_name not in name_found:
                    print '  I: %s (%d)' % (new_name, names[new_name])
                    name_found[new_name] = name
        for i in range(len(words)):
            new_words = words[:i] + words[i+1:]
            new_name = ' '.join(new_words)
            if new_name in names and new_name not in name_found:
                print '  D: %s (%d)' % (new_name, names[new_name])
                name_found[new_name] = name
        for i in range(len(words)):
            k = loader.sort_word(words[i])
            for w, c in word_lookup[k]:
                if w != words[i]:
                    new_words = words[:i] + [w] + words[i+1:]
                    new_name = ' '.join(new_words)
                    if new_name in names and new_name not in name_found:
                        print '  S: %s (%d)' % (new_name, names[new_name])
                        name_found[new_name] = name
    return name_found
Пример #2
0
def get_name_found(names, word_lookup):
    name_found = {}
    ordered_names = sorted(names.items(), key=lambda x: x[1], reverse=True)
    for name, count in ordered_names:
        if name in name_found:
            continue
        words = name.split()
        for i in range(len(words)):
            for j in range(len(words)):
                new_words = words[:j] + [words[i]] + words[j:]
                new_name = ' '.join(new_words)
                if new_name in names and new_name not in name_found:
                    name_found[new_name] = name
        for i in range(len(words)):
            new_words = words[:i] + words[i+1:]
            new_name = ' '.join(new_words)
            if new_name in names and new_name not in name_found:
                name_found[new_name] = name
        for i in range(len(words)):
            k = loader.sort_word(words[i])
            for w, c in word_lookup[k]:
                if w != words[i]:
                    new_words = words[:i] + [w] + words[i+1:]
                    new_name = ' '.join(new_words)
                    if new_name in names and new_name not in name_found:
                        name_found[new_name] = name
    return name_found
Пример #3
0
import loader

if __name__ == '__main__':
    with open('names_short.txt') as infile:
        names = [name.strip() for name in infile.readlines()]
    ninstances = len(names)
    print 'read %d instances' % ninstances

    word_lookup = loader.make_word_lookup(names)
    print word_lookup

    correct = []
    for name in names:
        words = name.split()
        for i in range(len(words)):
            words[i] = word_lookup[loader.sort_word(words[i])][0][0]
        correct.append(' '.join(words))

    with open('corrected.txt', 'w') as outfile:
        for i in range(len(names)):
            outfile.write('%s\t%s\n' % (names[i], correct[i]))

    # for each name in order of descending frequency
    
    # Hypothesis: there is a correct version of the name somewhere in
    # the data. For each name in the dataset, especially the ones that
    # have only one or two instances, I should be able to remake the
    # name by applying the edits. For instance, one of the words is
    # probably scrambled, or maybe deleted or doubled. Maybe two of
    # the words have been swapped.