/
main.py
45 lines (36 loc) · 1.38 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
__author__ = 'okhaz'
import re
import sys
from collections import OrderedDict
def delete_duplicates(input_file,my_file):
content=''
last_first = re.compile('\w+,\w+')
last_first_middle = re.compile('\w+,\w+ \w+')
fid = open(input_file)
fid_wrt = open(my_file,'w+')
lines = fid.readlines()
dict = OrderedDict()
for line in lines:
if len(re.split(':',line))!=1:
name, SSN = re.split(':',line)
SSN = SSN.strip('\n')
if dict.has_key(SSN):
dict[SSN].append(name)
else:
dict.update({SSN:[]})
dict[SSN].append(name)
for SSN in dict.iterkeys():
name_array=dict[SSN]
split_name=[]
name_array.sort(key=lambda item: (-len(item), item))
if re.match(last_first_middle,name_array[0]) or re.match(last_first,name_array[0]):
split_name=re.findall(r"[\w']+", name_array[0])
if re.match(last_first_middle,name_array[0]):
normal_form = split_name[1]+' '+split_name[2]+' '+split_name[0]
elif re.match(last_first,name_array[0]):
normal_form = split_name[1]+' '+split_name[0]
else:
normal_form=name_array[0]
content+=normal_form+':'+SSN+'\n'
fid_wrt.write(content.rstrip('\n'))
fid_wrt.close()