-
Notifications
You must be signed in to change notification settings - Fork 1
/
Formatter2.py
81 lines (68 loc) · 2.33 KB
/
Formatter2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import re
from bs4 import BeautifulSoup as bs
from Utilities import Utils
class Formatter:
# Fields:
movie_name = None
soup = None
script_map = None
# Static:
name_format = "^([A-Z]|[a-z]|\\s|\\.|[0-9])*:"
def __init__(self, name):
self.movie_name = name
self.soup = bs(Utils.script(name), features='html.parser')
self.script_map = Formatter.get_script_map(self.soup)
@staticmethod
def get_person(line):
return re.match(Formatter.name_format, line).group(0)[:-1]
@staticmethod
def get_text(line):
return re.split(Formatter.name_format, line).pop().lstrip().lower()
@staticmethod
def get_script_list(lst):
ret = []
for line in lst:
ret.append({'person': Formatter.get_person(line).rstrip(' '), 'text': Formatter.get_text(line)})
return ret
@staticmethod
def get_script_map(soup):
text = soup.findAll('td', {'class': 'line-content'})
ret = []
for td in text:
line = td.string
if not re.match(Formatter.name_format, line):
# print(line)
curr = ret.pop()
line = curr + line
line = re.sub("\\(.*\\)", "", line)
ret.append(line + ' ')
return Formatter.get_script_list(ret)
def get_char_tokens(self, person):
ret = []
for col in self.script_map:
if col['person'].upper() == person.upper():
ret += Utils.get_token(col['text'])
return ret
def get_characters(self):
ret = []
for col in self.script_map:
ret.append(col['person'])
return list(dict.fromkeys(ret)) # remove redundancies
def get_tokens(self):
ret = []
for col in self.script_map:
ret += Utils.get_token(col['text']) # col['text'].lower().split()
return ret
def get_chars_tuples(self):
x = self.get_words_num()
x = sorted(x.items(), key=lambda y: y[1])
x.reverse()
return x # list(map(lambda tup: tup[0], x))
def get_words_num(self):
my_map = self.script_map
ret = {}
for col in my_map:
x = col['person']
count = len(Utils.get_token(col['text']))
ret[x] = count if x not in ret.keys() else ret[x] + count
return ret