def __init__(self, json_folder, lemmatized=True, num_filter=0, no_below=1, no_above=1, word_freq_file=None, pos_tag_file=None, sense_freq_file=None, word_hypernyms_file=None, abstraction_only=False, get_sense=False, sense_filter={}): self.json_folder = json_folder self.corpora = { 'agent': None, 'patient': None, 'mod': None, 'poss': None, 'all': None, 'a': None, 'n': None, 'v': None, 'sense_all': None } self.documents = { 'agent': [], 'patient': [], 'mod': [], 'poss': [], 'all': [], 'a': [], 'n': [], 'v': [], 'sense_all': [] } # self.corpora_all = None self.sense_filter = sense_filter self.get_sense = get_sense self.no_above = no_above self.no_below = no_below self.character_list = {} json_files = os.listdir(json_folder) lmt = WordNetLemmatizer() agent = [] patient = [] mod = [] poss = [] sense_v = [] sense_n = [] sense_a = [] self.modes = ['agent', 'patient', 'mod', 'poss'] self.pos_tag = {} self.word_freq = {} self.sense_freq = { 'a': {}, 'n': {}, 'v': {} } # total sense frequency in all documents self.word_hypernyms = {'a': {}, 'n': {}, 'v': {}} self.initialize_pos_freq(json_folder, word_freq_file, pos_tag_file, sense_freq_file, word_hypernyms_file) print(len(self.sense_freq['v'])) for afile in json_files: # print("Processing file", afile) name = afile[:-5] c = Character(name) afile = json_folder + '/' + afile try: f = open(afile, 'r') data = json.load(f) c.E = float(data['extroversion']) c.A = float(data['agreeableness']) c.C = float(data['conscientiousness']) c.S = float(data['stability']) c.O = float(data['openness']) c.zE = float(data['z_extroversion']) c.zA = float(data['z_agreeableness']) c.zC = float(data['z_conscientiousness']) c.zS = float(data['z_stability']) c.zO = float(data['z_openness']) c.gender = int(data['gender']) c.salience = int(data['salience']) c.valence = int(data['valence']) for m in self.modes: for word in data[m]: w = word[1].lower() pos = self.pos_tag[w] if self.word_freq[w] >= num_filter: # ----------------------------- # if we use the sense features # ----------------------------- if get_sense: # print("Getting hypernyms...") if 'NN' in pos: hypernyms = self.word_hypernyms['n'][w] if len(hypernyms) > 0: for h in hypernyms: if self.sense_freq['n'][ h] >= sense_filter['n']: sense_sense = self.get_hypernyms( h[:-6], 'n', higher=True) if 'person_sense' not in sense_sense: c.persona['n'].append(h) # maxh = hypernyms[0] # maxc = self.sense_freq['n'][maxh] # sense_sense = self.get_hypernyms(maxh[:-6], 'n', higher=True) # # print(maxh[:-6], sense_sense) # for h in hypernyms[1:]: # tmp = self.sense_freq['n'][h] # if tmp > maxc: # maxc = tmp # maxh = h # sense_sense = self.get_hypernyms(maxh[:-6], 'n', higher=True) # if 'person_sense' not in sense_sense: # c.persona['n'].append(maxh) elif 'JJ' in pos: hypernyms = self.word_hypernyms['a'][w] if len(hypernyms) > 0: for h in hypernyms: if self.sense_freq['a'][ h] >= sense_filter['a']: c.persona['a'].append(h) # maxh = hypernyms[0] # maxc = self.sense_freq['a'][maxh] # for h in hypernyms[1:]: # tmp = self.sense_freq['a'][h] # if tmp > maxc: # maxc = tmp # maxh = h # c.persona['a'].append(maxh) elif 'VB' in pos: hypernyms = self.word_hypernyms['v'][w] if len(hypernyms) > 0: for h in hypernyms: if self.sense_freq['v'][ h] >= sense_filter['v']: c.persona['v'].append(h) # maxh = hypernyms[0] # maxc = self.sense_freq['v'][maxh] # for h in hypernyms[1:]: # tmp = self.sense_freq['v'][h] # if tmp > maxc: # maxc = tmp # maxh = h # c.persona['v'].append(maxh) # ----------------------------- # if we filter out physical entity # ----------------------------- if abstraction_only and 'NN' in pos: # print("Filtering out physical entity...") hypernyms = self.get_hypernyms(w, 'n', higher=True) if len( hypernyms ) > 0 and 'physical_entity_sense' not in hypernyms: c.persona[m].append(w) else: c.persona[m].append(w) # end if # end for # end for # print(c.name, c.persona) agent.append(c.persona['agent']) patient.append(c.persona['patient']) mod.append(c.persona['mod']) poss.append(c.persona['poss']) sense_a.append(c.persona['a']) sense_n.append(c.persona['n']) sense_v.append(c.persona['v']) self.character_list[name] = c # print(c.name, c.persona) except Exception as e: print(afile, e) self.documents['agent'] = agent self.documents['patient'] = patient self.documents['mod'] = mod self.documents['poss'] = poss self.documents['all'] = all self.documents['a'] = sense_a self.documents['v'] = sense_v self.documents['n'] = sense_n self.documents['sense_all'] = sense_v + sense_a + sense_n aall = agent + patient + mod + poss if get_sense: self.corpora['a'] = Dictionary(sense_a) self.corpora['n'] = Dictionary(sense_n) self.corpora['v'] = Dictionary(sense_v) self.corpora['sense_all'] = Dictionary(sense_v + sense_n + sense_a) self.corpora['a'].filter_extremes(no_below=no_below, no_above=no_above) self.corpora['n'].filter_extremes(no_below=no_below, no_above=no_above) self.corpora['v'].filter_extremes(no_below=no_below, no_above=no_above) self.corpora['sense_all'].filter_extremes(no_below=no_below, no_above=no_above) self.corpora['agent'] = Dictionary(agent) self.corpora['patient'] = Dictionary(patient) self.corpora['mod'] = Dictionary(mod) self.corpora['poss'] = Dictionary(poss) self.corpora['all'] = Dictionary(aall) self.corpora['agent'].filter_extremes(no_below=no_below, no_above=no_above) self.corpora['patient'].filter_extremes(no_below=no_below, no_above=no_above) self.corpora['mod'].filter_extremes(no_below=no_below, no_above=no_above) self.corpora['poss'].filter_extremes(no_below=no_below, no_above=no_above) self.corpora['all'].filter_extremes(no_below=no_below, no_above=no_above)