def run(self): infile = self.system.get_infile(self) parser = self.system.get_parser(self) sequence = self.system.get_sequence(self) pos = 0 for paragraph in paragraphs(infile): parse_results = parser.parse(paragraph) for parse in parse_results['parses']: main_edge = parse['resolved_corefs'] # add main edge if main_edge: # attach text to edge text = parse['text'] attr = {'text': text} yield create_op(main_edge, sequence=sequence, position=pos, attributes=attr) self.edges += 1 pos += 1 # add extra edges for edge in parse['extra_edges']: yield create_op(edge) for edge in parse_results['inferred_edges']: print('inferred edge: {}'.format(edge.to_str())) yield create_op(edge, count=True)
def process_edge(self, edge, depth): hg = self.system.get_hg(self) if not edge.is_atom(): ct = edge.connector_type() if ct[:2] == 'Pd': pred = edge[0] if (len(edge) > 2 and deep_lemma(hg, pred).root() in CONFLICT_PRED_LEMMAS): subjects = edge.edges_with_argrole('s') objects = edge.edges_with_argrole('o') if len(subjects) == 1 and len(objects) == 1: subject = strip_concept(subjects[0]) obj = strip_concept(objects[0]) if (subject and obj and has_proper_concept(subject) and has_proper_concept(obj)): actor_orig = main_coref(hg, subject) actor_targ = main_coref(hg, obj) conflict_edge = hedge( ('conflict/P/.', actor_orig, actor_targ, edge)) if (is_actor(hg, actor_orig) and is_actor(hg, actor_targ)): yield create_op(conflict_edge) for wedge in self._topics( hg, actor_orig, actor_targ, edge): yield wedge self.conflicts += 1
def _parse_title(self, text, author): parser = self.system.get_parser(self) parts = title_parts(text) title_edge = ['title/P/.reddit', author] for part in parts: parse_results = parser.parse(part) for op in self.system.parse_results2ops(parse_results): yield op for parse in parse_results['parses']: if 'resolved_corefs' in parse: main_edge = parse['resolved_corefs'] else: main_edge = parse['main_edge'] if main_edge: title_edge.append(main_edge) if len(title_edge) > 2: # add title edge yield create_op(title_edge) self.titles_added += 1 self.titles_parsed += 1
def process_edge(self, edge, depth): hg = self.hg.get_hg(self) _, main_actor, claim, main_edge = edge actors = find_actors(hg, claim) for actor in actors: yield create_op( ('claim-actor/P/.', main_actor, actor, claim, main_edge))
def _topics(self, hg, actor_orig, actor_targ, edge): for item in edge[1:]: if item.type()[0] == 's': if item[0].to_str() in CONFLICT_TOPIC_TRIGGERS: for concept in all_concepts(item[1]): if hg.degree(concept) > 1: yield create_op(('conflict-topic/P/.', actor_orig, actor_targ, concept, edge)) self.conflict_topics += 1
def _parse_row(self, row): parser = self.system.get_parser(self) parts = text_parts(row[self.text]) for part in parts: parse_results = parser.parse(part) for parse in parse_results['parses']: main_edge = parse['main_edge'] # add main edge if main_edge: # attach text to edge text = parse['text'] attr = {'text': text} yield create_op(main_edge, attributes=attr) # add extra edges for edge in parse['extra_edges']: yield create_op(edge)
def _parse_title(self, text, author): parser = self.system.get_parser(self) parts = title_parts(text) title_edge = ['title/P/.reddit', author] tags = [] for part in parts: parse_results = parser.parse(part) for parse in parse_results['parses']: main_edge = parse['resolved_corefs'] # add main edge if main_edge: # attach text to edge text = parse['text'] attr = {'text': text} yield create_op(main_edge, attributes=attr) # add extra edges for edge in parse['extra_edges']: yield create_op(edge) if main_edge.type()[0] == 'R': title_edge.append(main_edge) else: tags.append(main_edge) for edge in parse_results['inferred_edges']: yield create_op(edge, count=True) if len(title_edge) > 2: # add title edge yield create_op(title_edge) self.titles_added += 1 # add title tags if len(tags) > 0: tags_edge = ['tags/P/.reddit', title_edge] + tags yield create_op(tags_edge) self.titles_parsed += 1
def _update_main_coref_ops(hg, edge): cref_id = coref_id(hg, edge) corefs = coref_set(hg, edge) best_coref = None best_degree = -1 for coref in corefs: d = hg.degree(coref) if d > best_degree: best_degree = d best_coref = coref coref_edge = hedge((main_coref_pred, cref_id, best_coref)) if not hg.exists(coref_edge): old = set(hg.search('({} {} *)'.format(main_coref_pred, cref_id))) for old_edge in old: # hg.remove(old_edge) # print('&&&') # print(old_edge) yield create_op(old_edge, optype='remove') # hg.add(coref_edge, primary=False) yield create_op(coref_edge, primary=False)
def parse_text(self, infile, parser, sequence): pos = 0 paragraphs = list(read_paragraphs(infile)) if self.progress_bar: pbar = progressbar.ProgressBar(max_value=len(paragraphs)).start() else: pbar = None for i, paragraph in enumerate(paragraphs): parse_results = parser.parse(paragraph) for parse in parse_results['parses']: main_edge = parse['resolved_corefs'] # add main edge if main_edge: # attach text to edge text = parse['text'] attr = {'text': text} # print('main edge: {}'.format(main_edge.to_str())) yield create_op(main_edge, sequence=sequence, position=pos, attributes=attr) pos += 1 # add extra edges for edge in parse['extra_edges']: yield create_op(edge) for edge in parse_results['inferred_edges']: # print('inferred edge: {}'.format(edge.to_str())) yield create_op(edge, count=True) if self.progress_bar: pbar.update(i) if self.progress_bar: pbar.finish()
def process_edge(self, edge, depth): if not edge.is_atom(): et = edge.type() if et[0] == 'C': ct = edge[0].connector_type() parent = None if ct[0] == 'B': mcs = edge.main_concepts() if len(mcs) == 1: parent = mcs[0] elif ct[0] == 'M' and len(edge) == 2: parent = edge[1] if parent: ont_edge = (const.type_of_pred, edge, parent) yield create_op(ont_edge, primary=False)
def run(self): url = self.system.get_url(self) parser = self.system.get_parser(self) sequence = self.system.get_sequence(self) title, lang = url2title_and_lang(url) text = read_wikipedia(title, lang) pos = 0 for line in text.split('\n'): paragraph = line.strip() if len(paragraph) == 0: continue parse_results = parser.parse(paragraph) for parse in parse_results['parses']: main_edge = parse['resolved_corefs'] # add main edge if main_edge: # attach text to edge text = parse['text'] attr = {'text': text} yield create_op(main_edge, sequence=sequence, position=pos, attributes=attr) self.edges += 1 pos += 1 # add extra edges for edge in parse['extra_edges']: yield create_op(edge) for edge in parse_results['inferred_edges']: yield create_op(edge, count=True)
def make_corefs_ops(hg, edge1, edge2): # print('\n### make_corefs_ops {} {}'.format(edge1, edge2)) cref_id_1 = coref_id(hg, edge1) cref_id_2 = coref_id(hg, edge2) if cref_id_1 is None: if cref_id_2 is None: new_cref_id = _new_coref_id() else: new_cref_id = cref_id_2 elif cref_id_2 is None: new_cref_id = cref_id_1 else: count1 = len(coref_set(hg, edge1)) count2 = len(coref_set(hg, edge2)) if count2 > count1: new_cref_id = cref_id_2 else: new_cref_id = cref_id_1 update = False if cref_id_1 != new_cref_id: for op in _change_coref_id_ops(hg, edge1, new_cref_id): yield op update = True if cref_id_2 != new_cref_id: for op in _change_coref_id_ops(hg, edge2, new_cref_id): yield op update = True # hg.add((coref_pred, edge1, edge2), primary=False) yield create_op((coref_pred, edge1, edge2), primary=False) if update: for op in _update_main_coref_ops(hg, edge1): yield op
def parse_results2ops(self, parse_results, sequence=None, pos=-1): for parse in parse_results['parses']: if self.corefs == 'resolve': main_edge = parse['main_edge'] resolved_edge = parse['resolved_corefs'] elif self.corefs == 'replace': main_edge = parse['resolved_corefs'] resolved_edge = None else: main_edge = parse['main_edge'] resolved_edge = None # add main edge if main_edge: # attach text to edge text = parse['text'] attr = {'text': text} if sequence: yield create_op(main_edge, sequence=sequence, position=pos, attributes=attr) else: yield create_op(main_edge, attributes=attr) pos += 1 if self.corefs == 'resolve': yield create_op(resolved_edge, attributes=attr) coref_res_edge = hedge( (const.coref_res_pred, main_edge, resolved_edge)) yield create_op(coref_res_edge) # add extra edges for edge in parse['extra_edges']: yield create_op(edge) for edge in parse_results['inferred_edges']: yield create_op(edge, count=True)
def _set_coref_id_op(hg, edge, coref_id): attributes = {coref_set_id_key: coref_id} return create_op(edge, optype='set_attributes', attributes=attributes)
def on_end(self): for actor in self.actor_counter: if self.actor_counter[actor] > 0: yield create_op(('actor/P/.', actor))
def make_singular_plural_ops(hg, single, plural): yield create_op((singular_plural_pred, single, plural), primary=False)
def process_edge(self, edge, depth): yield create_op(edge)
def process_edge(self, edge, depth): for redge in conjunctions_decomposition(edge): yield create_op(redge)
def process_edge(self, edge, depth): for edge in conjunctions_resolution(edge): yield create_op(edge)
def on_end(self): # assign genders self.female = set() self.group = set() self.male = set() self.non_human = set() self.logger.debug('assigning genders') i = 0 with progressbar.ProgressBar(max_value=len(self.actors)) as bar: for actor in self.actors: gender = self._gender(actor) if gender == 'female': self.female.add(actor) elif gender == 'group': self.group.add(actor) elif gender == 'male': self.male.add(actor) elif gender == 'non-human': self.non_human.add(actor) # write gender if gender: gender_atom = '{}/P/.'.format(gender) yield create_op((gender_atom, actor)) i += 1 bar.update(i) # write claims self.logger.debug('writing claims') i = 0 with progressbar.ProgressBar(max_value=len(self.claims)) as bar: for claim_data in self.claims: actor = claim_data['actor'] claim = claim_data['claim'] edge = claim_data['edge'] # anaphora resolution prep = _subject_preposition(claim) if prep: resolve = False if prep == 'she': resolve = actor in self.female elif prep == 'they': resolve = actor in self.group elif prep == 'he': resolve = actor in self.male elif prep == 'it': resolve = actor in self.non_human if resolve: self.logger.debug('ANAPHORA') self.logger.debug('actor: {}'.format(actor)) self.logger.debug('before: {}'.format(claim)) claim = replace_subject(claim, actor) self.logger.debug('after: {}'.format(claim)) self.anaphoras += 1 # write claim yield create_op(('claim/P/.', actor, claim, edge)) i += 1 bar.update(i)