def get_basic_info(parsed_text): """Collect basic information for parsed texts, such as all tokens and lemmas.""" # Info n_words = 0 all_words = [] all_lemmas = [] all_tokens = [] # For each sentence for sentence in parsed_text: # Transform tokens into Token class instances tokens = [easy_parse.Token(k) for k in sentence] # Get words and lemmas, add them to all words and lemmas words = [t.word for t in tokens] lemmas = [t.lemma for t in tokens] all_words += words all_lemmas += lemmas # Get number of words n_words += len(words) all_tokens += tokens full_text = get_text(parsed_text) return all_tokens, all_lemmas, full_text
def get_tokens_and_sent(sent): """Return token and sent as class objects.""" # Set counters word_pos_counter = sent[0].i sentence = [] space_chars = 0 # For each sent for i, word in enumerate(sent): # If word is simply space, skip it if word.pos_ == 'SPACE': space_chars += 1 continue # Collect parsing infos parse_info = [ word.i-word_pos_counter-space_chars, # adapt position if there are white characters word.text, word.lemma_, word.pos_, word.tag_, word.head.i-word_pos_counter-space_chars, # adapt position if there are white characters word.dep_] sentence.append(parse_info) # Transform into Token and Sentence object tokens = [easy_parse.Token(k) for k in sentence] sent = easy_parse.Sentence(tokens) return tokens, sent
def rewrite_coref_tags(sentences): """Adjust the coreference tags given by CorZu. E.g. remove brackets, resolved nested/multile coreference tags.""" parsed_text = get_sentence_token_information(sentences) new_sentences = [] for sentence in parsed_text: # transform tokens into Token class instances tokens = [parse_info.Token(k) for k in sentence] # adjust the corefence information for easier processin parse_info.adjust_coref(tokens) sentence_string = parse_info.sentenceToString(tokens) new_sentences.append(sentence_string) return new_sentences
def get_text(parsed_text): """Return complete sentence as string without parsing information.""" full_text = '' # For each sent, get all tokens and connect correctly for sent in parsed_text: tokens = [easy_parse.Token(t) for t in sent] for token in tokens: full_text += token.word # Get prev and next token next_token = easy_parse.get_next_token(tokens, token) # Insert white space unless there is punctuation mark if ((not (token.position != len(sent) and next_token.sim_pos.startswith('$') and next_token.word != '(')) and token.word != '('): full_text += ' ' return full_text
def get_entities_with_roles(parsed_text): """Get entities with syntactic role S,O,X,P or G. If required, reduce weights for embedded entities.""" # Initializing entities_with_roles = [] passive_counter = 0 for sentence in parsed_text: # transform tokens into Token class instances tokens = [parse_info.Token(k) for k in sentence] # resolve and count passive constructions if necessary if settings.passive_on: passives = parse_info.adjust_passive(tokens) if passives == True: passive_counter += 1 # transform tokens into Sentence class instance sent = parse_info.Sentence(tokens) # get all subjects and objects of Sentence # get subject and object lemma if subj/obj is a noun or is marked as coreferent entity subjs = sent.subj() subjs_lemma = [ t for t in subjs if (t.sim_pos == 'N' or t.coref != '_') ] objs = sent.obj() objs_lemma = [t for t in objs if (t.sim_pos == 'N' or t.coref != '_')] # get all words from full subj and obj noun phrases (for excluding words later in the 'other' category) full_subjs = [ t for t in list( chain.from_iterable([ parse_info.get_full_phrase(tokens, subj) for subj in subjs ])) if (t.sim_pos == 'N' or t.coref != '_') ] full_objs = [ t for t in list( chain.from_iterable( [parse_info.get_full_phrase(tokens, obj) for obj in objs])) if (t.sim_pos == 'N' or t.coref != '_') ] # get all possessive pronouns (category 'P') poss_pronouns = [ t for t in tokens if (t.coref != '_' and (t.full_pos == 'PPOSAT')) ] # get all genitive modifiers (category 'G') genitive_mods = [ t for t in tokens if ((t.coref != '_' or t.sim_pos == 'N') and t.function == 'gmod') ] # get all nouns that are not contained in the subj or obj noun phrase, or genitive modifiers others = [ t for t in tokens if ((t.sim_pos == 'N') and t not in subjs_lemma + objs_lemma) ] # get prepositions preps = [t for t in tokens if t.function == 'pp'] # if genitive cat is on, remove genitives from 'others' if settings.cat_g_on: others = [t for t in others if t.function != 'gmod'] # assign cat G to genitive modifiers; or merge with category P into X for g in genitive_mods: if not settings.merge_p_and_g: g.tag = 'G' # if category P and G are merged into one (X) else: if g in full_subjs: subjs_lemma.append(g) elif g in full_objs: objs_lemma.append(g) else: others.append(g) g.tag = 'X' # Assign tag X to "other" category tokens for x in others: x.tag = 'X' # if possessive category is on, if settings.cat_p_on: # assign cat G to possessive pronouns, or merge with category G for p in poss_pronouns: if not settings.merge_p_and_g: p.tag = 'P' # if category P and G are merged into one (X) else: if p in full_subjs: subjs_lemma.append(p) elif p in full_objs: objs_lemma.append(p) else: others.append(p) p.tag = 'X' # Assign tag O to objects for o in objs_lemma: o.tag = 'O' # Assign tag S to subjects for s in subjs_lemma: s.tag = 'S' # get prepositional phrases prep_phrase = [ (p_ent) for (p_ent, prep, ent) in itertools.product(tokens, preps, tokens) if p_ent.function == 'pn' and p_ent in subjs_lemma + objs_lemma + others + poss_pronouns + genitive_mods + full_subjs + full_objs and p_ent.dependency == prep.position and prep.dependency == ent.position and (ent.function == 'pn') ] # get rel pronouns rel_prons = [t for t in tokens if t.full_pos == 'PRELS'] # get rel clauses rel_clauses = [ (k, j) for (k, j) in itertools.product(rel_prons, tokens) if j.function in ['rel', 'cj', 'objc'] and j.full_pos.endswith('FIN') and j.position > k.position ] # mark relative clause tokens for (rel_pron, rel_pred) in rel_clauses: for token in tokens: if token.position >= rel_pron.position and token.position <= rel_pred.position: token.rel = True # get conjunction candidates conjunctions = [ t for t in tokens if t.full_pos == 'KOUS' and t.function == 'konj' ] # get conjunctions and predicates conj_pred = [ (k, j) for (k, j) in itertools.product(conjunctions, tokens) if j.full_pos.startswith('V') and j.full_pos.endswith('FIN') and j.function in ['root', 'neb'] and j.position == k.dependency ] # Mark all tokens within subjunctional clause for k, j in conj_pred: for t in tokens: if t.position >= k.position and t.position <= j.position: t.subj = True # get part presense and past part_pres = [ t for t in tokens if t.full_pos == 'ADJD' and t.morph.part == '<PPRES' and t.function in ['root', 'pn'] ] part_praet = [ t for t in tokens if t.full_pos == 'VVPP' and t.function == 'neb' ] # for each participle for part in part_pres + part_praet: # get full participle construction part_con = parse_info.get_dependent_tokens(tokens, part) + [part] part_con = parse_info.get_all_tokens(part_con, tokens) # set initial comma positions first_comma_position = None sec_comma_position = None # find comma positions for comma in [t for t in part_con if t.lemma == ',']: if comma.position < part.position: first_comma_position = comma.position if comma.position > part.position: sec_comma_position = comma.position # cut participle construction at commas (only in-between) part_con = [ k for k in part_con if (first_comma_position == None or first_comma_position < k.position) and (sec_comma_position == None or sec_comma_position > k.position) ] # mark token in participle construction for token in part_con: token.part = True # Reduce weights for tokes in prepositional phrases, relative and # subjunctive clauses and participle constructions if settings.reduce_weights: for p in prep_phrase: if p.tag != '': p.reduce_tag() for t in tokens: if t.rel and t.tag != '': t.reduce_tag() if t.part and t.tag != '': t.reduce_tag() if t.subj and t.tag != '': t.reduce_tag() # list of all entities all_entities = subjs_lemma + objs_lemma + others if not settings.merge_p_and_g: # append cat p and g entities if settings.cat_p_on: all_entities = all_entities + poss_pronouns if settings.cat_g_on: all_entities = all_entities + genitive_mods entities_with_roles.append(all_entities) return entities_with_roles
import parse_information as easy_parse # ---------------------------------------- ### SETTINGS # ---------------------------------------- include_wordnet = True # ---------------------------------------- ### DICTIONARIES # ---------------------------------------- # Constant Token Dictionary token_dict = { 'NOT_': easy_parse.Token([-1, 'not', 'not', 'ADV', 'RB', 1, 'neg']), 'NO_': easy_parse.Token([-1, 'No', 'no', 'DET', 'DT', 1, 'det']), 'DO_': easy_parse.Token([-1, 'do', 'do', 'VERB', 'VBX', 1, 'aux']), 'A_': easy_parse.Token([-1, 'a', 'a', 'DET', 'DT', 1, 'det']), 'THE_': easy_parse.Token([-1, 'The', 'the', 'DET', 'DT', 1, 'det']), 'THERE_': easy_parse.Token([-1, 'There', 'there', 'DET', 'DT', 1, 'det']), 'BE_': easy_parse.Token([-1, 'be', 'be', 'VERB', 'AUX', 1, 'verb']), 'THAT_': easy_parse.Token([-1, 'that', 'that', 'ADJ', 'WDT', 1, 'nsubj']), 'WHO_':