예제 #1
0
def get_basic_info(parsed_text):
    """Collect basic information for parsed texts, such as all tokens and lemmas."""

    # Info
    n_words = 0
    all_words = []
    all_lemmas = []
    all_tokens = []

    # For each sentence
    for sentence in parsed_text:

        # Transform tokens into Token class instances
        tokens = [easy_parse.Token(k) for k in sentence]

        # Get words and lemmas, add them to all words and lemmas
        words = [t.word for t in tokens]
        lemmas = [t.lemma for t in tokens]

        all_words += words
        all_lemmas += lemmas

        # Get number of words
        n_words += len(words)
        all_tokens += tokens

    full_text = get_text(parsed_text)

    return all_tokens, all_lemmas, full_text
예제 #2
0
def get_tokens_and_sent(sent):
    """Return token and sent as class objects."""

    # Set counters
    word_pos_counter = sent[0].i
    sentence = []
    space_chars = 0

    # For each sent
    for i, word in enumerate(sent):

        # If word is simply space, skip it 
        if word.pos_ == 'SPACE':
            space_chars += 1
            continue

        # Collect parsing infos 
        parse_info = [
            word.i-word_pos_counter-space_chars,  # adapt position if there are white characters
            word.text,
            word.lemma_,
            word.pos_, 
            word.tag_, 
            word.head.i-word_pos_counter-space_chars, # adapt position if there are white characters
            word.dep_]

        sentence.append(parse_info)

    # Transform into Token and Sentence object
    tokens = [easy_parse.Token(k) for k in sentence]  
    sent = easy_parse.Sentence(tokens)

    return tokens, sent
예제 #3
0
def rewrite_coref_tags(sentences):
    """Adjust the coreference tags given by CorZu. 
    E.g. remove brackets, resolved nested/multile coreference tags."""

    parsed_text = get_sentence_token_information(sentences)
    new_sentences = []

    for sentence in parsed_text:

        # transform tokens into Token class instances
        tokens = [parse_info.Token(k) for k in sentence]

        # adjust the corefence information for easier processin
        parse_info.adjust_coref(tokens)

        sentence_string = parse_info.sentenceToString(tokens)
        new_sentences.append(sentence_string)

    return new_sentences
예제 #4
0
def get_text(parsed_text):
    """Return complete sentence as string without parsing information."""

    full_text = ''

    # For each sent, get all tokens and connect correctly
    for sent in parsed_text:
        tokens = [easy_parse.Token(t) for t in sent]

        for token in tokens:
            full_text += token.word

            # Get prev and next token
            next_token = easy_parse.get_next_token(tokens, token)

            # Insert white space unless there is punctuation mark
            if ((not (token.position != len(sent)
                      and next_token.sim_pos.startswith('$')
                      and next_token.word != '(')) and token.word != '('):

                full_text += ' '

    return full_text
예제 #5
0
def get_entities_with_roles(parsed_text):
    """Get entities with syntactic role S,O,X,P or G.
    If required, reduce weights for embedded entities."""

    # Initializing
    entities_with_roles = []
    passive_counter = 0

    for sentence in parsed_text:

        # transform tokens into Token class instances
        tokens = [parse_info.Token(k) for k in sentence]

        # resolve and count passive constructions if necessary
        if settings.passive_on:
            passives = parse_info.adjust_passive(tokens)
            if passives == True:
                passive_counter += 1

        # transform tokens into Sentence class instance
        sent = parse_info.Sentence(tokens)

        # get all subjects and objects of Sentence
        # get subject and object lemma if subj/obj is a noun or is marked as coreferent entity

        subjs = sent.subj()
        subjs_lemma = [
            t for t in subjs if (t.sim_pos == 'N' or t.coref != '_')
        ]

        objs = sent.obj()
        objs_lemma = [t for t in objs if (t.sim_pos == 'N' or t.coref != '_')]

        # get all words from full subj and obj noun phrases (for excluding words later in the 'other' category)
        full_subjs = [
            t for t in list(
                chain.from_iterable([
                    parse_info.get_full_phrase(tokens, subj) for subj in subjs
                ])) if (t.sim_pos == 'N' or t.coref != '_')
        ]
        full_objs = [
            t for t in list(
                chain.from_iterable(
                    [parse_info.get_full_phrase(tokens, obj) for obj in objs]))
            if (t.sim_pos == 'N' or t.coref != '_')
        ]

        # get all possessive pronouns (category 'P')
        poss_pronouns = [
            t for t in tokens if (t.coref != '_' and (t.full_pos == 'PPOSAT'))
        ]

        # get all genitive modifiers (category 'G')
        genitive_mods = [
            t for t in tokens
            if ((t.coref != '_' or t.sim_pos == 'N') and t.function == 'gmod')
        ]

        # get all nouns that are not contained in the subj or obj noun phrase, or genitive modifiers
        others = [
            t for t in tokens
            if ((t.sim_pos == 'N') and t not in subjs_lemma + objs_lemma)
        ]

        # get prepositions
        preps = [t for t in tokens if t.function == 'pp']

        # if genitive cat is on, remove genitives from 'others'
        if settings.cat_g_on:
            others = [t for t in others if t.function != 'gmod']

            # assign cat G to genitive modifiers; or merge with category P into X
            for g in genitive_mods:
                if not settings.merge_p_and_g:
                    g.tag = 'G'

                # if category P and G are merged into one (X)
                else:

                    if g in full_subjs:
                        subjs_lemma.append(g)
                    elif g in full_objs:
                        objs_lemma.append(g)
                    else:
                        others.append(g)
                        g.tag = 'X'

        # Assign tag X to "other" category tokens
        for x in others:
            x.tag = 'X'

        # if possessive category is on,
        if settings.cat_p_on:

            # assign cat G to possessive pronouns, or merge with category G
            for p in poss_pronouns:
                if not settings.merge_p_and_g:
                    p.tag = 'P'

                # if category P and G are merged into one (X)
                else:
                    if p in full_subjs:
                        subjs_lemma.append(p)
                    elif p in full_objs:
                        objs_lemma.append(p)
                    else:
                        others.append(p)
                        p.tag = 'X'

        # Assign tag O to objects
        for o in objs_lemma:
            o.tag = 'O'

        # Assign tag S to subjects
        for s in subjs_lemma:
            s.tag = 'S'

        # get prepositional phrases
        prep_phrase = [
            (p_ent)
            for (p_ent, prep, ent) in itertools.product(tokens, preps, tokens)
            if p_ent.function == 'pn' and p_ent in subjs_lemma + objs_lemma +
            others + poss_pronouns + genitive_mods + full_subjs +
            full_objs and p_ent.dependency == prep.position
            and prep.dependency == ent.position and (ent.function == 'pn')
        ]

        # get rel pronouns
        rel_prons = [t for t in tokens if t.full_pos == 'PRELS']

        # get rel clauses
        rel_clauses = [
            (k, j) for (k, j) in itertools.product(rel_prons, tokens)
            if j.function in ['rel', 'cj', 'objc']
            and j.full_pos.endswith('FIN') and j.position > k.position
        ]

        # mark relative clause tokens
        for (rel_pron, rel_pred) in rel_clauses:
            for token in tokens:
                if token.position >= rel_pron.position and token.position <= rel_pred.position:
                    token.rel = True

        # get conjunction candidates
        conjunctions = [
            t for t in tokens if t.full_pos == 'KOUS' and t.function == 'konj'
        ]

        # get conjunctions and predicates
        conj_pred = [
            (k, j) for (k, j) in itertools.product(conjunctions, tokens)
            if j.full_pos.startswith('V') and j.full_pos.endswith('FIN')
            and j.function in ['root', 'neb'] and j.position == k.dependency
        ]

        # Mark all tokens within subjunctional clause
        for k, j in conj_pred:

            for t in tokens:
                if t.position >= k.position and t.position <= j.position:
                    t.subj = True

        # get part presense and past
        part_pres = [
            t for t in tokens if t.full_pos == 'ADJD'
            and t.morph.part == '<PPRES' and t.function in ['root', 'pn']
        ]
        part_praet = [
            t for t in tokens if t.full_pos == 'VVPP' and t.function == 'neb'
        ]

        # for each participle
        for part in part_pres + part_praet:

            # get full participle construction
            part_con = parse_info.get_dependent_tokens(tokens, part) + [part]
            part_con = parse_info.get_all_tokens(part_con, tokens)

            # set initial comma positions
            first_comma_position = None
            sec_comma_position = None

            # find comma positions
            for comma in [t for t in part_con if t.lemma == ',']:
                if comma.position < part.position:
                    first_comma_position = comma.position
                if comma.position > part.position:
                    sec_comma_position = comma.position

            # cut participle construction at commas (only in-between)
            part_con = [
                k for k in part_con
                if (first_comma_position == None
                    or first_comma_position < k.position) and
                (sec_comma_position == None or sec_comma_position > k.position)
            ]

            # mark token in participle construction
            for token in part_con:
                token.part = True

        # Reduce weights for tokes in prepositional phrases, relative and
        # subjunctive clauses and participle constructions
        if settings.reduce_weights:

            for p in prep_phrase:
                if p.tag != '':
                    p.reduce_tag()

            for t in tokens:
                if t.rel and t.tag != '':
                    t.reduce_tag()
                if t.part and t.tag != '':
                    t.reduce_tag()
                if t.subj and t.tag != '':
                    t.reduce_tag()

        # list of all entities
        all_entities = subjs_lemma + objs_lemma + others

        if not settings.merge_p_and_g:
            # append cat p and g entities
            if settings.cat_p_on:
                all_entities = all_entities + poss_pronouns
            if settings.cat_g_on:
                all_entities = all_entities + genitive_mods

        entities_with_roles.append(all_entities)

    return entities_with_roles
예제 #6
0
import parse_information as easy_parse

# ----------------------------------------
### SETTINGS
# ----------------------------------------

include_wordnet = True

# ----------------------------------------
### DICTIONARIES
# ----------------------------------------

# Constant Token Dictionary
token_dict = {
    'NOT_':
    easy_parse.Token([-1, 'not', 'not', 'ADV', 'RB', 1, 'neg']),
    'NO_':
    easy_parse.Token([-1, 'No', 'no', 'DET', 'DT', 1, 'det']),
    'DO_':
    easy_parse.Token([-1, 'do', 'do', 'VERB', 'VBX', 1, 'aux']),
    'A_':
    easy_parse.Token([-1, 'a', 'a', 'DET', 'DT', 1, 'det']),
    'THE_':
    easy_parse.Token([-1, 'The', 'the', 'DET', 'DT', 1, 'det']),
    'THERE_':
    easy_parse.Token([-1, 'There', 'there', 'DET', 'DT', 1, 'det']),
    'BE_':
    easy_parse.Token([-1, 'be', 'be', 'VERB', 'AUX', 1, 'verb']),
    'THAT_':
    easy_parse.Token([-1, 'that', 'that', 'ADJ', 'WDT', 1, 'nsubj']),
    'WHO_':