Exemplo n.º 1
0
def select(arguments, premise, verbose=True):
    """Select token or phrase."""
    
    # Get token to be selected and selection type
    select_type = None
    select_token, select_type = arguments
    select_type = select_type.data

    # If full phrase should be deleted                
    if select_type == 'full':    
        select_tokens = easy_parse.get_dependent_tokens(premise.tokens, select_token)
        select_tokens.append(select_token)
        
    # Sort selected tokens by ID
    select_tokens_sorted = sorted(select_tokens, key=lambda token: token.id)
             
    # Ge new premise tokens
    new_premise_tokens = [t for t in select_tokens_sorted]
    premise.tokens = new_premise_tokens
        
    # Reparse and update premise
    premise = reparse(premise)
    premise.update(premise)
    
    if verbose:
        print('New premise:', " ".join(premise.words))   

    # Get projectivity
    projectivity = rel_pol.projectivity_dict[select_token.polarity]
    
    # Return new premise and projectivity
    return premise, projectivity  
Exemplo n.º 2
0
def get_entities_with_roles(parsed_text):
    """Get entities with syntactic role S,O,X,P or G.
    If required, reduce weights for embedded entities."""

    # Initializing
    entities_with_roles = []
    passive_counter = 0

    for sentence in parsed_text:

        # transform tokens into Token class instances
        tokens = [parse_info.Token(k) for k in sentence]

        # resolve and count passive constructions if necessary
        if settings.passive_on:
            passives = parse_info.adjust_passive(tokens)
            if passives == True:
                passive_counter += 1

        # transform tokens into Sentence class instance
        sent = parse_info.Sentence(tokens)

        # get all subjects and objects of Sentence
        # get subject and object lemma if subj/obj is a noun or is marked as coreferent entity

        subjs = sent.subj()
        subjs_lemma = [
            t for t in subjs if (t.sim_pos == 'N' or t.coref != '_')
        ]

        objs = sent.obj()
        objs_lemma = [t for t in objs if (t.sim_pos == 'N' or t.coref != '_')]

        # get all words from full subj and obj noun phrases (for excluding words later in the 'other' category)
        full_subjs = [
            t for t in list(
                chain.from_iterable([
                    parse_info.get_full_phrase(tokens, subj) for subj in subjs
                ])) if (t.sim_pos == 'N' or t.coref != '_')
        ]
        full_objs = [
            t for t in list(
                chain.from_iterable(
                    [parse_info.get_full_phrase(tokens, obj) for obj in objs]))
            if (t.sim_pos == 'N' or t.coref != '_')
        ]

        # get all possessive pronouns (category 'P')
        poss_pronouns = [
            t for t in tokens if (t.coref != '_' and (t.full_pos == 'PPOSAT'))
        ]

        # get all genitive modifiers (category 'G')
        genitive_mods = [
            t for t in tokens
            if ((t.coref != '_' or t.sim_pos == 'N') and t.function == 'gmod')
        ]

        # get all nouns that are not contained in the subj or obj noun phrase, or genitive modifiers
        others = [
            t for t in tokens
            if ((t.sim_pos == 'N') and t not in subjs_lemma + objs_lemma)
        ]

        # get prepositions
        preps = [t for t in tokens if t.function == 'pp']

        # if genitive cat is on, remove genitives from 'others'
        if settings.cat_g_on:
            others = [t for t in others if t.function != 'gmod']

            # assign cat G to genitive modifiers; or merge with category P into X
            for g in genitive_mods:
                if not settings.merge_p_and_g:
                    g.tag = 'G'

                # if category P and G are merged into one (X)
                else:

                    if g in full_subjs:
                        subjs_lemma.append(g)
                    elif g in full_objs:
                        objs_lemma.append(g)
                    else:
                        others.append(g)
                        g.tag = 'X'

        # Assign tag X to "other" category tokens
        for x in others:
            x.tag = 'X'

        # if possessive category is on,
        if settings.cat_p_on:

            # assign cat G to possessive pronouns, or merge with category G
            for p in poss_pronouns:
                if not settings.merge_p_and_g:
                    p.tag = 'P'

                # if category P and G are merged into one (X)
                else:
                    if p in full_subjs:
                        subjs_lemma.append(p)
                    elif p in full_objs:
                        objs_lemma.append(p)
                    else:
                        others.append(p)
                        p.tag = 'X'

        # Assign tag O to objects
        for o in objs_lemma:
            o.tag = 'O'

        # Assign tag S to subjects
        for s in subjs_lemma:
            s.tag = 'S'

        # get prepositional phrases
        prep_phrase = [
            (p_ent)
            for (p_ent, prep, ent) in itertools.product(tokens, preps, tokens)
            if p_ent.function == 'pn' and p_ent in subjs_lemma + objs_lemma +
            others + poss_pronouns + genitive_mods + full_subjs +
            full_objs and p_ent.dependency == prep.position
            and prep.dependency == ent.position and (ent.function == 'pn')
        ]

        # get rel pronouns
        rel_prons = [t for t in tokens if t.full_pos == 'PRELS']

        # get rel clauses
        rel_clauses = [
            (k, j) for (k, j) in itertools.product(rel_prons, tokens)
            if j.function in ['rel', 'cj', 'objc']
            and j.full_pos.endswith('FIN') and j.position > k.position
        ]

        # mark relative clause tokens
        for (rel_pron, rel_pred) in rel_clauses:
            for token in tokens:
                if token.position >= rel_pron.position and token.position <= rel_pred.position:
                    token.rel = True

        # get conjunction candidates
        conjunctions = [
            t for t in tokens if t.full_pos == 'KOUS' and t.function == 'konj'
        ]

        # get conjunctions and predicates
        conj_pred = [
            (k, j) for (k, j) in itertools.product(conjunctions, tokens)
            if j.full_pos.startswith('V') and j.full_pos.endswith('FIN')
            and j.function in ['root', 'neb'] and j.position == k.dependency
        ]

        # Mark all tokens within subjunctional clause
        for k, j in conj_pred:

            for t in tokens:
                if t.position >= k.position and t.position <= j.position:
                    t.subj = True

        # get part presense and past
        part_pres = [
            t for t in tokens if t.full_pos == 'ADJD'
            and t.morph.part == '<PPRES' and t.function in ['root', 'pn']
        ]
        part_praet = [
            t for t in tokens if t.full_pos == 'VVPP' and t.function == 'neb'
        ]

        # for each participle
        for part in part_pres + part_praet:

            # get full participle construction
            part_con = parse_info.get_dependent_tokens(tokens, part) + [part]
            part_con = parse_info.get_all_tokens(part_con, tokens)

            # set initial comma positions
            first_comma_position = None
            sec_comma_position = None

            # find comma positions
            for comma in [t for t in part_con if t.lemma == ',']:
                if comma.position < part.position:
                    first_comma_position = comma.position
                if comma.position > part.position:
                    sec_comma_position = comma.position

            # cut participle construction at commas (only in-between)
            part_con = [
                k for k in part_con
                if (first_comma_position == None
                    or first_comma_position < k.position) and
                (sec_comma_position == None or sec_comma_position > k.position)
            ]

            # mark token in participle construction
            for token in part_con:
                token.part = True

        # Reduce weights for tokes in prepositional phrases, relative and
        # subjunctive clauses and participle constructions
        if settings.reduce_weights:

            for p in prep_phrase:
                if p.tag != '':
                    p.reduce_tag()

            for t in tokens:
                if t.rel and t.tag != '':
                    t.reduce_tag()
                if t.part and t.tag != '':
                    t.reduce_tag()
                if t.subj and t.tag != '':
                    t.reduce_tag()

        # list of all entities
        all_entities = subjs_lemma + objs_lemma + others

        if not settings.merge_p_and_g:
            # append cat p and g entities
            if settings.cat_p_on:
                all_entities = all_entities + poss_pronouns
            if settings.cat_g_on:
                all_entities = all_entities + genitive_mods

        entities_with_roles.append(all_entities)

    return entities_with_roles
Exemplo n.º 3
0
    def set_polarity_scope(self):
        "Set polarity scope of quantifiers."
        
        # Set all token polarities to "up" by default
        for token in self.tokens:            
            token.polarity = 'up'
            token.specific_projectiviy = None
        
        # Fore each token, get polarity
        for token in self.tokens:
                                 
            # If token is quantifier
            if token.lemma in rel_pol.monotonicity_dict.keys() \
                              and token.deprel == 'det':
                           
                # Get first argument (noun phrase quantifier refers to)
                arg_1 = next(t for t in self.tokens if token.head == t.id)
                arg_1 = [t for t in easy_parse.get_dependent_tokens(
                                               self.tokens, arg_1)]+[arg_1]
                
                # Get second argument (VP)
                arg_2 = next(t for t in self.tokens if t.deprel == 'ROOT')
                arg_2 = [t for t in easy_parse.get_dependent_tokens(
                                        self.tokens, arg_2) if t not in arg_1]
            
                # Set polarity (up/down/non) for tokens in scope 
                for t in arg_1:
                    t.polarity = get_new_polarity(t.polarity, 
                                                  rel_pol.monotonicity_dict[token.lemma][0])

                for t in arg_2:   
                    t.polarity = get_new_polarity(t.polarity, 
                                                  rel_pol.monotonicity_dict[token.lemma][1])
                
                # If there is a specific projection for this quantifier, set it
                if token.lemma in rel_pol.quantifier_projection_dict.keys():
                    
                    # First argument of operator
                    for t in arg_1:
                        t.specific_projectivity = \
                        rel_pol.quantifier_projection_dict[token.lemma][0]
                        
                    # Second argument of operator
                    for t in arg_2:
                        t.specific_projectivity = \
                        rel_pol.quantifier_projection_dict[token.lemma][1]
                        
            # If negation
            if token.lemma == 'not':
                
                # Get root
                try:
                    root = next(t for t in self.tokens if t.deprel 
                                in ['ROOT'] and t.id == token.head)
                # Excpetion
                except StopIteration:
                    warnings.warn('Root not found')
                    return False
                    
                # Get subj
                try:
                    subj = next(t for t in self.tokens if t.deprel 
                                in ['nsubj','nsubjpass'] and t.head == root.id)
                # Exception
                except StopIteration:
                    warnings.warn('Root not found')
                    return False
                
                # Get full subject and VP
                full_subj =  [t for t in 
                              easy_parse.get_dependent_tokens(self.tokens, subj)]
                VP  =  [t for t in easy_parse.get_dependent_tokens(self.tokens, root) \
                        if t not in full_subj]
                
                # Set downward polarity to tokens in VP
                for t in VP:
                    t.specific_projectivity = rel_pol.negation_projectivity_dict 
                    t.polarity = 'down'
              
        return True                
Exemplo n.º 4
0
def same_phrase(args, premise, verbose=True):
    """Check whether two phrases are the equal, or equal but negated."""
  
    negated = False
    
    # Get other premise
    other_premise = premise.other_premise   
    
    ## Get anchor tokens, and is_negated value
    if len(args)==2:
        current_prem_anchor_token, other_prem_anchor_token = args
    if len(args)==3:
        current_prem_anchor_token, other_prem_anchor_token, is_negated = args

        # if "negated"
        if is_negated.data.strip() == 'neg':
            negated = True
            
    # Get dependent tokens on achor token for current and other premise
    current_prem_dep_tokens = easy_parse.get_dependent_tokens(premise.tokens, 
                                                              current_prem_anchor_token)
    other_prem_dep_tokens = easy_parse.get_dependent_tokens(other_premise.tokens, 
                                                            other_prem_anchor_token)

    # If current premise anchor is ROOT, remove all adverbial clause dependent tokens
    if current_prem_anchor_token.deprel == 'ROOT':
        
        # Get adverbial clause modifier
        advcls = [t for t in current_prem_dep_tokens if t.deprel == 'advcl']
        advcl_dep_tokens = []
        
        # Get dependent tokens on advcl
        for adv in advcls:
            advcl_dep_tokens += easy_parse.get_dependent_tokens(premise.tokens, adv)
            advcl_dep_tokens.append(adv)
            
        # Get list of all dependent tokens 
        advcl_dep_tokens = list(set(advcl_dep_tokens))
        
        # Exclude all these tokens for current premise
        current_prem_dep_tokens = [t for t in current_prem_dep_tokens if t not in advcl_dep_tokens] 
        
    # If other premise anchor is ROOT, remove all adverbial clause dependent tokens
    if other_prem_anchor_token.deprel == 'ROOT':
        
         # Get adverbial clause modifier
        advcls = [t for t in other_prem_dep_tokens if t.deprel == 'advcl']
        advcl_dep_tokens = []
        
        # Get dependent tokens on advcl
        for adv in advcls:
            advcl_dep_tokens += easy_parse.get_dependent_tokens(other_premise.tokens, adv)
            advcl_dep_tokens.append(adv)
            
        # Get list of all dependent tokens 
        advcl_dep_tokens = list(set(advcl_dep_tokens))
        
        # Exclude all these tokens for other premise
        other_prem_dep_tokens = [t for t in other_prem_dep_tokens if t not in advcl_dep_tokens] 
    
    # Append anchor token
    current_prem_dep_tokens.append(current_prem_anchor_token)
    other_prem_dep_tokens.append(other_prem_anchor_token)
        
    # Clean and sort lists by token id
    sorted_current_tokens = list(set(sorted(current_prem_dep_tokens, key=lambda token: token.id)))
    sorted_other_tokens = list(set(sorted(other_prem_dep_tokens, key=lambda token: token.id)))
            
    # Remove punctuations
    sorted_current_tokens = list(set([t.lemma for t in sorted_current_tokens if t.deprel not in ['mark','punct']]))
    sorted_other_tokens = list(set([t.lemma for t in sorted_other_tokens if t.deprel not in ['mark','punct']]))
    
    # If negated version, add a negation to non-negated sentence (for easy comparison)
    if negated:
        
        # Add "not" or "do not" to set that does not contain negation
        if 'not' not in sorted_current_tokens:
            if current_prem_anchor_token.lemma in ['be','can','must']:
                sorted_current_tokens.append('not')
            else:
                sorted_current_tokens += ['do','not']
                
        if 'not' not in sorted_other_tokens:
            if other_prem_anchor_token.lemma in ['be','can','must']:
                sorted_other_tokens.append('not')
            else:
                sorted_other_tokens += ['do','not']          
            
    # Sort lists
    sorted_current_tokens = sorted(sorted_current_tokens)
    sorted_other_tokens = sorted(sorted_other_tokens)
            
    # Return whether two token lists are identical
    return (sorted_current_tokens == sorted_other_tokens)
Exemplo n.º 5
0
def delete(arguments, premise, verbose=True):
    """Delete token from sentence."""
      
    # Get token to be deleted, and (if available) delete type
    if len(arguments)==2:
        del_token, del_type = arguments
        del_type = del_type.data
    else:
        del_token = arguments[0]
        del_type = None
        
    # Get deletion position
    if isinstance(del_token, str):
        for i, tok in enumerate(premise.tokens):
            if tok.lemma == del_token:
                del_token = tok
                break
            
    else:
        for i, tok in enumerate(premise.tokens):
                if tok.same_token(del_token):
                    del_token = tok

    
    # Printing               
    if verbose:
        print('\nDELETE', del_token.lemma)
        print('Old premise:', " ".join(premise.words))

    # Get all tokens that have to be delete
    del_tokens = [del_token]
    
    # If full phrase should be deleted                
    if del_type == 'full':    
        del_phrase = easy_parse.get_dependent_tokens(premise.tokens, del_token)
        del_tokens += del_phrase
            
    # Sort tokens to be deleted by ID
    del_tokens_sorted = sorted(del_tokens, key=lambda token: token.id)
                
    # All positions of tokens to be deleted
    del_tokens_positions = []
    
    # Save positions of tokens to be deleted
    for i,k in enumerate(premise.tokens):
        if k in del_tokens_sorted:
            del_tokens_positions.append(i)
                
    # Get first and last token
    last_del_token = max(del_tokens_positions)
    first_del_token = min(del_tokens_positions)
    
    # If relative clause, adjust positions (because of commas)
    if 'relcl' in [t.deprel for t in del_tokens_sorted]:    
        first_del_token -= 1
        last_del_token += 1
        
        del_tokens_positions.append(first_del_token)
        del_tokens_positions.append(last_del_token)
        
    # If necessary, increase position of last token to be deleted (e.g. punctuation)
    if len(premise.tokens) > last_del_token+1 and premise.tokens[last_del_token+1].lemma in [',',':',';']:
        del_tokens_positions.append(last_del_token+1)
        
    # Number of deleted tokens
    n_del_tokens = len(del_tokens_positions)
            
    # Get all tokens that are not supposed to be deleted
    new_premise_tokens = [t for i,t in enumerate(premise.tokens) if i not in del_tokens_positions]
    
    # Set position and dependency values for tokens
    for i,token in enumerate(new_premise_tokens):
        if i >= last_del_token:
            token.id = i - n_del_tokens
        if token.head >= last_del_token:
            token.head = token.head - n_del_tokens
                    
    # Save new premise tokens
    premise.tokens = new_premise_tokens
        
    # Reparse and update premise
    premise = reparse(premise)
    premise.update(premise)
    
    if verbose:
        print('New premise:', " ".join(premise.words))   

    # Get projectivity
    projectivity = rel_pol.projectivity_dict[del_token.polarity]
    
    # Return new premise and projectivity
    return premise, projectivity