예제 #1
0
파일: analyse.py 프로젝트: anorth/wwc-nltk
def main(argv):
    """
  Loads grammar files from command-line args then parses lines from standard input.
  """
    parser = argparse.ArgumentParser()
    parser.add_argument(dest='grammars',
                        nargs='+',
                        help='Grammar file path(s)')
    parser.add_argument('--draw',
                        dest='draw',
                        action='store_true',
                        help='Draw trees')
    parser.add_argument('-v',
                        '--verbose',
                        dest='verbose',
                        action='store_true',
                        help='Be verbose')
    args = parser.parse_args(argv)

    grammar = load_grammars(args.grammars)
    parser = FeatureChartParser(grammar,
                                trace=args.verbose,
                                trace_chart_width=80)

    line = sys.stdin.readline()
    while line:
        if line[0] == '#': continue
        tokens = line.lower().strip().split()
        if len(tokens) == 0: continue

        trees = list(parser.parse(tokens))
        print('*** {} ***'.format(tokens))
        if trees:
            for tree in trees:
                print(tree.pformat(margin=80))
                if args.draw:
                    tree.draw()
                # print(TreePrettyPrinter(tree).text())
        else:
            print('Could not parse {}'.format(tokens))
        print('\n')
        line = sys.stdin.readline()
예제 #2
0
whom does Homer serve salad
all the morning trains from Edinburgh to London leave before 10
most flights that serve breakfast leave at 9
some flights leave before 8
these people who live in the house are friendly
Lisa claims that Bart always leaves before 8
what airlines fly from Edinburgh to London

Bart laugh
when do Homer drinks milk
Bart laughs the kitchen

does the trains leave
Lisa likes drink milk
Lisa and Bart likes drinking milk
the morning flights from Edinburgh leave milk

many flights that serves breakfast leave after 10

Bart laughs in the kitchen
Bart serves
milk are healthy
"""
sents = text.splitlines()
for sent in sents:
    parses = uparser.parse(sent.split())
    print(sent + ":")
    for tree in parses:
        print(tree)
    print("")
예제 #3
0
def unification_grammar():
    ugrammar = FeatureGrammar.fromstring("""\
    ################### RULES #################
    S -> NP[NUM=?n] VP[NUM=?n]
    S -> PREP_P S
    S -> Wh_P AUX[NUM=?n] NP[NUM=?n] VP
    
    NP[NUM=?n] -> ProperNoun[NUM=?n] 
    NP[NUM=?n] -> N[NUM=?n] | ADJ_P NP[NUM=?n] | DET[NUM=?n] NP[NUM=?n] | N[NUM=?n] PREP_P | ADJ_P
    NP[NUM=?n] -> ProperNoun[NUM=?n] GER_P | GER_P
    NP[NUM=pl] -> NP[NUM=?n] CC NP[NUM=?n]
     
    VP[SUBCAT=?rest, NUM=?n] -> V[NUM=?n, SUBCAT=?rest] | VP[NUM=?n, TENSE=?t, SUBCAT=[HEAD=?arg, TAIL=?rest]] ARG[CAT=?arg]
    VP[SUBCAT=?rest, NUM=?n] -> ADV_P V[NUM=?n, SUBCAT=?rest] | V[NUM=?n, SUBCAT=?rest] ADV_P
    VP[SUBCAT=?rest, NUM=?n] -> MOD_P VP[TENSE=?t, SUBCAT=[HEAD=?arg, TAIL=?rest]] ARG[CAT=?arg]
    VP[SUBCAT=?rest, NUM=?n] -> VTB[NUM=?n, SUBCAT=[HEAD=?arg, TAIL=?rest]] ARG[CAT=?arg]
    VP[SUBCAT=?rest, NUM=?n] -> VTB VP[SUBCAT=?rest]
    
    GER_P -> GER NP
    
    ADJ_P -> ADJ | ADJ ADJ_P
    ADV_P -> ADV | ADV ADV_P

    PREP_P -> PREP NP | PREP S
    MOD_P -> MOD AUX[NUM=pl] |  MOD ADV AUX[NUM=pl]
    Wh_P -> Wh | Wh ARG[CAT=?arg] 
    
    ARG[CAT=np] -> NP
    ARG[CAT=pp] -> PREP_P
    ARG[CAT=s] -> S
    
    ################# Lexicons #################
    ################## NOUN ###################
    ###########################################
    ProperNoun[NUM=sg] -> 'Homer' | 'Bart' | 'Lisa'
    N[NUM=sg] -> 'milk' | 'salad' | 'midnight' | 'kitchen' | 'table' 
    N[NUM=pl] -> 'shoes' | 'tables'
    
    ################# VERB ####################
    ###########################################
    
    ############### PRESENT ###################
    #########----- Intransitive -----##########
    V[TENSE=pres, NUM=sg, SUBCAT=nil]-> 'laughs' | 'smiles' | 'walks' | 'serves' | 'drinks'
    V[TENSE=pres, NUM=pl, SUBCAT=nil] -> 'laugh' | 'smile' | 'walk' | 'serve' |'drink'
    
    #########----- Transitive ------###########
    V[TENSE=pres, NUM=sg, SUBCAT=[HEAD=s,TAIL=nil]] -> 'thinks' | 'believes'
    V[TENSE=pres, NUM=pl, SUBCAT=[HEAD=s,TAIL=nil]] -> 'think' | 'believe'
    
    V[TENSE=pres, NUM=sg, SUBCAT=[HEAD=np,TAIL=nil]] ->'serves' | 'drinks' | 'wears' | 'likes' 
    V[TENSE=pres, NUM=pl, SUBCAT=[HEAD=np,TAIL=nil]] ->'serve' | 'drink' | 'wear' | 'like'
    
    V[TENSE=pres, NUM=sg, SUBCAT=[HEAD=pp,TAIL=nil]] ->'walks' | 'teaches' 
    V[TENSE=pres, NUM=pl, SUBCAT=[HEAD=pp,TAIL=nil]] ->'walk' | 'teach' 
    
    ######### primary & secondary ########
    V[TENSE=pres, NUM=sg, SUBCAT=[HEAD=np, TAIL=[HEAD=np,TAIL=nil]]] -> 'serves'
    V[TENSE=pres, NUM=pl, SUBCAT=[HEAD=np, TAIL=[HEAD=np,TAIL=nil]]] -> 'serve'
    V[TENSE=pres, NUM=pl, SUBCAT=[HEAD=s, TAIL=[HEAD=np,TAIL=nil]]] -> 'think' | 'believe'
    
    ################# Past ####################
    #########----- Intransitive -----##########
    V[TENSE=past, SUBCAT=nil] -> 'laughed' | 'smiled' | 'walked'
    
    #########----- Transitive ------###########
    V[TENSE=past, SUBCAT=[HEAD=np,TAIL=nil]] -> 'drank' | 'wore' | 'served'
    V[TENSE=pastpart, SUBCAT=[HEAD=np,TAIL=nil]] ->'drunk' | 'worn' | 'served' | 'seen'
    
     ############### PRESENT CONT. #############
    V[TENSE=prescon, FORM=prespart , SUBCAT=[HEAD=np,TAIL=nil]] -> 'drinking' | 'wearing' 
    V[TENSE=prescon, FORM=prespart , SUBCAT=[HEAD=pp,TAIL=nil]] -> 'drinking'
    
    ################ Determiner ###############
    DET[NUM=sg] -> 'a' | 'the' | 'that'
    DET[NUM=pl] -> 'the' | 'these' | 'those'
    
    ################ Conjunction ##############
    CC -> 'and'
    
    ################## Modal ##################
    MOD -> 'may'
    
    ################# Gerund #################
    GER -> 'drinking'
    
    ############ Adverb & Adjective ############
    ADJ -> 'blue' | 'healthy' | 'green' | 'same'
    ADV -> 'always' | 'never' | 'not' | 'yesterday'
    
    ############## Preposition ##################
    PREP -> 'in' | 'before' | 'when' | 'on'  
    
    AUX[NUM=sg] -> 'does' | 'has'
    AUX[NUM=pl] -> 'do' | 'have'
    VTB[NUM=sg] -> 'is'
    VTB[NUM=pl] -> 'are'
    
    Wh -> 'when' | 'what' | 'where' | 'whom'
    """)
    uparser = FeatureChartParser(ugrammar)
    sents = text_extended.splitlines()
    for sent in sents:
        parses = uparser.parse(sent.split())
        print(sent)
        for tree in parses:
            print(tree)
예제 #4
0
class Parser:
    """
    A language parser which is used to extract relations between entities in a
    given query and group related entities together.

    The parser uses a context free grammar based on a configuration to generate
    candidate entity groupings. Heuristics are then used to rank and select a
    grouping.

    This rule based parser will be helpful in many situations, but if you have
    a sufficiently sophisticated entity hierarchy, you may benefit from using a
    statistical approach.

    Attributes:
        config (dict): The parser config.
    """
    def __init__(
        self,
        resource_loader=None,
        config=None,
        allow_relaxed=True,
        domain=None,
        intent=None,
    ):
        """Initializes the parser

        Args:
            resource_loader (ResourceLoader): An object which can load
                resources for the parser.
            config (dict, optional): The configuration for the parser. If none
                is provided the app config will be loaded.
        """
        if not resource_loader and not config:
            raise ValueError(
                "Parser requires either a configuration or a resource loader")
        app_path = resource_loader.app_path if resource_loader else None
        try:
            entity_types = path.get_entity_types(app_path) + ["unk"]
        except TypeError:
            entity_types = {"unk"}
        self._resource_loader = resource_loader
        self.config = get_parser_config(app_path, config, domain, intent) or {}
        configured_entities = set()
        for entity_type, entity_config in self.config.items():
            configured_entities.add(entity_type)
            configured_entities.update(entity_config.keys())

        self._configured_entities = configured_entities
        rules = generate_grammar(self.config, entity_types)
        self._grammar = FeatureGrammar.fromstring(rules)
        self._parser = FeatureChartParser(self._grammar)
        if allow_relaxed:
            relaxed_rules = generate_grammar(self.config,
                                             entity_types,
                                             relaxed=True)
            self._relaxed_grammar = FeatureGrammar.fromstring(relaxed_rules)
            self._relaxed_parser = FeatureChartParser(self._relaxed_grammar)
        else:
            self._relaxed_grammar = None
            self._relaxed_parser = None

    def parse_entities(
        self,
        query,
        entities,
        all_candidates=False,
        handle_timeout=True,
        timeout=MAX_PARSE_TIME,
    ):
        """Determines groupings of entities for the given query.

        Args:
            query (Query): The query being parsed.
            entities (list[QueryEntity]): The entities to find groupings for.
            all_candidates (bool, optional): Whether to return all the entity candidates.
            handle_timeout (bool, optional): False if an exception should be raised in the event of
                a parsing times out. Defaults to True.
            timeout (float, optional): The amount of time to wait for the parsing to complete.
                By default this is set to MAX_PARSE_TIME. If None is passed, the passing will never
                time out

        Returns:
            (tuple[QueryEntity]): An updated version of the entities collection passed in with \
                their parent and children attributes set appropriately.
        """
        if not self._configured_entities:
            return entities

        if not handle_timeout:
            return self._parse(query,
                               entities,
                               all_candidates=all_candidates,
                               timeout=timeout)

        try:
            return self._parse(query,
                               entities,
                               all_candidates=all_candidates,
                               timeout=timeout)
        except ParserTimeout:
            logger.warning("Parser timed out parsing query %r", query.text)
            return entities

    def _parse(self, query, entities, all_candidates, timeout):
        entity_type_count = defaultdict(int)
        entity_dict = {}
        tokens = []  # tokens to be parsed

        # generate sentential form (assumes entities are sorted)
        for entity in entities:
            entity_type = entity.entity.type
            role_type = entity.entity.role
            if role_type:
                # Append role type to entity type with - separator
                entity_with_role_type = entity_type + "--" + role_type
                if entity_with_role_type in self._configured_entities:
                    entity_type = entity_with_role_type
            if entity_type not in self._configured_entities:
                entity_type = "unk"
            entity_id = "{}{}".format(entity_type,
                                      entity_type_count[entity_type])
            entity_type_count[entity_type] += 1
            entity_dict[entity_id] = entity
            tokens.append(entity_id)

        logger.debug("Parsing sentential form: %r", " ".join(tokens))
        start_time = time.time()
        parses = []
        for parse in self._parser.parse(tokens):
            parses.append(parse)
            if timeout is not None and (time.time() - start_time) > timeout:
                raise ParserTimeout("Parsing took too long")

        if not parses and self._relaxed_parser:
            for parse in self._relaxed_parser.parse(tokens):
                parses.append(parse)
                if timeout is not None and (time.time() -
                                            start_time) > MAX_PARSE_TIME:
                    raise ParserTimeout("Parsing took too long")

        if not parses:
            if all_candidates:
                return []
            return entities

        ranked_parses = self._rank_parses(query, entity_dict, parses, timeout,
                                          start_time)
        if all_candidates:
            return ranked_parses

        # if we still have more than one, choose the first
        entities = self._get_flat_entities(ranked_parses[0], entities,
                                           entity_dict)
        return tuple(sorted(entities, key=lambda e: e.span.start))

    def _rank_parses(self,
                     query,
                     entity_dict,
                     parses,
                     timeout,
                     start_time=None):
        start_time = start_time or time.time()
        resolved = OrderedDict()

        for parse in parses:
            if timeout is not None and time.time() - start_time > timeout:
                raise ParserTimeout("Parsing took too long")
            resolved[self._resolve_parse(parse)] = None
        filtered = (p for p in resolved.keys())

        # Prefer parses with fewer groups
        parses = list(sorted(filtered, key=len))
        filtered = (p for p in parses if len(p) <= len(parses[0]))

        # Prefer parses with minimal distance from dependents to heads
        parses = list(
            sorted(filtered,
                   key=lambda p: self._parse_distance(p, query, entity_dict)))
        min_parse_dist = self._parse_distance(parses[0], query, entity_dict)
        filtered = (
            p for p in parses
            if self._parse_distance(p, query, entity_dict) <= min_parse_dist)

        # TODO: apply precedence

        return list(filtered)

    def _parse_distance(self, parse, query, entity_dict):
        total_link_distance = 0
        stack = list(parse)
        while stack:
            node = stack.pop()
            head = entity_dict[node.id]
            for dep in node.dependents or set():
                if dep.dependents:
                    stack.append(dep)
                    continue
                child = entity_dict[dep.id]
                if child.token_span.start > head.token_span.start:
                    intra_entity_span = Span(head.token_span.end,
                                             child.token_span.start)
                else:
                    intra_entity_span = Span(child.token_span.end,
                                             head.token_span.start)
                link_distance = 0
                for token in intra_entity_span.slice(query.text.split(" ")):
                    if token in self.config[node.type][
                            dep.type]["linking_words"]:
                        link_distance -= 0.5
                    else:
                        link_distance += 1
                total_link_distance += link_distance

        return total_link_distance

    @staticmethod
    def _get_flat_entities(parse, entities, entity_dict):
        stack = [g.to_query_entity(entity_dict) for g in parse]
        new_dict = {}
        while stack:
            entity = stack.pop()
            new_dict[(entity.entity.type, entity.span.start)] = entity

            for child in entity.children or ():
                stack.append(child)

        return [
            new_dict.get((e.entity.type, e.span.start), e) for e in entities
        ]

    @classmethod
    def _resolve_parse(cls, node):
        groups = set()
        for child in node:
            child_symbol = child.label()[TYPE_FEATURE]
            if child_symbol in START_SYMBOLS:
                groups.update(cls._resolve_parse(child))
            else:
                group = cls._resolve_group(child).freeze()
                groups.add(group)
        return frozenset(groups)

    @classmethod
    def _resolve_group(cls, node):
        symbol = node.label()[TYPE_FEATURE]
        if not symbol[0].isupper():
            # this node is a generic entity of type {symbol}, its child is the terminal
            return _EntityNode(symbol, node[0], None)

        # if first char is capitalized, this is a group!
        group_type = symbol.lower()
        dependents = set()
        for child in node:
            child_symbol = child.label()[TYPE_FEATURE]
            if child_symbol == symbol:
                # this is the ancestor of this group
                group = cls._resolve_group(child)
            elif child_symbol == group_type:
                # this is the root ancestor of this group
                group = cls._resolve_group(child)
                group = _EntityNode(group.type, group.id, set())
            else:
                dependents.add(cls._resolve_group(child).freeze())

        group.dependents.update(dependents)
        return group