def analyze(self):
        logging.info('*******************************************************')
        result_dict = {}
        result_dict['source'] = self.source.strip().lower()
        result_dict['q_type'] = self.s_type.strip().lower()
        res = model.predict(sentence=self.sentence)
        root_dict = res['hierplane_tree']['root']
        logging.info('sentence {} parsed as {}'.format(self.sentence,
                                                       root_dict))

        emb = elmo(batch_to_ids([
            self.sentence.split()
        ]))['elmo_representations'][0].detach().numpy()

        parse_tree = ParseTree(root_dict, self.sentence)
        # logging.info('ParseTree type is: {}'.format(parse_tree.get_type()))
        # parse_tree.iterate()
        logging.info(
            'Now it\'s time to check the string representation \n{}'.format(
                str(parse_tree.root)))
        # parse_tree.analyze()
        logging.info('extracting information')
        all_nodes = set()
        all_intent_nodes = set()
        all_desc_nodes = set()
        toponyms = NER.extract_place_names(self.sentence)
        result_dict['pnames'] = toponyms
        topo_nodes = set()
        for t in toponyms:
            logging.info('\ttoponym:\t{}'.format(t))
            nodes = parse_tree.find(t)
            if nodes is None:
                logging.info('An error in finding nodes')
            else:
                for n in nodes:
                    n.role = 'n'
                    topo_nodes.add(n)
        for t_node in topo_nodes:
            logging.info('\t**Found Node: {} and index {}'.format(
                t_node.word, t_node.index))
        all_nodes = all_nodes.union(topo_nodes)
        all_desc_nodes = all_desc_nodes.union(topo_nodes)

        dates = NER.extract_dates(self.sentence)
        result_dict['dates'] = dates
        dates_nodes = set()
        for d in dates:
            logging.info('\tdate:\t{}'.format(d))
            nodes = parse_tree.find(d)
            if nodes is None:
                logging.info('An error in finding nodes')
            else:
                for n in nodes:
                    n.role = 'd'
                    dates_nodes.add(n)

        for d_node in dates_nodes:
            logging.info('\t**Found Node: {} and index {}'.format(
                d_node.word, d_node.index))
        all_nodes = all_nodes.union(dates_nodes)
        all_desc_nodes = all_desc_nodes.union(dates_nodes)

        whs_nodes = parse_tree.get_intent()
        whs = []
        for wh_node in whs_nodes:
            wh_node.role = intent_encoding(wh_node, PRONOUN)
            whs.append(wh_node.word)
        for w in whs:
            logging.info('intent is: {}'.format(w))
        all_nodes = all_nodes.union(whs_nodes)
        all_intent_nodes = all_intent_nodes.union(whs_nodes)
        result_dict['intents'] = whs
        a_entities_set = set()
        a_entities_nodes = set()
        a_types = []
        a_types_nodes = set()
        for whs_node in whs_nodes:
            wh_nouns = whs_node.iterate_nouns()
            wh_nouns.sort(key=sort_function, reverse=True)
            for n in wh_nouns:
                if not is_inside(n.word, toponyms) and not is_inside(
                        n.word, dates) and not is_left_inside(
                            n.word, a_types) and is_a_new_one(
                                a_types_nodes, n):
                    if is_left_inside(
                            n.word.lower().strip(), pt_set) or is_left_inside(
                                n.word.lower().strip(), pt_dict.keys()):
                        a_types.append(n.word)
                        n.role = 't'
                        a_types_nodes.add(n)
                    elif ' ' not in n.word.strip() and len(n.word) > 2:
                        a_entities_set.add(n.word)
                        n.role = 'o'
                        a_entities_nodes.add(n)
        for t in a_types:
            logging.info('\ttype in intent:\t{}'.format(t))
        a_entities = list(a_entities_set)
        for e in a_entities:
            logging.info('\tentity in intent:\t{}'.format(e))
        all_nodes = all_nodes.union(a_types_nodes)
        all_intent_nodes = all_intent_nodes.union(a_types_nodes)
        all_nodes = all_nodes.union(a_entities_nodes)
        all_intent_nodes = all_intent_nodes.union(a_entities_nodes)
        result_dict['i_objects'] = a_entities
        result_dict['i_ptypes'] = a_types
        nouns = parse_tree.get_nouns()
        nouns.sort(key=sort_function, reverse=True)
        types = []
        types_nodes = set()
        entities_set = set()
        entities_nodes = set()
        for n in nouns:
            if not is_inside(n.word, toponyms) and not is_inside(
                    n.word, dates) and not is_inside(
                        n.word, whs) and not is_left_inside(
                            n.word, types) and is_a_new_one(types_nodes, n):
                if is_left_inside(n.word.lower().strip(),
                                  pt_set) or is_left_inside(
                                      n.word.lower().strip(), pt_dict.keys()):
                    types.append(n.word)
                    n.role = 't'
                    types_nodes.add(n)
                elif ' ' not in n.word.strip() and len(n.word) > 2:
                    entities_set.add(n.word)
                    n.role = 'o'
                    entities_nodes.add(n)
        for t in types:
            logging.info('\ttype:\t{}'.format(t))
        entities = list(entities_set)
        for e in entities:
            logging.info('\tentity:\t{}'.format(e))
        all_nodes = all_nodes.union(types_nodes)
        all_desc_nodes = all_desc_nodes.union(types_nodes)
        all_nodes = all_nodes.union(entities_nodes)
        all_desc_nodes = all_desc_nodes.union(entities_nodes)
        result_dict['objects'] = entities
        result_dict['ptypes'] = types
        verbs = parse_tree.get_verbs()
        situations = []
        situations_nodes = set()
        activities = []
        activities_nodes = set()
        unknowns = []
        unknowns_nodes = set()
        for v in verbs:
            v_index = self.sentence.split().index(v.word)
            v_emb = [emb[0][v_index]]
            logging.debug('verb is {} and len of emb is {}'.format(
                v.word, len(v_emb)))
            decision = verb_encoding(v_emb, actv_emb, stav_emb)
            if decision == "a":
                activities.append(v.word)
                v.role = 'a'
                activities_nodes.add(v)
            elif decision == "s":
                situations.append(v.word)
                v.role = 's'
                situations_nodes.add(v)
            else:
                unknowns.append(v.word)
                unknowns_nodes.add(v)
        for s in situations:
            logging.info('\tsituation: {}'.format(s))
        for a in activities:
            logging.info('\tactivities: {}'.format(a))
        for u in unknowns:
            logging.info('\tunknown: {}'.format(u))
        all_nodes = all_nodes.union(activities_nodes)
        all_desc_nodes = all_desc_nodes.union(activities_nodes)
        all_nodes = all_nodes.union(situations_nodes)
        all_desc_nodes = all_desc_nodes.union(situations_nodes)
        result_dict['situations'] = situations
        result_dict['activities'] = activities
        pps = parse_tree.get_pps()
        relations = []
        relation_nodes = set()
        for pp in pps:
            for n in toponyms:
                if 'with' in pp.word.lower():
                    is_within = is_within_phrase(pp.word)
                    if is_within is not None:
                        in_pp = pp.get_in_in_pp()
                        if in_pp is not None:
                            relations.append(in_pp.word)
                            in_pp.role = 'r'
                            relation_nodes.add(in_pp)
                if n in pp.word and not is_inside_right(
                        pp.word, entities) and not is_inside_right(
                            pp.word, a_entities):
                    in_pp = pp.get_in_in_pp()
                    if in_pp is not None:
                        relations.append(in_pp.word)
                        in_pp.role = 'r'
                        relation_nodes.add(in_pp)
                        break
            for t in types:
                if t in pp.word:
                    in_pp = pp.get_in_in_pp()
                    if in_pp is not None:
                        relations.append(in_pp.word)
                        in_pp.role = 'r'
                        relation_nodes.add(in_pp)
                        break
        all_nodes = all_nodes.union(relation_nodes)
        all_desc_nodes = all_desc_nodes.union(relation_nodes)
        for relation in relations:
            logging.info('\trelation: {}'.format(relation))
        result_dict['relations'] = relations

        adjs = parse_tree.get_adjectives()
        qualities = []
        qualities_nodes = set()
        object_qualities = []
        object_qualities_nodes = set()
        for adj in adjs:
            siblings = adj.get_siblings()
            for sibling in siblings:
                if is_inside(sibling.word, toponyms) or is_inside(
                        sibling.word, types) or is_inside(
                            sibling.word, a_types):
                    if not is_inside(adj.word, types) and not is_inside(
                            adj.word, a_types):
                        qualities.append(adj.word)
                        adj.role = 'q'
                        qualities_nodes.add(adj)
                        break
                elif is_inside(sibling.word, entities) or is_inside(
                        sibling.word, a_entities):
                    object_qualities.append(adj.word)
                    adj.role = 'p'
                    object_qualities_nodes.add(adj)
                    break
        all_nodes = all_nodes.union(qualities_nodes)
        all_desc_nodes = all_desc_nodes.union(qualities_nodes)
        all_nodes = all_nodes.union(object_qualities_nodes)
        all_desc_nodes = all_desc_nodes.union(object_qualities_nodes)
        for q in qualities:
            logging.info('\tquality: {}'.format(q))
        for oq in object_qualities:
            logging.info('\tobject quality: {}'.format(oq))
        result_dict['pqualities'] = qualities
        result_dict['oqualities'] = object_qualities
        # coding schema: where: 1, what: 2, which: 3, why: 4, how: 5, how+adj: 6 etc. make it complete... other:0...
        # ...activity: a, situation: s, quality: q, object_quality: p, relation: r, toponym: n, type: t, date: d
        ignored_nodes = []
        leaves = parse_tree.get_leaves()
        for leaf in leaves:
            if leaf.is_unknown():
                ignored_nodes.append(leaf)

        temp = []

        for leaf in ignored_nodes:
            for n in all_nodes:
                flag = True
                if n.is_fuzzy_matched:
                    if leaf.word in n.word:
                        flag = False
                        break
                else:
                    if n.is_your_child(leaf):
                        flag = False
                        break
            if flag:
                temp.append(leaf)
                all_nodes.add(leaf)
        # ignored_nodes = temp

        all_list = list(all_nodes)
        intent_list = list(all_intent_nodes)
        description_list = list(all_desc_nodes)
        all_list.sort(key=lambda x: x.index, reverse=False)
        intent_list.sort(key=lambda x: x.index, reverse=False)
        description_list.sort(key=lambda x: x.index, reverse=False)
        intent_code = ''
        intent_info = []
        for node in intent_list:
            intent_code += node.role
            if node.is_fuzzy_matched:
                intent_info.append({
                    'tag': node.role,
                    'value': node.fuzzy_word
                })
            else:
                intent_info.append({'tag': node.role, 'value': node.word})

        desc_code = ''
        desc_info = []
        for node in description_list:
            desc_code += node.role
            if node.is_fuzzy_matched:
                desc_info.append({'tag': node.role, 'value': node.fuzzy_word})
            else:
                desc_info.append({'tag': node.role, 'value': node.word})

        if Sentence.is_ambiguous(intent_list, intent_code):
            logging.info(
                'the intention is ambiguous, code: {}'.format(intent_code))
            resolved = Sentence.resolving_intent(desc_info)
            result_dict['resolved_intent'] = resolved
            if resolved['code'] != '':
                intent_code += resolved['code']
                intent_info.extend(resolved['list'])
                desc_temp_list = []
                for d in desc_info:
                    if d not in resolved['list']:
                        desc_temp_list.append(d)
                    else:
                        logging.debug('found!')
                desc_code = desc_code.replace(resolved['code'], '', 1)
                desc_info = desc_temp_list
                logging.debug('updated...')

        result_dict['intent_code'] = intent_code
        result_dict['intent_info'] = intent_info
        result_dict['desc_code'] = desc_code
        result_dict['desc_info'] = desc_info
        all_code = ''
        all_info = []
        for node in all_list:
            all_code += node.role
            if node.is_fuzzy_matched:
                all_info.append({'tag': node.role, 'value': node.fuzzy_word})
            else:
                all_info.append({'tag': node.role, 'value': node.word})
        result_dict['all_code'] = all_code
        result_dict['all_info'] = all_info
        logging.info('\tintent code is: {}'.format(intent_code))
        logging.info('\tdesc code is: {}'.format(desc_code))
        logging.info('\tall code is: {}'.format(all_code))
        logging.info('*******************************************************')
        return result_dict
Exemplo n.º 2
0
def find_toponyms(question):
    return NER.extract_place_names(question)