def read_utterances_from_new_db():
    root_nodes = (db_session.query(Node).options(
        joinedload(Node.children),
        joinedload(Node.utterances)).filter(Node.parent_id.is_(None)).all())
    global features
    global fe
    features = gen_feature_dict(sp.TOPIC, sp.SENTIMENT)
    fe = feature_extractor.FeatureExtractor(features)
    utterance_data = {}
    for root_node in root_nodes:
        utterance_data.update(count_utterances(root_node, True, False, None))
    return utterance_data, len(utterance_data)
Пример #2
0
 def __init__(self, feature):
     self.feature = feature
     self.extractor = FeatureExtractor(gen_feature_dict(feature,
                                                        cobot=True),
                                       start_extractor_servers=True)
     self.required_context = self.feature["cobot-input"]
     try:
         self.redis_pool = redis.ConnectionPool(host=REDIS_HOST,
                                                port=REDIS_PORT,
                                                db=0)
         r = redis.StrictRedis(connection_pool=self.redis_pool)
         r.ping()
         logger.info("using redis")
     except redis.exceptions.ConnectionError:
         logger.info("not using redis")
         self.redis_pool = None
Пример #3
0
def generate_features(list_of_nodes, features=None):
    if not features:
        features = gen_feature_dict(
            WORD_EMBEDDINGS,
            WORD_CLASS_SCORE,
            WORD_CLASS_VECTORS,
            WORD_CLASSES,
            SENTIMENT,
            TOPIC,
            LDA,
        )
    fe = FeatureExtractor(features)
    list_of_dicts = []
    print("nodes to extract features from:", len(list_of_nodes))
    for i in range(len(list_of_nodes)):
        list_of_dicts.append([])
        for utt in list_of_nodes[i]:
            list_of_dicts[i].append(fe({"text": utt}))
        print(i)
    return list_of_dicts
Пример #4
0
#        'steps': [
#            (WORD_EMBEDDINGS['steps'], WORD_CLASS_SCORE['steps']),
#            weighted_average_word_embeddings
#        ],
#        'cobot-steps': [
#            (WORD_EMBEDDINGS['cobot-name'], WORD_CLASS_SCORE['cobot-name']),
#            weighted_average_word_embeddings
#        ],
#        'cobot-input': [WORD_EMBEDDINGS, WORD_CLASS_SCORE]
# }

GRAPHSEARCH_MODEL_FEATURES = gen_feature_dict(
    WORD_EMBEDDINGS,
    WORD_CLASS_SCORE,
    SENTIMENT,
    TOPIC,
    WORD_CLASSES,
    WORD_CLASS_VECTORS,
    #        WEIGHTED_AVERAGE_WORD_EMBEDDINGS
)

GRAPHSEARCH_INFERENCE_FEATURES = gen_feature_dict(
    WORD_EMBEDDINGS,
    WORD_CLASS_SCORE,
    SENTIMENT,
    TOPIC,
    WORD_CLASSES,
    WORD_CLASS_VECTORS,
    NAMED_ENTITIES,
    TAGGED_TEXT,
    #        WEIGHTED_AVERAGE_WORD_EMBEDDINGS
Пример #5
0
    def train(self):
        # prepare data
        with NamedTemporaryFile(
            mode="w", dir=f"{DATA_DIR}/lda_model", delete=True
        ) as tmpf:
            from fantom_util.data_handler import DataHandler
            from fantom_util.misc import gen_feature_dict

            features = gen_feature_dict(CLEAN_TEXT)
            dh = DataHandler(features)
            utterances = []
            dialogs = {}
            for key in dh.node_lookup_table[None]:
                dialogs[key] = [] + [key]
            for key in dialogs.keys():
                stack = []
                if key in dh.node_lookup_table.keys():
                    stack = stack + dh.node_lookup_table[key]
                while stack != []:
                    dialogs[key] = dialogs[key] + [stack[0]]
                    if stack[0] in dh.node_lookup_table.keys():
                        stack = stack + dh.node_lookup_table[stack[0]]
                    del stack[0]
            dialogs_key = {}
            for key in dialogs.keys():
                dialogs_key[key] = []
                for uttID in dialogs[key]:
                    dialogs_key[key] = dialogs_key[key] + dh.node_utts[uttID]

            for key in dialogs_key.keys():
                temp_utt = ""
                for uttID in dialogs_key[key]:
                    temp_utt = temp_utt + dh.id_utt[uttID]["clean_text"] + " "
                utterances.append(temp_utt)

            count = 0
            for utt in utterances:
                if utt.replace(" ", "") != "":
                    count = count + 1
            tmpf.write(str(count))
            tmpf.flush()
            tmpf.write("\n")
            tmpf.flush()
            for utt in utterances:
                if utt.replace(" ", "") != "":
                    tmpf.write(utt)
                    tmpf.write("\n")
                    tmpf.flush()
            lda_arguments = [
                LDA_BIN,
                "-est",
                "-alpha",
                "0.5",
                "-beta",
                "0.1",
                "-ntopics",
                str(LDA_SIZE),
                "-niters",
                "1000",
                "-savestep",
                "100",
                "-twords",
                "20",
                "-dfile",
                f"{tmpf.name}",
            ]
            p = subprocess.Popen(lda_arguments)
            p.wait()
def get_merge_nodes():
    merges = db_session.query(Merging).all()
    used_nodes = []
    vec_cache = {}
    node_pairs = []
    parent_ids = []

    for merge in merges:
        used_nodes.append(f'{merge.left_node_id}--{merge.right_node_id}')

    nodes = db_session.query(Node).options(joinedload(
        Node.utterances), joinedload(Node.node_utterances)).filter(
            Node.parent_id == None,
            Node.active == True).order_by(Node.parent_id.desc()).all()
    grouped_nodes = defaultdict(list)

    fe = FeatureExtractor(gen_feature_dict(WORD_CLASS_SCORE))

    for node in nodes:
        grouped_nodes[node.parent_id].append(node)
        if node.parent_id not in parent_ids:
            parent_ids.append(node.parent_id)

    for group, grouped_nodes in grouped_nodes.items():
        for i, left_node in enumerate(grouped_nodes):
            for j, right_node in enumerate(grouped_nodes):
                if i != j and f'{left_node.id}--{right_node.id}' not in used_nodes and f'{right_node.id}--{left_node.id}' not in used_nodes:
                    used_nodes.append(f'{left_node.id}--{right_node.id}')
                    node_info = {
                        'score':
                        0,
                        'left_node_id':
                        left_node.id,
                        'right_node_id':
                        right_node.id,
                        'parent_utterances': [
                            x.utterance_text
                            for x in left_node.parent.utterances
                        ] if left_node.parent else [],
                        'left_node_utterances': [],
                        'right_node_utterances': []
                    }
                    for left_utterance in left_node.utterances:
                        for right_utterance in right_node.utterances:
                            vec_1 = vec_cache.get(left_utterance)
                            vec_2 = vec_cache.get(right_utterance)

                            if not vec_1:
                                vec_1 = fe.extract_features(
                                    {'text': left_utterance.utterance_text})
                                vec_cache[left_utterance] = vec_1

                            if not vec_2:
                                vec_2 = fe.extract_features(
                                    {'text': right_utterance.utterance_text})
                                vec_cache[right_utterance] = vec_2

                            gss = graph_search_score(vec_1, vec_2)
                            score = gss if gss > 0 else 0

                            node_info['score'] = max(node_info['score'], score)
                            node_info['left_node_utterances'].append(
                                left_utterance.utterance_text)
                            node_info['right_node_utterances'].append(
                                right_utterance.utterance_text)
                    node_info['left_node_utterances'] = list(
                        set(node_info['left_node_utterances']))
                    node_info['right_node_utterances'] = list(
                        set(node_info['right_node_utterances']))
                    node_pairs.append(node_info)

    return sorted(node_pairs, key=lambda x: x['score'], reverse=True)