def read_utterances_from_new_db(): root_nodes = (db_session.query(Node).options( joinedload(Node.children), joinedload(Node.utterances)).filter(Node.parent_id.is_(None)).all()) global features global fe features = gen_feature_dict(sp.TOPIC, sp.SENTIMENT) fe = feature_extractor.FeatureExtractor(features) utterance_data = {} for root_node in root_nodes: utterance_data.update(count_utterances(root_node, True, False, None)) return utterance_data, len(utterance_data)
def __init__(self, feature): self.feature = feature self.extractor = FeatureExtractor(gen_feature_dict(feature, cobot=True), start_extractor_servers=True) self.required_context = self.feature["cobot-input"] try: self.redis_pool = redis.ConnectionPool(host=REDIS_HOST, port=REDIS_PORT, db=0) r = redis.StrictRedis(connection_pool=self.redis_pool) r.ping() logger.info("using redis") except redis.exceptions.ConnectionError: logger.info("not using redis") self.redis_pool = None
def generate_features(list_of_nodes, features=None): if not features: features = gen_feature_dict( WORD_EMBEDDINGS, WORD_CLASS_SCORE, WORD_CLASS_VECTORS, WORD_CLASSES, SENTIMENT, TOPIC, LDA, ) fe = FeatureExtractor(features) list_of_dicts = [] print("nodes to extract features from:", len(list_of_nodes)) for i in range(len(list_of_nodes)): list_of_dicts.append([]) for utt in list_of_nodes[i]: list_of_dicts[i].append(fe({"text": utt})) print(i) return list_of_dicts
# 'steps': [ # (WORD_EMBEDDINGS['steps'], WORD_CLASS_SCORE['steps']), # weighted_average_word_embeddings # ], # 'cobot-steps': [ # (WORD_EMBEDDINGS['cobot-name'], WORD_CLASS_SCORE['cobot-name']), # weighted_average_word_embeddings # ], # 'cobot-input': [WORD_EMBEDDINGS, WORD_CLASS_SCORE] # } GRAPHSEARCH_MODEL_FEATURES = gen_feature_dict( WORD_EMBEDDINGS, WORD_CLASS_SCORE, SENTIMENT, TOPIC, WORD_CLASSES, WORD_CLASS_VECTORS, # WEIGHTED_AVERAGE_WORD_EMBEDDINGS ) GRAPHSEARCH_INFERENCE_FEATURES = gen_feature_dict( WORD_EMBEDDINGS, WORD_CLASS_SCORE, SENTIMENT, TOPIC, WORD_CLASSES, WORD_CLASS_VECTORS, NAMED_ENTITIES, TAGGED_TEXT, # WEIGHTED_AVERAGE_WORD_EMBEDDINGS
def train(self): # prepare data with NamedTemporaryFile( mode="w", dir=f"{DATA_DIR}/lda_model", delete=True ) as tmpf: from fantom_util.data_handler import DataHandler from fantom_util.misc import gen_feature_dict features = gen_feature_dict(CLEAN_TEXT) dh = DataHandler(features) utterances = [] dialogs = {} for key in dh.node_lookup_table[None]: dialogs[key] = [] + [key] for key in dialogs.keys(): stack = [] if key in dh.node_lookup_table.keys(): stack = stack + dh.node_lookup_table[key] while stack != []: dialogs[key] = dialogs[key] + [stack[0]] if stack[0] in dh.node_lookup_table.keys(): stack = stack + dh.node_lookup_table[stack[0]] del stack[0] dialogs_key = {} for key in dialogs.keys(): dialogs_key[key] = [] for uttID in dialogs[key]: dialogs_key[key] = dialogs_key[key] + dh.node_utts[uttID] for key in dialogs_key.keys(): temp_utt = "" for uttID in dialogs_key[key]: temp_utt = temp_utt + dh.id_utt[uttID]["clean_text"] + " " utterances.append(temp_utt) count = 0 for utt in utterances: if utt.replace(" ", "") != "": count = count + 1 tmpf.write(str(count)) tmpf.flush() tmpf.write("\n") tmpf.flush() for utt in utterances: if utt.replace(" ", "") != "": tmpf.write(utt) tmpf.write("\n") tmpf.flush() lda_arguments = [ LDA_BIN, "-est", "-alpha", "0.5", "-beta", "0.1", "-ntopics", str(LDA_SIZE), "-niters", "1000", "-savestep", "100", "-twords", "20", "-dfile", f"{tmpf.name}", ] p = subprocess.Popen(lda_arguments) p.wait()
def get_merge_nodes(): merges = db_session.query(Merging).all() used_nodes = [] vec_cache = {} node_pairs = [] parent_ids = [] for merge in merges: used_nodes.append(f'{merge.left_node_id}--{merge.right_node_id}') nodes = db_session.query(Node).options(joinedload( Node.utterances), joinedload(Node.node_utterances)).filter( Node.parent_id == None, Node.active == True).order_by(Node.parent_id.desc()).all() grouped_nodes = defaultdict(list) fe = FeatureExtractor(gen_feature_dict(WORD_CLASS_SCORE)) for node in nodes: grouped_nodes[node.parent_id].append(node) if node.parent_id not in parent_ids: parent_ids.append(node.parent_id) for group, grouped_nodes in grouped_nodes.items(): for i, left_node in enumerate(grouped_nodes): for j, right_node in enumerate(grouped_nodes): if i != j and f'{left_node.id}--{right_node.id}' not in used_nodes and f'{right_node.id}--{left_node.id}' not in used_nodes: used_nodes.append(f'{left_node.id}--{right_node.id}') node_info = { 'score': 0, 'left_node_id': left_node.id, 'right_node_id': right_node.id, 'parent_utterances': [ x.utterance_text for x in left_node.parent.utterances ] if left_node.parent else [], 'left_node_utterances': [], 'right_node_utterances': [] } for left_utterance in left_node.utterances: for right_utterance in right_node.utterances: vec_1 = vec_cache.get(left_utterance) vec_2 = vec_cache.get(right_utterance) if not vec_1: vec_1 = fe.extract_features( {'text': left_utterance.utterance_text}) vec_cache[left_utterance] = vec_1 if not vec_2: vec_2 = fe.extract_features( {'text': right_utterance.utterance_text}) vec_cache[right_utterance] = vec_2 gss = graph_search_score(vec_1, vec_2) score = gss if gss > 0 else 0 node_info['score'] = max(node_info['score'], score) node_info['left_node_utterances'].append( left_utterance.utterance_text) node_info['right_node_utterances'].append( right_utterance.utterance_text) node_info['left_node_utterances'] = list( set(node_info['left_node_utterances'])) node_info['right_node_utterances'] = list( set(node_info['right_node_utterances'])) node_pairs.append(node_info) return sorted(node_pairs, key=lambda x: x['score'], reverse=True)