def get_parents(node_id): if not node_id: return [] node = db_session.query(Node).get(node_id) nodes = db_session.query(Node).filter(Node.id.in_(node.path), Node.active.is_(True)).order_by(Node.path_length).all() return nodes
def incoherent_nodes(): node_utterance_ids = (db_session.query( NodeUtteranceStatus.node_utterance_id, func.count(NodeUtteranceStatus.node_utterance_id), ).filter( NodeUtteranceStatus.status == "incoherent", NodeUtteranceStatus.handled.is_(False), ).group_by(NodeUtteranceStatus.node_utterance_id).order_by( func.count(NodeUtteranceStatus.node_utterance_id).desc()).all()) for node_utterance_id, count in node_utterance_ids: node_utterance = db_session.query(NodeUtterance).get(node_utterance_id) history = (db_session.query(Node).filter( Node.id.in_(node_utterance.node.path)).order_by( Node.path_length.asc()).all()) print("+----------------------------------+") print("node utterance id:", node_utterance_id) print("node id:", node_utterance.node.id) print("count:", count) print("\n") for h in history[:-1]: print([x.utterance_text for x in h.utterances]) print(node_utterance.utterance.utterance_text) print("\n") user_input = input("Inactivate? Y/n/q ") if user_input.lower() == "q": exit() elif user_input == "" or user_input.lower() == "y": inactivate_node(node_utterance.node.id) db_session.query(NodeUtteranceStatus).filter( NodeUtteranceStatus.node_utterance_id == node_utterance_id, NodeUtteranceStatus.status == "incoherent", ).update({"handled": True}) db_session.commit()
def submit(external_worker_id, task_id): if task_id not in TASK_IDS: raise KeyError('Task id not recognized') worker = db_session.query(Worker) \ .filter_by(external_worker_id=external_worker_id) \ .first() if not worker: worker = Worker(external_worker_id=external_worker_id) db_session.add(worker) db_session.commit() training = db_session.query(Training).filter_by(worker=worker).first() if not training: training = Training(worker=worker) db_session.add(training) db_session.commit() if task_id == min(set(TASK_IDS) - set(training.tasks)): training.tasks = training.tasks + [task_id] db_session.commit() new_set = set(TASK_IDS) - set(training.tasks) if not new_set: return True else: return False
def set_parent(node_id, parent_node_id=None, commit=False): node = db_session.query(Node).get(node_id) if not node: raise Exception("Could not find node") if parent_node_id: parent_node = db_session.query(Node).get(parent_node_id) else: parent_node = None old_parent_id = None if node.parent: old_parent_id = node.parent.id node.parent = parent_node node.path = node.recalculate_path() def update_children(child_node): db_session.add(child_node) child_node.path = child_node.recalculate_path() for child in child_node.children[:]: logger.debug("found child %s", child.id) logger.debug("parent has children %s", child_node.children) update_children(child) if old_parent_id and node.children: update_children(node) if commit: db_session.commit() logger.info("committing!")
def prepare_from_db(): nodes = (db_session.query(Node).options(joinedload( Node.utterances)).filter( Node.active.is_(True), or_( Node.visited_count > 1, Node.child_count > 0, Node.is_user.is_(False), ), ).all()) lookup_table = defaultdict(list) node_utts = defaultdict(list) node_visit_counts = {} id_utt = {} for node in nodes: lookup_table[node.parent_id].append(node.id) node_visit_counts[node.id] = node.visited_count for utterance in node.utterances: node_utts[node.id].append(utterance.id) id_utt[utterance.id] = {"text": utterance.utterance_text} linked_nodes = { from_node: to_node for from_node, to_node in db_session.query( LinkedNodes.linked_from_node_id, LinkedNodes.linked_to_node_id).all() } return lookup_table, node_utts, id_utt, node_visit_counts, linked_nodes
def get_next_training_for_worker(external_worker_id): worker = db_session.query(Worker) \ .filter_by(external_worker_id=external_worker_id) \ .first() training = db_session.query(Training).filter_by(worker=worker).first() if not training or not training.tasks: return None remaining_tasks = set(TASK_IDS) - set(training.tasks) if not remaining_tasks: return '__DONE__' lowest_id_for_non_perfomed_task = min(remaining_tasks) t = list( filter(lambda x: x['id'] == lowest_id_for_non_perfomed_task, TRAINING_DATA))[0] return { 'id': t['id'], 'history': [ NodeUtterance(utterance=Utterance(utterance_text=x['text'])) for x in t['history'] ], 'replies': t['replies'], 'description': t['description'] }
def get_node_utterances(node_id): node_utterances = (db_session.query(NodeUtterance).filter( NodeUtterance.node_id == node_id).all()) for node_utterance in node_utterances: utterances = (db_session.query(Utterance).filter( Utterance.id == node_utterance.utterance_id).all()) for utterance in utterances: yield utterance.utterance_text
def activate_node(node_id): node = db_session.query(Node).get(node_id) node.active = True nodes = db_session.query(Node).filter(Node._path.descendant_of( node._path)).all() for node in nodes: node.active = True db_session.commit()
def add_yes_no_nodes(): text_file = open("nodeIDs_final.txt", "r") node_ids = text_file.read().split("\n")[:-1] nodes = (db_session.query(Node).filter(Node.is_user == False, Node.id.in_(map(int, node_ids))).all()) # Filter out nodes that already have yes/no children nodes_add_yes = [] nodes_add_no = [] for node in nodes: add_yes, add_no = True, True yes_utterances, no_utterances = [], [] children = db_session.query(Node).filter( Node.parent_id == node.id).all() for child in children: for utterance in get_node_utterances(child.id): if re.match( r".*\b(ye(s\b|ah\b|a\b|p\b)|of course|I do\b|absolutely|definitely|ok\b|ok(ay|ey)\b|sounds (good|great)|fine|sure|why not|don't mind|let's do that).*", utterance, ): add_yes = False if re.match( r".*\b(n(o\bo\b||ot\b|ope\b|a\b|ah\b)|sounds bad|I'm good).*", utterance, ): add_no = False if add_yes: nodes_add_yes.append(node) if add_no: nodes_add_no.append(node) # Add yes/no nodes and write file with new ids for future reference added_node_ids = "" for node in nodes_add_yes: new_node = create_new_node( "yes", source="yes_no_population", parent_id=node, species="yes", commit=False, ) added_node_ids += (str(new_node.id) + "\t" + list(get_node_utterances(node.id))[0] + "\tyes\n") for node in nodes_add_no: new_node = create_new_node("no", source="yes_no_population", parent_id=node, species="no", commit=False) added_node_ids += (str(new_node.id) + "\t" + list(get_node_utterances(node.id))[0] + "\tno\n") with open("added_node_ids.txt", "w") as the_file: the_file.write(added_node_ids)
def merge_by_score(): nodes = (db_session.query( PotentialNodeMerge.left_node_id, PotentialNodeMerge.right_node_id, PotentialNodeMerge.score, Merging, ).outerjoin( Merging, ((PotentialNodeMerge.left_node_id == Merging.left_node_id) & (PotentialNodeMerge.right_node_id == Merging.right_node_id) | (PotentialNodeMerge.left_node_id == Merging.right_node_id) & (PotentialNodeMerge.right_node_id == Merging.left_node_id)), ).filter(Merging.id.is_(None)).order_by( PotentialNodeMerge.score.desc()).all()) used_ids = [] merged_right_nodes = [] for left_node_id, right_node_id, score, _ in nodes: if (f"{left_node_id}-{right_node_id}" not in used_ids and left_node_id not in merged_right_nodes and right_node_id not in merged_right_nodes): used_ids.append(f"{left_node_id}-{right_node_id}") used_ids.append(f"{right_node_id}-{left_node_id}") print("+------------------+") left_node = db_session.query(Node).get(left_node_id) right_node = db_session.query(Node).get(right_node_id) if (left_node.active and right_node.active and left_node.utterances and right_node.utterances): print(left_node_id, [x.utterance_text for x in left_node.utterances]) print("---------------- VS ----------------") print(right_node_id, [x.utterance_text for x in right_node.utterances]) print("\nscore", score, "\n") user_input = input("Merge? Y/n/q ") if user_input.lower() == "q": exit() elif user_input.lower() == "n": merge_nodes(left_node_id, right_node_id, merged=False) print("nope!") elif user_input == "" or user_input.lower() == "y": if right_node.child_count > left_node.child_count: print(right_node.id, "<-", left_node.id) merge_nodes(right_node.id, left_node.id, merged=True) merged_right_nodes.append(left_node.id) else: print(left_node.id, "<-", right_node.id) merge_nodes(left_node.id, right_node.id, merged=True) merged_right_nodes.append(right_node.id) db_session.commit()
def inactivate_node(node_id): node = db_session.query(Node).get(node_id) node.active = False nodes = db_session.query(Node).filter(Node._path.descendant_of( node._path)).all() for node in nodes: node.active = False db_session.query(PotentialNodeMerge).filter( (PotentialNodeMerge.left_node_id == node_id) | (PotentialNodeMerge.right_node_id == node_id)).delete() db_session.commit()
def finish_job(ext_job_id, external_worker_id, answer, corrections, extra_questions, with_audio, used_text_input, assignment_id, hit_id): job = get_job(ext_job_id) nodes = [x.node for x in job.node_utterances] last_node = nodes[-1] if nodes else None worker = _create_or_get_worker(external_worker_id) node = create_new_node([answer], parent_id=last_node.id, source='typed') node_utterance = node.node_utterances[0] node_utterance.with_audio = with_audio node_utterance.used_text_input = used_text_input node_utterance_worker_job = NodeUtteranceWorkerJob( node_utterance_id=node_utterance.id, worker_id=worker.id, job_id=job.id, assignment_id=assignment_id, hit_id=hit_id ) db_session.add(node_utterance_worker_job) for old_node_utterance_id, corrected_text in corrections.items(): old_node_utterance = db_session.query(NodeUtterance).get(old_node_utterance_id) add_utterance_to_node(corrected_text, old_node_utterance.node, 'correction') node_utterance_status = NodeUtteranceStatus( node_utterance_id=old_node_utterance.id, status='corrected' ) db_session.add(node_utterance_status) for extra_question in extra_questions: if extra_question['type'] != 'api': extra_node_utterance = db_session.query(NodeUtterance).get(extra_question['id']) for status in ['suitable', 'equivalent', 'needs_correction']: if extra_question[status]: db_session.add(NodeUtteranceStatus( node_utterance_id=extra_node_utterance.id, referenced_node_utterance_id=node_utterance.id, status=status )) else: # extra_node_utterance = add_utterance_to_node( # extra_question['text'], node, extra_question['id'] # ) pass # TODO: set positive scoring for worker and node_utterances db_session.commit()
def get_ratings(start_time, end_time): if not end_time: end_time, = db_session.query(Rating.start_time).filter( Rating.start_time.isnot(None)).order_by( Rating.start_time.desc()).first() if not start_time: start_time = (end_time - datetime.timedelta(days=1)).replace(hour=0, minute=0) return db_session.query(Rating) \ .filter(Rating.start_time >= start_time, Rating.start_time <= end_time)\ .order_by(Rating.start_time.desc())\ .all()
def linked_nodes(linked_to_node_id, linked_from_node_id): linked_to_node = db_session.query(Node).get(linked_to_node_id) linked_from_node = db_session.query(Node).get(linked_from_node_id) link = (db_session.query(LinkedNodes).filter( LinkedNodes.linked_to_node_id == linked_to_node.id, LinkedNodes.linked_from_node_id == linked_from_node.id, ).first()) if link: return if linked_to_node and linked_from_node: db_session.add( LinkedNodes( linked_to_node_id=linked_to_node_id, linked_from_node_id=linked_from_node_id, )) db_session.commit()
def classify_root_nodes(): from fantom_util.models.rnc_mlp_model import RootNodeClassifierMLP nodes = (db_session.query(Node).outerjoin( RootNode, RootNode.node_id == Node.id).filter( Node.active.is_(True), Node.parent_id.is_(None), RootNode.id.is_(None)).order_by(Node.visited_count.desc()).all()) rnc_mlp = RootNodeClassifierMLP() for node in nodes: print("+----------------------+") utterances = [x.utterance_text for x in node.utterances] score_results = rnc_mlp.predict_list(utterances) for utterance, score in zip(utterances, score_results): print(f"{utterance}: {score[0]}") print("\navg:", (sum(score_results) / len(score_results))[0], "\n") user_input = input("Root node? Y/n/q ") if user_input.lower() == "q": exit() elif user_input.lower() == "n": node.active = False for utterance in utterances: db_session.add( RootNode(node_id=node.id, utterance=utterance, is_root_node=False)) elif user_input == "" or user_input.lower() == "y": for utterance in utterances: db_session.add( RootNode(node_id=node.id, utterance=utterance, is_root_node=True)) db_session.commit()
def nodes(): nodes = db_session.query(Node).filter(Node.parent_id.is_(None), Node.active.is_(True)) count = 0 allNode = [] returnNode = {} for node in nodes: allNode.append(node) shuffle(allNode) for node in allNode: utt = [] tree_nodes = [] stack = [node] while stack: cur_node = stack[0] stack = stack[1:] tree_nodes.append(cur_node) for child in cur_node.children: stack.append(child) for el in tree_nodes: for utterances in el.utterances: utt.append(utterances.utterance_text) returnNode[count] = utt count = count + 1 if count == 30: break return returnNode
def named_entity_merge(node_id): nodes = (db_session.query(Node).options(joinedload( Node.utterances), joinedload( Node.node_utterances)).filter(Node.parent_id == node_id).all()) to_merge = [] categories = ["movies", "musicians", "bands"] for node in nodes: done = False if not node.children: for utterance in node.utterances: utterance_text = utterance.utterance_text # print(utterance_text) for category in categories: for item in nem[category]: if f" {item.lower()} " in f" {utterance_text} ": print(f"found {item} in {utterance_text}") to_merge.append(node) done = True if done: break if done: break if done: break return to_merge
def check_for_worker_eligibilitiy_for_qualification(): workers = db_session.query(Worker).filter(Worker.has_more_than_20_qualifaction.is_(False), Worker.source == 'mturk').all() for worker in workers: if worker.job_counts > 20: mturk.qualify_worker_for_has_more_than_20_qualification(worker.external_worker_id) worker.has_more_than_20_qualifaction = True db_session.commit()
def populate(conversation_id=None, automate=False): root_node_utterances = _get_utterance_lookup_table(None) global file_name file_name = ( FANTOM_WORKDIR + "/fantom_util/fantom_util/graph_tools/possible_root_nodes.txt" ) os.system("aws s3 cp s3://SOME_AWS_BUCKET_URL/possible_root_nodes.txt " + file_name) if not conversation_id: conversation_ids = ( db_session.query(Conversation.conversation_id) .filter(Conversation.conversation_id != None) .distinct(Conversation.conversation_id) .all() ) if automate: for conversation_id in tqdm(conversation_ids): process_conversation(conversation_id, root_node_utterances, automate) else: stop = False while not stop: conversation_id = random.choice(conversation_ids)[0] process_conversation(conversation_id, root_node_utterances, automate) response = input("Continue? Y/n\n") if response.lower() == "n": stop = True else: process_conversation(conversation_id, root_node_utterances, False) os.system( "aws s3 cp " + file_name + " s3://SOME_AWS_BUCKET_URL/possible_root_nodes.txt" )
def set_incoherent(ext_job_id, external_worker_id, incoherent_node_utterance_id, with_audio, assignment_id, hit_id): job = get_job(ext_job_id) worker = _create_or_get_worker(external_worker_id) incoherent_node_utterance = db_session.query(NodeUtterance).get(incoherent_node_utterance_id) incoherent_node_utterance_worker_job = IncoherentNodeUtteranceWorkerJob( node_utterance_id=incoherent_node_utterance.id, worker_id=worker.id, job_id=job.id, assignment_id=assignment_id, hit_id=hit_id ) db_session.add(incoherent_node_utterance_worker_job) node_utterance_status = NodeUtteranceStatus( with_audio=with_audio, node_utterance_id=incoherent_node_utterance.id, status='incoherent' ) db_session.add(node_utterance_status) # TODO: set negative scoring for worker and node_utterances db_session.commit()
def main(synonym_path): synonym_objects = get_synonym_objects(synonym_path) root_nodes = (db_session.query(Node).options( joinedload(Node.children), joinedload(Node.utterances)).filter(Node.parent_id.is_(None)).all()) merge_synonyms(root_nodes, synonym_objects) db_session.commit()
def get_graph(): nodes = db_session.query(Node).filter(Node.active.is_(True)).all() max_visits = 0 for node in nodes: max_visits = max(max_visits, node.visited_count) output = ['digraph {'] # For each node for node in nodes: text = '<br/>'.join(['- {} (nu: {}, u: {})'.format(x.utterance.utterance_text.replace("'", "\\'").replace('"', '\\"'), x.id, x.utterance.id) for x in node.node_utterances]) # set the color to grey if it is a user utterance otherwise black color = '#000000' if node.is_user else '#00ff00' thickness = max(1, 4 * node.visited_count/max_visits) # print the node with it's properties output.append(f'{node.id}[label=<<b>{node.id}</b> ({node.visited_count})<br/>{text}>,color="{color}",penwidth={thickness}];') # if the root node is not this nodes parent if node.parent_id: # print it with a link to it's parent output.append(f'{node.parent_id} -> {node.id};') output.append('}') return ' '.join(output)
def remove_profane_utterances(): utterances = db_session.query(Utterance).all() for utterance in tqdm(utterances): text = normalize_text(utterance.utterance_text) if text and re.search(EXCLUDED_UTTERANCES, text) and utterance.node_utterances: node_ids = [x.node.id for x in utterance.node_utterances if x.node.is_user and x.node.active] if node_ids: print(text, node_ids)
def read_utterances_from_new_db(): root_nodes = (db_session.query(Node).options( joinedload(Node.children), joinedload(Node.utterances)).filter(Node.parent_id.is_(None)).all()) utterances = [] for root_node in root_nodes: utterances += count_utterances(root_node) return utterances, len(utterances)
def _create_or_get_worker(external_worker_id, source=None): worker = db_session.query(Worker) \ .filter_by(external_worker_id=external_worker_id) \ .first() if not worker: worker = Worker(external_worker_id=external_worker_id, source=source) db_session.add(worker) db_session.flush() return worker
def create_jobs(job_type, amount=1): if job_type not in ['user', 'system', SPECIES_TAG]: raise Exception('work type: "{}" does not exist. Use either system_task or user_task') job_filter = [Node.active_child_count == 0, Node.visited_count > 1] if job_type == SPECIES_TAG: job_filter = [Node.species == SPECIES_TAG, Node.active_child_count < 3] nodes = db_session.query(Node) \ .filter( Node.score > 0, Node.is_user == (job_type != 'user'), Node.active.is_(True), *job_filter )\ .order_by(Node.score.desc()) \ .all() created_jobs = [] for node in nodes: history_ids = node.path[-MAX_DIALOGUE_HISTORY:] history = db_session\ .query(Node)\ .filter(Node.id.in_(history_ids), Node.active.is_(True))\ .options(joinedload(Node.utterances), joinedload(Node.node_utterances))\ .order_by(Node.path_length.asc())\ .all() history_length = len(history) if len(history_ids) != history_length: logger.warning(f'history_ids != history, {history_ids} != {history}') continue job_node_utterances = [] for index, history_node in enumerate(history): pool_of_node_utterances = [] for node_utterance in history_node.node_utterances: if _check_node_utterance_eligibility(node_utterance, index == history_length - 1, job_type): pool_of_node_utterances.append(node_utterance) if pool_of_node_utterances: job_node_utterances.append(random.choice(pool_of_node_utterances)) if len(history_ids) == len(job_node_utterances): job = Job(job_type=job_type, persona_sample=get_persona_sample()) db_session.add(job) db_session.flush() for i, node_utterance in enumerate(job_node_utterances): db_session.add(JobNodeUtterance(job_id=job.id, node_utterance_id=node_utterance.id, position=i)) created_jobs.append(job) if len(created_jobs) == amount: break db_session.commit() print(f'created {len(created_jobs)} jobs') return created_jobs
def find_duplicate_utterances(): duplicate_utterances = db_session.query(Utterance.utterance_text).group_by(Utterance.utterance_text).having(func.count(Utterance.utterance_text) > 1).all() for utterance_text, in duplicate_utterances: utterances = db_session.query(Utterance).filter(Utterance.utterance_text == utterance_text).all() print(utterance_text) print([(x.id, len(x.node_utterances)) for x in utterances]) nu_count = 0 candidate_utterance = utterances[0] for utterance in utterances: if utterance.node_utterances: candidate_utterance = utterance nu_count += 1 if nu_count == 0 or nu_count == 1: print('deleting..') for utterance in utterances: if utterance.id != candidate_utterance.id: print('removing..', utterance.id) db_session.delete(utterance) print('-------------------') db_session.commit()
def get_synonym_objects(synonym_path): synonym_objects = [] with open(synonym_path, "r") as f: for synonym in f.readlines(): utterance = (db_session.query(Utterance).filter_by( utterance_text=synonym.strip()).first()) if not utterance: utterance = Utterance(utterance_text=synonym.strip()) db_session.add(utterance) db_session.flush() synonym_objects.append(utterance) return synonym_objects
def add_utterance_to_node(utterance_text, node, source): if not isinstance(utterance_text, Utterance): utterance = (db_session.query(Utterance).filter_by( utterance_text=utterance_text).first()) if not utterance: utterance = Utterance(utterance_text=utterance_text) db_session.add(utterance) db_session.flush() else: utterance = utterance_text node.utterances.append(utterance) db_session.flush() node_utterance = (db_session.query(NodeUtterance).filter_by( node_id=node.id).filter_by(utterance_id=utterance.id).first()) node_utterance.source = source db_session.flush() return node_utterance
def read_utterances_from_new_db(): root_nodes = (db_session.query(Node).options( joinedload(Node.children), joinedload(Node.utterances)).filter(Node.parent_id.is_(None)).all()) global features global fe features = gen_feature_dict(sp.TOPIC, sp.SENTIMENT) fe = feature_extractor.FeatureExtractor(features) utterance_data = {} for root_node in root_nodes: utterance_data.update(count_utterances(root_node, True, False, None)) return utterance_data, len(utterance_data)