def classify_root_nodes(): from fantom_util.models.rnc_mlp_model import RootNodeClassifierMLP nodes = (db_session.query(Node).outerjoin( RootNode, RootNode.node_id == Node.id).filter( Node.active.is_(True), Node.parent_id.is_(None), RootNode.id.is_(None)).order_by(Node.visited_count.desc()).all()) rnc_mlp = RootNodeClassifierMLP() for node in nodes: print("+----------------------+") utterances = [x.utterance_text for x in node.utterances] score_results = rnc_mlp.predict_list(utterances) for utterance, score in zip(utterances, score_results): print(f"{utterance}: {score[0]}") print("\navg:", (sum(score_results) / len(score_results))[0], "\n") user_input = input("Root node? Y/n/q ") if user_input.lower() == "q": exit() elif user_input.lower() == "n": node.active = False for utterance in utterances: db_session.add( RootNode(node_id=node.id, utterance=utterance, is_root_node=False)) elif user_input == "" or user_input.lower() == "y": for utterance in utterances: db_session.add( RootNode(node_id=node.id, utterance=utterance, is_root_node=True)) db_session.commit()
def set_parent(node_id, parent_node_id=None, commit=False): node = db_session.query(Node).get(node_id) if not node: raise Exception("Could not find node") if parent_node_id: parent_node = db_session.query(Node).get(parent_node_id) else: parent_node = None old_parent_id = None if node.parent: old_parent_id = node.parent.id node.parent = parent_node node.path = node.recalculate_path() def update_children(child_node): db_session.add(child_node) child_node.path = child_node.recalculate_path() for child in child_node.children[:]: logger.debug("found child %s", child.id) logger.debug("parent has children %s", child_node.children) update_children(child) if old_parent_id and node.children: update_children(node) if commit: db_session.commit() logger.info("committing!")
def set_incoherent(ext_job_id, external_worker_id, incoherent_node_utterance_id, with_audio, assignment_id, hit_id): job = get_job(ext_job_id) worker = _create_or_get_worker(external_worker_id) incoherent_node_utterance = db_session.query(NodeUtterance).get(incoherent_node_utterance_id) incoherent_node_utterance_worker_job = IncoherentNodeUtteranceWorkerJob( node_utterance_id=incoherent_node_utterance.id, worker_id=worker.id, job_id=job.id, assignment_id=assignment_id, hit_id=hit_id ) db_session.add(incoherent_node_utterance_worker_job) node_utterance_status = NodeUtteranceStatus( with_audio=with_audio, node_utterance_id=incoherent_node_utterance.id, status='incoherent' ) db_session.add(node_utterance_status) # TODO: set negative scoring for worker and node_utterances db_session.commit()
def main(synonym_path): synonym_objects = get_synonym_objects(synonym_path) root_nodes = (db_session.query(Node).options( joinedload(Node.children), joinedload(Node.utterances)).filter(Node.parent_id.is_(None)).all()) merge_synonyms(root_nodes, synonym_objects) db_session.commit()
def check_for_worker_eligibilitiy_for_qualification(): workers = db_session.query(Worker).filter(Worker.has_more_than_20_qualifaction.is_(False), Worker.source == 'mturk').all() for worker in workers: if worker.job_counts > 20: mturk.qualify_worker_for_has_more_than_20_qualification(worker.external_worker_id) worker.has_more_than_20_qualifaction = True db_session.commit()
def incoherent_nodes(): node_utterance_ids = (db_session.query( NodeUtteranceStatus.node_utterance_id, func.count(NodeUtteranceStatus.node_utterance_id), ).filter( NodeUtteranceStatus.status == "incoherent", NodeUtteranceStatus.handled.is_(False), ).group_by(NodeUtteranceStatus.node_utterance_id).order_by( func.count(NodeUtteranceStatus.node_utterance_id).desc()).all()) for node_utterance_id, count in node_utterance_ids: node_utterance = db_session.query(NodeUtterance).get(node_utterance_id) history = (db_session.query(Node).filter( Node.id.in_(node_utterance.node.path)).order_by( Node.path_length.asc()).all()) print("+----------------------------------+") print("node utterance id:", node_utterance_id) print("node id:", node_utterance.node.id) print("count:", count) print("\n") for h in history[:-1]: print([x.utterance_text for x in h.utterances]) print(node_utterance.utterance.utterance_text) print("\n") user_input = input("Inactivate? Y/n/q ") if user_input.lower() == "q": exit() elif user_input == "" or user_input.lower() == "y": inactivate_node(node_utterance.node.id) db_session.query(NodeUtteranceStatus).filter( NodeUtteranceStatus.node_utterance_id == node_utterance_id, NodeUtteranceStatus.status == "incoherent", ).update({"handled": True}) db_session.commit()
def activate_node(node_id): node = db_session.query(Node).get(node_id) node.active = True nodes = db_session.query(Node).filter(Node._path.descendant_of( node._path)).all() for node in nodes: node.active = True db_session.commit()
def mturk_job(ext_job_id): external_worker_id = request.args.get("workerId") if external_worker_id != "NO_WORKER_ID": job_controller._create_or_get_worker(external_worker_id, source="mturk") db_session.commit() submit_url = "{}/mturk/externalSubmit".format( request.args.get("turkSubmitTo", "")) return show_task(submit_url, ext_job_id, external_worker_id, mturk=True)
def merge_by_score(): nodes = (db_session.query( PotentialNodeMerge.left_node_id, PotentialNodeMerge.right_node_id, PotentialNodeMerge.score, Merging, ).outerjoin( Merging, ((PotentialNodeMerge.left_node_id == Merging.left_node_id) & (PotentialNodeMerge.right_node_id == Merging.right_node_id) | (PotentialNodeMerge.left_node_id == Merging.right_node_id) & (PotentialNodeMerge.right_node_id == Merging.left_node_id)), ).filter(Merging.id.is_(None)).order_by( PotentialNodeMerge.score.desc()).all()) used_ids = [] merged_right_nodes = [] for left_node_id, right_node_id, score, _ in nodes: if (f"{left_node_id}-{right_node_id}" not in used_ids and left_node_id not in merged_right_nodes and right_node_id not in merged_right_nodes): used_ids.append(f"{left_node_id}-{right_node_id}") used_ids.append(f"{right_node_id}-{left_node_id}") print("+------------------+") left_node = db_session.query(Node).get(left_node_id) right_node = db_session.query(Node).get(right_node_id) if (left_node.active and right_node.active and left_node.utterances and right_node.utterances): print(left_node_id, [x.utterance_text for x in left_node.utterances]) print("---------------- VS ----------------") print(right_node_id, [x.utterance_text for x in right_node.utterances]) print("\nscore", score, "\n") user_input = input("Merge? Y/n/q ") if user_input.lower() == "q": exit() elif user_input.lower() == "n": merge_nodes(left_node_id, right_node_id, merged=False) print("nope!") elif user_input == "" or user_input.lower() == "y": if right_node.child_count > left_node.child_count: print(right_node.id, "<-", left_node.id) merge_nodes(right_node.id, left_node.id, merged=True) merged_right_nodes.append(left_node.id) else: print(left_node.id, "<-", right_node.id) merge_nodes(left_node.id, right_node.id, merged=True) merged_right_nodes.append(right_node.id) db_session.commit()
def inactivate_node(node_id): node = db_session.query(Node).get(node_id) node.active = False nodes = db_session.query(Node).filter(Node._path.descendant_of( node._path)).all() for node in nodes: node.active = False db_session.query(PotentialNodeMerge).filter( (PotentialNodeMerge.left_node_id == node_id) | (PotentialNodeMerge.right_node_id == node_id)).delete() db_session.commit()
def create_jobs(job_type, amount=1): if job_type not in ['user', 'system', SPECIES_TAG]: raise Exception('work type: "{}" does not exist. Use either system_task or user_task') job_filter = [Node.active_child_count == 0, Node.visited_count > 1] if job_type == SPECIES_TAG: job_filter = [Node.species == SPECIES_TAG, Node.active_child_count < 3] nodes = db_session.query(Node) \ .filter( Node.score > 0, Node.is_user == (job_type != 'user'), Node.active.is_(True), *job_filter )\ .order_by(Node.score.desc()) \ .all() created_jobs = [] for node in nodes: history_ids = node.path[-MAX_DIALOGUE_HISTORY:] history = db_session\ .query(Node)\ .filter(Node.id.in_(history_ids), Node.active.is_(True))\ .options(joinedload(Node.utterances), joinedload(Node.node_utterances))\ .order_by(Node.path_length.asc())\ .all() history_length = len(history) if len(history_ids) != history_length: logger.warning(f'history_ids != history, {history_ids} != {history}') continue job_node_utterances = [] for index, history_node in enumerate(history): pool_of_node_utterances = [] for node_utterance in history_node.node_utterances: if _check_node_utterance_eligibility(node_utterance, index == history_length - 1, job_type): pool_of_node_utterances.append(node_utterance) if pool_of_node_utterances: job_node_utterances.append(random.choice(pool_of_node_utterances)) if len(history_ids) == len(job_node_utterances): job = Job(job_type=job_type, persona_sample=get_persona_sample()) db_session.add(job) db_session.flush() for i, node_utterance in enumerate(job_node_utterances): db_session.add(JobNodeUtterance(job_id=job.id, node_utterance_id=node_utterance.id, position=i)) created_jobs.append(job) if len(created_jobs) == amount: break db_session.commit() print(f'created {len(created_jobs)} jobs') return created_jobs
def split_nodes(node_id): node = db_session.query(Node).get(node_id) print( "which utterance for this node would you like to split (currently only one at a time) (enter to quit)" ) split_utterances = {} for i, node_utterance in enumerate(node.node_utterances): split_utterances[i] = node_utterance print(f"({i}) {node_utterance.utterance.utterance_text}") print("") print("----- CHILDREN -----") for child in node.children: print("*", ", ".join([x.utterance_text for x in child.utterances])) print("-----") print("") utterance_to_split = input(">") if not utterance_to_split: return None split_node_utterance = split_utterances[int(utterance_to_split)] parent = node.parent new_node = create_new_node([], parent_id=parent, source="") split_node_utterance.node = new_node db_session.add(split_node_utterance) db_session.add(node) print("new node with utterance", [x.utterance_text for x in new_node.utterances]) print( "Which children do you want to bring over to the new node (press enter for none, use comma for multiple)" ) move_kids = {} for i, child in enumerate(node.children): move_kids[i] = child kids = ", ".join([x.utterance_text for x in child.utterances]) print(f"({i}) {kids}") print("") kids_to_move = input(">").replace(" ", "") if kids_to_move: splited_kids = [int(x) for x in kids_to_move.split(",")] for kid in splited_kids: move_kids[kid].parent = new_node db_session.add(move_kids[kid]) print("adding", move_kids[kid].id, "to", new_node.id, move_kids[kid].parent.id) print("done") # db_session.rollback() db_session.commit()
def finish_job(ext_job_id, external_worker_id, answer, corrections, extra_questions, with_audio, used_text_input, assignment_id, hit_id): job = get_job(ext_job_id) nodes = [x.node for x in job.node_utterances] last_node = nodes[-1] if nodes else None worker = _create_or_get_worker(external_worker_id) node = create_new_node([answer], parent_id=last_node.id, source='typed') node_utterance = node.node_utterances[0] node_utterance.with_audio = with_audio node_utterance.used_text_input = used_text_input node_utterance_worker_job = NodeUtteranceWorkerJob( node_utterance_id=node_utterance.id, worker_id=worker.id, job_id=job.id, assignment_id=assignment_id, hit_id=hit_id ) db_session.add(node_utterance_worker_job) for old_node_utterance_id, corrected_text in corrections.items(): old_node_utterance = db_session.query(NodeUtterance).get(old_node_utterance_id) add_utterance_to_node(corrected_text, old_node_utterance.node, 'correction') node_utterance_status = NodeUtteranceStatus( node_utterance_id=old_node_utterance.id, status='corrected' ) db_session.add(node_utterance_status) for extra_question in extra_questions: if extra_question['type'] != 'api': extra_node_utterance = db_session.query(NodeUtterance).get(extra_question['id']) for status in ['suitable', 'equivalent', 'needs_correction']: if extra_question[status]: db_session.add(NodeUtteranceStatus( node_utterance_id=extra_node_utterance.id, referenced_node_utterance_id=node_utterance.id, status=status )) else: # extra_node_utterance = add_utterance_to_node( # extra_question['text'], node, extra_question['id'] # ) pass # TODO: set positive scoring for worker and node_utterances db_session.commit()
def linked_nodes(linked_to_node_id, linked_from_node_id): linked_to_node = db_session.query(Node).get(linked_to_node_id) linked_from_node = db_session.query(Node).get(linked_from_node_id) link = (db_session.query(LinkedNodes).filter( LinkedNodes.linked_to_node_id == linked_to_node.id, LinkedNodes.linked_from_node_id == linked_from_node.id, ).first()) if link: return if linked_to_node and linked_from_node: db_session.add( LinkedNodes( linked_to_node_id=linked_to_node_id, linked_from_node_id=linked_from_node_id, )) db_session.commit()
def correct_spelling_submit() -> str: utterance_id = int(request.form["utterance_id"]) new_spelling = request.form["new_spelling"] corections = request.form.get("corections") utterance = db_session.query(Utterance).get(utterance_id) utterance.utterance_text = new_spelling utterance.is_spellchecked = True db_session.commit() with open("new_corrections_v2.json", "r") as f: new_corrections = json.dumps( [x for x in json.loads(f.read()) if x[0] != utterance_id]) with open("new_corrections_v2.json", "w") as f: f.write(new_corrections) if corections: return redirect(url_for("admin.fix_spelling_issues")) return "ok, done! Please reload the page to see your spelling fix"
def update_amazon_anonymous(): if not os.path.exists(ANONYMOUS_UTTERANCE_DIR): os.makedirs(ANONYMOUS_UTTERANCE_DIR) files_to_process = [] for file in list_files_in_s3_bucket_dir(ALEXA_PRIZE_BUCKET_NAME, ANONYMOUS_UTTERANCE_DIR_ON_S3): file_name = file.key.rsplit("/", 1)[1] file_from_s3( ALEXA_PRIZE_BUCKET_NAME, file.key, f"{ANONYMOUS_UTTERANCE_DIR}/{file_name}.tmp", ) files_to_process.append(f"{ANONYMOUS_UTTERANCE_DIR}/{file_name}.tmp") anonymous_utterances = [] for file_path in files_to_process: with open(file_path, "r") as f: for line in f.readlines(): if re.search(EXCLUDED_UTTERANCES, normalize_text(line)): logger.info("removed utterance: %s", line.strip()) continue anonymous_utterances.append(line.strip().lower()) anonymous_utterances = set(anonymous_utterances) logger.info("anonymous_utterances %d", len(anonymous_utterances)) utterances = db_session.query(Utterance).all() for utterance in utterances: if utterance.utterance_text in anonymous_utterances: if not utterance.amazon_anonymous: logger.info("setting anonymous: %s", utterance.utterance_text) utterance.amazon_anonymous = True anonymous_utterances.remove(utterance.utterance_text) logger.info("anonymous_utterances left %d", len(anonymous_utterances)) logger.info("to be added: %s", anonymous_utterances) for new_utterance in anonymous_utterances: db_session.add( Utterance(utterance_text=new_utterance, amazon_anonymous=True)) db_session.commit() for file_path in files_to_process: os.rename(file_path, file_path[:-4])
def find_duplicate_utterances(): duplicate_utterances = db_session.query(Utterance.utterance_text).group_by(Utterance.utterance_text).having(func.count(Utterance.utterance_text) > 1).all() for utterance_text, in duplicate_utterances: utterances = db_session.query(Utterance).filter(Utterance.utterance_text == utterance_text).all() print(utterance_text) print([(x.id, len(x.node_utterances)) for x in utterances]) nu_count = 0 candidate_utterance = utterances[0] for utterance in utterances: if utterance.node_utterances: candidate_utterance = utterance nu_count += 1 if nu_count == 0 or nu_count == 1: print('deleting..') for utterance in utterances: if utterance.id != candidate_utterance.id: print('removing..', utterance.id) db_session.delete(utterance) print('-------------------') db_session.commit()
def delete_node(node_id): node = db_session.query(Node).get(node_id) for child in node.children: delete_node(child.id) for node_utterance in node.node_utterances: db_session.query(JobNodeUtterance).filter( JobNodeUtterance.node_utterance_id == node_utterance.id).delete() db_session.query(NodeUtteranceStatus).filter( NodeUtteranceStatus.node_utterance_id == node_utterance.id).delete() db_session.query(NodeUtteranceStatus).filter( NodeUtteranceStatus.referenced_node_utterance_id == node_utterance.id).delete() db_session.query(NodeUtteranceWorkerJob).filter( NodeUtteranceWorkerJob.node_utterance_id == node_utterance.id).delete() db_session.flush() db_session.delete(node_utterance) db_session.commit() db_session.delete(node) db_session.commit()
def submit(external_worker_id, task_id): if task_id not in TASK_IDS: raise KeyError('Task id not recognized') worker = db_session.query(Worker) \ .filter_by(external_worker_id=external_worker_id) \ .first() if not worker: worker = Worker(external_worker_id=external_worker_id) db_session.add(worker) db_session.commit() training = db_session.query(Training).filter_by(worker=worker).first() if not training: training = Training(worker=worker) db_session.add(training) db_session.commit() if task_id == min(set(TASK_IDS) - set(training.tasks)): training.tasks = training.tasks + [task_id] db_session.commit() new_set = set(TASK_IDS) - set(training.tasks) if not new_set: return True else: return False
def create_new_node(utterances, source="manual", parent_id=None, commit=False, species=None): if type(parent_id) == Node: parent = parent_id elif parent_id is not None: parent = db_session.query(Node).get(parent_id) else: parent = None node = Node(parent=parent, species=species) db_session.add(node) db_session.flush() node.path = (parent.path if parent else []) + [node.id] if type(utterances) == str: utterances = [utterances] for utterance in utterances: add_utterance_to_node(utterance, node, source) if commit: db_session.commit() return node
def old_anonymize(): while True: utterance, count = (db_session.query( Conversation.user_utterance, func.count(Conversation.user_utterance)).group_by( Conversation.user_utterance).order_by( func.count(Conversation.user_utterance).desc()).filter( ~exists().where(Conversation.user_utterance == AnonymousUtterance.text)).first()) print(count, utterance) user_input = input("Appropriate? Y/n/q ") if user_input.lower() == "q": exit() elif user_input.lower() == "n": print("-") db_session.add( AnonymousUtterance(text=utterance, appropriate=False)) elif user_input == "" or user_input.lower() == "y": print("+") db_session.add(AnonymousUtterance(text=utterance, appropriate=True)) db_session.commit()
def fix_utterances_starting_with_alexa(): utterances = db_session.query(Utterance)\ .filter( Utterance.node_utterances.any(), or_(Utterance.utterance_text.like('alexa %'), Utterance.utterance_text.like('amazon %'), Utterance.utterance_text.like('echo %'), Utterance.utterance_text.like('computer %') ) ).all() for utterance in utterances: nul = len(utterance.node_utterances) alternative_utterance = db_session.query(Utterance).filter(Utterance.utterance_text == normalize_text(utterance.utterance_text)).first() if alternative_utterance: for node_utterance in utterance.node_utterances: print('==', node_utterance.id) node_utterance.utterance = alternative_utterance print(utterance.utterance_text, '->', alternative_utterance.utterance_text, f'({nul})', utterance.id, alternative_utterance.id) else: print(normalize_text(utterance.utterance_text), utterance.utterance_text, 'HAS NO', f'({nul})') db_session.commit()
def exact_match(): merges = db_session.query(Merging).all() used_nodes = [] for merge in merges: used_nodes.append(f"{merge.left_node_id}--{merge.right_node_id}") nodes = (db_session.query(Node).filter(Node.active == True).order_by( Node.parent_id.desc()).all()) grouped_nodes = defaultdict(list) for node in nodes: grouped_nodes[node.parent_id].append(node) bar = progressbar.ProgressBar() for group, grouped_nodes in bar(grouped_nodes.items()): for i, left_node in enumerate(grouped_nodes): for j, right_node in enumerate(grouped_nodes): if (i != j and f"{left_node.id}--{right_node.id}" not in used_nodes and f"{right_node.id}--{left_node.id}" not in used_nodes): used_nodes.append(f"{left_node.id}--{right_node.id}") for left_utterance in left_node.utterances: for right_utterance in right_node.utterances: do_continue = True if (left_utterance.utterance_text == "" or left_utterance.utterance_text == " "): print( "removing empty utterance", left_utterance.utterance_text, left_utterance.id, ) if left_node.children: raise Exception( "empty string has children. WAT?! :S") db_session.remove(left_utterance) db_session.flush() if not left_node.utterances: print("removing node", left_node.id) db_session.remove(left_node) do_continue = False if (right_utterance.utterance_text == "" or right_utterance.utterance_text == " "): print( "removing empty utterance", right_utterance.utterance_text, right_utterance.id, ) if right_node.children: raise Exception( "empty string has children. WAT?! :S") db_session.remove(right_utterance) db_session.flush() if not right_node.utterances: print("removing node", right_node.id) db_session.remove(right_node) do_continue = False if (do_continue and left_utterance.utterance_text.lower() == right_utterance.utterance_text.lower()): # print('merge', left_utterance.utterance_text, right_utterance.utterance_text) merge_nodes(left_node.id, right_node.id, True) db_session.commit()
def fix_visited_count(): nodes = db_session.query(Node).options(joinedload(Node.children), joinedload(Node.utterances)).filter(Node.parent_id.is_(None)).all() #nodes = [db_session.query(Node).get(648610)] _check_kids(nodes, '', 0) db_session.commit()
def fix_root_visited_count(): nodes = db_session.query(Node).options(joinedload(Node.children)).filter(Node.parent_id.is_(None)).all() for node in tqdm(nodes): node.visited_count = sum([child.visited_count for child in node.children]) or 1 db_session.commit()
def process_conversation(conversation_id, root_node_utterances, automate): conversations = ( db_session.query(Conversation) .filter(Conversation.conversation_id == conversation_id) .order_by(Conversation.interaction_timestamp) .all() ) logger.debug("processing conversation_id: %s", conversation_id) conversation_chunks = [] processed = 0 for conversation in conversations: if conversation.intent == "LaunchRequestIntent": processed += 1 continue if root_node_utterances.get(normalize_text(conversation.user_utterance)): conversation_chunks.append([]) if not conversation_chunks: conversation_chunks = [[]] for conversation_chunk in conversation_chunks: conversation_chunk.append(conversation) if conversation.processed: processed += 1 if processed >= len(conversations): logger.debug("skipping due to all being processed") return None logger.debug([[y.user_utterance for y in x] for x in conversation_chunks]) processed_time = datetime.datetime.now() for conversations in conversation_chunks: parent = None child_nodes = root_node_utterances for idx, conversation in enumerate(conversations): text = normalize_text(conversation.user_utterance) if conversation.intent == "LaunchRequestIntent": logger.debug( "skipping: LaunchRequestIntent: %d %s", idx, conversation.user_utterance, ) continue if not text: logger.debug( "skipping: user utterance is empty: %d %s", idx, conversation.user_utterance, ) continue if re.search(EXCLUDED_UTTERANCES, text) or re.search( EXCLUDED_UTTERANCES, conversation.user_utterance ): logger.debug( "breaking: Detected excluded utterance %s -> %s", conversation.user_utterance, text, ) break logger.debug("- %d %s %s", idx, conversation.user_utterance, text) if parent: child_nodes = _get_utterance_lookup_table(parent) show_kids = str(child_nodes.keys()) if len(child_nodes.keys()) < 4 else "" logger.debug( f"-- Searching among {len(child_nodes.keys())} nodes. %s", show_kids ) node_utterance = child_nodes.get(text) if node_utterance: node = node_utterance.node if not node.active: logger.debug(f"This node ({node.id}) has been marked as inactive.") break logger.debug( "--- Found existing node node_id: %s, node_utterance_id: %s", node.id, node_utterance.id, ) if ( not conversation.processed or conversation.processed == processed_time ): node.visited_count += 1 logger.debug( "---- Increase count for node %s (%d)", node.id, node.visited_count, ) db_session.add(node) conversation.processed = processed_time db_session.add(conversation) parent = None for child in node.children: for utterance in child.utterances: if utterance.id == conversation.graphsearch_matched_utterance_id or normalize_text( utterance.utterance_text ) == normalize_text( conversation.system_utterance ): logger.debug( "----- Found system response: %s", utterance.utterance_text, ) if ( not conversation.processed or conversation.processed == processed_time ): logger.debug( "----- Increase count for child node %s (%d)", child.id, child.visited_count, ) child.visited_count += 1 db_session.add(child) parent = child if parent: break if not parent: logger.debug( "---- No system response found: %s", conversation.system_utterance, ) break else: logger.debug("--- No existing node found") if ( not conversation.processed or conversation.processed == processed_time ): if parent: logger.debug( "--- Adding new node %s", conversation.user_utterance ) node = create_new_node( [conversation.user_utterance], parent_id=parent.id if parent else None, source="automatic_population", ) else: # root_node_utterances[text] = node.node_utterances[0] try: logger.debug( "--- New potential root node: " + conversation.user_utterance ) with open(file_name, "a") as storage_file: logger.debug( "---XXXXXXXXXXXXXXXXXXXXXXX: Adding first utterance to " + file_name ) storage_file.write(conversation.user_utterance + "\n") except: logger.debug("--- File not found") conversation.processed = processed_time break if not automate: response = input("Populate? N/y\n") if response.lower() == "y": db_session.commit() logger.debug("committing!") else: db_session.rollback() else: db_session.commit()
file_from_s3(BUCKET_NAME, file.key, f'{PATH_TO_UTTERANCES}/{file_name}.tmp') files_to_process.append(f'{PATH_TO_UTTERANCES}/{file_name}.tmp') anonymous_utterances = [] for file_path in files_to_process: with open(file_path, 'r') as f: for line in f.readlines(): if re.search(EXCLUDED_UTTERANCES, normalize_text(line)): print('removed utterance', line.strip()) continue anonymous_utterances.append(line.strip().lower()) anonymous_utterances = set(anonymous_utterances) print('-----', len(anonymous_utterances)) utterances = db_session.query(Utterance).all() for utterance in utterances: if utterance.utterance_text in anonymous_utterances: if not utterance.amazon_anonymous: print('setting anonymous:', utterance.utterance_text) utterance.amazon_anonymous = True anonymous_utterances.remove(utterance.utterance_text) print('-----', len(anonymous_utterances), anonymous_utterances) for new_utterance in anonymous_utterances: db_session.add( Utterance(utterance_text=new_utterance, amazon_anonymous=True)) db_session.commit() for file_path in files_to_process: os.rename(file_path, file_path[:-4])
def merge_nodes(left_node_id, right_node_id, merged=True): if merged: left_node = db_session.query(Node).get(left_node_id) right_node = db_session.query(Node).get(right_node_id) merge_1 = (db_session.query(Merging).filter( Merging.left_node_id == left_node.id, Merging.right_node_id == right_node.id, ).first()) merge_2 = (db_session.query(Merging).filter( Merging.left_node_id == right_node.id, Merging.right_node_id == left_node.id, ).first()) if merge_1 or merge_2: # or left_node.id == right_node.id: return for node_utterance in right_node.node_utterances[:]: logger.debug( "node has utterance %s %s", node_utterance.id, node_utterance.utterance.utterance_text, ) node_utterance.node = left_node node_utterance.node_id = left_node_id db_session.add(node_utterance) db_session.add( NodeUtteranceStatus(node_utterance_id=node_utterance.id, status="merged")) logger.debug("(before) left node children: %s", left_node.children) logger.debug("(before) right node children: %s", right_node.children) for child in right_node.children[:]: logger.debug("found child %s", child.id) set_parent(child.id, left_node.id) logger.debug("left node children: %s", left_node.children) logger.debug("right node children: %s", right_node.children) left_node.visited_count += right_node.visited_count db_session.commit() the_right_node_id = right_node.id inactivate_node(right_node.id) db_session.commit() db_session.add( Merging( left_node_id=left_node_id, right_node_id=the_right_node_id, merged=merged, )) else: db_session.add( Merging(left_node_id=left_node_id, right_node_id=right_node_id, merged=merged)) db_session.query(PotentialNodeMerge).filter( (PotentialNodeMerge.left_node_id == right_node_id) | (PotentialNodeMerge.right_node_id == right_node_id)).delete() db_session.commit()