def anagram(anag: str): """ For all elements in all dictionaries, find words that contain any anagram of `anag` as a substring. For "non-consecutive anagrams" you just want a word bank: see wordbank.py. """ found = [] try: all_elems = utils.get_all_dicts() perms = perm_strs(anag) num_perms = len(perms) utils.print_progress_bar(0, num_perms) for i, perm in enumerate(perms): found.extend([elem for elem in all_elems if perm in elem]) utils.print_progress_bar(i+1, num_perms) finally: print('found {} elems after containing an anagram'.format(len(found))) if found: utils.list_to_file(fname_anagram(anag), found) if len(found) < 100: for elem in found: print('\t-', elem)
def map_genomes_on_tree(self): my_log = logging.getLogger('train:map_genomes') self.organism_tree_map = {} mappings = set() for organism in self.organisms: if organism in self.organism_tree_map: continue # already exists mapped = self.get_mapped_organism(organism) if mapped is None: self.organisms_invalid.add(organism) continue if str(mapped) == "1": self.organisms_invalid.add(organism) self.organism_tree_map[organism] = mapped if mapped not in self.tree_organism_map: self.tree_organism_map[mapped] = set() self.tree_organism_map[mapped].add(organism) mappings.add(mapped) self.n_frags_per_node = int(math.ceil(self.config.settings["number_examples"]/len(mappings))) if len(self.organisms_invalid) >= 1: my_log.info("these {nr} organisms will not be processed due to lack of mapping:\n{orgs}" .format(nr=str(len(self.organisms_invalid)), orgs="\t".join(self.organisms_invalid))) utils.list_to_file(self.organisms_invalid, os.path.join(self.config.settings["project_dir"], "organisms_not_used.txt")) if self.stat is not None: self.stat.add_written_file(os.path.join(self.config.settings["project_dir"], "organisms_not_used.txt")) self.stat.succesfully_written(os.path.join(self.config.settings["project_dir"], "organisms_not_used.txt"))
def prediction_set(students, student_fields, course_fields, course_list, target, outpath, exclusive): local_students = [] for student in students: if "progress" in student.status and "status" in target: continue local_students.append(student) df = dp.create_predictor_data_frame(local_students, student_fields, course_fields, course_list, target, exclusive) outputs = dp.classify_target(df, target, course_fields, 5) pred_output = outputs[0] value_output = outputs[1] pred_output[0].extend([" "]) pred_output[0].extend(student_fields) pred_output[1].extend([" "]) pred_output[1].extend(course_fields) pred_output[2].extend([" "]) pred_output[2].extend(course_list) suffix = "_inclusive.csv" if exclusive: suffix = "_exclusive.csv" utils.list_to_file(outpath+"_"+target+suffix, pred_output) utils.list_to_file(outpath+"_"+target+"_feat_weight"+suffix, value_output) return
def transform(fromstr: str, tostr: str): """ For all elements in all dictionaries, find words that are valid when transformation s/fromstr/tostr applied. """ if tostr == '_': tostr = '' all_elems = utils.get_all_dicts() candidates = [ elem for elem in all_elems if fromstr in elem and elem != fromstr ] print('found {} candidates containing substring: {}'.format( len(candidates), fromstr)) valid = [] for cand in candidates: transformed = cand.replace(fromstr, tostr) if transformed in all_elems: valid.append('{} -> {}'.format(cand, transformed)) print('found {} valid elems after transformation'.format(len(valid))) if valid: utils.list_to_file(fname_transformed(fromstr, tostr), valid)
def wikisort_file(file: str): _, names = utils.file_to_list(file) scores = {} couldnt_find = [] utils.print_progress_bar(0, len(names)) for i, name in enumerate(names): try: scores[name] = views_per_month(name) except: # should probably keep track of the exceptions (so can tell if it's rate limiting etc.) couldnt_find.append(name) finally: utils.print_progress_bar(i + 1, len(names)) print() print('---FAILED TO FIND---') print(couldnt_find) print('------') print() sort_by_views = [ '{}\t{}'.format(k, v) for k, v in sorted(scores.items(), key=lambda x: x[1], reverse=True) ] utils.list_to_file(fname_ranked(file), sort_by_views, do_dedupe=False)
def gen_train_val(): mix_list = [x for x in os.listdir(config.backing_dir) if x.endswith('.hdf5') and x.startswith('med') ] train_list = mix_list[:int(len(mix_list)*config.split)] val_list = mix_list[int(len(mix_list)*config.split):] utils.list_to_file(val_list,config.log_dir+'val_files.txt') utils.list_to_file(train_list,config.log_dir+'train_files.txt')
def to_output(parsed_play, output_path, output_path_base): play_lines, graph = parsed_play a = [ str(x) + "\n" for x in play_lines ] print("writing to", output_path) list_to_file(a, output_path_base + '.out') dot_graph = nx.nx_agraph.to_agraph(graph) dot_graph.write(output_path_base + ".dot") prog = ['dot', 'circo'] for p in prog: dot_graph.layout(p) dot_graph.draw(output_path_base + "_" + p + ".png")
def dedupe_from_file(file: str): """ Read in the file $FILE as newline-separated list, dedupe list and write to $FILE.deduped """ frontmatter, elems = utils.file_to_list(file, do_dedupe=False) print('Elems before dedupe: {}'.format(len(elems))) deduped = utils.dedupe(elems, verbose=True) print('Elems after dedupe: {}'.format(len(deduped))) utils.list_to_file(fname_deduped(file), frontmatter + elems)
def gen_train_val(): voc_list = [ x for x in os.listdir(config.voice_dir) if x.endswith('.hdf5') and x.startswith('yam') ] train_list = voc_list[:int(len(voc_list) * 0.9)] val_list = voc_list[int(len(voc_list) * 0.9):] utils.list_to_file(val_list, './val_files.txt') utils.list_to_file(train_list, './train_files.txt')
def gen_train_val(): casas_list = [ x for x in os.listdir(config.voice_dir) if x.endswith('.hdf5') and x.startswith('casas') and not x in config.do_not_use and not x.startswith('casasros') ] trn_list = casas_list[:int(len(casas_list) * 0.9)] val_list = casas_list[int(len(casas_list) * 0.9):] utils.list_to_file(val_list, config.log_dir + 'val_files.txt') utils.list_to_file(trn_list, config.log_dir + 'train_files.txt')
def iterative_impact_tests(students, filter, compare_type, compare_dict, score_type, **kwargs): suffix = ".csv" if filter: #if the fitler flag is true, then in splits students up in groupings, if not, it just runs them as is. group_types = ga.get_grouping_types(kwargs['groupings']) for group_set in group_types: print("init") filtered_students = ga.filter_students(kwargs['groupings'], group_set, students, **kwargs) datas = run_impact_tests(filtered_students, compare_type, compare_dict, score_type) suffix = ga.translate_header(kwargs['groupings'], group_set) utils.list_to_file( "/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/3_18" + compare_type + "_prereq_impact_stats" + suffix + "_score_" + score_type + ".csv", datas[0]) utils.list_to_file( "/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/3_18" + compare_type + "_elect_impact_stats" + suffix + "_score_" + score_type + ".csv", datas[1]) else: datas = run_impact_tests(students, compare_type, compare_dict, score_type) if len(datas) == 1: return utils.list_to_file( "/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/3_18" + compare_type + "_prereq_impact_stats" + suffix, datas[0]) utils.list_to_file( "/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/3_18" + compare_type + "_all_impact_stats" + suffix, datas[1])
def vet_file(basefile: str): """ Read in the file $BASEFILE as newline-separated list, offers elements to user one by one to vet via cmd line. Approved elements are stored in $BASEFILE.vetted. Elements-to-vet stored in $BASEFILE.in_prog_vet. If this file exists when `vet` is called, user has the option of continuing the in-progress vet or starting a new one. """ in_prog_file = fname_in_prog_vet(basefile) accepted = [] use_existing_vet = False if os.path.isfile(in_prog_file): print('Vet in progress, continue existing? [Y/n]') answer = utils.ask_user_yn() if answer: frontmatter, elems = utils.file_to_list(in_prog_file) _, accepted = utils.file_to_list(fname_vetted(basefile)) use_existing_vet = True else: os.remove(in_prog_file) # if no in-prog file but there exist a .vetted file? if not use_existing_vet: frontmatter, elems = utils.file_to_list(basefile) print('Elems to vet: {}'.format(len(elems))) finished = False i = 0 try: for i, elem in enumerate(elems): print('{} -- approve? [y/N]'.format(elem)) answer = utils.ask_user_yn(default=False) if answer: accepted.append(elem) print('hooray, finished vet!') finished = True finally: if len(accepted) > 0: utils.list_to_file(fname_vetted(basefile), frontmatter + accepted) if finished: print('Accepted {} candidates'.format(len(accepted))) if os.path.isfile(in_prog_file): os.remove(in_prog_file) else: utils.list_to_file(in_prog_file, frontmatter + elems[i:]) print('Accepted {} candidates ({} remaining)'.format( len(accepted), len(elems) - i))
def combinate_file(file: str): """ Read in the file of names $FILE as newline-separated list, and for every name, generate crossword candidates: [first last, first, last], etc. Stores results in $FILE.combinated. """ frontmatter, names = utils.file_to_list(file) results = [] for name in names: results.extend(combinate(name)) print('{} names resulted in {} combinations'.format( len(names), len(results))) utils.list_to_file(fname_combinated(file), frontmatter + results)
def score_file(basefile: str): """ Read in the file $BASEFILE as newline-separated list, offers elements to user one by one to vet via cmd line to score. Approved elements are stored in $BASEFILE.scored. Elements-to-score stored in $BASEFILE.in_prog_score. If this file exists when `score` is called, user has the option of continuing the in-progress score or starting a new one. """ in_prog_file = fname_in_prog_score(basefile) scored = [] use_existing_score = False if os.path.isfile(in_prog_file): print('Score in progress, continue existing? [Y/n]') answer = utils.ask_user_yn() if answer: frontmatter, elems = utils.file_to_list(in_prog_file) _, scored = utils.file_to_list(fname_scored(basefile)) use_existing_score = True else: os.remove(in_prog_file) # if no in-prog file but there exist a .scored file? if not use_existing_score: frontmatter, elems = utils.file_to_list(basefile) print('Elems to score: {}'.format(len(elems))) finished = False i = 0 try: for i, elem in enumerate(elems): print(elem) score = ask_user_score() scored.append('{};{}'.format(elem, score)) print('hooray, finished scoring!') finished = True finally: if len(scored) > 0: utils.list_to_file(fname_scored(basefile), frontmatter + scored) if finished: print('Scored {} candidates'.format(len(scored))) if os.path.isfile(in_prog_file): os.remove(in_prog_file) else: utils.list_to_file(in_prog_file, frontmatter + elems[i:]) print('Scored {} candidates ({} remaining)'.format( len(scored), len(elems) - i))
def iterative_sequence_tests(students, filter, **kwargs): suffix = ".csv" if filter: group_types = ga.get_grouping_types(kwargs['groupings']) for group_set in group_types: filtered_students = ga.filter_students(kwargs['groupings'], group_set, students, kwargs) suffix = ga.translate_header(kwargs['groupings'], group_set) datas = sequence_analysis(filtered_students, kwargs['class_filter'], kwargs['support']) utils.list_to_file( "/Users/thomasolson/Documents/workspace/advising_revamp/series analysis/test_series_stats" + suffix+".csv", datas) else: datas = sequence_analysis(students, kwargs['class_filter'], kwargs['support']) utils.list_to_file( "/Users/thomasolson/Documents/workspace/advising_revamp/series analysis/series_stats" + suffix, datas)
def feature_analysis_tests(students, socio_factors, course_factors, courses, target_course, time_isolation, **kwargs): isolation_set = False if 'groupings' in kwargs: group_types = ga.get_grouping_types(kwargs['groupings']) for group_set in group_types: if 'isolation' in kwargs: isolation_set = kwargs['isolation'] filtered_students = ga.filter_students(kwargs['groupings'], group_set, students, kwargs) if len(filtered_students) == 0: continue suffix = ga.translate_header(kwargs['groupings'], group_set) df = ca.create_corr_dataframe(filtered_students, [], course_factors, #socio factors are disabled for lr courses) datas = ca.LinearRegression(df, target_course+"_grade") utils.list_to_file("/Users/thomasolson/Documents/workspace/advising_revamp/feature_extraction/linear_reg_extraction_" + suffix +"_"+target_course +".csv", datas) df = ca.create_corr_dataframe(filtered_students, socio_factors, course_factors, courses) datas = ca.generic_impact_rf_feature_extract(df, target_course+"_grade") utils.list_to_file( "/Users/thomasolson/Documents/workspace/advising_revamp/feature_extraction/random_forest_extraction_" + suffix +target_course+".csv", datas) ca.correlation_analysis(df, "/Users/thomasolson/Documents/workspace/advising_revamp/feature_extraction/corr_grid_" + suffix +"_"+target_course) else: df = ca.create_corr_dataframe(students, course_factors, courses, time_isolation, target_course ) #datas = ca.LinearRegression(df, target_course+"_grade") #utils.list_to_file( # "/Users/thomasolson/Documents/workspace/advising_revamp/feature_extraction/linear_reg_extraction_allall"+target_course+".csv", datas) #df = ca.create_corr_dataframe(students, socio_factors, course_factors, # courses, # ) #datas = ca.generic_impact_rf_feature_extract(df, target_course+"_grade") #utils.list_to_file( # "/Users/thomasolson/Documents/workspace/advising_revamp/feature_extraction/random_forest_extraction_allall"+target_course+".csv", datas) if time_isolation: target_course += "_isolated" ca.correlation_analysis(df, "/Users/thomasolson/Documents/workspace/advising_revamp/feature_extraction/concurrent_7_29_corr_grid_allall_"+target_course)
def wordbank(wdbnk_counts: Dict[str, int], wdbnk: str): """ For all elements in all dictionaries, find elements that contain all of the letters in the given wordbank """ minchars = sum(wdbnk_counts.values()) all_elems = utils.get_all_dicts() valid = [] for elem in all_elems: if len(elem) < minchars: continue if matches_wordbank(elem, wdbnk_counts): valid.append(elem) print('found {} valid elems matching wordbank {}'.format(len(valid), wdbnk)) if valid: utils.list_to_file(fname_wordbank(wdbnk), valid) if len(valid) < 100: for wd in valid: print('\t-', wd)
def before_after(s: str): """ For all elements in all dictionaries, find elements that, if split on string s, form two valid words """ # minchars = sum(wdbnk_counts.values()) all_elems = utils.get_all_dicts() valid = [] for elem in all_elems: # if len(elem) < minchars: # continue if matches_before_after(elem, s, all_elems): valid.append(pretty(elem, s)) print('found {} valid elems matching before_after {}'.format( len(valid), s)) if valid: utils.list_to_file(fname_before_after(s), valid) if len(valid) < 100: for wd in valid: print('\t-', wd)
def precompute_sim(core_path, elective_path, request_type, vect_type, outpath): elective_data = utils.list_from_file(elective_path, "\n", ",", False) core_data = utils.list_from_file(core_path, "\n", ",", False) class_dict = build_class_key_vector(core_data, elective_data, request_type) student_list = utils.get_students_history() student_vects = {} for student in student_list: vect = build_student_vector(student, class_dict, request_type, vect_type) student_vects[student] = vect output = [] for x in range(0, len(student_list)): student_a_vect = utils.grade_vect_to_bit( student_vects[student_list[x]]) print(x) for y in range(x + 1, len(student_list)): student_b_vect = utils.grade_vect_to_bit( student_vects[student_list[y]]) tani = 1.0 - jaccard_similarity_score(student_a_vect, student_b_vect) output.append( str(student_list[x].id_num) + "," + str(student_list[y].id_num) + "," + str(tani)) utils.list_to_file(outpath, output)
def contains_in_order(wd: str): """ For all elements in all dictionaries, find elements that contain the letters of the given word in order (though not necessarily consecutively). """ minchars = len(wd) all_elems = utils.get_all_dicts() valid = [] for elem in all_elems: if len(elem) < minchars: continue if wd in elem: # candidate just contains the word wholesale (not spread out) continue if _contains_in_order(elem, wd): valid.append(elem) print('found {} valid elems that contain the ordered characters {}'.format( len(valid), wd)) if valid: utils.list_to_file(fname_contains_in_order(wd), valid) if len(valid) < 100: for wd in valid: print('\t-', wd)
def main_processing(self): """ all steps of the training pipeline - processing ncbi sequences - tree processing (copying newick tree or building tree from a clade list) - mapping genomes on this tree - generating fragments from the sequences - dealing with sample specific data - generating kmer features - building models """ if not self.config.settings["only_models"]: # if you do not want to build only models, check if the project directory is empty if len(os.listdir(self.config.settings["project_dir"])) != 0: self.log.warning("The project directory is not empty, this can result in unpredictable behavior.") if self.yes: answer = "y" else: answer = utils.get_answer_timeout("Remove? [Y/N]") if answer == "y": self.log.info("Deleting old project directory..") shutil.rmtree(self.config.settings["project_dir"]) os.mkdir(self.config.settings["project_dir"]) else: self.log.critical("Please provide an empty project directory. Quiting...") sys.exit(1) if self.backup: self.stat = Status(logged=True) self.backupdir = os.path.join(self.config.settings["project_dir"], "backup") os.mkdir(self.backupdir) self.log.info("creating folder structure...") self.create_folderstructure() self.log.info("checking database") self.check_db() if len(self.config.genomes_exclude) != 0: # write organisms that will not be considered into a file utils.list_to_file(self.config.genomes_exclude, os.path.join(self.config.settings["project_dir"], "excluded.txt")) if self.stat is not None: self.stat.add_written_file(os.path.join(self.config.settings["project_dir"], "excluded.txt")) self.stat.write_backup(self.backupdir) self.log.info("Processing NCBI data...") self.process_ncbi() if self.stat is not None: self.stat.change_variable(1, "status") self.stat.change_variable(self.organism_file_map, "organism_file_map") self.stat.change_variable(self.genomes_excluded, "genomes_excluded") self.stat.change_variable(self.organisms, "organisms") self.stat.write_backup(self.backupdir) self.log.info("tree processing...") self.tree_process() if self.stat is not None: self.stat.change_variable(2, "status") self.stat.change_variable(self.nodes, "nodes") self.stat.change_variable(self.tree_file, "tree_file") self.stat.write_backup(self.backupdir) self.log.info("mapping genomes on the tree...") self.map_genomes_on_tree() if self.stat is not None: self.stat.change_variable(3, "status") self.stat.change_variable("n_frags_per_node", "status") self.stat.change_variable("tree_organism_map", "status") self.stat.change_variable("organisms_invalid", "status") self.stat.change_variable("organism_tree_map", "status") self.stat.write_backup(self.backupdir) self.log.info("generating sequence fragments...") self.generate_seq() if self.stat is not None: self.stat.change_variable(4, "status") self.stat.write_backup(self.backupdir) self.log.info("sample specific stuff...") self.sample_specific() if self.stat is not None: self.stat.change_variable(5, "status") self.stat.write_backup(self.backupdir) self.log.info("generating kmer features...") self.generate_kmer_features_concat() if self.stat is not None: self.stat.change_variable(6, "status") self.stat.write_backup(self.backupdir) else: self.log.info("reading tree string") self.config.settings["tree_file"] = os.path.join(self.config.settings["project_dir"], "tree.newick") self.log.info("building models") self.build_models() if self.config.settings["clean_up_train"]: self.log.info("Cleaning..") shutil.rmtree(os.path.join(self.config.settings["project_dir"], "train_data")) shutil.rmtree(os.path.join(self.config.settings["project_dir"], "sampled_fasta")) self.sqlite_taxonomy.close() self.log.info("Processing finished ...models are ready in {}".format(os.path.join(self.config.settings["project_dir"], "models")))
def tree_process(self): my_log = logging.getLogger('train:tree_processing') if self.config.settings["tree_file"] is None or self.config.settings["tree_file"] == "": # create clade list (own method?) descandents = {} for organism in self.organisms: for rank in self.config.settings["taxonomy_ranks"]: parent = self.sqlite_taxonomy.parent_at_rank(organism, rank) print('!!! Call: parent = self.sqlite_taxonomy.parent_at_rank(organism, rank) !!!') if parent in descandents: descandents[parent] += 1 else: descandents[parent] = 1 for parent in descandents.keys(): if parent is None or parent == "": continue if descandents[parent] >= self.config.settings["n_min_genomes_generic"]: self.nodes.append(parent) else: # read the tree_string nodes_tmp = utils.get_lines(self.config.settings["tree_file"]) for n in nodes_tmp: if n != "" and n is not None: self.nodes.append(n.rstrip("\n")) tree_string = self.nodes[0] # check if the tree is a newick string or a node list t_file = os.path.join(self.config.settings["project_dir"], "tree.newick") if ";" not in self.nodes[0] and len(self.nodes) >= 1: my_log.debug("Generating tree from the clades list ({} clades)....".format(str(len(self.nodes)))) clades_file = os.path.join(self.config.settings["project_dir"], "clades.txt") utils.list_to_file(self.nodes, clades_file) if self.stat is not None: self.stat.add_written_file(clades_file) self.stat.succesfully_written(clades_file) # run script to create tree obj = ncbi2newick.Ncbi2Newick(self.config.ncbi_tax_db, logged=True) obj.tree_from_nodes(clades_file) obj.tree_to_file(t_file) obj.close() else: my_log.debug("Copying tree to the project directory...") fw = open(t_file, "w") if self.stat is not None: self.stat.add_written_file(t_file) fw.write(tree_string) if self.stat is not None: self.succesfully_written(t_file) fw.close() # change tree_file to the tree in the project directory # get back the tree_string and nodes, this is necessary for further processing self.tree_file = os.path.join(self.config.settings["project_dir"], "tree.newick") fr = open(self.tree_file, "r") tree_string = fr.readline().rstrip() fr.close() if tree_string == "": my_log.critical("First line is empty in the newick file: {}".format(self.tree_file)) sys.exit(1) self.nodes = ncbi2newick.get_nodes_from_newick(self.tree_file)
def histo_analysis(): students = it.package_student_data() filtered_students = ga.filter_students(["admin_descript"], ["Transfer_Start"], students) datas = sa.course_semester_histogram(filtered_students, True) utils.list_to_file("/Users/thomasolson/Documents/workspace/advising_revamp/series analysis/xfer_class_histo.csv", datas)
def eval_file(): file_path = config.wav_dir # log_dir = './log_ikala_notrain/' log_dir = config.log_dir mode = 0 stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r') max_feat = np.array(stat_file["feats_maximus"]) min_feat = np.array(stat_file["feats_minimus"]) max_voc = np.array(stat_file["voc_stft_maximus"]) min_voc = np.array(stat_file["voc_stft_minimus"]) max_back = np.array(stat_file["back_stft_maximus"]) min_back = np.array(stat_file["back_stft_minimus"]) max_mix = np.array(max_voc) + np.array(max_back) with tf.Graph().as_default(): input_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, config.input_features), name='input_placeholder') with tf.variable_scope('First_Model') as scope: harm, ap, f0, vuv = modules.nr_wavenet(input_placeholder) saver = tf.train.Saver(max_to_keep=config.max_models_to_keep) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess = tf.Session() sess.run(init_op) ckpt = tf.train.get_checkpoint_state(log_dir) if ckpt and ckpt.model_checkpoint_path: print("Using the model in %s" % ckpt.model_checkpoint_path) # saver.restore(sess, ckpt.model_checkpoint_path) saver.restore(sess, './log/model.ckpt-59') # import pdb;pdb.set_trace() files = [ x for x in os.listdir(config.wav_dir) if x.endswith('.wav') and not x.startswith('.') ] diffs = [] count = 0 for file_name in files: count += 1 mix_stft = utils.file_to_stft(os.path.join(file_path, file_name), mode=mode) targs = utils.input_to_feats(os.path.join(file_path, file_name), mode=mode) # f0_sac = utils.file_to_sac(os.path.join(file_path,file_name)) # f0_sac = (f0_sac-min_feat[-2])/(max_feat[-2]-min_feat[-2]) in_batches, nchunks_in = utils.generate_overlapadd(mix_stft) in_batches = in_batches / max_mix # in_batches = utils.normalize(in_batches, 'mix_stft', mode=config.norm_mode_in) val_outer = [] first_pred = [] cleaner = [] gan_op = [] for in_batch in in_batches: val_harm, val_ap, val_f0, val_vuv = sess.run( [harm, ap, f0, vuv], feed_dict={input_placeholder: in_batch}) if config.use_gan: val_op = sess.run(gen_op, feed_dict={input_placeholder: in_batch}) gan_op.append(val_op) # first_pred.append(harm1) # cleaner.append(val_harm) val_harm = val_harm val_outs = np.concatenate((val_harm, val_ap, val_f0, val_vuv), axis=-1) val_outer.append(val_outs) val_outer = np.array(val_outer) val_outer = utils.overlapadd(val_outer, nchunks_in) val_outer[:, -1] = np.round(val_outer[:, -1]) val_outer = val_outer[:targs.shape[0], :] val_outer = np.clip(val_outer, 0.0, 1.0) #Test purposes only # first_pred = np.array(first_pred) # first_pred = utils.overlapadd(first_pred, nchunks_in) # cleaner = np.array(cleaner) # cleaner = utils.overlapadd(cleaner, nchunks_in) f0_output = val_outer[:, -2] * ( (max_feat[-2] - min_feat[-2]) + min_feat[-2]) f0_output = f0_output * (1 - targs[:, -1]) f0_output = utils.new_base_to_hertz(f0_output) f0_gt = targs[:, -2] f0_gt = f0_gt * (1 - targs[:, -1]) f0_gt = utils.new_base_to_hertz(f0_gt) f0_outputs = [] gt_outputs = [] for i, f0_o in enumerate(f0_output): f0_outputs.append( str(i * 0.00580498866 * 10000000) + ' ' + str(f0_o)) for i, f0_o in enumerate(f0_gt): gt_outputs.append( str(i * 0.00580498866 * 10000000) + ' ' + str(f0_o)) utils.list_to_file( f0_outputs, './ikala_eval/net_out/' + file_name[:-4] + '.pv') utils.list_to_file(gt_outputs, './ikala_eval/sac_gt/' + file_name[:-4] + '.pv') # f0_difference = np.nan_to_num(abs(f0_gt-f0_output)) # f0_greater = np.where(f0_difference>config.f0_threshold) # diff_per = f0_greater[0].shape[0]/len(f0_output) # diffs.append(str(1-diff_per)) utils.progress(count, len(files))
def cross_comp(): ikala_gt_dir = './ikala_eval/ikala_gt/' net_out_dir = './ikala_eval/net_out/' sac_gt_dir = './ikala_eval/sac_gt/' file_list = [ x for x in os.listdir(net_out_dir) if x.endswith('.pv') and not x.startswith('.') ] output = [] for file_name in file_list: out_time, out_freq = mir_eval.io.load_time_series(net_out_dir + file_name) for i, freq in enumerate(out_freq): if float(freq) == 0.0: out_freq[i] = 0 else: out_freq[i] = utils.f0_to_hertz(float(freq)) out_freq, out_vuv = mir_eval.melody.freq_to_voicing(out_freq) ref_time_o, ref_freq_o = mir_eval.io.load_time_series(ikala_gt_dir + file_name) for i, freq in enumerate(ref_freq_o): if float(freq) == 0.0: ref_freq_o[i] = 0 else: ref_freq_o[i] = utils.f0_to_hertz(float(freq)) plt.figure(1) plt.plot(out_freq) plt.plot(ref_freq_o) plt.show() # import pdb;pdb.set_trace() haha = mir_eval.melody.evaluate(ref_time_o, ref_freq_o, out_time, out_freq) out_string = file_name for key in haha.keys(): out_string = out_string + ';' + str(haha[key]) # import pdb;pdb.set_trace() # ref_freq_o, ref_vuv_o = mir_eval.melody.freq_to_voicing(ref_freq_o) # ref_freq,ref_vuv = mir_eval.melody.resample_melody_series(ref_time_o,ref_freq_o, ref_vuv_o,out_time) # out_freq_o, out_vuv_o = mir_eval.melody.resample_melody_series(out_time,out_freq, out_vuv,ref_time_o) # raw_pitch_accuracy_10_o = mir_eval.melody.raw_pitch_accuracy(ref_vuv_o,ref_freq_o,out_vuv_o,out_freq_o, cent_tolerance = 10) # raw_pitch_accuracy_25_o = mir_eval.melody.raw_pitch_accuracy(ref_vuv_o,ref_freq_o,out_vuv_o,out_freq_o, cent_tolerance = 25) # raw_pitch_accuracy_50_o = mir_eval.melody.raw_pitch_accuracy(ref_vuv_o,ref_freq_o,out_vuv_o,out_freq_o, cent_tolerance = 50) # raw_chroma_accuracy_o = mir_eval.melody.raw_chroma_accuracy(ref_vuv_o,ref_freq_o,out_vuv_o,out_freq_o) # raw_pitch_accuracy_10 = mir_eval.melody.raw_pitch_accuracy(ref_vuv,ref_freq,out_vuv,out_freq, cent_tolerance = 10) # raw_pitch_accuracy_25 = mir_eval.melody.raw_pitch_accuracy(ref_vuv,ref_freq,out_vuv,out_freq, cent_tolerance = 25) # raw_pitch_accuracy_50 = mir_eval.melody.raw_pitch_accuracy(ref_vuv,ref_freq,out_vuv,out_freq, cent_tolerance = 50) # raw_chroma_accuracy = mir_eval.melody.raw_chroma_accuracy(ref_vuv,ref_freq,out_vuv,out_freq) # import pdb;pdb.set_trace() output.append(out_string) # output.append(file_name+';'+str(raw_pitch_accuracy_10)+';'+str(raw_pitch_accuracy_25)+';'+str(raw_pitch_accuracy_50)+';'+str(raw_chroma_accuracy)+';'+str(raw_pitch_accuracy_10_o)+';'+str(raw_pitch_accuracy_25_o)+';'+str(raw_pitch_accuracy_50_o)+';'+str(raw_chroma_accuracy_o)) utils.list_to_file(output, './ikala_eval/mir_eval_results.txt')
def cluster(student_vects, request_type, vect_type, base_dir, dissim_path): cluster_data = [] student_list = [] student_output = [] for student in student_vects: cluster_data.append(student_vects[student]) student_output.append( str(student.id_num) + "," + str(student.grade_adj) + "," + str(student.age)) student_list.append(student) #ms = MeanShift().fit_predict(cluster_data) #ward = AgglomerativeClustering(n_clusters=5, linkage='ward').fit_predict(cluster_data) #utils.list_to_file(base_dir+"/test_labels_"+request_type+"_"+vect_type, student_output) #utils.list_to_file("test_clusters_ms", ms) n_set = [5] #dissim_list = utils.list_from_file(dissim_path,"\n", ",", False) #dissim_dict = ca.format_dissim_list(dissim_list) """ for n in n_set: pred_clusters = KMeans(n_clusters=n).fit(cluster_data) analysis_set = ca.cluster_analysis(cluster_data, pred_clusters.labels_, student_list, dissim_dict) os.mkdir(base_dir+"/"+str(n)+"_run") for clust in analysis_set: ca.print_stats(analysis_set[clust], clust, base_dir+"/"+str(n)+"_run") utils.list_to_file("test_clusters_kmeans_"+str(n)+"_"+request_type+"_"+vect_type, pred_clusters.labels_) """ utils.list_to_file( "test_labels_2_2" + vect_type + "_" + request_type + ".csv", student_output) af = AffinityPropagation().fit(cluster_data) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) #print('Estimated number of clusters: %d' % n_clusters_) #print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) #print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) #print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) #print("Adjusted Rand Index: %0.3f" # % metrics.adjusted_rand_score(labels_true, labels)) #print("Adjusted Mutual Information: %0.3f" # % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(cluster_data, labels, metric='sqeuclidean')) utils.list_to_file("test_clusters_af_" + vect_type + "_" + request_type, af.labels_) print("DB Index: %0.3f" % metrics.davies_bouldin_score(cluster_data, af.labels_)) print("km 10") pred_clusters = KMeans(n_clusters=10).fit(cluster_data) utils.list_to_file( "test_clusters_kmeans_10_" + vect_type + "_" + request_type, pred_clusters.labels_) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score( cluster_data, pred_clusters.labels_, metric='sqeuclidean')) print("DB Index: %0.3f" % metrics.davies_bouldin_score(cluster_data, pred_clusters.labels_)) print("km 15") pred_clusters = KMeans(n_clusters=15).fit(cluster_data) utils.list_to_file( "test_clusters_kmeans_15_" + vect_type + "_" + request_type, pred_clusters.labels_) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score( cluster_data, pred_clusters.labels_, metric='sqeuclidean')) print("DB Index: %0.3f" % metrics.davies_bouldin_score(cluster_data, pred_clusters.labels_)) print("km 5") pred_clusters = KMeans(n_clusters=5).fit(cluster_data) utils.list_to_file( "test_clusters_kmeans_5_" + vect_type + "_" + request_type, pred_clusters.labels_) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score( cluster_data, pred_clusters.labels_, metric='sqeuclidean')) print("DB Index: %0.3f" % metrics.davies_bouldin_score(cluster_data, pred_clusters.labels_)) #pred_clusters = KMeans(n_clusters=15).fit(cluster_data) #utils.list_to_file("test_clusters_kmeans_15_cls_core", pred_clusters.labels_) #utils.list_to_file("test_clusters_ward", ward) print("ward5") ward = AgglomerativeClustering(n_clusters=5).fit(cluster_data) utils.list_to_file( "test_clusters_ward_5clust" + vect_type + "_" + request_type, ward.labels_) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score( cluster_data, ward.labels_, metric='sqeuclidean')) print("DB Index: %0.3f" % metrics.davies_bouldin_score(cluster_data, ward.labels_)) print("ward10") ward = AgglomerativeClustering(n_clusters=10).fit(cluster_data) utils.list_to_file( "test_clusters_ward_10clust" + vect_type + "_" + request_type, ward.labels_) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score( cluster_data, ward.labels_, metric='sqeuclidean')) print("DB Index: %0.3f" % metrics.davies_bouldin_score(cluster_data, ward.labels_)) print("ward15") ward = AgglomerativeClustering(n_clusters=15).fit(cluster_data) utils.list_to_file( "test_clusters_ward_15clust" + vect_type + "_" + request_type, ward.labels_) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score( cluster_data, ward.labels_, metric='sqeuclidean')) print("DB Index: %0.3f" % metrics.davies_bouldin_score(cluster_data, ward.labels_)) print("ms") ms = MeanShift().fit(cluster_data) utils.list_to_file("test_clusters_ms" + vect_type + "_" + request_type, ms.labels_) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score( cluster_data, ms.labels_, metric='sqeuclidean')) print("DB Index: %0.3f" % metrics.davies_bouldin_score(cluster_data, ms.labels_))
def train(_): stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r') max_feat = np.array(stat_file["feats_maximus"]) min_feat = np.array(stat_file["feats_minimus"]) with tf.Graph().as_default(): input_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, config.input_features), name='input_placeholder') tf.summary.histogram('inputs', input_placeholder) target_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, config.output_features), name='target_placeholder') tf.summary.histogram('targets', target_placeholder) with tf.variable_scope('First_Model') as scope: harm, ap, f0, vuv = modules.nr_wavenet(input_placeholder) # tf.summary.histogram('initial_output', op) tf.summary.histogram('harm', harm) tf.summary.histogram('ap', ap) tf.summary.histogram('f0', f0) tf.summary.histogram('vuv', vuv) if config.use_gan: with tf.variable_scope('Generator') as scope: gen_op = modules.GAN_generator(harm) with tf.variable_scope('Discriminator') as scope: D_real = modules.GAN_discriminator( target_placeholder[:, :, :60], input_placeholder) scope.reuse_variables() D_fake = modules.GAN_discriminator(gen_op + harmy, input_placeholder) # Comment out these lines to train without GAN D_loss_real = -tf.reduce_mean(tf.log(D_real + 1e-12)) D_loss_fake = -tf.reduce_mean(tf.log(1. - (D_fake + 1e-12))) D_loss = D_loss_real + D_loss_fake D_summary_real = tf.summary.scalar('Discriminator_Loss_Real', D_loss_real) D_summary_fake = tf.summary.scalar('Discriminator_Loss_Fake', D_loss_fake) G_loss_GAN = -tf.reduce_mean(tf.log(D_fake + 1e-12)) G_loss_diff = tf.reduce_sum( tf.abs(gen_op + harmy - target_placeholder[:, :, :60]) * (1 - target_placeholder[:, :, -1:])) * 0.5 G_loss = G_loss_GAN + G_loss_diff G_summary_GAN = tf.summary.scalar('Generator_Loss_GAN', G_loss_GAN) G_summary_diff = tf.summary.scalar('Generator_Loss_diff', G_loss_diff) vars = tf.trainable_variables() # import pdb;pdb.set_trace() d_params = [ v for v in vars if v.name.startswith('Discriminator/D') ] g_params = [v for v in vars if v.name.startswith('Generator/G')] # import pdb;pdb.set_trace() # d_optimizer_grad = tf.train.GradientDescentOptimizer(learning_rate=config.gan_lr).minimize(D_loss, var_list=d_params) # g_optimizer = tf.train.GradientDescentOptimizer(learning_rate=config.gan_lr).minimize(G_loss, var_list=g_params) d_optimizer = tf.train.GradientDescentOptimizer( learning_rate=config.gan_lr).minimize(D_loss, var_list=d_params) # g_optimizer_diff = tf.train.AdamOptimizer(learning_rate=config.gan_lr).minimize(G_loss_diff, var_list=g_params) g_optimizer = tf.train.AdamOptimizer( learning_rate=config.gan_lr).minimize(G_loss, var_list=g_params) # initial_loss = tf.reduce_sum(tf.abs(op - target_placeholder[:,:,:60])*np.linspace(1.0,0.7,60)*(1-target_placeholder[:,:,-1:])) harm_loss = tf.reduce_sum( tf.abs(harm - target_placeholder[:, :, :60]) * np.linspace(1.0, 0.7, 60) * (1 - target_placeholder[:, :, -1:])) ap_loss = tf.reduce_sum( tf.abs(ap - target_placeholder[:, :, 60:-2]) * (1 - target_placeholder[:, :, -1:])) f0_loss = tf.reduce_sum( tf.abs(f0 - target_placeholder[:, :, -2:-1]) * (1 - target_placeholder[:, :, -1:])) # vuv_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=, logits=vuv)) vuv_loss = tf.reduce_mean( tf.reduce_sum(binary_cross(target_placeholder[:, :, -1:], vuv))) loss = harm_loss + ap_loss + vuv_loss + f0_loss * config.f0_weight # initial_summary = tf.summary.scalar('initial_loss', initial_loss) harm_summary = tf.summary.scalar('harm_loss', harm_loss) ap_summary = tf.summary.scalar('ap_loss', ap_loss) f0_summary = tf.summary.scalar('f0_loss', f0_loss) vuv_summary = tf.summary.scalar('vuv_loss', vuv_loss) loss_summary = tf.summary.scalar('total_loss', loss) global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.AdamOptimizer(learning_rate=config.init_lr) # optimizer_f0 = tf.train.AdamOptimizer(learning_rate = config.init_lr) train_function = optimizer.minimize(loss, global_step=global_step) # train_f0 = optimizer.minimize(f0_loss, global_step= global_step) # train_harm = optimizer.minimize(harm_loss, global_step= global_step) # train_ap = optimizer.minimize(ap_loss, global_step= global_step) # train_f0 = optimizer.minimize(f0_loss, global_step= global_step) # train_vuv = optimizer.minimize(vuv_loss, global_step= global_step) summary = tf.summary.merge_all() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) saver = tf.train.Saver(max_to_keep=config.max_models_to_keep) sess = tf.Session() sess.run(init_op) ckpt = tf.train.get_checkpoint_state(config.log_dir_m1) if ckpt and ckpt.model_checkpoint_path: print("Using the model in %s" % ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) train_summary_writer = tf.summary.FileWriter( config.log_dir_m1 + 'train/', sess.graph) val_summary_writer = tf.summary.FileWriter(config.log_dir_m1 + 'val/', sess.graph) start_epoch = int( sess.run(tf.train.get_global_step()) / (config.batches_per_epoch_train)) print("Start from: %d" % start_epoch) f0_accs = [] for epoch in xrange(start_epoch, config.num_epochs): val_f0_accs = [] data_generator = data_gen() start_time = time.time() epoch_loss_harm = 0 epoch_loss_ap = 0 epoch_loss_f0 = 0 epoch_loss_vuv = 0 epoch_total_loss = 0 # epoch_initial_loss = 0 epoch_loss_harm_val = 0 epoch_loss_ap_val = 0 epoch_loss_f0_val = 0 epoch_loss_vuv_val = 0 epoch_total_loss_val = 0 # epoch_initial_loss_val = 0 if config.use_gan: epoch_loss_generator_GAN = 0 epoch_loss_generator_diff = 0 epoch_loss_discriminator_real = 0 epoch_loss_discriminator_fake = 0 val_epoch_loss_generator_GAN = 0 val_epoch_loss_generator_diff = 0 val_epoch_loss_discriminator_real = 0 val_epoch_loss_discriminator_fake = 0 batch_num = 0 batch_num_val = 0 val_generator = data_gen(mode='val') # val_generator = get_batches(train_filename=config.h5py_file_val, batches_per_epoch=config.batches_per_epoch_val_m1) with tf.variable_scope('Training'): for voc, feat in data_generator: voc = np.clip( voc + np.random.rand(config.max_phr_len, config.input_features) * np.clip(np.random.rand(1), 0.0, config.noise_threshold), 0.0, 1.0) _, step_loss_harm, step_loss_ap, step_loss_f0, step_loss_vuv, step_total_loss = sess.run( [ train_function, harm_loss, ap_loss, f0_loss, vuv_loss, loss ], feed_dict={ input_placeholder: voc, target_placeholder: feat }) # _, step_loss_f0 = sess.run([train_f0, f0_loss], feed_dict={input_placeholder: voc,target_placeholder: feat}) if config.use_gan: _, step_dis_loss_real, step_dis_loss_fake = sess.run( [d_optimizer, D_loss_real, D_loss_fake], feed_dict={ input_placeholder: voc, target_placeholder: feat }) _, step_gen_loss_GAN, step_gen_loss_diff = sess.run( [g_optimizer, G_loss_GAN, G_loss_diff], feed_dict={ input_placeholder: voc, target_placeholder: feat }) # else : # _, step_dis_loss_real, step_dis_loss_fake = sess.run([d_optimizer_grad, D_loss_real,D_loss_fake], feed_dict={input_placeholder: voc,target_placeholder: feat}) # _, step_gen_loss_diff = sess.run([g_optimizer_diff, G_loss_diff], feed_dict={input_placeholder: voc,target_placeholder: feat}) # step_gen_loss_GAN = 0 # _, step_loss_harm = sess.run([train_harm, harm_loss], feed_dict={input_placeholder: voc,target_placeholder: feat}) # _, step_loss_ap = sess.run([train_ap, ap_loss], feed_dict={input_placeholder: voc,target_placeholder: feat}) # _, step_loss_f0 = sess.run([train_f0, f0_loss], feed_dict={input_placeholder: voc,target_placeholder: feat}) # _, step_loss_vuv = sess.run([train_vuv, vuv_loss], feed_dict={input_placeholder: voc,target_placeholder: feat}) # epoch_initial_loss+=step_initial_loss epoch_loss_harm += step_loss_harm epoch_loss_ap += step_loss_ap epoch_loss_f0 += step_loss_f0 epoch_loss_vuv += step_loss_vuv epoch_total_loss += step_total_loss if config.use_gan: epoch_loss_generator_GAN += step_gen_loss_GAN epoch_loss_generator_diff += step_gen_loss_diff epoch_loss_discriminator_real += step_dis_loss_real epoch_loss_discriminator_fake += step_dis_loss_fake utils.progress(batch_num, config.batches_per_epoch_train, suffix='training done') batch_num += 1 # epoch_initial_loss = epoch_initial_loss/(config.batches_per_epoch_train *config.batch_size*config.max_phr_len*60) epoch_loss_harm = epoch_loss_harm / ( config.batches_per_epoch_train * config.batch_size * config.max_phr_len * 60) epoch_loss_ap = epoch_loss_ap / ( config.batches_per_epoch_train * config.batch_size * config.max_phr_len * 4) epoch_loss_f0 = epoch_loss_f0 / ( config.batches_per_epoch_train * config.batch_size * config.max_phr_len) epoch_loss_vuv = epoch_loss_vuv / ( config.batches_per_epoch_train * config.batch_size * config.max_phr_len) epoch_total_loss = epoch_total_loss / ( config.batches_per_epoch_train * config.batch_size * config.max_phr_len * 66) if config.use_gan: epoch_loss_generator_GAN = epoch_loss_generator_GAN / ( config.batches_per_epoch_train * config.batch_size) epoch_loss_generator_diff = epoch_loss_generator_diff / ( config.batches_per_epoch_train * config.batch_size * config.max_phr_len * 60) epoch_loss_discriminator_real = epoch_loss_discriminator_real / ( config.batches_per_epoch_train * config.batch_size) epoch_loss_discriminator_fake = epoch_loss_discriminator_fake / ( config.batches_per_epoch_train * config.batch_size) summary_str = sess.run(summary, feed_dict={ input_placeholder: voc, target_placeholder: feat }) train_summary_writer.add_summary(summary_str, epoch) # summary_writer.add_summary(summary_str_val, epoch) train_summary_writer.flush() with tf.variable_scope('Validation'): for voc, feat in val_generator: step_loss_harm_val = sess.run(harm_loss, feed_dict={ input_placeholder: voc, target_placeholder: feat }) step_loss_ap_val = sess.run(ap_loss, feed_dict={ input_placeholder: voc, target_placeholder: feat }) step_loss_f0_val = sess.run(f0_loss, feed_dict={ input_placeholder: voc, target_placeholder: feat }) step_loss_vuv_val = sess.run(vuv_loss, feed_dict={ input_placeholder: voc, target_placeholder: feat }) step_total_loss_val = sess.run(loss, feed_dict={ input_placeholder: voc, target_placeholder: feat }) epoch_loss_harm_val += step_loss_harm_val epoch_loss_ap_val += step_loss_ap_val epoch_loss_f0_val += step_loss_f0_val epoch_loss_vuv_val += step_loss_vuv_val epoch_total_loss_val += step_total_loss_val if config.use_gan: val_epoch_loss_generator_GAN += step_gen_loss_GAN val_epoch_loss_generator_diff += step_gen_loss_diff val_epoch_loss_discriminator_real += step_dis_loss_real val_epoch_loss_discriminator_fake += step_dis_loss_fake utils.progress(batch_num_val, config.batches_per_epoch_val_m1, suffix='validiation done') batch_num_val += 1 # f0_accs.append(np.mean(val_f0_accs)) # epoch_initial_loss_val = epoch_initial_loss_val/(config.batches_per_epoch_val_m1 *config.batch_size*config.max_phr_len*60) epoch_loss_harm_val = epoch_loss_harm_val / ( batch_num_val * config.batch_size * config.max_phr_len * 60) epoch_loss_ap_val = epoch_loss_ap_val / ( batch_num_val * config.batch_size * config.max_phr_len * 4) epoch_loss_f0_val = epoch_loss_f0_val / ( batch_num_val * config.batch_size * config.max_phr_len) epoch_loss_vuv_val = epoch_loss_vuv_val / ( batch_num_val * config.batch_size * config.max_phr_len) epoch_total_loss_val = epoch_total_loss_val / ( batch_num_val * config.batch_size * config.max_phr_len * 66) if config.use_gan: val_epoch_loss_generator_GAN = val_epoch_loss_generator_GAN / ( config.batches_per_epoch_val_m1 * config.batch_size) val_epoch_loss_generator_diff = val_epoch_loss_generator_diff / ( config.batches_per_epoch_val_m1 * config.batch_size * config.max_phr_len * 60) val_epoch_loss_discriminator_real = val_epoch_loss_discriminator_real / ( config.batches_per_epoch_val_m1 * config.batch_size) val_epoch_loss_discriminator_fake = val_epoch_loss_discriminator_fake / ( config.batches_per_epoch_val_m1 * config.batch_size) summary_str = sess.run(summary, feed_dict={ input_placeholder: voc, target_placeholder: feat }) val_summary_writer.add_summary(summary_str, epoch) # summary_writer.add_summary(summary_str_val, epoch) val_summary_writer.flush() duration = time.time() - start_time # np.save('./ikala_eval/accuracies', f0_accs) if (epoch + 1) % config.print_every == 0: print('epoch %d: Harm Training Loss = %.10f (%.3f sec)' % (epoch + 1, epoch_loss_harm, duration)) print(' : Ap Training Loss = %.10f ' % (epoch_loss_ap)) print(' : F0 Training Loss = %.10f ' % (epoch_loss_f0)) print(' : VUV Training Loss = %.10f ' % (epoch_loss_vuv)) # print(' : Initial Training Loss = %.10f ' % (epoch_initial_loss)) if config.use_gan: print(' : Gen GAN Training Loss = %.10f ' % (epoch_loss_generator_GAN)) print(' : Gen diff Training Loss = %.10f ' % (epoch_loss_generator_diff)) print( ' : Discriminator Training Loss Real = %.10f ' % (epoch_loss_discriminator_real)) print( ' : Discriminator Training Loss Fake = %.10f ' % (epoch_loss_discriminator_fake)) print(' : Harm Validation Loss = %.10f ' % (epoch_loss_harm_val)) print(' : Ap Validation Loss = %.10f ' % (epoch_loss_ap_val)) print(' : F0 Validation Loss = %.10f ' % (epoch_loss_f0_val)) print(' : VUV Validation Loss = %.10f ' % (epoch_loss_vuv_val)) # if (epoch + 1) % config.save_every == 0 or (epoch + 1) == config.num_epochs: # print(' : Mean F0 IKala Accuracy = %.10f ' % (np.mean(val_f0_accs))) # print(' : Mean F0 IKala Accuracy = '+'%{1:.{0}f}%'.format(np.mean(val_f0_accs))) # print(' : Initial Validation Loss = %.10f ' % (epoch_initial_loss_val)) if config.use_gan: print(' : Gen GAN Validation Loss = %.10f ' % (val_epoch_loss_generator_GAN)) print(' : Gen diff Validation Loss = %.10f ' % (val_epoch_loss_generator_diff)) print( ' : Discriminator Validation Loss Real = %.10f ' % (val_epoch_loss_discriminator_real)) print( ' : Discriminator Validation Loss Fake = %.10f ' % (val_epoch_loss_discriminator_fake)) if (epoch + 1) % config.save_every == 0 or ( epoch + 1) == config.num_epochs: utils.list_to_file( val_f0_accs, './ikala_eval/accuracies_' + str(epoch + 1) + '.txt') checkpoint_file = os.path.join(config.log_dir_m1, 'model.ckpt') saver.save(sess, checkpoint_file, global_step=epoch)
def score_series_set(path, outpath, add_412, add_211, class_type): #score maps are built from sequence analysis and I have included examples of their format in git. if class_type.lower() == "transfer": seq_score_map = utils.dict_from_file("/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/subset_transfer_sequence_score_map_25.csv", 0,1,"\n", ",", True) equiv_score_map = utils.dict_from_file("/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/subset_transfer_concurrent_score_map_25.csv", 0,1,"\n", ",", True) elif class_type.lower() == "49_set":#This was some testing work I did seq_score_map = utils.dict_from_file( "/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/49_cs_sequence_score_map_25.csv", 0, 1, "\n", ",", True) equiv_score_map = utils.dict_from_file( "/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/49_cs_concurrent_score_map_25.csv", 0, 1, "\n", ",", True) else: seq_score_map = utils.dict_from_file( "/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/combo_score_seq_mod_bonus.csv", 0, 1, "\n", ",", True) equiv_score_map = utils.dict_from_file( "/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/combo_score_equiv_mod_bonus.csv", 0, 1, "\n", ",", True) i = 0 top_100 = [] with open(path, "r") as x: data = x.readline() while data: if i % 10000 == 0: print(i) i+=1 #if "10_" in data or "9_" in data: # data = x.readline() # continue line = data.strip().replace(" ", "").replace("[","").replace("]","").replace("'","").split(",") if add_412 or add_211: for sem_x in range(0,len(line)): if "CSC340" in line[sem_x] and add_412: sem = line[sem_x].split("_")[0] line.insert(sem_x+1, sem+"_CSC412") break if "CSC210" in line[sem_x] and add_211: sem = line[sem_x].split("_")[0] line.insert(sem_x+1, sem+"_CSC211") #if "PHYS220" in line[sem_x]: #Typically unneeded due to presence of PHYS230/222 scores that capture same info. # sem = line[sem_x].split("_")[0] # line.insert(sem_x+1, sem+"_PHYS222") #if "PHYS230" in line[sem_x]: # sem = line[sem_x].split("_")[0] # line.insert(sem_x+1, sem+"_PHYS232") score_line = [] for crs in line: if crs.startswith("0"): continue score_line.append(crs) sem_dict = build_seq_sem_dict(score_line) score = score_seq(sem_dict, equiv_score_map, seq_score_map) top_100 = update_top_100([score, score_line], top_100) data = x.readline() print(i) if add_412: split_path = outpath.split(".") prefix = split_path[0] split_path[0] = prefix + "_412add" outpath = ".".join(split_path) if add_211: split_path = outpath.split(".") prefix = split_path[0] split_path[0] = prefix + "_211add" outpath = ".".join(split_path) utils.list_to_file(outpath, top_100)