예제 #1
0
def anagram(anag: str):
    """
    For all elements in all dictionaries, find words that contain any anagram of `anag` as a substring.

    For "non-consecutive anagrams" you just want a word bank: see wordbank.py.
    """
    found = []

    try:
        all_elems = utils.get_all_dicts()

        perms = perm_strs(anag)
        num_perms = len(perms)
        utils.print_progress_bar(0, num_perms)
        for i, perm in enumerate(perms):
            found.extend([elem for elem in all_elems if perm in elem])
            utils.print_progress_bar(i+1, num_perms)

    finally:
        print('found {} elems after containing an anagram'.format(len(found)))
        if found:
            utils.list_to_file(fname_anagram(anag), found)
            if len(found) < 100:
                for elem in found:
                    print('\t-', elem)
예제 #2
0
    def map_genomes_on_tree(self):
        my_log = logging.getLogger('train:map_genomes')

        self.organism_tree_map = {}
        mappings = set()

        for organism in self.organisms:
            if organism in self.organism_tree_map:
                continue    # already exists
            mapped = self.get_mapped_organism(organism)
            if mapped is None:
                self.organisms_invalid.add(organism)
                continue
            if str(mapped) == "1":
                self.organisms_invalid.add(organism)

            self.organism_tree_map[organism] = mapped
            if mapped not in self.tree_organism_map:
                self.tree_organism_map[mapped] = set()
            self.tree_organism_map[mapped].add(organism)
            mappings.add(mapped)

        self.n_frags_per_node = int(math.ceil(self.config.settings["number_examples"]/len(mappings)))

        if len(self.organisms_invalid) >= 1:
            my_log.info("these {nr} organisms will not be processed due to lack of mapping:\n{orgs}"
                             .format(nr=str(len(self.organisms_invalid)), orgs="\t".join(self.organisms_invalid)))
            utils.list_to_file(self.organisms_invalid,
                               os.path.join(self.config.settings["project_dir"], "organisms_not_used.txt"))
            if self.stat is not None:
                self.stat.add_written_file(os.path.join(self.config.settings["project_dir"], "organisms_not_used.txt"))
                self.stat.succesfully_written(os.path.join(self.config.settings["project_dir"], "organisms_not_used.txt"))
예제 #3
0
def prediction_set(students, student_fields, course_fields, course_list, target, outpath, exclusive):

    local_students = []
    for student in students:
        if "progress" in student.status and "status" in target:
            continue
        local_students.append(student)


    df = dp.create_predictor_data_frame(local_students, student_fields, course_fields, course_list, target, exclusive)
    outputs = dp.classify_target(df, target, course_fields, 5)

    pred_output = outputs[0]
    value_output = outputs[1]
    pred_output[0].extend([" "])
    pred_output[0].extend(student_fields)
    pred_output[1].extend([" "])
    pred_output[1].extend(course_fields)
    pred_output[2].extend([" "])
    pred_output[2].extend(course_list)

    suffix = "_inclusive.csv"
    if exclusive:
        suffix = "_exclusive.csv"
    utils.list_to_file(outpath+"_"+target+suffix, pred_output)
    utils.list_to_file(outpath+"_"+target+"_feat_weight"+suffix, value_output)

    return
예제 #4
0
def transform(fromstr: str, tostr: str):
    """
    For all elements in all dictionaries, find words that are valid when transformation
    s/fromstr/tostr applied.
    """
    if tostr == '_':
        tostr = ''

    all_elems = utils.get_all_dicts()

    candidates = [
        elem for elem in all_elems if fromstr in elem and elem != fromstr
    ]
    print('found {} candidates containing substring: {}'.format(
        len(candidates), fromstr))

    valid = []
    for cand in candidates:
        transformed = cand.replace(fromstr, tostr)
        if transformed in all_elems:
            valid.append('{} -> {}'.format(cand, transformed))

    print('found {} valid elems after transformation'.format(len(valid)))
    if valid:
        utils.list_to_file(fname_transformed(fromstr, tostr), valid)
예제 #5
0
파일: wiki.py 프로젝트: maiamcc/xword_dicts
def wikisort_file(file: str):
    _, names = utils.file_to_list(file)
    scores = {}
    couldnt_find = []

    utils.print_progress_bar(0, len(names))
    for i, name in enumerate(names):
        try:
            scores[name] = views_per_month(name)
        except:
            # should probably keep track of the exceptions (so can tell if it's rate limiting etc.)
            couldnt_find.append(name)
        finally:
            utils.print_progress_bar(i + 1, len(names))

    print()
    print('---FAILED TO FIND---')
    print(couldnt_find)
    print('------')
    print()

    sort_by_views = [
        '{}\t{}'.format(k, v)
        for k, v in sorted(scores.items(), key=lambda x: x[1], reverse=True)
    ]

    utils.list_to_file(fname_ranked(file), sort_by_views, do_dedupe=False)
def gen_train_val():
    mix_list = [x for x in os.listdir(config.backing_dir) if x.endswith('.hdf5') and x.startswith('med') ]

    train_list = mix_list[:int(len(mix_list)*config.split)]

    val_list = mix_list[int(len(mix_list)*config.split):]

    utils.list_to_file(val_list,config.log_dir+'val_files.txt')

    utils.list_to_file(train_list,config.log_dir+'train_files.txt')
예제 #7
0
def to_output(parsed_play, output_path, output_path_base):
    play_lines, graph = parsed_play
    a = [ str(x) + "\n" for x in play_lines ]
    print("writing to", output_path)
    list_to_file(a, output_path_base + '.out')
    dot_graph = nx.nx_agraph.to_agraph(graph)
    dot_graph.write(output_path_base + ".dot")
    prog = ['dot', 'circo']
    for p in prog:
        dot_graph.layout(p)
        dot_graph.draw(output_path_base + "_" + p + ".png")
예제 #8
0
def dedupe_from_file(file: str):
    """
    Read in the file $FILE as newline-separated list,
    dedupe list and write to $FILE.deduped
    """
    frontmatter, elems = utils.file_to_list(file, do_dedupe=False)
    print('Elems before dedupe: {}'.format(len(elems)))
    deduped = utils.dedupe(elems, verbose=True)
    print('Elems after dedupe: {}'.format(len(deduped)))

    utils.list_to_file(fname_deduped(file), frontmatter + elems)
def gen_train_val():
    voc_list = [
        x for x in os.listdir(config.voice_dir)
        if x.endswith('.hdf5') and x.startswith('yam')
    ]

    train_list = voc_list[:int(len(voc_list) * 0.9)]
    val_list = voc_list[int(len(voc_list) * 0.9):]

    utils.list_to_file(val_list, './val_files.txt')

    utils.list_to_file(train_list, './train_files.txt')
예제 #10
0
def gen_train_val():
    casas_list = [
        x for x in os.listdir(config.voice_dir)
        if x.endswith('.hdf5') and x.startswith('casas')
        and not x in config.do_not_use and not x.startswith('casasros')
    ]

    trn_list = casas_list[:int(len(casas_list) * 0.9)]

    val_list = casas_list[int(len(casas_list) * 0.9):]

    utils.list_to_file(val_list, config.log_dir + 'val_files.txt')

    utils.list_to_file(trn_list, config.log_dir + 'train_files.txt')
예제 #11
0
def iterative_impact_tests(students, filter, compare_type, compare_dict, score_type, **kwargs):
    suffix = ".csv"
    if filter:  #if the fitler flag is true, then in splits students up in groupings, if not, it just runs them as is.
        group_types = ga.get_grouping_types(kwargs['groupings'])
        for group_set in group_types:
            print("init")
            filtered_students = ga.filter_students(kwargs['groupings'], group_set, students, **kwargs)
            datas = run_impact_tests(filtered_students, compare_type, compare_dict, score_type)

            suffix = ga.translate_header(kwargs['groupings'], group_set)

            utils.list_to_file(
                "/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/3_18" + compare_type +
                "_prereq_impact_stats" + suffix + "_score_" + score_type + ".csv", datas[0])
            utils.list_to_file(
                "/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/3_18" + compare_type +
                "_elect_impact_stats" + suffix + "_score_" + score_type + ".csv", datas[1])
    else:
        datas = run_impact_tests(students, compare_type, compare_dict, score_type)
        if len(datas) == 1:
            return
        utils.list_to_file(
            "/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/3_18" + compare_type +
            "_prereq_impact_stats" + suffix, datas[0])
        utils.list_to_file(
            "/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/3_18" + compare_type +
            "_all_impact_stats" + suffix, datas[1])
예제 #12
0
파일: vet.py 프로젝트: maiamcc/xword_dicts
def vet_file(basefile: str):
    """
    Read in the file $BASEFILE as newline-separated list,
    offers elements to user one by one to vet via cmd line.
    Approved elements are stored in $BASEFILE.vetted.

    Elements-to-vet stored in $BASEFILE.in_prog_vet. If this file
    exists when `vet` is called, user has the option of continuing
    the in-progress vet or starting a new one.
    """
    in_prog_file = fname_in_prog_vet(basefile)
    accepted = []
    use_existing_vet = False
    if os.path.isfile(in_prog_file):
        print('Vet in progress, continue existing? [Y/n]')
        answer = utils.ask_user_yn()
        if answer:
            frontmatter, elems = utils.file_to_list(in_prog_file)
            _, accepted = utils.file_to_list(fname_vetted(basefile))
            use_existing_vet = True
        else:
            os.remove(in_prog_file)
    # if no in-prog file but there exist a .vetted file?
    if not use_existing_vet:
        frontmatter, elems = utils.file_to_list(basefile)

    print('Elems to vet: {}'.format(len(elems)))
    finished = False
    i = 0
    try:
        for i, elem in enumerate(elems):
            print('{} -- approve? [y/N]'.format(elem))
            answer = utils.ask_user_yn(default=False)
            if answer:
                accepted.append(elem)
        print('hooray, finished vet!')
        finished = True
    finally:
        if len(accepted) > 0:
            utils.list_to_file(fname_vetted(basefile), frontmatter + accepted)
        if finished:
            print('Accepted {} candidates'.format(len(accepted)))
            if os.path.isfile(in_prog_file):
                os.remove(in_prog_file)
        else:
            utils.list_to_file(in_prog_file, frontmatter + elems[i:])
            print('Accepted {} candidates ({} remaining)'.format(
                len(accepted),
                len(elems) - i))
예제 #13
0
def combinate_file(file: str):
    """
    Read in the file of names $FILE as newline-separated list, and for every name,
    generate crossword candidates: [first last, first, last], etc.

    Stores results in $FILE.combinated.
    """
    frontmatter, names = utils.file_to_list(file)
    results = []
    for name in names:
        results.extend(combinate(name))

    print('{} names resulted in {} combinations'.format(
        len(names), len(results)))
    utils.list_to_file(fname_combinated(file), frontmatter + results)
예제 #14
0
def score_file(basefile: str):
    """
    Read in the file $BASEFILE as newline-separated list, offers elements
    to user one by one to vet via cmd line to score.
    Approved elements are stored in $BASEFILE.scored.

    Elements-to-score stored in $BASEFILE.in_prog_score. If this file
    exists when `score` is called, user has the option of continuing
    the in-progress score or starting a new one.
    """
    in_prog_file = fname_in_prog_score(basefile)
    scored = []
    use_existing_score = False
    if os.path.isfile(in_prog_file):
        print('Score in progress, continue existing? [Y/n]')
        answer = utils.ask_user_yn()
        if answer:
            frontmatter, elems = utils.file_to_list(in_prog_file)
            _, scored = utils.file_to_list(fname_scored(basefile))
            use_existing_score = True
        else:
            os.remove(in_prog_file)
    # if no in-prog file but there exist a .scored file?
    if not use_existing_score:
        frontmatter, elems = utils.file_to_list(basefile)

    print('Elems to score: {}'.format(len(elems)))
    finished = False
    i = 0
    try:
        for i, elem in enumerate(elems):
            print(elem)
            score = ask_user_score()
            scored.append('{};{}'.format(elem, score))
        print('hooray, finished scoring!')
        finished = True
    finally:
        if len(scored) > 0:
            utils.list_to_file(fname_scored(basefile), frontmatter + scored)
        if finished:
            print('Scored {} candidates'.format(len(scored)))
            if os.path.isfile(in_prog_file):
                os.remove(in_prog_file)
        else:
            utils.list_to_file(in_prog_file, frontmatter + elems[i:])
            print('Scored {} candidates ({} remaining)'.format(
                len(scored),
                len(elems) - i))
예제 #15
0
def iterative_sequence_tests(students, filter,  **kwargs):
    suffix = ".csv"
    if filter:
        group_types = ga.get_grouping_types(kwargs['groupings'])
        for group_set in group_types:
            filtered_students = ga.filter_students(kwargs['groupings'], group_set, students, kwargs)
            suffix = ga.translate_header(kwargs['groupings'], group_set)
            datas = sequence_analysis(filtered_students, kwargs['class_filter'], kwargs['support'])
            utils.list_to_file(
                "/Users/thomasolson/Documents/workspace/advising_revamp/series analysis/test_series_stats" + suffix+".csv",
                datas)
    else:
         datas = sequence_analysis(students, kwargs['class_filter'], kwargs['support'])
         utils.list_to_file(
            "/Users/thomasolson/Documents/workspace/advising_revamp/series analysis/series_stats" + suffix,
            datas)
예제 #16
0
def feature_analysis_tests(students, socio_factors, course_factors, courses, target_course, time_isolation, **kwargs):
    isolation_set = False
    if 'groupings' in kwargs:

        group_types = ga.get_grouping_types(kwargs['groupings'])
        for group_set in group_types:
            if 'isolation' in kwargs:
                isolation_set = kwargs['isolation']

            filtered_students = ga.filter_students(kwargs['groupings'], group_set, students, kwargs)
            if len(filtered_students) == 0:
                continue

            suffix = ga.translate_header(kwargs['groupings'], group_set)
            df = ca.create_corr_dataframe(filtered_students, [], course_factors,  #socio factors are disabled for lr
                                          courses)
            datas = ca.LinearRegression(df, target_course+"_grade")
            utils.list_to_file("/Users/thomasolson/Documents/workspace/advising_revamp/feature_extraction/linear_reg_extraction_"
                               + suffix +"_"+target_course +".csv", datas)
            df = ca.create_corr_dataframe(filtered_students, socio_factors, course_factors,
                                          courses)
            datas = ca.generic_impact_rf_feature_extract(df, target_course+"_grade")
            utils.list_to_file(
                "/Users/thomasolson/Documents/workspace/advising_revamp/feature_extraction/random_forest_extraction_"
                + suffix +target_course+".csv", datas)
            ca.correlation_analysis(df, "/Users/thomasolson/Documents/workspace/advising_revamp/feature_extraction/corr_grid_"
                                    + suffix +"_"+target_course)
    else:
        df = ca.create_corr_dataframe(students, course_factors,
                                      courses, time_isolation, target_course
                                      )
        #datas = ca.LinearRegression(df, target_course+"_grade")
        #utils.list_to_file(
        #    "/Users/thomasolson/Documents/workspace/advising_revamp/feature_extraction/linear_reg_extraction_allall"+target_course+".csv", datas)
        #df = ca.create_corr_dataframe(students, socio_factors, course_factors,
        #                                courses,
        #                                )
        #datas = ca.generic_impact_rf_feature_extract(df, target_course+"_grade")
        #utils.list_to_file(
        #    "/Users/thomasolson/Documents/workspace/advising_revamp/feature_extraction/random_forest_extraction_allall"+target_course+".csv", datas)
        if time_isolation:
            target_course += "_isolated"
        ca.correlation_analysis(df,
                                "/Users/thomasolson/Documents/workspace/advising_revamp/feature_extraction/concurrent_7_29_corr_grid_allall_"+target_course)
예제 #17
0
def wordbank(wdbnk_counts: Dict[str, int], wdbnk: str):
    """
    For all elements in all dictionaries, find elements that contain all of the letters
    in the given wordbank
    """
    minchars = sum(wdbnk_counts.values())
    all_elems = utils.get_all_dicts()

    valid = []
    for elem in all_elems:
        if len(elem) < minchars:
            continue
        if matches_wordbank(elem, wdbnk_counts):
            valid.append(elem)

    print('found {} valid elems matching wordbank {}'.format(len(valid), wdbnk))
    if valid:
        utils.list_to_file(fname_wordbank(wdbnk), valid)
        if len(valid) < 100:
            for wd in valid:
                print('\t-', wd)
예제 #18
0
def before_after(s: str):
    """
    For all elements in all dictionaries, find elements that, if split on string s, form two valid words
    """
    # minchars = sum(wdbnk_counts.values())
    all_elems = utils.get_all_dicts()

    valid = []
    for elem in all_elems:
        # if len(elem) < minchars:
        #     continue
        if matches_before_after(elem, s, all_elems):
            valid.append(pretty(elem, s))

    print('found {} valid elems matching before_after {}'.format(
        len(valid), s))
    if valid:
        utils.list_to_file(fname_before_after(s), valid)
        if len(valid) < 100:
            for wd in valid:
                print('\t-', wd)
예제 #19
0
def precompute_sim(core_path, elective_path, request_type, vect_type, outpath):
    elective_data = utils.list_from_file(elective_path, "\n", ",", False)
    core_data = utils.list_from_file(core_path, "\n", ",", False)
    class_dict = build_class_key_vector(core_data, elective_data, request_type)
    student_list = utils.get_students_history()
    student_vects = {}
    for student in student_list:
        vect = build_student_vector(student, class_dict, request_type,
                                    vect_type)
        student_vects[student] = vect
    output = []
    for x in range(0, len(student_list)):
        student_a_vect = utils.grade_vect_to_bit(
            student_vects[student_list[x]])
        print(x)
        for y in range(x + 1, len(student_list)):
            student_b_vect = utils.grade_vect_to_bit(
                student_vects[student_list[y]])
            tani = 1.0 - jaccard_similarity_score(student_a_vect,
                                                  student_b_vect)
            output.append(
                str(student_list[x].id_num) + "," +
                str(student_list[y].id_num) + "," + str(tani))
    utils.list_to_file(outpath, output)
예제 #20
0
def contains_in_order(wd: str):
    """
    For all elements in all dictionaries, find elements that contain the letters
    of the given word in order (though not necessarily consecutively).
    """
    minchars = len(wd)
    all_elems = utils.get_all_dicts()

    valid = []
    for elem in all_elems:
        if len(elem) < minchars:
            continue
        if wd in elem:  # candidate just contains the word wholesale (not spread out)
            continue
        if _contains_in_order(elem, wd):
            valid.append(elem)

    print('found {} valid elems that contain the ordered characters {}'.format(
        len(valid), wd))
    if valid:
        utils.list_to_file(fname_contains_in_order(wd), valid)
        if len(valid) < 100:
            for wd in valid:
                print('\t-', wd)
예제 #21
0
    def main_processing(self):
        """
            all steps of the training pipeline
                - processing ncbi sequences
                - tree processing (copying newick tree or building tree from a clade list)
                - mapping genomes on this tree
                - generating fragments from the sequences
                - dealing with sample specific data
                - generating kmer features
                - building models
        """

        if not self.config.settings["only_models"]:
            # if you do not want to build only models, check if the project directory is empty
            if len(os.listdir(self.config.settings["project_dir"])) != 0:
                self.log.warning("The project directory is not empty, this can result in unpredictable behavior.")
                if self.yes:
                    answer = "y"
                else:
                    answer = utils.get_answer_timeout("Remove? [Y/N]")
                if answer == "y":
                    self.log.info("Deleting old project directory..")
                    shutil.rmtree(self.config.settings["project_dir"])
                    os.mkdir(self.config.settings["project_dir"])
                else:
                    self.log.critical("Please provide an empty project directory. Quiting...")
                    sys.exit(1)

            if self.backup:
                self.stat = Status(logged=True)
                self.backupdir = os.path.join(self.config.settings["project_dir"], "backup")
                os.mkdir(self.backupdir)

            self.log.info("creating folder structure...")
            self.create_folderstructure()

            self.log.info("checking database")
            self.check_db()

            if len(self.config.genomes_exclude) != 0:
                # write organisms that will not be considered into a file
                utils.list_to_file(self.config.genomes_exclude,
                             os.path.join(self.config.settings["project_dir"], "excluded.txt"))

                if self.stat is not None:
                    self.stat.add_written_file(os.path.join(self.config.settings["project_dir"], "excluded.txt"))
                    self.stat.write_backup(self.backupdir)

            self.log.info("Processing NCBI data...")
            self.process_ncbi()
            if self.stat is not None:
                self.stat.change_variable(1, "status")
                self.stat.change_variable(self.organism_file_map, "organism_file_map")
                self.stat.change_variable(self.genomes_excluded, "genomes_excluded")
                self.stat.change_variable(self.organisms, "organisms")
                self.stat.write_backup(self.backupdir)

            self.log.info("tree processing...")
            self.tree_process()
            if self.stat is not None:
                self.stat.change_variable(2, "status")
                self.stat.change_variable(self.nodes, "nodes")
                self.stat.change_variable(self.tree_file, "tree_file")
                self.stat.write_backup(self.backupdir)

            self.log.info("mapping genomes on the tree...")
            self.map_genomes_on_tree()
            if self.stat is not None:
                self.stat.change_variable(3, "status")
                self.stat.change_variable("n_frags_per_node", "status")
                self.stat.change_variable("tree_organism_map", "status")
                self.stat.change_variable("organisms_invalid", "status")
                self.stat.change_variable("organism_tree_map", "status")
                self.stat.write_backup(self.backupdir)

            self.log.info("generating sequence fragments...")
            self.generate_seq()
            if self.stat is not None:
                self.stat.change_variable(4, "status")
                self.stat.write_backup(self.backupdir)

            self.log.info("sample specific stuff...")
            self.sample_specific()
            if self.stat is not None:
                self.stat.change_variable(5, "status")
                self.stat.write_backup(self.backupdir)

            self.log.info("generating kmer features...")
            self.generate_kmer_features_concat()
            if self.stat is not None:
                self.stat.change_variable(6, "status")
                self.stat.write_backup(self.backupdir)

        else:
            self.log.info("reading tree string")
            self.config.settings["tree_file"] = os.path.join(self.config.settings["project_dir"],
                                                             "tree.newick")

        self.log.info("building models")
        self.build_models()

        if self.config.settings["clean_up_train"]:
            self.log.info("Cleaning..")
            shutil.rmtree(os.path.join(self.config.settings["project_dir"], "train_data"))
            shutil.rmtree(os.path.join(self.config.settings["project_dir"], "sampled_fasta"))

        self.sqlite_taxonomy.close()
        self.log.info("Processing finished ...models are ready in {}".format(os.path.join(self.config.settings["project_dir"], "models")))
예제 #22
0
    def tree_process(self):
        my_log = logging.getLogger('train:tree_processing')
        if self.config.settings["tree_file"] is None or self.config.settings["tree_file"] == "":
            # create clade list (own method?)
            descandents = {}
            for organism in self.organisms:
                for rank in self.config.settings["taxonomy_ranks"]:
                    parent = self.sqlite_taxonomy.parent_at_rank(organism, rank)
                    print('!!! Call: parent = self.sqlite_taxonomy.parent_at_rank(organism, rank) !!!')
                    if parent in descandents:
                        descandents[parent] += 1
                    else:
                        descandents[parent] = 1
            for parent in descandents.keys():
                if parent is None or parent == "":
                    continue
                if descandents[parent] >= self.config.settings["n_min_genomes_generic"]:
                    self.nodes.append(parent)
        else:
            # read the tree_string
            nodes_tmp = utils.get_lines(self.config.settings["tree_file"])
            for n in nodes_tmp:
                if n != "" and n is not None:
                    self.nodes.append(n.rstrip("\n"))
            tree_string = self.nodes[0]

        # check if the tree is a newick string or a node list
        t_file = os.path.join(self.config.settings["project_dir"], "tree.newick")
        if ";" not in self.nodes[0] and len(self.nodes) >= 1:
            my_log.debug("Generating tree from the clades list ({} clades)....".format(str(len(self.nodes))))
            clades_file = os.path.join(self.config.settings["project_dir"], "clades.txt")
            utils.list_to_file(self.nodes, clades_file)

            if self.stat is not None:
                self.stat.add_written_file(clades_file)
                self.stat.succesfully_written(clades_file)

            # run script to create tree
            obj = ncbi2newick.Ncbi2Newick(self.config.ncbi_tax_db, logged=True)
            obj.tree_from_nodes(clades_file)
            obj.tree_to_file(t_file)
            obj.close()

        else:
            my_log.debug("Copying tree to the project directory...")
            fw = open(t_file, "w")
            if self.stat is not None:
                self.stat.add_written_file(t_file)
            fw.write(tree_string)
            if self.stat is not None:
                self.succesfully_written(t_file)
            fw.close()

            # change tree_file to the tree in the project directory
        # get back the tree_string  and nodes, this is necessary for further processing
        self.tree_file = os.path.join(self.config.settings["project_dir"], "tree.newick")
        fr = open(self.tree_file, "r")
        tree_string = fr.readline().rstrip()
        fr.close()
        if tree_string == "":
            my_log.critical("First line is empty in the newick file: {}".format(self.tree_file))
            sys.exit(1)
        self.nodes = ncbi2newick.get_nodes_from_newick(self.tree_file)
예제 #23
0
def histo_analysis():
    students = it.package_student_data()
    filtered_students = ga.filter_students(["admin_descript"], ["Transfer_Start"], students)
    datas = sa.course_semester_histogram(filtered_students, True)
    utils.list_to_file("/Users/thomasolson/Documents/workspace/advising_revamp/series analysis/xfer_class_histo.csv", datas)
예제 #24
0
def eval_file():
    file_path = config.wav_dir

    # log_dir = './log_ikala_notrain/'
    log_dir = config.log_dir

    mode = 0

    stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r')

    max_feat = np.array(stat_file["feats_maximus"])
    min_feat = np.array(stat_file["feats_minimus"])
    max_voc = np.array(stat_file["voc_stft_maximus"])
    min_voc = np.array(stat_file["voc_stft_minimus"])
    max_back = np.array(stat_file["back_stft_maximus"])
    min_back = np.array(stat_file["back_stft_minimus"])
    max_mix = np.array(max_voc) + np.array(max_back)

    with tf.Graph().as_default():

        input_placeholder = tf.placeholder(tf.float32,
                                           shape=(config.batch_size,
                                                  config.max_phr_len,
                                                  config.input_features),
                                           name='input_placeholder')

        with tf.variable_scope('First_Model') as scope:
            harm, ap, f0, vuv = modules.nr_wavenet(input_placeholder)

        saver = tf.train.Saver(max_to_keep=config.max_models_to_keep)

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess = tf.Session()

        sess.run(init_op)

        ckpt = tf.train.get_checkpoint_state(log_dir)

        if ckpt and ckpt.model_checkpoint_path:
            print("Using the model in %s" % ckpt.model_checkpoint_path)
            # saver.restore(sess, ckpt.model_checkpoint_path)
            saver.restore(sess, './log/model.ckpt-59')

        # import pdb;pdb.set_trace()

        files = [
            x for x in os.listdir(config.wav_dir)
            if x.endswith('.wav') and not x.startswith('.')
        ]
        diffs = []
        count = 0
        for file_name in files:

            count += 1

            mix_stft = utils.file_to_stft(os.path.join(file_path, file_name),
                                          mode=mode)

            targs = utils.input_to_feats(os.path.join(file_path, file_name),
                                         mode=mode)

            # f0_sac = utils.file_to_sac(os.path.join(file_path,file_name))
            # f0_sac = (f0_sac-min_feat[-2])/(max_feat[-2]-min_feat[-2])

            in_batches, nchunks_in = utils.generate_overlapadd(mix_stft)
            in_batches = in_batches / max_mix
            # in_batches = utils.normalize(in_batches, 'mix_stft', mode=config.norm_mode_in)
            val_outer = []

            first_pred = []

            cleaner = []

            gan_op = []

            for in_batch in in_batches:
                val_harm, val_ap, val_f0, val_vuv = sess.run(
                    [harm, ap, f0, vuv],
                    feed_dict={input_placeholder: in_batch})
                if config.use_gan:
                    val_op = sess.run(gen_op,
                                      feed_dict={input_placeholder: in_batch})

                    gan_op.append(val_op)

                # first_pred.append(harm1)
                # cleaner.append(val_harm)
                val_harm = val_harm
                val_outs = np.concatenate((val_harm, val_ap, val_f0, val_vuv),
                                          axis=-1)
                val_outer.append(val_outs)

            val_outer = np.array(val_outer)
            val_outer = utils.overlapadd(val_outer, nchunks_in)
            val_outer[:, -1] = np.round(val_outer[:, -1])
            val_outer = val_outer[:targs.shape[0], :]
            val_outer = np.clip(val_outer, 0.0, 1.0)

            #Test purposes only
            # first_pred = np.array(first_pred)
            # first_pred = utils.overlapadd(first_pred, nchunks_in)

            # cleaner = np.array(cleaner)
            # cleaner = utils.overlapadd(cleaner, nchunks_in)

            f0_output = val_outer[:, -2] * (
                (max_feat[-2] - min_feat[-2]) + min_feat[-2])
            f0_output = f0_output * (1 - targs[:, -1])
            f0_output = utils.new_base_to_hertz(f0_output)
            f0_gt = targs[:, -2]
            f0_gt = f0_gt * (1 - targs[:, -1])
            f0_gt = utils.new_base_to_hertz(f0_gt)
            f0_outputs = []
            gt_outputs = []
            for i, f0_o in enumerate(f0_output):
                f0_outputs.append(
                    str(i * 0.00580498866 * 10000000) + ' ' + str(f0_o))

            for i, f0_o in enumerate(f0_gt):
                gt_outputs.append(
                    str(i * 0.00580498866 * 10000000) + ' ' + str(f0_o))

            utils.list_to_file(
                f0_outputs, './ikala_eval/net_out/' + file_name[:-4] + '.pv')
            utils.list_to_file(gt_outputs,
                               './ikala_eval/sac_gt/' + file_name[:-4] + '.pv')
            #     f0_difference = np.nan_to_num(abs(f0_gt-f0_output))
            #     f0_greater = np.where(f0_difference>config.f0_threshold)

            #     diff_per = f0_greater[0].shape[0]/len(f0_output)
            #     diffs.append(str(1-diff_per))
            utils.progress(count, len(files))
예제 #25
0
def cross_comp():
    ikala_gt_dir = './ikala_eval/ikala_gt/'
    net_out_dir = './ikala_eval/net_out/'
    sac_gt_dir = './ikala_eval/sac_gt/'

    file_list = [
        x for x in os.listdir(net_out_dir)
        if x.endswith('.pv') and not x.startswith('.')
    ]

    output = []

    for file_name in file_list:
        out_time, out_freq = mir_eval.io.load_time_series(net_out_dir +
                                                          file_name)

        for i, freq in enumerate(out_freq):
            if float(freq) == 0.0:
                out_freq[i] = 0
            else:
                out_freq[i] = utils.f0_to_hertz(float(freq))

        out_freq, out_vuv = mir_eval.melody.freq_to_voicing(out_freq)

        ref_time_o, ref_freq_o = mir_eval.io.load_time_series(ikala_gt_dir +
                                                              file_name)

        for i, freq in enumerate(ref_freq_o):
            if float(freq) == 0.0:
                ref_freq_o[i] = 0
            else:
                ref_freq_o[i] = utils.f0_to_hertz(float(freq))

        plt.figure(1)
        plt.plot(out_freq)
        plt.plot(ref_freq_o)
        plt.show()

        # import pdb;pdb.set_trace()

        haha = mir_eval.melody.evaluate(ref_time_o, ref_freq_o, out_time,
                                        out_freq)

        out_string = file_name

        for key in haha.keys():
            out_string = out_string + ';' + str(haha[key])

        # import pdb;pdb.set_trace()

        # ref_freq_o, ref_vuv_o = mir_eval.melody.freq_to_voicing(ref_freq_o)
        # ref_freq,ref_vuv = mir_eval.melody.resample_melody_series(ref_time_o,ref_freq_o, ref_vuv_o,out_time)

        # out_freq_o, out_vuv_o = mir_eval.melody.resample_melody_series(out_time,out_freq, out_vuv,ref_time_o)

        # raw_pitch_accuracy_10_o = mir_eval.melody.raw_pitch_accuracy(ref_vuv_o,ref_freq_o,out_vuv_o,out_freq_o, cent_tolerance = 10)
        # raw_pitch_accuracy_25_o = mir_eval.melody.raw_pitch_accuracy(ref_vuv_o,ref_freq_o,out_vuv_o,out_freq_o, cent_tolerance = 25)
        # raw_pitch_accuracy_50_o = mir_eval.melody.raw_pitch_accuracy(ref_vuv_o,ref_freq_o,out_vuv_o,out_freq_o, cent_tolerance = 50)
        # raw_chroma_accuracy_o = mir_eval.melody.raw_chroma_accuracy(ref_vuv_o,ref_freq_o,out_vuv_o,out_freq_o)

        # raw_pitch_accuracy_10 = mir_eval.melody.raw_pitch_accuracy(ref_vuv,ref_freq,out_vuv,out_freq, cent_tolerance = 10)
        # raw_pitch_accuracy_25 = mir_eval.melody.raw_pitch_accuracy(ref_vuv,ref_freq,out_vuv,out_freq, cent_tolerance = 25)
        # raw_pitch_accuracy_50 = mir_eval.melody.raw_pitch_accuracy(ref_vuv,ref_freq,out_vuv,out_freq, cent_tolerance = 50)
        # raw_chroma_accuracy = mir_eval.melody.raw_chroma_accuracy(ref_vuv,ref_freq,out_vuv,out_freq)

        # import pdb;pdb.set_trace()
        output.append(out_string)

        # output.append(file_name+';'+str(raw_pitch_accuracy_10)+';'+str(raw_pitch_accuracy_25)+';'+str(raw_pitch_accuracy_50)+';'+str(raw_chroma_accuracy)+';'+str(raw_pitch_accuracy_10_o)+';'+str(raw_pitch_accuracy_25_o)+';'+str(raw_pitch_accuracy_50_o)+';'+str(raw_chroma_accuracy_o))

    utils.list_to_file(output, './ikala_eval/mir_eval_results.txt')
예제 #26
0
def cluster(student_vects, request_type, vect_type, base_dir, dissim_path):
    cluster_data = []
    student_list = []
    student_output = []
    for student in student_vects:
        cluster_data.append(student_vects[student])
        student_output.append(
            str(student.id_num) + "," + str(student.grade_adj) + "," +
            str(student.age))
        student_list.append(student)

    #ms = MeanShift().fit_predict(cluster_data)
    #ward = AgglomerativeClustering(n_clusters=5, linkage='ward').fit_predict(cluster_data)
    #utils.list_to_file(base_dir+"/test_labels_"+request_type+"_"+vect_type, student_output)
    #utils.list_to_file("test_clusters_ms", ms)
    n_set = [5]
    #dissim_list = utils.list_from_file(dissim_path,"\n", ",", False)
    #dissim_dict = ca.format_dissim_list(dissim_list)
    """
    for n in n_set:
        pred_clusters = KMeans(n_clusters=n).fit(cluster_data)
        analysis_set = ca.cluster_analysis(cluster_data, pred_clusters.labels_, student_list, dissim_dict)
        os.mkdir(base_dir+"/"+str(n)+"_run")
        for clust in analysis_set:
            ca.print_stats(analysis_set[clust], clust, base_dir+"/"+str(n)+"_run")
            utils.list_to_file("test_clusters_kmeans_"+str(n)+"_"+request_type+"_"+vect_type, pred_clusters.labels_)

    """
    utils.list_to_file(
        "test_labels_2_2" + vect_type + "_" + request_type + ".csv",
        student_output)

    af = AffinityPropagation().fit(cluster_data)
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_

    n_clusters_ = len(cluster_centers_indices)

    #print('Estimated number of clusters: %d' % n_clusters_)
    #print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    #print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    #print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    #print("Adjusted Rand Index: %0.3f"
    #      % metrics.adjusted_rand_score(labels_true, labels))
    #print("Adjusted Mutual Information: %0.3f"
    #      % metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f" %
          metrics.silhouette_score(cluster_data, labels, metric='sqeuclidean'))
    utils.list_to_file("test_clusters_af_" + vect_type + "_" + request_type,
                       af.labels_)
    print("DB Index: %0.3f" %
          metrics.davies_bouldin_score(cluster_data, af.labels_))

    print("km 10")
    pred_clusters = KMeans(n_clusters=10).fit(cluster_data)
    utils.list_to_file(
        "test_clusters_kmeans_10_" + vect_type + "_" + request_type,
        pred_clusters.labels_)
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(
        cluster_data, pred_clusters.labels_, metric='sqeuclidean'))
    print("DB Index: %0.3f" %
          metrics.davies_bouldin_score(cluster_data, pred_clusters.labels_))
    print("km 15")
    pred_clusters = KMeans(n_clusters=15).fit(cluster_data)
    utils.list_to_file(
        "test_clusters_kmeans_15_" + vect_type + "_" + request_type,
        pred_clusters.labels_)
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(
        cluster_data, pred_clusters.labels_, metric='sqeuclidean'))
    print("DB Index: %0.3f" %
          metrics.davies_bouldin_score(cluster_data, pred_clusters.labels_))
    print("km 5")
    pred_clusters = KMeans(n_clusters=5).fit(cluster_data)
    utils.list_to_file(
        "test_clusters_kmeans_5_" + vect_type + "_" + request_type,
        pred_clusters.labels_)
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(
        cluster_data, pred_clusters.labels_, metric='sqeuclidean'))
    print("DB Index: %0.3f" %
          metrics.davies_bouldin_score(cluster_data, pred_clusters.labels_))
    #pred_clusters = KMeans(n_clusters=15).fit(cluster_data)
    #utils.list_to_file("test_clusters_kmeans_15_cls_core", pred_clusters.labels_)
    #utils.list_to_file("test_clusters_ward", ward)
    print("ward5")
    ward = AgglomerativeClustering(n_clusters=5).fit(cluster_data)
    utils.list_to_file(
        "test_clusters_ward_5clust" + vect_type + "_" + request_type,
        ward.labels_)
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(
        cluster_data, ward.labels_, metric='sqeuclidean'))
    print("DB Index: %0.3f" %
          metrics.davies_bouldin_score(cluster_data, ward.labels_))
    print("ward10")
    ward = AgglomerativeClustering(n_clusters=10).fit(cluster_data)
    utils.list_to_file(
        "test_clusters_ward_10clust" + vect_type + "_" + request_type,
        ward.labels_)
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(
        cluster_data, ward.labels_, metric='sqeuclidean'))
    print("DB Index: %0.3f" %
          metrics.davies_bouldin_score(cluster_data, ward.labels_))
    print("ward15")
    ward = AgglomerativeClustering(n_clusters=15).fit(cluster_data)
    utils.list_to_file(
        "test_clusters_ward_15clust" + vect_type + "_" + request_type,
        ward.labels_)
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(
        cluster_data, ward.labels_, metric='sqeuclidean'))
    print("DB Index: %0.3f" %
          metrics.davies_bouldin_score(cluster_data, ward.labels_))
    print("ms")
    ms = MeanShift().fit(cluster_data)
    utils.list_to_file("test_clusters_ms" + vect_type + "_" + request_type,
                       ms.labels_)
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(
        cluster_data, ms.labels_, metric='sqeuclidean'))
    print("DB Index: %0.3f" %
          metrics.davies_bouldin_score(cluster_data, ms.labels_))
예제 #27
0
def train(_):
    stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r')
    max_feat = np.array(stat_file["feats_maximus"])
    min_feat = np.array(stat_file["feats_minimus"])
    with tf.Graph().as_default():

        input_placeholder = tf.placeholder(tf.float32,
                                           shape=(config.batch_size,
                                                  config.max_phr_len,
                                                  config.input_features),
                                           name='input_placeholder')
        tf.summary.histogram('inputs', input_placeholder)
        target_placeholder = tf.placeholder(tf.float32,
                                            shape=(config.batch_size,
                                                   config.max_phr_len,
                                                   config.output_features),
                                            name='target_placeholder')
        tf.summary.histogram('targets', target_placeholder)

        with tf.variable_scope('First_Model') as scope:
            harm, ap, f0, vuv = modules.nr_wavenet(input_placeholder)

            # tf.summary.histogram('initial_output', op)

            tf.summary.histogram('harm', harm)

            tf.summary.histogram('ap', ap)

            tf.summary.histogram('f0', f0)

            tf.summary.histogram('vuv', vuv)

        if config.use_gan:

            with tf.variable_scope('Generator') as scope:
                gen_op = modules.GAN_generator(harm)
            with tf.variable_scope('Discriminator') as scope:
                D_real = modules.GAN_discriminator(
                    target_placeholder[:, :, :60], input_placeholder)
                scope.reuse_variables()
                D_fake = modules.GAN_discriminator(gen_op + harmy,
                                                   input_placeholder)

            # Comment out these lines to train without GAN

            D_loss_real = -tf.reduce_mean(tf.log(D_real + 1e-12))
            D_loss_fake = -tf.reduce_mean(tf.log(1. - (D_fake + 1e-12)))

            D_loss = D_loss_real + D_loss_fake

            D_summary_real = tf.summary.scalar('Discriminator_Loss_Real',
                                               D_loss_real)
            D_summary_fake = tf.summary.scalar('Discriminator_Loss_Fake',
                                               D_loss_fake)

            G_loss_GAN = -tf.reduce_mean(tf.log(D_fake + 1e-12))
            G_loss_diff = tf.reduce_sum(
                tf.abs(gen_op + harmy - target_placeholder[:, :, :60]) *
                (1 - target_placeholder[:, :, -1:])) * 0.5
            G_loss = G_loss_GAN + G_loss_diff

            G_summary_GAN = tf.summary.scalar('Generator_Loss_GAN', G_loss_GAN)
            G_summary_diff = tf.summary.scalar('Generator_Loss_diff',
                                               G_loss_diff)

            vars = tf.trainable_variables()

            # import pdb;pdb.set_trace()

            d_params = [
                v for v in vars if v.name.startswith('Discriminator/D')
            ]
            g_params = [v for v in vars if v.name.startswith('Generator/G')]

            # import pdb;pdb.set_trace()

            # d_optimizer_grad = tf.train.GradientDescentOptimizer(learning_rate=config.gan_lr).minimize(D_loss, var_list=d_params)
            # g_optimizer = tf.train.GradientDescentOptimizer(learning_rate=config.gan_lr).minimize(G_loss, var_list=g_params)

            d_optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=config.gan_lr).minimize(D_loss,
                                                      var_list=d_params)
            # g_optimizer_diff = tf.train.AdamOptimizer(learning_rate=config.gan_lr).minimize(G_loss_diff, var_list=g_params)
            g_optimizer = tf.train.AdamOptimizer(
                learning_rate=config.gan_lr).minimize(G_loss,
                                                      var_list=g_params)

        # initial_loss = tf.reduce_sum(tf.abs(op - target_placeholder[:,:,:60])*np.linspace(1.0,0.7,60)*(1-target_placeholder[:,:,-1:]))

        harm_loss = tf.reduce_sum(
            tf.abs(harm - target_placeholder[:, :, :60]) *
            np.linspace(1.0, 0.7, 60) * (1 - target_placeholder[:, :, -1:]))

        ap_loss = tf.reduce_sum(
            tf.abs(ap - target_placeholder[:, :, 60:-2]) *
            (1 - target_placeholder[:, :, -1:]))

        f0_loss = tf.reduce_sum(
            tf.abs(f0 - target_placeholder[:, :, -2:-1]) *
            (1 - target_placeholder[:, :, -1:]))

        # vuv_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=, logits=vuv))

        vuv_loss = tf.reduce_mean(
            tf.reduce_sum(binary_cross(target_placeholder[:, :, -1:], vuv)))

        loss = harm_loss + ap_loss + vuv_loss + f0_loss * config.f0_weight

        # initial_summary = tf.summary.scalar('initial_loss', initial_loss)

        harm_summary = tf.summary.scalar('harm_loss', harm_loss)

        ap_summary = tf.summary.scalar('ap_loss', ap_loss)

        f0_summary = tf.summary.scalar('f0_loss', f0_loss)

        vuv_summary = tf.summary.scalar('vuv_loss', vuv_loss)

        loss_summary = tf.summary.scalar('total_loss', loss)

        global_step = tf.Variable(0, name='global_step', trainable=False)

        optimizer = tf.train.AdamOptimizer(learning_rate=config.init_lr)

        # optimizer_f0 = tf.train.AdamOptimizer(learning_rate = config.init_lr)

        train_function = optimizer.minimize(loss, global_step=global_step)

        # train_f0 = optimizer.minimize(f0_loss, global_step= global_step)

        # train_harm = optimizer.minimize(harm_loss, global_step= global_step)

        # train_ap = optimizer.minimize(ap_loss, global_step= global_step)

        # train_f0 = optimizer.minimize(f0_loss, global_step= global_step)

        # train_vuv = optimizer.minimize(vuv_loss, global_step= global_step)

        summary = tf.summary.merge_all()

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        saver = tf.train.Saver(max_to_keep=config.max_models_to_keep)
        sess = tf.Session()

        sess.run(init_op)

        ckpt = tf.train.get_checkpoint_state(config.log_dir_m1)

        if ckpt and ckpt.model_checkpoint_path:
            print("Using the model in %s" % ckpt.model_checkpoint_path)
            saver.restore(sess, ckpt.model_checkpoint_path)

        train_summary_writer = tf.summary.FileWriter(
            config.log_dir_m1 + 'train/', sess.graph)
        val_summary_writer = tf.summary.FileWriter(config.log_dir_m1 + 'val/',
                                                   sess.graph)

        start_epoch = int(
            sess.run(tf.train.get_global_step()) /
            (config.batches_per_epoch_train))

        print("Start from: %d" % start_epoch)
        f0_accs = []
        for epoch in xrange(start_epoch, config.num_epochs):
            val_f0_accs = []

            data_generator = data_gen()
            start_time = time.time()

            epoch_loss_harm = 0
            epoch_loss_ap = 0
            epoch_loss_f0 = 0
            epoch_loss_vuv = 0
            epoch_total_loss = 0
            # epoch_initial_loss = 0

            epoch_loss_harm_val = 0
            epoch_loss_ap_val = 0
            epoch_loss_f0_val = 0
            epoch_loss_vuv_val = 0
            epoch_total_loss_val = 0
            # epoch_initial_loss_val = 0

            if config.use_gan:
                epoch_loss_generator_GAN = 0
                epoch_loss_generator_diff = 0
                epoch_loss_discriminator_real = 0
                epoch_loss_discriminator_fake = 0

                val_epoch_loss_generator_GAN = 0
                val_epoch_loss_generator_diff = 0
                val_epoch_loss_discriminator_real = 0
                val_epoch_loss_discriminator_fake = 0

            batch_num = 0
            batch_num_val = 0
            val_generator = data_gen(mode='val')

            # val_generator = get_batches(train_filename=config.h5py_file_val, batches_per_epoch=config.batches_per_epoch_val_m1)

            with tf.variable_scope('Training'):

                for voc, feat in data_generator:
                    voc = np.clip(
                        voc + np.random.rand(config.max_phr_len,
                                             config.input_features) *
                        np.clip(np.random.rand(1), 0.0,
                                config.noise_threshold), 0.0, 1.0)

                    _, step_loss_harm, step_loss_ap, step_loss_f0, step_loss_vuv, step_total_loss = sess.run(
                        [
                            train_function, harm_loss, ap_loss, f0_loss,
                            vuv_loss, loss
                        ],
                        feed_dict={
                            input_placeholder: voc,
                            target_placeholder: feat
                        })
                    # _, step_loss_f0 = sess.run([train_f0, f0_loss], feed_dict={input_placeholder: voc,target_placeholder: feat})

                    if config.use_gan:
                        _, step_dis_loss_real, step_dis_loss_fake = sess.run(
                            [d_optimizer, D_loss_real, D_loss_fake],
                            feed_dict={
                                input_placeholder: voc,
                                target_placeholder: feat
                            })
                        _, step_gen_loss_GAN, step_gen_loss_diff = sess.run(
                            [g_optimizer, G_loss_GAN, G_loss_diff],
                            feed_dict={
                                input_placeholder: voc,
                                target_placeholder: feat
                            })
                    # else :
                    #     _, step_dis_loss_real, step_dis_loss_fake = sess.run([d_optimizer_grad, D_loss_real,D_loss_fake], feed_dict={input_placeholder: voc,target_placeholder: feat})
                    #     _, step_gen_loss_diff = sess.run([g_optimizer_diff, G_loss_diff], feed_dict={input_placeholder: voc,target_placeholder: feat})
                    #     step_gen_loss_GAN = 0

                    # _, step_loss_harm = sess.run([train_harm, harm_loss], feed_dict={input_placeholder: voc,target_placeholder: feat})
                    # _, step_loss_ap = sess.run([train_ap, ap_loss], feed_dict={input_placeholder: voc,target_placeholder: feat})
                    # _, step_loss_f0 = sess.run([train_f0, f0_loss], feed_dict={input_placeholder: voc,target_placeholder: feat})
                    # _, step_loss_vuv = sess.run([train_vuv, vuv_loss], feed_dict={input_placeholder: voc,target_placeholder: feat})

                    # epoch_initial_loss+=step_initial_loss
                    epoch_loss_harm += step_loss_harm
                    epoch_loss_ap += step_loss_ap
                    epoch_loss_f0 += step_loss_f0
                    epoch_loss_vuv += step_loss_vuv
                    epoch_total_loss += step_total_loss

                    if config.use_gan:

                        epoch_loss_generator_GAN += step_gen_loss_GAN
                        epoch_loss_generator_diff += step_gen_loss_diff
                        epoch_loss_discriminator_real += step_dis_loss_real
                        epoch_loss_discriminator_fake += step_dis_loss_fake

                    utils.progress(batch_num,
                                   config.batches_per_epoch_train,
                                   suffix='training done')
                    batch_num += 1

                # epoch_initial_loss = epoch_initial_loss/(config.batches_per_epoch_train *config.batch_size*config.max_phr_len*60)
                epoch_loss_harm = epoch_loss_harm / (
                    config.batches_per_epoch_train * config.batch_size *
                    config.max_phr_len * 60)
                epoch_loss_ap = epoch_loss_ap / (
                    config.batches_per_epoch_train * config.batch_size *
                    config.max_phr_len * 4)
                epoch_loss_f0 = epoch_loss_f0 / (
                    config.batches_per_epoch_train * config.batch_size *
                    config.max_phr_len)
                epoch_loss_vuv = epoch_loss_vuv / (
                    config.batches_per_epoch_train * config.batch_size *
                    config.max_phr_len)
                epoch_total_loss = epoch_total_loss / (
                    config.batches_per_epoch_train * config.batch_size *
                    config.max_phr_len * 66)

                if config.use_gan:

                    epoch_loss_generator_GAN = epoch_loss_generator_GAN / (
                        config.batches_per_epoch_train * config.batch_size)
                    epoch_loss_generator_diff = epoch_loss_generator_diff / (
                        config.batches_per_epoch_train * config.batch_size *
                        config.max_phr_len * 60)
                    epoch_loss_discriminator_real = epoch_loss_discriminator_real / (
                        config.batches_per_epoch_train * config.batch_size)
                    epoch_loss_discriminator_fake = epoch_loss_discriminator_fake / (
                        config.batches_per_epoch_train * config.batch_size)

                summary_str = sess.run(summary,
                                       feed_dict={
                                           input_placeholder: voc,
                                           target_placeholder: feat
                                       })
                train_summary_writer.add_summary(summary_str, epoch)
                # summary_writer.add_summary(summary_str_val, epoch)
                train_summary_writer.flush()

            with tf.variable_scope('Validation'):

                for voc, feat in val_generator:

                    step_loss_harm_val = sess.run(harm_loss,
                                                  feed_dict={
                                                      input_placeholder: voc,
                                                      target_placeholder: feat
                                                  })
                    step_loss_ap_val = sess.run(ap_loss,
                                                feed_dict={
                                                    input_placeholder: voc,
                                                    target_placeholder: feat
                                                })
                    step_loss_f0_val = sess.run(f0_loss,
                                                feed_dict={
                                                    input_placeholder: voc,
                                                    target_placeholder: feat
                                                })
                    step_loss_vuv_val = sess.run(vuv_loss,
                                                 feed_dict={
                                                     input_placeholder: voc,
                                                     target_placeholder: feat
                                                 })
                    step_total_loss_val = sess.run(loss,
                                                   feed_dict={
                                                       input_placeholder: voc,
                                                       target_placeholder: feat
                                                   })

                    epoch_loss_harm_val += step_loss_harm_val
                    epoch_loss_ap_val += step_loss_ap_val
                    epoch_loss_f0_val += step_loss_f0_val
                    epoch_loss_vuv_val += step_loss_vuv_val
                    epoch_total_loss_val += step_total_loss_val

                    if config.use_gan:

                        val_epoch_loss_generator_GAN += step_gen_loss_GAN
                        val_epoch_loss_generator_diff += step_gen_loss_diff
                        val_epoch_loss_discriminator_real += step_dis_loss_real
                        val_epoch_loss_discriminator_fake += step_dis_loss_fake

                    utils.progress(batch_num_val,
                                   config.batches_per_epoch_val_m1,
                                   suffix='validiation done')
                    batch_num_val += 1

                # f0_accs.append(np.mean(val_f0_accs))

                # epoch_initial_loss_val = epoch_initial_loss_val/(config.batches_per_epoch_val_m1 *config.batch_size*config.max_phr_len*60)
                epoch_loss_harm_val = epoch_loss_harm_val / (
                    batch_num_val * config.batch_size * config.max_phr_len *
                    60)
                epoch_loss_ap_val = epoch_loss_ap_val / (
                    batch_num_val * config.batch_size * config.max_phr_len * 4)
                epoch_loss_f0_val = epoch_loss_f0_val / (
                    batch_num_val * config.batch_size * config.max_phr_len)
                epoch_loss_vuv_val = epoch_loss_vuv_val / (
                    batch_num_val * config.batch_size * config.max_phr_len)
                epoch_total_loss_val = epoch_total_loss_val / (
                    batch_num_val * config.batch_size * config.max_phr_len *
                    66)

                if config.use_gan:

                    val_epoch_loss_generator_GAN = val_epoch_loss_generator_GAN / (
                        config.batches_per_epoch_val_m1 * config.batch_size)
                    val_epoch_loss_generator_diff = val_epoch_loss_generator_diff / (
                        config.batches_per_epoch_val_m1 * config.batch_size *
                        config.max_phr_len * 60)
                    val_epoch_loss_discriminator_real = val_epoch_loss_discriminator_real / (
                        config.batches_per_epoch_val_m1 * config.batch_size)
                    val_epoch_loss_discriminator_fake = val_epoch_loss_discriminator_fake / (
                        config.batches_per_epoch_val_m1 * config.batch_size)

                summary_str = sess.run(summary,
                                       feed_dict={
                                           input_placeholder: voc,
                                           target_placeholder: feat
                                       })
                val_summary_writer.add_summary(summary_str, epoch)
                # summary_writer.add_summary(summary_str_val, epoch)
                val_summary_writer.flush()

            duration = time.time() - start_time

            # np.save('./ikala_eval/accuracies', f0_accs)

            if (epoch + 1) % config.print_every == 0:
                print('epoch %d: Harm Training Loss = %.10f (%.3f sec)' %
                      (epoch + 1, epoch_loss_harm, duration))
                print('        : Ap Training Loss = %.10f ' % (epoch_loss_ap))
                print('        : F0 Training Loss = %.10f ' % (epoch_loss_f0))
                print('        : VUV Training Loss = %.10f ' %
                      (epoch_loss_vuv))
                # print('        : Initial Training Loss = %.10f ' % (epoch_initial_loss))

                if config.use_gan:

                    print('        : Gen GAN Training Loss = %.10f ' %
                          (epoch_loss_generator_GAN))
                    print('        : Gen diff Training Loss = %.10f ' %
                          (epoch_loss_generator_diff))
                    print(
                        '        : Discriminator Training Loss Real = %.10f ' %
                        (epoch_loss_discriminator_real))
                    print(
                        '        : Discriminator Training Loss Fake = %.10f ' %
                        (epoch_loss_discriminator_fake))

                print('        : Harm Validation Loss = %.10f ' %
                      (epoch_loss_harm_val))
                print('        : Ap Validation Loss = %.10f ' %
                      (epoch_loss_ap_val))
                print('        : F0 Validation Loss = %.10f ' %
                      (epoch_loss_f0_val))
                print('        : VUV Validation Loss = %.10f ' %
                      (epoch_loss_vuv_val))

                # if (epoch + 1) % config.save_every == 0 or (epoch + 1) == config.num_epochs:
                # print('        : Mean F0 IKala Accuracy  = %.10f ' % (np.mean(val_f0_accs)))

                # print('        : Mean F0 IKala Accuracy = '+'%{1:.{0}f}%'.format(np.mean(val_f0_accs)))
                # print('        : Initial Validation Loss = %.10f ' % (epoch_initial_loss_val))

                if config.use_gan:

                    print('        : Gen GAN Validation Loss = %.10f ' %
                          (val_epoch_loss_generator_GAN))
                    print('        : Gen diff Validation Loss = %.10f ' %
                          (val_epoch_loss_generator_diff))
                    print(
                        '        : Discriminator Validation Loss Real = %.10f '
                        % (val_epoch_loss_discriminator_real))
                    print(
                        '        : Discriminator Validation Loss Fake = %.10f '
                        % (val_epoch_loss_discriminator_fake))

            if (epoch + 1) % config.save_every == 0 or (
                    epoch + 1) == config.num_epochs:
                utils.list_to_file(
                    val_f0_accs,
                    './ikala_eval/accuracies_' + str(epoch + 1) + '.txt')
                checkpoint_file = os.path.join(config.log_dir_m1, 'model.ckpt')
                saver.save(sess, checkpoint_file, global_step=epoch)
예제 #28
0
def score_series_set(path, outpath, add_412, add_211, class_type):

    #score maps are built from sequence analysis and I have included examples of their format in git.
    if class_type.lower() == "transfer":
        seq_score_map = utils.dict_from_file("/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/subset_transfer_sequence_score_map_25.csv",
                                            0,1,"\n", ",", True)
        equiv_score_map = utils.dict_from_file("/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/subset_transfer_concurrent_score_map_25.csv",
                                            0,1,"\n", ",", True)
    elif class_type.lower() == "49_set":#This was some testing work I did
        seq_score_map = utils.dict_from_file(
            "/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/49_cs_sequence_score_map_25.csv",
            0, 1, "\n", ",", True)
        equiv_score_map = utils.dict_from_file(
            "/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/49_cs_concurrent_score_map_25.csv",
            0, 1, "\n", ",", True)
    else:
        seq_score_map = utils.dict_from_file(
            "/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/combo_score_seq_mod_bonus.csv",
            0, 1, "\n", ",", True)
        equiv_score_map = utils.dict_from_file(
            "/Users/thomasolson/Documents/workspace/advising_revamp/group analysis runs/combo_score_equiv_mod_bonus.csv",
            0, 1, "\n", ",", True)



    i = 0

    top_100 = []
    with open(path, "r") as x:
        data = x.readline()
        while data:
            if i % 10000 == 0:
                print(i)
            i+=1
            #if "10_" in data or "9_" in data:
            #    data = x.readline()
            #    continue
            line = data.strip().replace(" ", "").replace("[","").replace("]","").replace("'","").split(",")

            if add_412 or add_211:
                for sem_x in range(0,len(line)):
                    if "CSC340" in line[sem_x] and add_412:
                        sem = line[sem_x].split("_")[0]
                        line.insert(sem_x+1, sem+"_CSC412")
                        break
                    if "CSC210" in line[sem_x] and add_211:
                        sem = line[sem_x].split("_")[0]
                        line.insert(sem_x+1, sem+"_CSC211")
                    #if "PHYS220" in line[sem_x]:  #Typically unneeded due to presence of PHYS230/222 scores that capture same info.
                    #    sem = line[sem_x].split("_")[0]
                    #    line.insert(sem_x+1, sem+"_PHYS222")
                    #if "PHYS230" in line[sem_x]:
                    #    sem = line[sem_x].split("_")[0]
                    #    line.insert(sem_x+1, sem+"_PHYS232")

            score_line = []
            for crs in line:
                if crs.startswith("0"):
                    continue
                score_line.append(crs)

            sem_dict = build_seq_sem_dict(score_line)
            score = score_seq(sem_dict, equiv_score_map, seq_score_map)
            top_100 = update_top_100([score, score_line], top_100)
            data = x.readline()
    print(i)
    if add_412:
        split_path = outpath.split(".")
        prefix = split_path[0]
        split_path[0] = prefix + "_412add"
        outpath = ".".join(split_path)
    if add_211:
        split_path = outpath.split(".")
        prefix = split_path[0]
        split_path[0] = prefix + "_211add"
        outpath = ".".join(split_path)
    utils.list_to_file(outpath, top_100)