def main(args): r"""Evaluate pattern's accuracy with 10 folds. For a detailed explanation, see the html doc.:: ____ / __ \ /) | | | | _ _ _ __ ___ _ _ __ | | | || | | || '_ ` _ \ | || '_ \ | |__| || |_| || | | | | || || | | | \___\_\ \__,_||_| |_| |_||_||_| |_| Quantitative modeling of inflection """ np.random.seed(0) # make random generator determinist now = time.strftime("%Hh%M_%Y%m%d") segments.initialize(args.segments) paradigms, features = prepare_data(args) files = [Path(file).stem for file in args.paradigms] general_infos = {"Qumin_version": get_repository_version(), "lexemes": paradigms.shape[0], "paradigms": ";".join(files), "day_time": now} tasks = prepare_arguments(paradigms, args.iterations, args.methods, features) if args.workers == 1: results = list(chain(*(evaluate(t) for t in tqdm(tasks)))) else: pool = Pool(args.workers) results = list(chain(*tqdm(pool.imap_unordered(evaluate, tasks)))) pool.close() results = pd.DataFrame(results) for info in general_infos: results[info] = general_infos[info] results.to_csv("../Results/Patterns/eval_patterns_{}_{}.csv".format(now, "_".join(files))) print_summary(results, general_infos) figs = to_heatmap(results, paradigms.columns.levels[1].tolist()) for name, fig in figs: fig.savefig("../Results/Patterns/eval_patterns_heatmap_{}_{}_{}.png".format(now, name, "_".join(files)), dpi=300, bbox_inches='tight', pad_inches=0.5)
def main(args): r""" Infer Inflection classes as a lattice from alternation patterns. ____ / __ \ /) | | | | _ _ _ __ ___ _ _ __ | | | || | | || '_ ` _ \ | || '_ \ | |__| || |_| || | | | | || || | | | \___\_\ \__,_||_| |_| |_||_||_| |_| Quantitative modeling of inflection """ from os import path, makedirs import time now = time.strftime("%Hh%M") day = time.strftime("%Y%m%d") # Loading files and paths features_file_name = args.segments data_file_path = args.patterns data_file_name = path.basename(data_file_path) version = get_repository_version().rstrip("_") # Setting up the output path. result_dir = "../Results/{}/{}".format(args.folder, day) makedirs(result_dir, exist_ok=True) result_prefix = "{}/{}_{}_{}_{}_{}_{}lattice".format(result_dir, data_file_name, version, day, now, "aoc" if args.aoc else "full", "bipartite_" if args.bipartite else "_") if features_file_name != "ORTHO": # Initializing segments print("Initializing segments...") segments.initialize(features_file_name, sep="\t") print("Reading patterns...") pat_table, _ = patterns.from_csv(data_file_path) # pat_table = pat_table.applymap(str) # pat_table.columns = [x+" ~ "+y for x,y in pat_table.columns] collections = True comp = None if args.bipartite is not None: comp = "<comp>" try: pat_table2, _ = patterns.from_csv(args.bipartite) pat_table2.columns = [(comp + c1, c2) for (c1, c2) in pat_table2.columns] except: pat_table2 = pd.read_csv(args.bipartite, index_col=0).fillna("") pat_table2.columns = [comp + c for c in pat_table2.columns] pat_table = pat_table.join(pat_table2) else: print("Reading patterns...") pat_table = pd.read_csv(data_file_path, index_col=0) collections = False microclasses = find_microclasses(pat_table.applymap(str)) print("Building the lattice...") lattice = ICLattice(pat_table.loc[list(microclasses), :], microclasses, collections=collections, comp_prefix=comp, AOC=args.aoc, keep_names=(not args.shorten)) if args.stat: with open(result_prefix + "_stats.txt", "w", encoding="utf-8") as flow: print(lattice.stats().to_frame().T.to_latex(), file=flow) print(lattice.stats().to_frame().T.to_latex()) if args.png: lattice.draw(result_prefix + ".png", figsize=(20, 10), title=None, point=True) if args.pdf: lattice.draw(result_prefix + ".pdf", figsize=(20, 10), title=None, point=True) if args.html: print("Exporting to html:", result_prefix + ".html") lattice.to_html(result_prefix + ".html") if args.cxt: print("Exporting context to file:", result_prefix + ".cxt") lattice.context.tofile(result_prefix + ".cxt", frmat='cxt') if args.first: print("Here is the first level of the hierarchy:") print("Root:") obj, common = lattice.nodes.attributes["objects"], lattice.nodes.attributes["common"] if obj or common: print("\tdefines:", obj, common) for child in lattice.nodes.children: extent, common = child.labels, child.attributes["common"] print("extent:", extent, "\n\tdefines:", common, ">")
def main(args): r"""Find pairwise alternation patterns from paradigms. For a detailed explanation, see the html doc.:: ____ / __ \ /) | | | | _ _ _ __ ___ _ _ __ | | | || | | || '_ ` _ \ | || '_ \ | |__| || |_| || | | | | || || | | | \___\_\ \__,_||_| |_| |_||_||_| |_| Quantitative modeling of inflection """ from os import path, makedirs import time now = time.strftime("%Hh%M") day = time.strftime("%Y%m%d") # Loading files and paths kind = args.kind defective = args.defective overabundant = args.overabundant features_file_name = args.segments data_file_path = args.paradigms data_file_name = path.basename(data_file_path).rstrip("_") version = get_repository_version() # Setting up the output path. result_dir = "../Results/{}/".format(args.folder) makedirs(result_dir, exist_ok=True) result_prefix = "{}{}_{}_{}_{}_".format(result_dir, data_file_name, version, day, now) is_of_pattern_type = kind.startswith("patterns") segcheck = True # Initializing segments if features_file_name != "ORTHO": segments.initialize(features_file_name, sep="\t") elif is_of_pattern_type: raise argparse.ArgumentTypeError( "You can't find patterns on orthographic material.") else: segcheck = False patterns.ORTHO = True method = { 'globalAlt': 'global', 'localAlt': 'local', 'patternsLevenshtein': 'levenshtein', 'patternsPhonsim': 'similarity', 'patternsSuffix': 'suffix', 'patternsPrefix': 'prefix', 'patternsBaseline': 'baseline' } merge_cols = False if is_of_pattern_type: merge_cols = True paradigms = create_paradigms(data_file_path, defective=defective, overabundant=overabundant, merge_cols=merge_cols, segcheck=segcheck) print("Looking for patterns...") if kind.startswith("endings"): patterns_df = patterns.find_endings(paradigms) if kind.endswith("Pairs"): patterns_df = patterns.make_pairs(patterns_df) print(patterns_df) elif is_of_pattern_type: patterns_df, dic = patterns.find_patterns(paradigms, method[kind], optim_mem=args.optim_mem, gap_prop=args.gap_proportion) else: patterns_df = patterns.find_alternations(paradigms, method[kind]) if merge_cols and not args.merge_cols: # Re-build duplicate columns for a, b in patterns_df.columns: if "#" in a: cols = a.split("#") for c in cols: patterns_df[(c, b)] = patterns_df[(a, b)] patterns_df.drop((a, b), axis=1, inplace=True) for x, y in combinations(cols, 2): patterns_df[(x, y)] = patterns.Pattern.new_identity((x, y)) for a, b in patterns_df.columns: if "#" in b: cols = b.split("#") for c in cols: patterns_df[(a, c)] = patterns_df[(a, b)] patterns_df.drop((a, b), axis=1, inplace=True) for x, y in combinations(cols, 2): patterns_df[(x, y)] = patterns.Pattern.new_identity((x, y)) if patterns_df.isnull().values.any(): print("Warning: error, some patterns are None") print(patterns_df[patterns_df.isnull().values]) microclasses = find_microclasses(patterns_df.applymap(str)) filename = result_prefix + "_microclasses.txt" print("\nFound ", len(microclasses), " microclasses.\nPrinting microclasses to ", filename) with open(filename, "w", encoding="utf-8") as flow: for m in sorted(microclasses, key=lambda m: len(microclasses[m])): flow.write("\n\n{} ({}) \n\t".format(m, len(microclasses[m])) + ", ".join(microclasses[m])) patfilename = result_prefix + "_" + kind + ".csv" print("Printing patterns (importable by other scripts) to " + patfilename) if is_of_pattern_type: if args.optim_mem: patterns.to_csv( patterns_df, patfilename, pretty=True) # uses str because optim_mem already used repr print( "Since you asked for args.optim_mem, I will not export the human_readable file " ) else: patterns.to_csv(patterns_df, patfilename, pretty=False) # uses repr pathumanfilename = result_prefix + "_human_readable_" + kind + ".csv" print("Printing pretty patterns (for manual examination) to " + pathumanfilename) patterns.to_csv(patterns_df, pathumanfilename, pretty=True) # uses str else: patterns_df.to_csv(patfilename, sep=",")
def main(args): r"""Compute entropies of flexional paradigms' distributions. For a detailed explanation, see the corresponding ipython Notebook and the html doc.:: ____ / __ \ /) | | | | _ _ _ __ ___ _ _ __ | | | || | | || '_ ` _ \ | || '_ \ | |__| || |_| || | | | | || || | | | \___\_\ \__,_||_| |_| |_||_||_| |_| Quantitative modeling of inflection """ patterns_file_path = args.patterns paradigms_file_path = args.paradigms data_file_name = path.basename(patterns_file_path).rstrip("_") verbose = args.verbose features_file_name = args.segments import time now = time.strftime("%Hh%M") day = time.strftime("%Y%m%d") # if compress and args.probabilities: # print("WARNING: Printing probabilitie log isn't possible" # " if we compress the data, so we won't compress.") # compress = False result_dir = "../Results/{}/{}".format(args.folder, day) makedirs(result_dir, exist_ok=True) version = get_repository_version() preds = sorted(args.nPreds) onePred = preds[0] == 1 if onePred: preds.pop(0) result_prefix = "{}/{}_{}_{}_{}_".format(result_dir, data_file_name, version, day, now) if onePred: #TODO: Changer la gestion du fichier de log logfile_name = result_prefix + "onePred_log.log" if args.nPreds: logfile_name = result_prefix + "nPreds_log.log" else: logfile_name = result_prefix + ".log" if verbose or args.probabilities: logfile = open(logfile_name, "w", encoding="utf-8") # Initialize the class of segments. segments.initialize(features_file_name, sep="\t") # Patterns pat_table, pat_dic = patterns.from_csv(patterns_file_path, defective=True, overabundant=False) # Inflectional paradigms: columns are cells, rows are lexemes. paradigms = create_paradigms(paradigms_file_path, defective=True, overabundant=False, merge_cols=args.cols_merged, segcheck=True) if pat_table.shape[0] < paradigms.shape[0]: print( "It looks like you ignored defective rows when computing patterns. I'll drop all defectives." ) paradigms = paradigms[(paradigms != "").all(axis=1)] sanity_check = verbose and len(pat_table.columns) < 10 if args.features is not None: features = create_features(args.features) else: features = None if args.bipartite: result_prefix = "{}/{}_{}_{}_{}_bipartite".format( result_dir, data_file_name, version, day, now) paradigms2 = create_paradigms(args.bipartite[1], defective=True, overabundant=False, merge_cols=args.cols_merged, segcheck=True) pat_table2, pat_dic2 = patterns.from_csv(args.bipartite[0], defective=True, overabundant=False) distrib = SplitPatternDistribution( [paradigms, paradigms2], [pat_table, pat_table2], [pat_dic, pat_dic2], args.names, logfile=logfile if verbose or args.probabilities else None, features=features) if args.comp: ent_file1 = "{}onepredEntropies-{}.csv".format( result_prefix, args.names[0]) ent_file2 = "{}onepredEntropies-{}.csv".format( result_prefix, args.names[1]) I = "{}EntropiesI-{}{}.csv".format(result_prefix, *args.names) NMI = "{}EntropiesNMI-{}{}.csv".format(result_prefix, *args.names) distrib.distribs[0].entropy_matrix() entropies1 = distrib.distribs[0].entropies[1] distrib.distribs[1].entropy_matrix() entropies2 = distrib.distribs[1].entropies[1] mutual = distrib.mutual_information() normmutual = distrib.mutual_information(normalize=True) print("\nWriting to:", "\n\t".join([ent_file1, ent_file2, I, NMI])) entropies1.to_csv(ent_file1, sep="\t") entropies2.to_csv(ent_file2, sep="\t") mutual.to_csv(I, sep="\t") normmutual.to_csv(NMI, sep="\t") if args.verbose: # mean on df's index, then on Series' values. mean1 = entropies1.mean().mean() mean2 = entropies2.mean().mean() mean3 = mutual.mean().mean() mean4 = normmutual.mean().mean() print("Mean remaining H(c1 -> c2) for " + args.names[0], mean1) print("Mean remaining H(c1 -> c2) for " + args.names[1], mean2) print("Mean I({},{})".format(*args.names), mean3) print("Mean NMI({},{})".format(*args.names), mean4) else: distrib = PatternDistribution(paradigms, pat_table, pat_dic, features=features) if onePred: ent_file = "{}onePredEntropies.csv".format(result_prefix) effectifs_file = "{}onePredEntropiesEffectifs.csv".format( result_prefix) distrib.entropy_matrix() entropies = distrib.entropies[1] effectifs = distrib.effectifs[1] if args.stacked: entropies = entropies.stack() entropies.index = [ ' -> '.join(index[::-1]) for index in entropies.index.values ] print("\nWriting to: {}\n\tand {}".format(ent_file, effectifs_file)) entropies.to_csv(ent_file, sep="\t") effectifs.to_csv(effectifs_file, sep="\t") if args.verbose: # mean on df's index, then on Series' values. mean = entropies.mean().mean() print("Mean H(c1 -> c2) entropy: ", mean) print("Mean H(c1 -> c2) entropy: ", mean, file=logfile) if args.probabilities: check = distrib.one_pred_distrib_log(logfile, sanity_check=sanity_check) if sanity_check: scsuffix = "{}onePredEntropies_slow_method.csv" check_file = scsuffix.format(result_prefix) print("\nWriting slowly computed " "entropies to: {}".format(check_file)) check.to_csv(check_file, sep="\t") if preds: if args.importFile: distrib.read_entropy_from_file(args.importFile) for n in preds: n_ent_file = "{}{}PredsEntropies.csv".format(result_prefix, n) effectifs_file = "{}{}PredsEntropiesEffectifs.csv".format( result_prefix, n) distrib.n_preds_entropy_matrix(n) n_entropies = distrib.entropies[n] effectifs = distrib.effectifs[n] print("\nWriting to: {}\n\tand {}".format(n_ent_file, effectifs_file)) if args.stacked: n_entropies = n_entropies.stack() n_entropies.index = [ ' -> '.join(index[::-1]) for index in n_entropies.index.values ] n_entropies.to_csv(n_ent_file, sep="\t") effectifs.to_csv(effectifs_file, sep="\t") if args.verbose: # mean on df's index, then on Series' values. mean = n_entropies.mean().mean() print("Mean H(c1, ..., c{!s} -> c)" " entropy: ".format(n), mean) print("Mean H(c1, ..., c{!s} -> c)" " entropy: ".format(n), mean, file=logfile) if args.probabilities: n_check = distrib.n_preds_distrib_log( logfile, n, sanity_check=sanity_check) if sanity_check: scsuffix = "{}{}PredsEntropies_slow_method.csv" n_check_file = scsuffix.format(result_prefix, n) print("\nWriting slowly computed" " entropies to: {}".format(n_check_file)) n_check.to_csv(n_check_file, sep="\t") if onePred and verbose: distrib.value_check(n, logfile=logfile if verbose else None) print() if verbose or args.probabilities: print("\nWrote log to: {}".format(logfile_name)) logfile.close()
def main(args): r"""Cluster lexemes in macroclasses according to alternation patterns. We strongly recommend the default setting for the measure (-m) and the algorithm (-a) For a detailed explanation, see the html doc.:: ____ / __ \ /) | | | | _ _ _ __ ___ _ _ __ | | | || | | || '_ ` _ \ | || '_ \ | |__| || |_| || | | | | || || | | | \___\_\ \__,_||_| |_| |_||_||_| |_| Quantitative modeling of inflection """ from os import path, makedirs import time import re now = time.strftime("%Hh%M") day = time.strftime("%Y%m%d") # Loading files and paths features_file_name = args.segments data_file_path = args.patterns data_file_name = path.basename(data_file_path).rstrip("_") version = get_repository_version() print(data_file_name) pattern_type_match = re.match(r".+_(.+)\.csv", data_file_name) if pattern_type_match is None: print("Did you rename the patterns file ? As a result, I do not know which type of pattern you used..") kind = "unknown" else: kind = pattern_type_match.groups()[0] # Setting up the output path. result_dir = "../Results/{}/{}".format(args.folder, day) makedirs(result_dir, exist_ok=True) result_prefix = "{}/{}_{}_{}_{}_".format(result_dir, data_file_name, version, day, now) # Initializing segments if features_file_name != "ORTHO": segments.initialize(features_file_name, sep="\t") pat_table, pat_dic = patterns.from_csv(data_file_path, defective=False, overabundant=False) pat_table = pat_table.applymap(str) else: pat_table = pd.read_csv(data_file_path, index_col=0) result_prefix += args.algorithm + "_" + args.measure measures = {"BU": {"DL": descriptionlength.BUDLClustersBuilder, "CD": distances.CompressionDistClustersBuilder, "UPGMA": distances.UPGMAClustersBuilder}, "TD": {"DL": descriptionlength.TDDLClustersBuilder}} algorithm_choice = {"BU": algorithms.bottom_up_clustering, "TD": algorithms.top_down_clustering} preferences = {"prefix": result_prefix, "clustering_algorithm": algorithm_choice[args.algorithm], "verbose": args.verbose, "debug": args.debug} attr = {"DL": "DL", "CD": "dist", "UPGMA": "dist"} # if args.randomised: # func = preferences["clustering_algorithm"] # randomised_algo = partial(algorithms.randomised, func, n=args.randomised) # preferences["clustering_algorithm"] = randomised_algo node = algorithms.hierarchical_clustering(pat_table, measures[args.algorithm][args.measure], **preferences) if args.measure == "DL": DL = "Min :" + str(find_min_attribute(node, "DL")) else: DL = "" experiment_id = " ".join([args.algorithm, args.measure, " on ", kind, DL, "(", version, day, now, ")", ]) # Saving png figure if MATPLOTLIB_LOADED: fig = plt.figure(figsize=(10, 20)) figname = result_prefix + "_figure.png" print("Drawing figure to: {}".format(figname)) node.draw(horizontal=True, square=True, leavesfunc=lambda x: x.labels[0] + " (" + str(x.attributes["size"]) + ")", nodefunc=lambda x: "{0:.3f}".format(x.attributes[attr[args.measure]]), keep_above_macroclass=True) fig.suptitle(experiment_id) fig.savefig(result_prefix + "_figure.png", bbox_inches='tight', pad_inches=.5) # Saving text tree print("Printing tree to: {}".format(result_prefix + "_tree.txt")) string_tree = repr(node) flow = open(result_prefix + "_tree.txt", "w", encoding="utf8") flow.write(string_tree) flow.write("\n" + experiment_id) flow.close()