예제 #1
0
def main(args):
    r"""Evaluate pattern's accuracy with 10 folds.

    For a detailed explanation, see the html doc.::
      ____
     / __ \                    /)
    | |  | | _   _  _ __ ___   _  _ __
    | |  | || | | || '_ ` _ \ | || '_ \
    | |__| || |_| || | | | | || || | | |
     \___\_\ \__,_||_| |_| |_||_||_| |_|
      Quantitative modeling of inflection

    """
    np.random.seed(0)  # make random generator determinist
    now = time.strftime("%Hh%M_%Y%m%d")

    segments.initialize(args.segments)
    paradigms, features = prepare_data(args)

    files = [Path(file).stem for file in args.paradigms]

    general_infos = {"Qumin_version": get_repository_version(), "lexemes": paradigms.shape[0],
                     "paradigms": ";".join(files), "day_time": now}

    tasks = prepare_arguments(paradigms, args.iterations, args.methods, features)
    if args.workers == 1:
        results = list(chain(*(evaluate(t) for t in tqdm(tasks))))
    else:
        pool = Pool(args.workers)
        results = list(chain(*tqdm(pool.imap_unordered(evaluate, tasks))))
        pool.close()

    results = pd.DataFrame(results)
    for info in general_infos:
        results[info] = general_infos[info]
    results.to_csv("../Results/Patterns/eval_patterns_{}_{}.csv".format(now, "_".join(files)))

    print_summary(results, general_infos)
    figs = to_heatmap(results, paradigms.columns.levels[1].tolist())
    for name, fig in figs:
        fig.savefig("../Results/Patterns/eval_patterns_heatmap_{}_{}_{}.png".format(now, name, "_".join(files)),
                    dpi=300, bbox_inches='tight', pad_inches=0.5)
예제 #2
0
def main(args):
    r""" Infer Inflection classes as a lattice from alternation patterns.
      ____
     / __ \                    /)
    | |  | | _   _  _ __ ___   _  _ __
    | |  | || | | || '_ ` _ \ | || '_ \
    | |__| || |_| || | | | | || || | | |
     \___\_\ \__,_||_| |_| |_||_||_| |_|
      Quantitative modeling of inflection

    """
    from os import path, makedirs
    import time
    now = time.strftime("%Hh%M")
    day = time.strftime("%Y%m%d")

    # Loading files and paths

    features_file_name = args.segments
    data_file_path = args.patterns
    data_file_name = path.basename(data_file_path)
    version = get_repository_version().rstrip("_")

    # Setting up the output path.
    result_dir = "../Results/{}/{}".format(args.folder, day)
    makedirs(result_dir, exist_ok=True)
    result_prefix = "{}/{}_{}_{}_{}_{}_{}lattice".format(result_dir, data_file_name, version, day, now,
                                                         "aoc" if args.aoc else "full",
                                                         "bipartite_" if args.bipartite else "_")

    if features_file_name != "ORTHO":

        # Initializing segments
        print("Initializing segments...")
        segments.initialize(features_file_name, sep="\t")

        print("Reading patterns...")
        pat_table, _ = patterns.from_csv(data_file_path)
        # pat_table = pat_table.applymap(str)
        # pat_table.columns = [x+" ~ "+y for x,y in pat_table.columns]
        collections = True
        comp = None
        if args.bipartite is not None:
            comp = "<comp>"
            try:
                pat_table2, _ = patterns.from_csv(args.bipartite)
                pat_table2.columns = [(comp + c1, c2) for (c1, c2) in pat_table2.columns]
            except:
                pat_table2 = pd.read_csv(args.bipartite, index_col=0).fillna("")
                pat_table2.columns = [comp + c for c in pat_table2.columns]
            pat_table = pat_table.join(pat_table2)
    else:
        print("Reading patterns...")
        pat_table = pd.read_csv(data_file_path, index_col=0)
        collections = False

    microclasses = find_microclasses(pat_table.applymap(str))

    print("Building the lattice...")
    lattice = ICLattice(pat_table.loc[list(microclasses), :], microclasses,
                        collections=collections, comp_prefix=comp, AOC=args.aoc, keep_names=(not args.shorten))

    if args.stat:
        with open(result_prefix + "_stats.txt", "w", encoding="utf-8") as flow:
            print(lattice.stats().to_frame().T.to_latex(), file=flow)
            print(lattice.stats().to_frame().T.to_latex())

    if args.png:
        lattice.draw(result_prefix + ".png", figsize=(20, 10), title=None, point=True)

    if args.pdf:
        lattice.draw(result_prefix + ".pdf", figsize=(20, 10), title=None, point=True)

    if args.html:
        print("Exporting to html:", result_prefix + ".html")
        lattice.to_html(result_prefix + ".html")

    if args.cxt:
        print("Exporting context to file:", result_prefix + ".cxt")
        lattice.context.tofile(result_prefix + ".cxt", frmat='cxt')

    if args.first:
        print("Here is the first level of the hierarchy:")
        print("Root:")
        obj, common = lattice.nodes.attributes["objects"], lattice.nodes.attributes["common"]
        if obj or common:
            print("\tdefines:", obj, common)
        for child in lattice.nodes.children:
            extent, common = child.labels, child.attributes["common"]
            print("extent:", extent, "\n\tdefines:", common, ">")
예제 #3
0
def main(args):
    r"""Find pairwise alternation patterns from paradigms.

    For a detailed explanation, see the html doc.::
      ____
     / __ \                    /)
    | |  | | _   _  _ __ ___   _  _ __
    | |  | || | | || '_ ` _ \ | || '_ \
    | |__| || |_| || | | | | || || | | |
     \___\_\ \__,_||_| |_| |_||_||_| |_|
      Quantitative modeling of inflection

    """
    from os import path, makedirs
    import time
    now = time.strftime("%Hh%M")
    day = time.strftime("%Y%m%d")

    # Loading files and paths
    kind = args.kind
    defective = args.defective
    overabundant = args.overabundant
    features_file_name = args.segments
    data_file_path = args.paradigms
    data_file_name = path.basename(data_file_path).rstrip("_")

    version = get_repository_version()
    # Setting up the output path.
    result_dir = "../Results/{}/".format(args.folder)
    makedirs(result_dir, exist_ok=True)
    result_prefix = "{}{}_{}_{}_{}_".format(result_dir, data_file_name,
                                            version, day, now)

    is_of_pattern_type = kind.startswith("patterns")
    segcheck = True

    # Initializing segments
    if features_file_name != "ORTHO":
        segments.initialize(features_file_name, sep="\t")
    elif is_of_pattern_type:
        raise argparse.ArgumentTypeError(
            "You can't find patterns on orthographic material.")
    else:
        segcheck = False
        patterns.ORTHO = True

    method = {
        'globalAlt': 'global',
        'localAlt': 'local',
        'patternsLevenshtein': 'levenshtein',
        'patternsPhonsim': 'similarity',
        'patternsSuffix': 'suffix',
        'patternsPrefix': 'prefix',
        'patternsBaseline': 'baseline'
    }

    merge_cols = False
    if is_of_pattern_type:
        merge_cols = True

    paradigms = create_paradigms(data_file_path,
                                 defective=defective,
                                 overabundant=overabundant,
                                 merge_cols=merge_cols,
                                 segcheck=segcheck)

    print("Looking for patterns...")
    if kind.startswith("endings"):
        patterns_df = patterns.find_endings(paradigms)
        if kind.endswith("Pairs"):
            patterns_df = patterns.make_pairs(patterns_df)
            print(patterns_df)
    elif is_of_pattern_type:
        patterns_df, dic = patterns.find_patterns(paradigms,
                                                  method[kind],
                                                  optim_mem=args.optim_mem,
                                                  gap_prop=args.gap_proportion)
    else:
        patterns_df = patterns.find_alternations(paradigms, method[kind])

    if merge_cols and not args.merge_cols:  # Re-build duplicate columns
        for a, b in patterns_df.columns:
            if "#" in a:
                cols = a.split("#")
                for c in cols:
                    patterns_df[(c, b)] = patterns_df[(a, b)]
                patterns_df.drop((a, b), axis=1, inplace=True)
                for x, y in combinations(cols, 2):
                    patterns_df[(x, y)] = patterns.Pattern.new_identity((x, y))

        for a, b in patterns_df.columns:
            if "#" in b:
                cols = b.split("#")
                for c in cols:
                    patterns_df[(a, c)] = patterns_df[(a, b)]
                patterns_df.drop((a, b), axis=1, inplace=True)
                for x, y in combinations(cols, 2):
                    patterns_df[(x, y)] = patterns.Pattern.new_identity((x, y))

    if patterns_df.isnull().values.any():
        print("Warning: error, some patterns are None")
        print(patterns_df[patterns_df.isnull().values])

    microclasses = find_microclasses(patterns_df.applymap(str))
    filename = result_prefix + "_microclasses.txt"
    print("\nFound ", len(microclasses),
          " microclasses.\nPrinting microclasses to ", filename)
    with open(filename, "w", encoding="utf-8") as flow:
        for m in sorted(microclasses, key=lambda m: len(microclasses[m])):
            flow.write("\n\n{} ({}) \n\t".format(m, len(microclasses[m])) +
                       ", ".join(microclasses[m]))

    patfilename = result_prefix + "_" + kind + ".csv"
    print("Printing patterns (importable by other scripts) to " + patfilename)
    if is_of_pattern_type:
        if args.optim_mem:
            patterns.to_csv(
                patterns_df, patfilename,
                pretty=True)  # uses str because optim_mem already used repr
            print(
                "Since you asked for args.optim_mem, I will not export the human_readable file "
            )
        else:
            patterns.to_csv(patterns_df, patfilename,
                            pretty=False)  # uses repr
            pathumanfilename = result_prefix + "_human_readable_" + kind + ".csv"
            print("Printing pretty patterns (for manual examination) to " +
                  pathumanfilename)
            patterns.to_csv(patterns_df, pathumanfilename,
                            pretty=True)  # uses str
    else:
        patterns_df.to_csv(patfilename, sep=",")
예제 #4
0
def main(args):
    r"""Compute entropies of flexional paradigms' distributions.

    For a detailed explanation, see the corresponding ipython Notebook
    and the html doc.::

          ____
         / __ \                    /)
        | |  | | _   _  _ __ ___   _  _ __
        | |  | || | | || '_ ` _ \ | || '_ \
        | |__| || |_| || | | | | || || | | |
         \___\_\ \__,_||_| |_| |_||_||_| |_|
          Quantitative modeling of inflection

    """
    patterns_file_path = args.patterns
    paradigms_file_path = args.paradigms
    data_file_name = path.basename(patterns_file_path).rstrip("_")

    verbose = args.verbose
    features_file_name = args.segments

    import time

    now = time.strftime("%Hh%M")
    day = time.strftime("%Y%m%d")

    # if compress and args.probabilities:
    #     print("WARNING: Printing probabilitie log isn't possible"
    #           " if we compress the data, so we won't compress.")
    #     compress = False

    result_dir = "../Results/{}/{}".format(args.folder, day)
    makedirs(result_dir, exist_ok=True)
    version = get_repository_version()
    preds = sorted(args.nPreds)
    onePred = preds[0] == 1
    if onePred:
        preds.pop(0)
    result_prefix = "{}/{}_{}_{}_{}_".format(result_dir, data_file_name,
                                             version, day, now)

    if onePred:  #TODO: Changer la gestion du fichier de log
        logfile_name = result_prefix + "onePred_log.log"
    if args.nPreds:
        logfile_name = result_prefix + "nPreds_log.log"
    else:
        logfile_name = result_prefix + ".log"

    if verbose or args.probabilities:
        logfile = open(logfile_name, "w", encoding="utf-8")

    # Initialize the class of segments.
    segments.initialize(features_file_name, sep="\t")

    # Patterns
    pat_table, pat_dic = patterns.from_csv(patterns_file_path,
                                           defective=True,
                                           overabundant=False)

    # Inflectional paradigms: columns are cells, rows are lexemes.
    paradigms = create_paradigms(paradigms_file_path,
                                 defective=True,
                                 overabundant=False,
                                 merge_cols=args.cols_merged,
                                 segcheck=True)

    if pat_table.shape[0] < paradigms.shape[0]:
        print(
            "It looks like you ignored defective rows when computing patterns. I'll drop all defectives."
        )
        paradigms = paradigms[(paradigms != "").all(axis=1)]

    sanity_check = verbose and len(pat_table.columns) < 10

    if args.features is not None:
        features = create_features(args.features)
    else:
        features = None

    if args.bipartite:

        result_prefix = "{}/{}_{}_{}_{}_bipartite".format(
            result_dir, data_file_name, version, day, now)
        paradigms2 = create_paradigms(args.bipartite[1],
                                      defective=True,
                                      overabundant=False,
                                      merge_cols=args.cols_merged,
                                      segcheck=True)
        pat_table2, pat_dic2 = patterns.from_csv(args.bipartite[0],
                                                 defective=True,
                                                 overabundant=False)

        distrib = SplitPatternDistribution(
            [paradigms, paradigms2], [pat_table, pat_table2],
            [pat_dic, pat_dic2],
            args.names,
            logfile=logfile if verbose or args.probabilities else None,
            features=features)
        if args.comp:
            ent_file1 = "{}onepredEntropies-{}.csv".format(
                result_prefix, args.names[0])
            ent_file2 = "{}onepredEntropies-{}.csv".format(
                result_prefix, args.names[1])
            I = "{}EntropiesI-{}{}.csv".format(result_prefix, *args.names)
            NMI = "{}EntropiesNMI-{}{}.csv".format(result_prefix, *args.names)

            distrib.distribs[0].entropy_matrix()
            entropies1 = distrib.distribs[0].entropies[1]
            distrib.distribs[1].entropy_matrix()
            entropies2 = distrib.distribs[1].entropies[1]
            mutual = distrib.mutual_information()
            normmutual = distrib.mutual_information(normalize=True)

            print("\nWriting to:", "\n\t".join([ent_file1, ent_file2, I, NMI]))
            entropies1.to_csv(ent_file1, sep="\t")
            entropies2.to_csv(ent_file2, sep="\t")
            mutual.to_csv(I, sep="\t")
            normmutual.to_csv(NMI, sep="\t")
            if args.verbose:
                #  mean on df's index, then on Series' values.
                mean1 = entropies1.mean().mean()
                mean2 = entropies2.mean().mean()
                mean3 = mutual.mean().mean()
                mean4 = normmutual.mean().mean()
                print("Mean remaining H(c1 -> c2) for " + args.names[0], mean1)
                print("Mean remaining H(c1 -> c2) for " + args.names[1], mean2)
                print("Mean I({},{})".format(*args.names), mean3)
                print("Mean NMI({},{})".format(*args.names), mean4)

    else:
        distrib = PatternDistribution(paradigms,
                                      pat_table,
                                      pat_dic,
                                      features=features)

    if onePred:
        ent_file = "{}onePredEntropies.csv".format(result_prefix)
        effectifs_file = "{}onePredEntropiesEffectifs.csv".format(
            result_prefix)
        distrib.entropy_matrix()
        entropies = distrib.entropies[1]
        effectifs = distrib.effectifs[1]

        if args.stacked:
            entropies = entropies.stack()
            entropies.index = [
                ' -> '.join(index[::-1]) for index in entropies.index.values
            ]
        print("\nWriting to: {}\n\tand {}".format(ent_file, effectifs_file))
        entropies.to_csv(ent_file, sep="\t")
        effectifs.to_csv(effectifs_file, sep="\t")
        if args.verbose:
            #  mean on df's index, then on Series' values.
            mean = entropies.mean().mean()
            print("Mean H(c1 -> c2) entropy: ", mean)
            print("Mean H(c1 -> c2) entropy: ", mean, file=logfile)

        if args.probabilities:
            check = distrib.one_pred_distrib_log(logfile,
                                                 sanity_check=sanity_check)

            if sanity_check:
                scsuffix = "{}onePredEntropies_slow_method.csv"
                check_file = scsuffix.format(result_prefix)

                print("\nWriting slowly computed "
                      "entropies to: {}".format(check_file))

                check.to_csv(check_file, sep="\t")

    if preds:

        if args.importFile:
            distrib.read_entropy_from_file(args.importFile)

        for n in preds:
            n_ent_file = "{}{}PredsEntropies.csv".format(result_prefix, n)
            effectifs_file = "{}{}PredsEntropiesEffectifs.csv".format(
                result_prefix, n)
            distrib.n_preds_entropy_matrix(n)
            n_entropies = distrib.entropies[n]
            effectifs = distrib.effectifs[n]
            print("\nWriting to: {}\n\tand {}".format(n_ent_file,
                                                      effectifs_file))
            if args.stacked:
                n_entropies = n_entropies.stack()
                n_entropies.index = [
                    ' -> '.join(index[::-1])
                    for index in n_entropies.index.values
                ]
            n_entropies.to_csv(n_ent_file, sep="\t")
            effectifs.to_csv(effectifs_file, sep="\t")
            if args.verbose:
                #  mean on df's index, then on Series' values.
                mean = n_entropies.mean().mean()
                print("Mean H(c1, ..., c{!s} -> c)"
                      "  entropy: ".format(n), mean)
                print("Mean H(c1, ..., c{!s} -> c)"
                      "  entropy: ".format(n),
                      mean,
                      file=logfile)
            if args.probabilities:
                n_check = distrib.n_preds_distrib_log(
                    logfile, n, sanity_check=sanity_check)

                if sanity_check:
                    scsuffix = "{}{}PredsEntropies_slow_method.csv"
                    n_check_file = scsuffix.format(result_prefix, n)
                    print("\nWriting slowly computed"
                          " entropies to: {}".format(n_check_file))
                    n_check.to_csv(n_check_file, sep="\t")

            if onePred and verbose:
                distrib.value_check(n, logfile=logfile if verbose else None)

    print()

    if verbose or args.probabilities:
        print("\nWrote log to: {}".format(logfile_name))
        logfile.close()
예제 #5
0
def main(args):
    r"""Cluster lexemes in macroclasses according to alternation patterns.

    We strongly recommend the default setting for the measure (-m) and the algorithm (-a)
    For a detailed explanation, see the html doc.::
      ____
     / __ \                    /)
    | |  | | _   _  _ __ ___   _  _ __
    | |  | || | | || '_ ` _ \ | || '_ \
    | |__| || |_| || | | | | || || | | |
     \___\_\ \__,_||_| |_| |_||_||_| |_|
      Quantitative modeling of inflection

    """
    from os import path, makedirs
    import time
    import re
    now = time.strftime("%Hh%M")
    day = time.strftime("%Y%m%d")

    # Loading files and paths
    features_file_name = args.segments
    data_file_path = args.patterns
    data_file_name = path.basename(data_file_path).rstrip("_")
    version = get_repository_version()
    print(data_file_name)

    pattern_type_match = re.match(r".+_(.+)\.csv", data_file_name)
    if pattern_type_match is None:
        print("Did you rename the patterns file ? As a result, I do not know which type of pattern you used..")
        kind = "unknown"
    else:
        kind = pattern_type_match.groups()[0]

    # Setting up the output path.
    result_dir = "../Results/{}/{}".format(args.folder, day)
    makedirs(result_dir, exist_ok=True)
    result_prefix = "{}/{}_{}_{}_{}_".format(result_dir, data_file_name, version, day, now)

    # Initializing segments

    if features_file_name != "ORTHO":
        segments.initialize(features_file_name, sep="\t")
        pat_table, pat_dic = patterns.from_csv(data_file_path, defective=False, overabundant=False)
        pat_table = pat_table.applymap(str)
    else:
        pat_table = pd.read_csv(data_file_path, index_col=0)

    result_prefix += args.algorithm + "_" + args.measure

    measures = {"BU": {"DL": descriptionlength.BUDLClustersBuilder,
                       "CD": distances.CompressionDistClustersBuilder,
                       "UPGMA": distances.UPGMAClustersBuilder},
                "TD": {"DL": descriptionlength.TDDLClustersBuilder}}

    algorithm_choice = {"BU": algorithms.bottom_up_clustering,
                        "TD": algorithms.top_down_clustering}

    preferences = {"prefix": result_prefix,
                   "clustering_algorithm": algorithm_choice[args.algorithm],
                   "verbose": args.verbose,
                   "debug": args.debug}

    attr = {"DL": "DL", "CD": "dist", "UPGMA": "dist"}

    # if args.randomised:
    #     func = preferences["clustering_algorithm"]
    #     randomised_algo = partial(algorithms.randomised, func, n=args.randomised)
    #     preferences["clustering_algorithm"] = randomised_algo

    node = algorithms.hierarchical_clustering(pat_table, measures[args.algorithm][args.measure], **preferences)

    if args.measure == "DL":
        DL = "Min :" + str(find_min_attribute(node, "DL"))
    else:
        DL = ""

    experiment_id = " ".join([args.algorithm, args.measure, " on ", kind, DL, "(", version, day, now, ")", ])

    # Saving png figure
    if MATPLOTLIB_LOADED:
        fig = plt.figure(figsize=(10, 20))
        figname = result_prefix + "_figure.png"
        print("Drawing figure to: {}".format(figname))
        node.draw(horizontal=True,
                  square=True,
                  leavesfunc=lambda x: x.labels[0] + " (" + str(x.attributes["size"]) + ")",
                  nodefunc=lambda x: "{0:.3f}".format(x.attributes[attr[args.measure]]),
                  keep_above_macroclass=True)

        fig.suptitle(experiment_id)
        fig.savefig(result_prefix + "_figure.png",
                    bbox_inches='tight', pad_inches=.5)

    # Saving text tree
    print("Printing tree to: {}".format(result_prefix + "_tree.txt"))
    string_tree = repr(node)
    flow = open(result_prefix + "_tree.txt", "w", encoding="utf8")
    flow.write(string_tree)
    flow.write("\n" + experiment_id)
    flow.close()