示例#1
0
from pathlib import Path
import tensorflow as tf

from classes import Alphabet

ipa = Alphabet(Path("../data/alphabets/ipa.csv"))

chars = "abcdefghijklmnop"

for char in chars:
    vec = ipa.create_char(char).vector
    char_ = ipa.get_char_by_feature_vector(vec)
    print(char, char_)

vec = [0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
vec_ = []
for i in vec:
    if i == 0:
        vec_.append(0.1)
    elif i == 1:
        vec_.append(0.9)
print(ipa.get_char_by_feature_vector(vec_))

import numpy as np
from utils import create_model

model, optimizer, loss_object = create_model(len(vec), 64, 32, len(vec))
vec = tf.keras.backend.expand_dims(vec, axis=0)
vec = tf.dtypes.cast(vec, tf.float32)

losses = []
示例#2
0
def main():
    global encoding
    args = parser_args()
    # determine whether the model should use feature encodings or character embeddings
    assert args.orthographic in [
        0, 1
    ], "Too many instances of --orthographic switch, should be 0 or 1"
    orthographic = bool(args.orthographic)
    # determine whether to use the aligned or unaligned data
    assert args.aligned in [
        0, 1
    ], "Too many instances of --aligned switch, should be 0 or 1"
    aligned = bool(args.aligned)
    # load data
    data_file = None
    if args.data == "ipa":
        encoding = 'utf-16'
        data_file = Path("../data/romance_swadesh_ipa.csv")
    elif args.data == "asjp":
        encoding = 'ascii'
        data_file = Path("../data/romance_swadesh_asjp.csv")
    assert data_file.exists() and data_file.is_file(
    ), "Data file {} does not exist".format(data_file)
    # determine model
    assert args.model in MODELS, "Model should be one of {}".format(MODELS)
    # determine path to alphabet file & encoding
    alphabet_file = None
    if args.model == "ipa":
        encoding = 'utf-16'
        alphabet_file = Path("../data/alphabets/ipa.csv")
    elif args.model == "asjp":
        encoding = 'ascii'
        alphabet_file = Path("../data/alphabets/asjp.csv")
    # load data from file
    assert alphabet_file.exists() and alphabet_file.is_file(
    ), "Alphabet file {} does not exist".format(alphabet_file)
    alphabet = Alphabet(alphabet_file,
                        encoding=encoding,
                        orthographic=orthographic)
    assert isinstance(args.epochs,
                      int), "Epochs not int, but {}".format(type(args.epochs))
    assert args.epochs > 0, "Epochs out of range: {}".format(args.epochs)
    epochs = args.epochs
    print("alphabet:")
    print(alphabet)

    # initialize model
    model, optimizer, loss_object = create_model(
        input_dim=alphabet.get_feature_dim(),
        embedding_dim=28,
        context_dim=128,
        output_dim=alphabet.get_feature_dim())

    model.summary()

    print("data_file: {}".format(data_file.absolute()))
    print("model: {}, orthographic={}, aligned={}".format(
        args.model, orthographic, aligned))
    print("alphabet: {}, read from {}".format(args.model,
                                              alphabet_file.absolute()))
    print("epochs: {}".format(epochs))

    # create cognate sets

    cognate_sets = []

    data = data_file.open(encoding='utf-16').read().split("\n")
    cols = data[HEADER_ROW].split(COLUMN_SEPARATOR)
    langs = cols[2:]

    for li, line in enumerate(data[HEADER_ROW:]):
        if aligned:
            if line == "" or li % 2 != 0:
                continue
        else:
            if line == "" or li % 2 == 0:
                continue
        row_split = line.split(COLUMN_SEPARATOR)
        id = row_split[ID_COLUMN]
        concept = row_split[CONCEPT_COLUMN]
        words = row_split[CONCEPT_COLUMN + 1:]
        cognate_dict = {}
        assert len(langs) == len(
            words), "Langs / Words mismatch, expected {}, got {}".format(
                len(langs), len(words))
        for lang, word in zip(langs, words):
            cognate_dict[lang] = alphabet.translate(word)
        cs = CognateSet(id=id,
                        concept=concept,
                        ancestor='latin',
                        cognate_dict=cognate_dict,
                        alphabet=alphabet)
        cognate_sets.append(cs)

    # maybe we needn't do the evaluation, since we mainly want to know how
    # the model behaves with the different inputs

    #split_index = int(valid_size * len(cognate_sets))
    #train_data = cognate_sets[:split_index]
    #valid_data = cognate_sets[split_index:]
    #print("train size: {}".format(len(train_data)))
    #print("valid size: {}".format(len(valid_data)))
    #cognate_sets = cognate_sets[10:30]

    words_true = []
    words_pred = []
    epoch_losses = []
    batch_losses = []

    for epoch in range(epochs):
        # reset lists
        epoch_losses.clear()
        words_true.clear()
        words_pred.clear()
        # iterate over the cognate sets
        for i, cs in enumerate(cognate_sets):
            # reset batch loss
            batch_losses.clear()
            # iterate over the character embeddings
            for j, char_embeddings in enumerate(cs):
                # add a dimension to the latin character embedding (ancestor embedding)
                # we add a dimension because we use a batch size of 1 and TensorFlow does not
                # automatically insert the batch size dimension
                target = tf.keras.backend.expand_dims(char_embeddings.pop(
                    cs.ancestor).to_numpy(),
                                                      axis=0)
                # convert the latin character embedding to float32 to match the dtype of the output (line 137)
                target = tf.dtypes.cast(target, tf.float32)
                # iterate through the embeddings
                # initialize the GradientTape
                with tf.GradientTape(persistent=True) as tape:
                    for lang, embedding in char_embeddings.items():
                        # add a dimension to the the embeddings
                        data = tf.keras.backend.expand_dims(
                            embedding.to_numpy(), axis=0)
                        output = model(data)
                        # calculate the loss
                        loss = loss_object(target, output)
                        epoch_losses.append(float(loss))
                        batch_losses.append(float(loss))
                        # calculate the gradients
                        gradients = tape.gradient(loss,
                                                  model.trainable_weights)
                        # backpropagate
                        optimizer.apply_gradients(
                            zip(gradients, model.trainable_weights))
                        # convert the character vector into a character
                    output_char = alphabet.get_char_by_feature_vector(output)
                    # append the converted vectors to a list so we can see the reconstructed word
                    output_characters.append(output_char)
            # append the reconstructed word and the ancestor to the true/pred lists
            words_pred.append("".join(output_characters))
            words_true.append(str(cs.get_ancestor()))
            # clear the list of output characters so we can create another word
            output_characters.clear()
            print("Batch {}, mean loss={}".format(i, np.mean(batch_losses)))
        # calculate distances
        ld = LevenshteinDistance(true=words_true, pred=words_pred)
        print("Epoch {} finished".format(epoch + 1))
        print("Mean loss={}".format(epoch, np.mean(epoch_losses)))
        ld.print_distances()
        ld.print_percentiles()

    # do so again after training has finished, but now also save the plots
    ld = LevenshteinDistance(true=words_true, pred=words_pred)
    ld.print_distances()
    ld.print_percentiles()
    ld.plot_distances(Path("../data/out/distances.png"))
    ld.plot_percentiles(Path("../data/out/percentiles.png"))
示例#3
0
def main():
    global encoding

    args = parse_args()

    # determine whether to use the aligned or unaligned data
    assert args.aligned in [0, 1], "Too many instances of --aligned switch, should be 0 or 1"
    aligned = bool(args.aligned)

    # and decide between feature encodings and character embeddings
    assert args.ortho in [0, 1], "Too many instances of --ortho switch, should be 0 or 1"
    ortho = bool(args.ortho)

    # load data
    data_file = Path(args.data)
    assert data_file.exists() and data_file.is_file(), "Data file {} does not exist".format(data_file)
    # determine model
    assert args.model in MODELS, "Model should be one of {}".format(MODELS)
    # determine path to alphabet file & encoding
    alphabet_file = None
    if args.model == "ipa":
        encoding = 'utf-16'
        alphabet_file = Path("../data/alphabets/ipa.csv")
    elif args.model == "asjp":
        encoding = 'ascii'
        alphabet_file = Path("../data/alphabets/asjp.csv")
    elif args.model == 'latin':
        encoding = 'utf-16'
        alphabet_file = Path("../data/alphabets/latin.csv")
    # load data from file
    assert alphabet_file.exists() and alphabet_file.is_file(), "Alphabet file {} does not exist".format(alphabet_file)
    alphabet = Alphabet(alphabet_file, encoding=encoding, ortho=ortho)

    # number of epochs
    assert isinstance(args.epochs, int), "Epochs not int, but {}".format(type(args.epochs))
    assert args.epochs > 0, "Epochs out of range: {}".format(args.epochs)
    epochs = args.epochs

    # number of hidden layers
    # assert args.n_hidden > 0, "Number of hidden layers should be at least 1 ;)"
    # n_hidden = args.n_hidden

    # determine output directories, create them if they do not exist
    out_tag = "_{}".format(args.out_tag)
    # and tag for files with train/test indices
    indices_tag = args.out_tag
    plots_dir = Path("../out/plots{}_many2one".format(out_tag))
    if not plots_dir.exists():
        plots_dir.mkdir(parents=True)
    results_dir = Path("../out/results{}_many2one".format(out_tag))
    if not results_dir.exists():
        results_dir.mkdir(parents=True)
    # create file for results
    result_file_path = results_dir / "m2one_{}{}{}.txt".format(args.model,
                                                               "_aligned" if aligned else "",
                                                               "_ortho" if ortho else "")
    result_file_path.touch()
    result_file = result_file_path.open('w', encoding=encoding)

    # determine ancestor
    ancestor = args.ancestor

    # create cognate sets
    cognate_sets = []
    data = data_file.open(encoding='utf-16').read().split("\n")
    cols = data[HEADER_ROW].split(COLUMN_SEPARATOR)
    langs = cols[2:]

    # import tensorflow here to comply with the wiki entry https://wiki.lsv.uni-saarland.de/doku.php?id=cluster
    import tensorflow as tf
    # set random seed for weights
    tf.random.set_seed(seed=42)

    # start data extraction
    for li, line in enumerate(data[HEADER_ROW:]):
        # have to do that because the file with the latin characters doesn't contain aligned cognate sets
        if args.model == 'latin':
            if line == "":
                continue
        # but the other two do
        elif aligned:
            if line == "" or li % 2 == 0:
                continue
        # the unaligned case
        else:
            if line == "" or li % 2 != 0:
                continue
        row_split = line.split(COLUMN_SEPARATOR)
        id = row_split[ID_COLUMN]
        concept = row_split[CONCEPT_COLUMN]
        words = row_split[CONCEPT_COLUMN + 1:]
        cognate_dict = {}
        assert len(langs) == len(words), "Langs / Words mismatch, expected {}, got {}".format(len(langs), len(words))
        for lang, word in zip(langs, words):
            cognate_dict[lang] = alphabet.translate(word)
        cognate_set = CognateSet(id=id,
                                 concept=concept,
                                 ancestor=ancestor,
                                 cognate_dict=cognate_dict,
                                 alphabet=alphabet)
        cognate_sets.append(cognate_set)


    # prepare train_test_split
    total_data = {str(i + 1): cognate_set for i, cognate_set in enumerate(cognate_sets)}
    train_indices = set(total_data.keys())
    runs = cross_validation_runs(5, train_indices)
    # test_indices = Path("../data/{}_test_indices.txt".format(indices_tag)).open('r').read().split("\n")
    # train_data = {i: cognate_set for i, cognate_set in data.items() if i in train_indices}
    # test_data = {i: cognate_set for i, cognate_set in data.items() if i in test_indices}

    # define model
    model, optimizer, loss_object = create_many_to_one_model(lstm_dim=128,
                                                             timesteps=len(langs) - 1,
                                                             data_dim=alphabet.feature_dim,
                                                             fc_dim=100,
                                                             output_dim=alphabet.feature_dim)
    model.summary()

    # save model weights for reset
    initital_weights = model.get_weights()

    words_true = []
    words_pred = []
    wts = []
    wps = []
    epoch_losses = []
    batch_losses = []

    # Training with cross-validation
    for i, run in enumerate(runs):
        print("***** Cross-validation run [{}/{}] *****".format(i + 1, len(runs)))
        # reload initial model weights
        model.set_weights(initital_weights)
        # get train & test folds
        train_data = {i: cognate_set for i, cognate_set in total_data.items() if i in run['train']}
        test_data = {i: cognate_set for i, cognate_set in total_data.items() if i in run['test']}
        print("***** Start training *****")
        for epoch in range(1, epochs + 1):
            words_true.clear()
            words_pred.clear()
            batch_losses.clear()
            for batch, cognate_set in train_data.items():
                output_characters = []
                for lang_array in cognate_set:
                    target = tf.keras.backend.expand_dims(lang_array.pop(ancestor).to_numpy(), axis=0)
                    target = tf.dtypes.cast(target, tf.float32)
                    data = []
                    for lang, vec in lang_array.items():
                        data.append(list(vec))
                    data = np.array(data)
                    data = tf.keras.backend.expand_dims(data, axis=0)
                    data = tf.dtypes.cast(data, tf.float32)
                    # data = tf.reshape(data, (1, -1))
                    with tf.GradientTape() as tape:
                        output = model(data)
                        loss = loss_object(target, output)
                        batch_losses.append(float(loss))
                        gradients = tape.gradient(loss, model.trainable_weights)
                        optimizer.apply_gradients(zip(gradients, model.trainable_weights))
                        output_characters.append(alphabet.get_char_by_vector(output))
                words_pred.append("".join(output_characters))
                words_true.append(str(cognate_set.ancestor_word))
                # print("".join(output_characters), str(cognate_set.ancestor_word))
                if int(batch) % 100 == 0:
                    print("Epoch [{}/{}], Batch [{}/{}]".format(epoch, epochs, batch, len(cognate_sets)))
            # calculate mean epoch loss
            mean_loss = np.mean(batch_losses)
            epoch_losses.append(mean_loss)
            print("Epoch[{}]/[{}], mean batch loss = {}".format(epoch, epochs, mean_loss))
            # calculate levenshtein distance
            ld = LevenshteinDistance(true=words_true, pred=words_pred)
            ld.print_distances()
            ld.print_percentiles()

        words_pred.clear()
        words_true.clear()
        print("***** Training finished *****")
        print()

        # Testing
        # Do the same thing as above with the test data, but don't collect the gradients
        # and don't backpropagate
        print("***** Start testing *****")
        for i, cognate_set in test_data.items():
            output_characters = []
            for lang_array in cognate_set:
                target = tf.keras.backend.expand_dims(lang_array.pop(ancestor).to_numpy(), axis=0)
                target = tf.dtypes.cast(target, tf.float32)
                data = []
                for lang, vec in lang_array.items():
                    data.append(list(vec))
                data = np.array(data)
                data = tf.keras.backend.expand_dims(data, axis=0)
                data = tf.dtypes.cast(data, tf.float32)
                output = model(data)
                # loss = loss_object(target, output)
                output_characters.append(alphabet.get_char_by_vector(output))
            # compile the reconstructed word
            words_pred.append("".join(output_characters))
            # save the true word for the distance calculation
            words_true.append(str(cognate_set.ancestor_word))
        wts.extend(words_true)
        wps.extend(words_pred)

        # create plots
        ld = LevenshteinDistance(words_true, words_pred)
        ld.print_distances()
        ld.print_percentiles()
        print("***** Testing finished *****")

    # save results after last run
    outfile = plots_dir / "many2one_test_{}{}{}.jpg".format(args.model, "_aligned" if aligned else "",
                                                        "_ortho" if ortho else "")
    title = "Model [Test]: LSTM {}{}{}\n 5 cross-validation folds" \
        .format(", " + args.model, ", aligned" if aligned else "", ", orthographic" if ortho else "")
    ld = LevenshteinDistance(wts, wps)
    plot_results(title=title,
                 distances={"=<" + str(d): count / 5 for d, count in ld.distances.items()},
                 percentiles={"=<" + str(d): perc for d, perc in ld.percentiles.items()},
                 mean_dist=ld.mean_distance,
                 mean_dist_norm=ld.mean_distance_normalized,
                 losses=[],
                 outfile=Path(outfile),
                 testing=True)
示例#4
0
from classes import Alphabet, Text, Partitioning, Key
from program import decode_partition_fast, fitness, decode_partition_slow, decode_sort, decode_fitness

# Example SymbolSets
SymbolSets = {
    "Standard": Alphabet([*"abcdefghijklmnopqrstuvwxyz"]),
    "Reversed": Alphabet([*"zyxwvutsrqponmlkjihgfedcba"]),
    "Symbols": Alphabet([*"!§$%&/()=?-_{[]}#'+~:;.,@<"]),
    "Standard27": Alphabet([*"abcdefghijklmnopqrstuvwxyz "]),
    "Reversed27": Alphabet([*" zyxwvutsrqponmlkjihgfedcba"]),
    "Symbols27": Alphabet([*"!§$%&/()=?-_{[]}#'+~:;.,@<|"])
}
# Example Partitionings
Partitioning_ = {
    "Slow": Partitioning((6, 10, 9, 1)),
    "Normal": Partitioning((4, 2, 5, 5, 3, 5, 1, 1)),
    "Fast": Partitioning((4, 2, 5, 2, 3, 3, 3, 2, 1, 1)),
    "Slow27": Partitioning((6, 10, 9, 1, 1)),
    "Normal27": Partitioning((4, 2, 5, 5, 3, 5, 1, 1, 1)),
    "Fast27": Partitioning((4, 2, 5, 2, 3, 3, 3, 2, 1, 1, 1))
}
replace = {'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss'}

# Choosing the SymbolSets and the Partitioning
symbols_ref = SymbolSets["Standard"]
symbols_enc = SymbolSets["Reversed"]
partitioning = Partitioning_["Fast"]

# Creating the encryption key
key = Key.new(symbols_ref, symbols_enc)
示例#5
0
from lingpy import rc
from pathlib import Path

from classes import Ipa2Asjp, Alphabet

ipa = Alphabet(Path("../data/alphabets/ipa.csv"))
sca = rc('asjp')
converter = Ipa2Asjp(sca, ["ː"])

romance_ipa_path = Path("../data/romance_ciobanu_ipa.csv")
romance_ipa = romance_ipa_path.open(encoding='utf-16').read()

out_path = Path("../data/romance_ciobanu_asjp.csv")
out_path.touch()
out_file = out_path.open('w', encoding='utf-16')

langs = ["latin", "italian", "spanish", "french", "portuguese", "romanian"]
col_names = ["id", "concept"] + langs

header = "id,concept,latin,italian,spanish,french,portuguese,romanian\n"
out_file.write(header)
print(header)

for line in romance_ipa.split("\n")[1:]:
    s = ""
    if line != "":
        row = line.split(",")
        assert len(row) == len(col_names), "Expected {} fields, found {}"\
            .format(len(col_names), row)
        # create row data dict
        row_data = {
示例#6
0
from pathlib import Path

from classes import Alphabet

id_col = 0
langs = ['latin', 'italian', 'spanish', 'french', 'portuguese', 'romanian']

path_to_ipa = Path("../data/alphabets/ipa.csv")
path_to_romance_data = Path("../data/romance_swadesh_ipa.csv")

ipa = Alphabet(path_to_ipa)
romance_data = path_to_romance_data.open(encoding='utf-16').read().split("\n")
cols = romance_data[0].split(",")

romance_raw = [
    romance_data[i] for i in range(1,
                                   len(romance_data) - 1) if i % 2 != 0
]
romance_aligned = [
    romance_data[i] for i in range(2, len(romance_data)) if i % 2 == 0
]

assert len(romance_raw) == len(romance_aligned), "aligned and raw data of different length: {}, {}"\
    .format(len(romance_raw), len(romance_aligned))

data = {}

for category, lines in {
        'raw': romance_raw,
        'aligned': romance_aligned
}.items():
示例#7
0
def train():

    # Command line call I used:
    # python ciobanu_rnn.py --data=ipa --model=ipa --epochs=10 --out_tag=test --model=ipa --ancestor=ancestor

    global encoding
    args = parser_args()
    # determine whether the model should use feature encodings or character embeddings
    assert args.ortho in [
        0, 1
    ], "Too many instances of --orthographic switch, should be 0 or 1"
    ortho = bool(args.ortho)
    # determine whether to use the aligned or unaligned data
    assert args.aligned in [
        0, 1
    ], "Too many instances of --aligned switch, should be 0 or 1"
    aligned = bool(args.aligned)
    # load data
    data_file = None
    if args.data == "ipa":
        encoding = 'utf-16'
        data_file = Path("../data/romance_ciobanu_ipa.csv")
    elif args.data == "asjp":
        encoding = 'ascii'
        data_file = Path("../data/romance_ciobanu_asjp.csv")
    assert data_file.exists() and data_file.is_file(
    ), "Data file {} does not exist".format(data_file)
    # determine model
    assert args.model in MODELS, "Model should be one of {}".format(MODELS)
    # determine path to alphabet file & encoding
    alphabet_file = None
    if args.model == "ipa":
        encoding = 'utf-16'
        alphabet_file = Path("../data/alphabets/ipa.csv")
    elif args.model == "asjp":
        encoding = 'ascii'
        alphabet_file = Path("../data/alphabets/asjp.csv")
    # load data from file
    assert alphabet_file.exists() and alphabet_file.is_file(
    ), "Alphabet file {} does not exist".format(alphabet_file)
    alphabet = Alphabet(alphabet_file, encoding=encoding, ortho=ortho)
    assert isinstance(args.epochs,
                      int), "Epochs not int, but {}".format(type(args.epochs))
    assert args.epochs > 0, "Epochs out of range: {}".format(args.epochs)
    epochs = args.epochs

    # ancestor
    ancestor = args.ancestor

    # determine output directories, create them if they do not exist
    out_tag = "_{}".format(args.out_tag)
    plots_dir = Path("../out/plots{}_deep".format(out_tag))
    if not plots_dir.exists():
        plots_dir.mkdir(parents=True)
    results_dir = Path("../out/results{}_deep".format(out_tag))
    if not results_dir.exists():
        results_dir.mkdir(parents=True)
    # create file for results
    result_file_path = results_dir / "deep_{}{}{}.txt".format(
        args.model, "_aligned" if aligned else "", "_ortho" if ortho else "")
    result_file_path.touch()
    result_file = result_file_path.open('w', encoding=encoding)

    print("alphabet:")
    print(alphabet)

    # initialize model
    model, optimizer, loss_object = create_model(
        input_dim=alphabet.get_feature_dim(),
        embedding_dim=28,
        context_dim=128,
        output_dim=alphabet.get_feature_dim())

    model.summary()

    print("data_file: {}".format(data_file.absolute()))
    print("model: {}, orthographic={}, aligned={}".format(
        args.model, ortho, aligned))
    print("alphabet: {}, read from {}".format(args.model,
                                              alphabet_file.absolute()))
    print("epochs: {}".format(epochs))

    # create cognate sets

    cognate_sets = []

    data = data_file.open(encoding='utf-16').read().split("\n")
    cols = data[HEADER_ROW].split(COLUMN_SEPARATOR)
    langs = cols[2:]
    print("langs")
    print(langs)

    for li, line in enumerate(data[HEADER_ROW:]):
        if aligned:
            if line == "" or li % 2 != 0:
                continue
        else:
            if line == "" or li % 2 == 0:
                continue
        row_split = line.split(COLUMN_SEPARATOR)
        id = row_split[ID_COLUMN]
        concept = row_split[CONCEPT_COLUMN]
        words = row_split[CONCEPT_COLUMN + 1:]
        # print("words")
        # print(words)
        cognate_dict = {}
        assert len(langs) == len(
            words), "Langs / Words mismatch, expected {}, got {}".format(
                len(langs), len(words))
        for lang, word in zip(langs, words):
            # print("lang, word")
            # print(lang, word)
            cognate_dict[lang] = alphabet.translate(word)
        cs = CognateSet(id=id,
                        concept=concept,
                        ancestor=ancestor,
                        cognate_dict=cognate_dict,
                        alphabet=alphabet)
        cognate_sets.append(cs)

    # maybe we needn't do the evaluation, since we mainly want to know how
    # the model behaves with the different inputs

    split_index = int(valid_size * len(cognate_sets))
    train_data = cognate_sets[:split_index]
    valid_data = cognate_sets[split_index:]
    print("train size: {}".format(len(train_data)))
    print("valid size: {}".format(len(valid_data)))
    # cognate_sets = cognate_sets[10:30]
    # print("cognate_sets in ral")
    # print(cognate_sets)

    words_true = []
    words_pred = []
    epoch_losses = []
    batch_losses = []

    for epoch in range(epochs):
        # reset lists
        epoch_losses.clear()
        words_true.clear()
        words_pred.clear()
        # iterate over the cognate sets
        for i, cs in enumerate(cognate_sets):
            # reset batch loss
            batch_losses.clear()
            # iterate over the character embeddings
            for j, char_embeddings in enumerate(cs):
                # add a dimension to the latin character embedding (ancestor embedding)
                # we add a dimension because we use a batch size of 1 and TensorFlow does not
                # automatically insert the batch size dimension
                target = tf.keras.backend.expand_dims(char_embeddings.pop(
                    cs.ancestor).to_numpy(),
                                                      axis=0)
                # convert the latin character embedding to float32 to match the dtype of the output (line 137)
                target = tf.dtypes.cast(target, tf.float32)
                # iterate through the embeddings
                # initialize the GradientTape
                with tf.GradientTape(persistent=True) as tape:
                    for lang, embedding in char_embeddings.items():
                        # add a dimension to the the embeddings
                        data = tf.keras.backend.expand_dims(
                            embedding.to_numpy(), axis=0)
                        output = model(data)
                        # calculate the loss
                        loss = loss_object(target, output)
                        epoch_losses.append(float(loss))
                        batch_losses.append(float(loss))
                        # calculate the gradients
                        gradients = tape.gradient(loss,
                                                  model.trainable_weights)
                        # backpropagate
                        optimizer.apply_gradients(
                            zip(gradients, model.trainable_weights))
                        # convert the character vector into a character
                    output_char = alphabet.get_char_by_feature_vector(output)
                    # append the converted vectors to a list so we can see the reconstructed word
                    output_characters.append(output_char)
            # append the reconstructed word and the ancestor to the true/pred lists
            words_pred.append("".join(output_characters))
            words_true.append(str(cs.ancestor))
            # clear the list of output characters so we can create another word
            output_characters.clear()
            print("Batch {}, mean loss={}".format(i, np.mean(batch_losses)))
        # calculate distances
        ld = LevenshteinDistance(true=words_true, pred=words_pred)
        print("Epoch {} finished".format(epoch + 1))
        print("Mean loss={}".format(epoch, np.mean(epoch_losses)))
        ld.print_distances()
        ld.print_percentiles()
        if epoch == epochs:
            outfile = "../out/plots_swadesh_deep/deep_{}{}{}.jpg".format(
                args.model, "_aligned" if aligned else "",
                "_ortho" if ortho else "")
            title = "Model: deep net{}{}{}".format(
                ", " + args.model, ", aligned" if aligned else "",
                ", orthographic" if ortho else "")
            plot_results(title=title,
                         distances={
                             "=<" + str(d): count
                             for d, count in ld.distances.items()
                         },
                         percentiles={
                             "=<" + str(d): perc
                             for d, perc in ld.percentiles.items()
                         },
                         mean_dist=ld.mean_distance,
                         mean_dist_norm=ld.mean_distance_normalized,
                         losses=epoch_losses,
                         outfile=Path(outfile))
            # save reconstructed words (but only if the edit distance is at least one)
            import nltk
            for t, p in zip(words_true, words_pred):
                distance = nltk.edit_distance(t, p)
                if distance > 0:
                    line = "{},{},distance={}\n".format(
                        t, p, nltk.edit_distance(t, p))
                    result_file.write(line)
            result_file.close()
import lingpy as lp
from pathlib import Path
from typing import List

from classes import Alphabet


def lst_to_str(lst: List[str]):
    s = ""
    for c in lst:
        s += c
    return s


ipa_csv_path = Path("../data/alphabets/ipa.csv")
ipa = Alphabet(ipa_csv_path)
romance_data_path = Path("../data/romance_ipa_partial.csv")
romance_data = romance_data_path.open(encoding='utf-16').read()
out_path = Path("../data/romance_ipa_aligned.csv")
out_file = out_path.open('w')

langs = ["latin", "italian", "spanish", "french", "romanian"]

header = "id,concept,latin,italian,spanish, french,romanian\n"

out_file.write(header)

cognate_sets = {}
for line in romance_data.split("\n")[1:51]:
    s = ""
    data = line.split(",")