Python Alphabet примеры использования

Язык программирования: Python

Пространство имен/Пакет: classes

Класс/Тип: Alphabet

Примеров на hotexamples.com: 8

Python Alphabet - 8 примеров найдено. Это лучшие примеры Python кода для classes.Alphabet, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Alphabet(7)

translate(5)

get_char_by_feature_vector(3)

get_feature_dim(2)

create_char(1)

get_char_by_vector(1)

Пример #1

Показать файл

Файл: sanity_check.py Проект: justeuer/sopro-nlpwithnn

from pathlib import Path
import tensorflow as tf

from classes import Alphabet

ipa = Alphabet(Path("../data/alphabets/ipa.csv"))

chars = "abcdefghijklmnop"

for char in chars:
    vec = ipa.create_char(char).vector
    char_ = ipa.get_char_by_feature_vector(vec)
    print(char, char_)

vec = [0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
vec_ = []
for i in vec:
    if i == 0:
        vec_.append(0.1)
    elif i == 1:
        vec_.append(0.9)
print(ipa.get_char_by_feature_vector(vec_))

import numpy as np
from utils import create_model

model, optimizer, loss_object = create_model(len(vec), 64, 32, len(vec))
vec = tf.keras.backend.expand_dims(vec, axis=0)
vec = tf.dtypes.cast(vec, tf.float32)

losses = []

Пример #2

Показать файл

def main():
    global encoding
    args = parser_args()
    # determine whether the model should use feature encodings or character embeddings
    assert args.orthographic in [
        0, 1
    ], "Too many instances of --orthographic switch, should be 0 or 1"
    orthographic = bool(args.orthographic)
    # determine whether to use the aligned or unaligned data
    assert args.aligned in [
        0, 1
    ], "Too many instances of --aligned switch, should be 0 or 1"
    aligned = bool(args.aligned)
    # load data
    data_file = None
    if args.data == "ipa":
        encoding = 'utf-16'
        data_file = Path("../data/romance_swadesh_ipa.csv")
    elif args.data == "asjp":
        encoding = 'ascii'
        data_file = Path("../data/romance_swadesh_asjp.csv")
    assert data_file.exists() and data_file.is_file(
    ), "Data file {} does not exist".format(data_file)
    # determine model
    assert args.model in MODELS, "Model should be one of {}".format(MODELS)
    # determine path to alphabet file & encoding
    alphabet_file = None
    if args.model == "ipa":
        encoding = 'utf-16'
        alphabet_file = Path("../data/alphabets/ipa.csv")
    elif args.model == "asjp":
        encoding = 'ascii'
        alphabet_file = Path("../data/alphabets/asjp.csv")
    # load data from file
    assert alphabet_file.exists() and alphabet_file.is_file(
    ), "Alphabet file {} does not exist".format(alphabet_file)
    alphabet = Alphabet(alphabet_file,
                        encoding=encoding,
                        orthographic=orthographic)
    assert isinstance(args.epochs,
                      int), "Epochs not int, but {}".format(type(args.epochs))
    assert args.epochs > 0, "Epochs out of range: {}".format(args.epochs)
    epochs = args.epochs
    print("alphabet:")
    print(alphabet)

    # initialize model
    model, optimizer, loss_object = create_model(
        input_dim=alphabet.get_feature_dim(),
        embedding_dim=28,
        context_dim=128,
        output_dim=alphabet.get_feature_dim())

    model.summary()

    print("data_file: {}".format(data_file.absolute()))
    print("model: {}, orthographic={}, aligned={}".format(
        args.model, orthographic, aligned))
    print("alphabet: {}, read from {}".format(args.model,
                                              alphabet_file.absolute()))
    print("epochs: {}".format(epochs))

    # create cognate sets

    cognate_sets = []

    data = data_file.open(encoding='utf-16').read().split("\n")
    cols = data[HEADER_ROW].split(COLUMN_SEPARATOR)
    langs = cols[2:]

    for li, line in enumerate(data[HEADER_ROW:]):
        if aligned:
            if line == "" or li % 2 != 0:
                continue
        else:
            if line == "" or li % 2 == 0:
                continue
        row_split = line.split(COLUMN_SEPARATOR)
        id = row_split[ID_COLUMN]
        concept = row_split[CONCEPT_COLUMN]
        words = row_split[CONCEPT_COLUMN + 1:]
        cognate_dict = {}
        assert len(langs) == len(
            words), "Langs / Words mismatch, expected {}, got {}".format(
                len(langs), len(words))
        for lang, word in zip(langs, words):
            cognate_dict[lang] = alphabet.translate(word)
        cs = CognateSet(id=id,
                        concept=concept,
                        ancestor='latin',
                        cognate_dict=cognate_dict,
                        alphabet=alphabet)
        cognate_sets.append(cs)

    # maybe we needn't do the evaluation, since we mainly want to know how
    # the model behaves with the different inputs

    #split_index = int(valid_size * len(cognate_sets))
    #train_data = cognate_sets[:split_index]
    #valid_data = cognate_sets[split_index:]
    #print("train size: {}".format(len(train_data)))
    #print("valid size: {}".format(len(valid_data)))
    #cognate_sets = cognate_sets[10:30]

    words_true = []
    words_pred = []
    epoch_losses = []
    batch_losses = []

    for epoch in range(epochs):
        # reset lists
        epoch_losses.clear()
        words_true.clear()
        words_pred.clear()
        # iterate over the cognate sets
        for i, cs in enumerate(cognate_sets):
            # reset batch loss
            batch_losses.clear()
            # iterate over the character embeddings
            for j, char_embeddings in enumerate(cs):
                # add a dimension to the latin character embedding (ancestor embedding)
                # we add a dimension because we use a batch size of 1 and TensorFlow does not
                # automatically insert the batch size dimension
                target = tf.keras.backend.expand_dims(char_embeddings.pop(
                    cs.ancestor).to_numpy(),
                                                      axis=0)
                # convert the latin character embedding to float32 to match the dtype of the output (line 137)
                target = tf.dtypes.cast(target, tf.float32)
                # iterate through the embeddings
                # initialize the GradientTape
                with tf.GradientTape(persistent=True) as tape:
                    for lang, embedding in char_embeddings.items():
                        # add a dimension to the the embeddings
                        data = tf.keras.backend.expand_dims(
                            embedding.to_numpy(), axis=0)
                        output = model(data)
                        # calculate the loss
                        loss = loss_object(target, output)
                        epoch_losses.append(float(loss))
                        batch_losses.append(float(loss))
                        # calculate the gradients
                        gradients = tape.gradient(loss,
                                                  model.trainable_weights)
                        # backpropagate
                        optimizer.apply_gradients(
                            zip(gradients, model.trainable_weights))
                        # convert the character vector into a character
                    output_char = alphabet.get_char_by_feature_vector(output)
                    # append the converted vectors to a list so we can see the reconstructed word
                    output_characters.append(output_char)
            # append the reconstructed word and the ancestor to the true/pred lists
            words_pred.append("".join(output_characters))
            words_true.append(str(cs.get_ancestor()))
            # clear the list of output characters so we can create another word
            output_characters.clear()
            print("Batch {}, mean loss={}".format(i, np.mean(batch_losses)))
        # calculate distances
        ld = LevenshteinDistance(true=words_true, pred=words_pred)
        print("Epoch {} finished".format(epoch + 1))
        print("Mean loss={}".format(epoch, np.mean(epoch_losses)))
        ld.print_distances()
        ld.print_percentiles()

    # do so again after training has finished, but now also save the plots
    ld = LevenshteinDistance(true=words_true, pred=words_pred)
    ld.print_distances()
    ld.print_percentiles()
    ld.plot_distances(Path("../data/out/distances.png"))
    ld.plot_percentiles(Path("../data/out/percentiles.png"))

Пример #3

Показать файл

def main():
    global encoding

    args = parse_args()

    # determine whether to use the aligned or unaligned data
    assert args.aligned in [0, 1], "Too many instances of --aligned switch, should be 0 or 1"
    aligned = bool(args.aligned)

    # and decide between feature encodings and character embeddings
    assert args.ortho in [0, 1], "Too many instances of --ortho switch, should be 0 or 1"
    ortho = bool(args.ortho)

    # load data
    data_file = Path(args.data)
    assert data_file.exists() and data_file.is_file(), "Data file {} does not exist".format(data_file)
    # determine model
    assert args.model in MODELS, "Model should be one of {}".format(MODELS)
    # determine path to alphabet file & encoding
    alphabet_file = None
    if args.model == "ipa":
        encoding = 'utf-16'
        alphabet_file = Path("../data/alphabets/ipa.csv")
    elif args.model == "asjp":
        encoding = 'ascii'
        alphabet_file = Path("../data/alphabets/asjp.csv")
    elif args.model == 'latin':
        encoding = 'utf-16'
        alphabet_file = Path("../data/alphabets/latin.csv")
    # load data from file
    assert alphabet_file.exists() and alphabet_file.is_file(), "Alphabet file {} does not exist".format(alphabet_file)
    alphabet = Alphabet(alphabet_file, encoding=encoding, ortho=ortho)

    # number of epochs
    assert isinstance(args.epochs, int), "Epochs not int, but {}".format(type(args.epochs))
    assert args.epochs > 0, "Epochs out of range: {}".format(args.epochs)
    epochs = args.epochs

    # number of hidden layers
    # assert args.n_hidden > 0, "Number of hidden layers should be at least 1 ;)"
    # n_hidden = args.n_hidden

    # determine output directories, create them if they do not exist
    out_tag = "_{}".format(args.out_tag)
    # and tag for files with train/test indices
    indices_tag = args.out_tag
    plots_dir = Path("../out/plots{}_many2one".format(out_tag))
    if not plots_dir.exists():
        plots_dir.mkdir(parents=True)
    results_dir = Path("../out/results{}_many2one".format(out_tag))
    if not results_dir.exists():
        results_dir.mkdir(parents=True)
    # create file for results
    result_file_path = results_dir / "m2one_{}{}{}.txt".format(args.model,
                                                               "_aligned" if aligned else "",
                                                               "_ortho" if ortho else "")
    result_file_path.touch()
    result_file = result_file_path.open('w', encoding=encoding)

    # determine ancestor
    ancestor = args.ancestor

    # create cognate sets
    cognate_sets = []
    data = data_file.open(encoding='utf-16').read().split("\n")
    cols = data[HEADER_ROW].split(COLUMN_SEPARATOR)
    langs = cols[2:]

    # import tensorflow here to comply with the wiki entry https://wiki.lsv.uni-saarland.de/doku.php?id=cluster
    import tensorflow as tf
    # set random seed for weights
    tf.random.set_seed(seed=42)

    # start data extraction
    for li, line in enumerate(data[HEADER_ROW:]):
        # have to do that because the file with the latin characters doesn't contain aligned cognate sets
        if args.model == 'latin':
            if line == "":
                continue
        # but the other two do
        elif aligned:
            if line == "" or li % 2 == 0:
                continue
        # the unaligned case
        else:
            if line == "" or li % 2 != 0:
                continue
        row_split = line.split(COLUMN_SEPARATOR)
        id = row_split[ID_COLUMN]
        concept = row_split[CONCEPT_COLUMN]
        words = row_split[CONCEPT_COLUMN + 1:]
        cognate_dict = {}
        assert len(langs) == len(words), "Langs / Words mismatch, expected {}, got {}".format(len(langs), len(words))
        for lang, word in zip(langs, words):
            cognate_dict[lang] = alphabet.translate(word)
        cognate_set = CognateSet(id=id,
                                 concept=concept,
                                 ancestor=ancestor,
                                 cognate_dict=cognate_dict,
                                 alphabet=alphabet)
        cognate_sets.append(cognate_set)


    # prepare train_test_split
    total_data = {str(i + 1): cognate_set for i, cognate_set in enumerate(cognate_sets)}
    train_indices = set(total_data.keys())
    runs = cross_validation_runs(5, train_indices)
    # test_indices = Path("../data/{}_test_indices.txt".format(indices_tag)).open('r').read().split("\n")
    # train_data = {i: cognate_set for i, cognate_set in data.items() if i in train_indices}
    # test_data = {i: cognate_set for i, cognate_set in data.items() if i in test_indices}

    # define model
    model, optimizer, loss_object = create_many_to_one_model(lstm_dim=128,
                                                             timesteps=len(langs) - 1,
                                                             data_dim=alphabet.feature_dim,
                                                             fc_dim=100,
                                                             output_dim=alphabet.feature_dim)
    model.summary()

    # save model weights for reset
    initital_weights = model.get_weights()

    words_true = []
    words_pred = []
    wts = []
    wps = []
    epoch_losses = []
    batch_losses = []

    # Training with cross-validation
    for i, run in enumerate(runs):
        print("***** Cross-validation run [{}/{}] *****".format(i + 1, len(runs)))
        # reload initial model weights
        model.set_weights(initital_weights)
        # get train & test folds
        train_data = {i: cognate_set for i, cognate_set in total_data.items() if i in run['train']}
        test_data = {i: cognate_set for i, cognate_set in total_data.items() if i in run['test']}
        print("***** Start training *****")
        for epoch in range(1, epochs + 1):
            words_true.clear()
            words_pred.clear()
            batch_losses.clear()
            for batch, cognate_set in train_data.items():
                output_characters = []
                for lang_array in cognate_set:
                    target = tf.keras.backend.expand_dims(lang_array.pop(ancestor).to_numpy(), axis=0)
                    target = tf.dtypes.cast(target, tf.float32)
                    data = []
                    for lang, vec in lang_array.items():
                        data.append(list(vec))
                    data = np.array(data)
                    data = tf.keras.backend.expand_dims(data, axis=0)
                    data = tf.dtypes.cast(data, tf.float32)
                    # data = tf.reshape(data, (1, -1))
                    with tf.GradientTape() as tape:
                        output = model(data)
                        loss = loss_object(target, output)
                        batch_losses.append(float(loss))
                        gradients = tape.gradient(loss, model.trainable_weights)
                        optimizer.apply_gradients(zip(gradients, model.trainable_weights))
                        output_characters.append(alphabet.get_char_by_vector(output))
                words_pred.append("".join(output_characters))
                words_true.append(str(cognate_set.ancestor_word))
                # print("".join(output_characters), str(cognate_set.ancestor_word))
                if int(batch) % 100 == 0:
                    print("Epoch [{}/{}], Batch [{}/{}]".format(epoch, epochs, batch, len(cognate_sets)))
            # calculate mean epoch loss
            mean_loss = np.mean(batch_losses)
            epoch_losses.append(mean_loss)
            print("Epoch[{}]/[{}], mean batch loss = {}".format(epoch, epochs, mean_loss))
            # calculate levenshtein distance
            ld = LevenshteinDistance(true=words_true, pred=words_pred)
            ld.print_distances()
            ld.print_percentiles()

        words_pred.clear()
        words_true.clear()
        print("***** Training finished *****")
        print()

        # Testing
        # Do the same thing as above with the test data, but don't collect the gradients
        # and don't backpropagate
        print("***** Start testing *****")
        for i, cognate_set in test_data.items():
            output_characters = []
            for lang_array in cognate_set:
                target = tf.keras.backend.expand_dims(lang_array.pop(ancestor).to_numpy(), axis=0)
                target = tf.dtypes.cast(target, tf.float32)
                data = []
                for lang, vec in lang_array.items():
                    data.append(list(vec))
                data = np.array(data)
                data = tf.keras.backend.expand_dims(data, axis=0)
                data = tf.dtypes.cast(data, tf.float32)
                output = model(data)
                # loss = loss_object(target, output)
                output_characters.append(alphabet.get_char_by_vector(output))
            # compile the reconstructed word
            words_pred.append("".join(output_characters))
            # save the true word for the distance calculation
            words_true.append(str(cognate_set.ancestor_word))
        wts.extend(words_true)
        wps.extend(words_pred)

        # create plots
        ld = LevenshteinDistance(words_true, words_pred)
        ld.print_distances()
        ld.print_percentiles()
        print("***** Testing finished *****")

    # save results after last run
    outfile = plots_dir / "many2one_test_{}{}{}.jpg".format(args.model, "_aligned" if aligned else "",
                                                        "_ortho" if ortho else "")
    title = "Model [Test]: LSTM {}{}{}\n 5 cross-validation folds" \
        .format(", " + args.model, ", aligned" if aligned else "", ", orthographic" if ortho else "")
    ld = LevenshteinDistance(wts, wps)
    plot_results(title=title,
                 distances={"=<" + str(d): count / 5 for d, count in ld.distances.items()},
                 percentiles={"=<" + str(d): perc for d, perc in ld.percentiles.items()},
                 mean_dist=ld.mean_distance,
                 mean_dist_norm=ld.mean_distance_normalized,
                 losses=[],
                 outfile=Path(outfile),
                 testing=True)

Пример #4

Показать файл

from classes import Alphabet, Text, Partitioning, Key
from program import decode_partition_fast, fitness, decode_partition_slow, decode_sort, decode_fitness

# Example SymbolSets
SymbolSets = {
    "Standard": Alphabet([*"abcdefghijklmnopqrstuvwxyz"]),
    "Reversed": Alphabet([*"zyxwvutsrqponmlkjihgfedcba"]),
    "Symbols": Alphabet([*"!§$%&/()=?-_{[]}#'+~:;.,@<"]),
    "Standard27": Alphabet([*"abcdefghijklmnopqrstuvwxyz "]),
    "Reversed27": Alphabet([*" zyxwvutsrqponmlkjihgfedcba"]),
    "Symbols27": Alphabet([*"!§$%&/()=?-_{[]}#'+~:;.,@<|"])
}
# Example Partitionings
Partitioning_ = {
    "Slow": Partitioning((6, 10, 9, 1)),
    "Normal": Partitioning((4, 2, 5, 5, 3, 5, 1, 1)),
    "Fast": Partitioning((4, 2, 5, 2, 3, 3, 3, 2, 1, 1)),
    "Slow27": Partitioning((6, 10, 9, 1, 1)),
    "Normal27": Partitioning((4, 2, 5, 5, 3, 5, 1, 1, 1)),
    "Fast27": Partitioning((4, 2, 5, 2, 3, 3, 3, 2, 1, 1, 1))
}
replace = {'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss'}

# Choosing the SymbolSets and the Partitioning
symbols_ref = SymbolSets["Standard"]
symbols_enc = SymbolSets["Reversed"]
partitioning = Partitioning_["Fast"]

# Creating the encryption key
key = Key.new(symbols_ref, symbols_enc)

Пример #5

Показать файл

from lingpy import rc
from pathlib import Path

from classes import Ipa2Asjp, Alphabet

ipa = Alphabet(Path("../data/alphabets/ipa.csv"))
sca = rc('asjp')
converter = Ipa2Asjp(sca, ["ː"])

romance_ipa_path = Path("../data/romance_ciobanu_ipa.csv")
romance_ipa = romance_ipa_path.open(encoding='utf-16').read()

out_path = Path("../data/romance_ciobanu_asjp.csv")
out_path.touch()
out_file = out_path.open('w', encoding='utf-16')

langs = ["latin", "italian", "spanish", "french", "portuguese", "romanian"]
col_names = ["id", "concept"] + langs

header = "id,concept,latin,italian,spanish,french,portuguese,romanian\n"
out_file.write(header)
print(header)

for line in romance_ipa.split("\n")[1:]:
    s = ""
    if line != "":
        row = line.split(",")
        assert len(row) == len(col_names), "Expected {} fields, found {}"\
            .format(len(col_names), row)
        # create row data dict
        row_data = {

Пример #6

Показать файл

from pathlib import Path

from classes import Alphabet

id_col = 0
langs = ['latin', 'italian', 'spanish', 'french', 'portuguese', 'romanian']

path_to_ipa = Path("../data/alphabets/ipa.csv")
path_to_romance_data = Path("../data/romance_swadesh_ipa.csv")

ipa = Alphabet(path_to_ipa)
romance_data = path_to_romance_data.open(encoding='utf-16').read().split("\n")
cols = romance_data[0].split(",")

romance_raw = [
    romance_data[i] for i in range(1,
                                   len(romance_data) - 1) if i % 2 != 0
]
romance_aligned = [
    romance_data[i] for i in range(2, len(romance_data)) if i % 2 == 0
]

assert len(romance_raw) == len(romance_aligned), "aligned and raw data of different length: {}, {}"\
    .format(len(romance_raw), len(romance_aligned))

data = {}

for category, lines in {
        'raw': romance_raw,
        'aligned': romance_aligned
}.items():

Пример #7

Показать файл

Файл: ciobanu_rnn.py Проект: justeuer/sopro-nlpwithnn

def train():

    # Command line call I used:
    # python ciobanu_rnn.py --data=ipa --model=ipa --epochs=10 --out_tag=test --model=ipa --ancestor=ancestor

    global encoding
    args = parser_args()
    # determine whether the model should use feature encodings or character embeddings
    assert args.ortho in [
        0, 1
    ], "Too many instances of --orthographic switch, should be 0 or 1"
    ortho = bool(args.ortho)
    # determine whether to use the aligned or unaligned data
    assert args.aligned in [
        0, 1
    ], "Too many instances of --aligned switch, should be 0 or 1"
    aligned = bool(args.aligned)
    # load data
    data_file = None
    if args.data == "ipa":
        encoding = 'utf-16'
        data_file = Path("../data/romance_ciobanu_ipa.csv")
    elif args.data == "asjp":
        encoding = 'ascii'
        data_file = Path("../data/romance_ciobanu_asjp.csv")
    assert data_file.exists() and data_file.is_file(
    ), "Data file {} does not exist".format(data_file)
    # determine model
    assert args.model in MODELS, "Model should be one of {}".format(MODELS)
    # determine path to alphabet file & encoding
    alphabet_file = None
    if args.model == "ipa":
        encoding = 'utf-16'
        alphabet_file = Path("../data/alphabets/ipa.csv")
    elif args.model == "asjp":
        encoding = 'ascii'
        alphabet_file = Path("../data/alphabets/asjp.csv")
    # load data from file
    assert alphabet_file.exists() and alphabet_file.is_file(
    ), "Alphabet file {} does not exist".format(alphabet_file)
    alphabet = Alphabet(alphabet_file, encoding=encoding, ortho=ortho)
    assert isinstance(args.epochs,
                      int), "Epochs not int, but {}".format(type(args.epochs))
    assert args.epochs > 0, "Epochs out of range: {}".format(args.epochs)
    epochs = args.epochs

    # ancestor
    ancestor = args.ancestor

    # determine output directories, create them if they do not exist
    out_tag = "_{}".format(args.out_tag)
    plots_dir = Path("../out/plots{}_deep".format(out_tag))
    if not plots_dir.exists():
        plots_dir.mkdir(parents=True)
    results_dir = Path("../out/results{}_deep".format(out_tag))
    if not results_dir.exists():
        results_dir.mkdir(parents=True)
    # create file for results
    result_file_path = results_dir / "deep_{}{}{}.txt".format(
        args.model, "_aligned" if aligned else "", "_ortho" if ortho else "")
    result_file_path.touch()
    result_file = result_file_path.open('w', encoding=encoding)

    print("alphabet:")
    print(alphabet)

    # initialize model
    model, optimizer, loss_object = create_model(
        input_dim=alphabet.get_feature_dim(),
        embedding_dim=28,
        context_dim=128,
        output_dim=alphabet.get_feature_dim())

    model.summary()

    print("data_file: {}".format(data_file.absolute()))
    print("model: {}, orthographic={}, aligned={}".format(
        args.model, ortho, aligned))
    print("alphabet: {}, read from {}".format(args.model,
                                              alphabet_file.absolute()))
    print("epochs: {}".format(epochs))

    # create cognate sets

    cognate_sets = []

    data = data_file.open(encoding='utf-16').read().split("\n")
    cols = data[HEADER_ROW].split(COLUMN_SEPARATOR)
    langs = cols[2:]
    print("langs")
    print(langs)

    for li, line in enumerate(data[HEADER_ROW:]):
        if aligned:
            if line == "" or li % 2 != 0:
                continue
        else:
            if line == "" or li % 2 == 0:
                continue
        row_split = line.split(COLUMN_SEPARATOR)
        id = row_split[ID_COLUMN]
        concept = row_split[CONCEPT_COLUMN]
        words = row_split[CONCEPT_COLUMN + 1:]
        # print("words")
        # print(words)
        cognate_dict = {}
        assert len(langs) == len(
            words), "Langs / Words mismatch, expected {}, got {}".format(
                len(langs), len(words))
        for lang, word in zip(langs, words):
            # print("lang, word")
            # print(lang, word)
            cognate_dict[lang] = alphabet.translate(word)
        cs = CognateSet(id=id,
                        concept=concept,
                        ancestor=ancestor,
                        cognate_dict=cognate_dict,
                        alphabet=alphabet)
        cognate_sets.append(cs)

    # maybe we needn't do the evaluation, since we mainly want to know how
    # the model behaves with the different inputs

    split_index = int(valid_size * len(cognate_sets))
    train_data = cognate_sets[:split_index]
    valid_data = cognate_sets[split_index:]
    print("train size: {}".format(len(train_data)))
    print("valid size: {}".format(len(valid_data)))
    # cognate_sets = cognate_sets[10:30]
    # print("cognate_sets in ral")
    # print(cognate_sets)

    words_true = []
    words_pred = []
    epoch_losses = []
    batch_losses = []

    for epoch in range(epochs):
        # reset lists
        epoch_losses.clear()
        words_true.clear()
        words_pred.clear()
        # iterate over the cognate sets
        for i, cs in enumerate(cognate_sets):
            # reset batch loss
            batch_losses.clear()
            # iterate over the character embeddings
            for j, char_embeddings in enumerate(cs):
                # add a dimension to the latin character embedding (ancestor embedding)
                # we add a dimension because we use a batch size of 1 and TensorFlow does not
                # automatically insert the batch size dimension
                target = tf.keras.backend.expand_dims(char_embeddings.pop(
                    cs.ancestor).to_numpy(),
                                                      axis=0)
                # convert the latin character embedding to float32 to match the dtype of the output (line 137)
                target = tf.dtypes.cast(target, tf.float32)
                # iterate through the embeddings
                # initialize the GradientTape
                with tf.GradientTape(persistent=True) as tape:
                    for lang, embedding in char_embeddings.items():
                        # add a dimension to the the embeddings
                        data = tf.keras.backend.expand_dims(
                            embedding.to_numpy(), axis=0)
                        output = model(data)
                        # calculate the loss
                        loss = loss_object(target, output)
                        epoch_losses.append(float(loss))
                        batch_losses.append(float(loss))
                        # calculate the gradients
                        gradients = tape.gradient(loss,
                                                  model.trainable_weights)
                        # backpropagate
                        optimizer.apply_gradients(
                            zip(gradients, model.trainable_weights))
                        # convert the character vector into a character
                    output_char = alphabet.get_char_by_feature_vector(output)
                    # append the converted vectors to a list so we can see the reconstructed word
                    output_characters.append(output_char)
            # append the reconstructed word and the ancestor to the true/pred lists
            words_pred.append("".join(output_characters))
            words_true.append(str(cs.ancestor))
            # clear the list of output characters so we can create another word
            output_characters.clear()
            print("Batch {}, mean loss={}".format(i, np.mean(batch_losses)))
        # calculate distances
        ld = LevenshteinDistance(true=words_true, pred=words_pred)
        print("Epoch {} finished".format(epoch + 1))
        print("Mean loss={}".format(epoch, np.mean(epoch_losses)))
        ld.print_distances()
        ld.print_percentiles()
        if epoch == epochs:
            outfile = "../out/plots_swadesh_deep/deep_{}{}{}.jpg".format(
                args.model, "_aligned" if aligned else "",
                "_ortho" if ortho else "")
            title = "Model: deep net{}{}{}".format(
                ", " + args.model, ", aligned" if aligned else "",
                ", orthographic" if ortho else "")
            plot_results(title=title,
                         distances={
                             "=<" + str(d): count
                             for d, count in ld.distances.items()
                         },
                         percentiles={
                             "=<" + str(d): perc
                             for d, perc in ld.percentiles.items()
                         },
                         mean_dist=ld.mean_distance,
                         mean_dist_norm=ld.mean_distance_normalized,
                         losses=epoch_losses,
                         outfile=Path(outfile))
            # save reconstructed words (but only if the edit distance is at least one)
            import nltk
            for t, p in zip(words_true, words_pred):
                distance = nltk.edit_distance(t, p)
                if distance > 0:
                    line = "{},{},distance={}\n".format(
                        t, p, nltk.edit_distance(t, p))
                    result_file.write(line)
            result_file.close()

Пример #8

Показать файл

Файл: alignment_with_lingpy.py Проект: justeuer/sopro-nlpwithnn

import lingpy as lp
from pathlib import Path
from typing import List

from classes import Alphabet


def lst_to_str(lst: List[str]):
    s = ""
    for c in lst:
        s += c
    return s


ipa_csv_path = Path("../data/alphabets/ipa.csv")
ipa = Alphabet(ipa_csv_path)
romance_data_path = Path("../data/romance_ipa_partial.csv")
romance_data = romance_data_path.open(encoding='utf-16').read()
out_path = Path("../data/romance_ipa_aligned.csv")
out_file = out_path.open('w')

langs = ["latin", "italian", "spanish", "french", "romanian"]

header = "id,concept,latin,italian,spanish, french,romanian\n"

out_file.write(header)

cognate_sets = {}
for line in romance_data.split("\n")[1:51]:
    s = ""
    data = line.split(",")