from pathlib import Path import tensorflow as tf from classes import Alphabet ipa = Alphabet(Path("../data/alphabets/ipa.csv")) chars = "abcdefghijklmnop" for char in chars: vec = ipa.create_char(char).vector char_ = ipa.get_char_by_feature_vector(vec) print(char, char_) vec = [0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] vec_ = [] for i in vec: if i == 0: vec_.append(0.1) elif i == 1: vec_.append(0.9) print(ipa.get_char_by_feature_vector(vec_)) import numpy as np from utils import create_model model, optimizer, loss_object = create_model(len(vec), 64, 32, len(vec)) vec = tf.keras.backend.expand_dims(vec, axis=0) vec = tf.dtypes.cast(vec, tf.float32) losses = []
def main(): global encoding args = parser_args() # determine whether the model should use feature encodings or character embeddings assert args.orthographic in [ 0, 1 ], "Too many instances of --orthographic switch, should be 0 or 1" orthographic = bool(args.orthographic) # determine whether to use the aligned or unaligned data assert args.aligned in [ 0, 1 ], "Too many instances of --aligned switch, should be 0 or 1" aligned = bool(args.aligned) # load data data_file = None if args.data == "ipa": encoding = 'utf-16' data_file = Path("../data/romance_swadesh_ipa.csv") elif args.data == "asjp": encoding = 'ascii' data_file = Path("../data/romance_swadesh_asjp.csv") assert data_file.exists() and data_file.is_file( ), "Data file {} does not exist".format(data_file) # determine model assert args.model in MODELS, "Model should be one of {}".format(MODELS) # determine path to alphabet file & encoding alphabet_file = None if args.model == "ipa": encoding = 'utf-16' alphabet_file = Path("../data/alphabets/ipa.csv") elif args.model == "asjp": encoding = 'ascii' alphabet_file = Path("../data/alphabets/asjp.csv") # load data from file assert alphabet_file.exists() and alphabet_file.is_file( ), "Alphabet file {} does not exist".format(alphabet_file) alphabet = Alphabet(alphabet_file, encoding=encoding, orthographic=orthographic) assert isinstance(args.epochs, int), "Epochs not int, but {}".format(type(args.epochs)) assert args.epochs > 0, "Epochs out of range: {}".format(args.epochs) epochs = args.epochs print("alphabet:") print(alphabet) # initialize model model, optimizer, loss_object = create_model( input_dim=alphabet.get_feature_dim(), embedding_dim=28, context_dim=128, output_dim=alphabet.get_feature_dim()) model.summary() print("data_file: {}".format(data_file.absolute())) print("model: {}, orthographic={}, aligned={}".format( args.model, orthographic, aligned)) print("alphabet: {}, read from {}".format(args.model, alphabet_file.absolute())) print("epochs: {}".format(epochs)) # create cognate sets cognate_sets = [] data = data_file.open(encoding='utf-16').read().split("\n") cols = data[HEADER_ROW].split(COLUMN_SEPARATOR) langs = cols[2:] for li, line in enumerate(data[HEADER_ROW:]): if aligned: if line == "" or li % 2 != 0: continue else: if line == "" or li % 2 == 0: continue row_split = line.split(COLUMN_SEPARATOR) id = row_split[ID_COLUMN] concept = row_split[CONCEPT_COLUMN] words = row_split[CONCEPT_COLUMN + 1:] cognate_dict = {} assert len(langs) == len( words), "Langs / Words mismatch, expected {}, got {}".format( len(langs), len(words)) for lang, word in zip(langs, words): cognate_dict[lang] = alphabet.translate(word) cs = CognateSet(id=id, concept=concept, ancestor='latin', cognate_dict=cognate_dict, alphabet=alphabet) cognate_sets.append(cs) # maybe we needn't do the evaluation, since we mainly want to know how # the model behaves with the different inputs #split_index = int(valid_size * len(cognate_sets)) #train_data = cognate_sets[:split_index] #valid_data = cognate_sets[split_index:] #print("train size: {}".format(len(train_data))) #print("valid size: {}".format(len(valid_data))) #cognate_sets = cognate_sets[10:30] words_true = [] words_pred = [] epoch_losses = [] batch_losses = [] for epoch in range(epochs): # reset lists epoch_losses.clear() words_true.clear() words_pred.clear() # iterate over the cognate sets for i, cs in enumerate(cognate_sets): # reset batch loss batch_losses.clear() # iterate over the character embeddings for j, char_embeddings in enumerate(cs): # add a dimension to the latin character embedding (ancestor embedding) # we add a dimension because we use a batch size of 1 and TensorFlow does not # automatically insert the batch size dimension target = tf.keras.backend.expand_dims(char_embeddings.pop( cs.ancestor).to_numpy(), axis=0) # convert the latin character embedding to float32 to match the dtype of the output (line 137) target = tf.dtypes.cast(target, tf.float32) # iterate through the embeddings # initialize the GradientTape with tf.GradientTape(persistent=True) as tape: for lang, embedding in char_embeddings.items(): # add a dimension to the the embeddings data = tf.keras.backend.expand_dims( embedding.to_numpy(), axis=0) output = model(data) # calculate the loss loss = loss_object(target, output) epoch_losses.append(float(loss)) batch_losses.append(float(loss)) # calculate the gradients gradients = tape.gradient(loss, model.trainable_weights) # backpropagate optimizer.apply_gradients( zip(gradients, model.trainable_weights)) # convert the character vector into a character output_char = alphabet.get_char_by_feature_vector(output) # append the converted vectors to a list so we can see the reconstructed word output_characters.append(output_char) # append the reconstructed word and the ancestor to the true/pred lists words_pred.append("".join(output_characters)) words_true.append(str(cs.get_ancestor())) # clear the list of output characters so we can create another word output_characters.clear() print("Batch {}, mean loss={}".format(i, np.mean(batch_losses))) # calculate distances ld = LevenshteinDistance(true=words_true, pred=words_pred) print("Epoch {} finished".format(epoch + 1)) print("Mean loss={}".format(epoch, np.mean(epoch_losses))) ld.print_distances() ld.print_percentiles() # do so again after training has finished, but now also save the plots ld = LevenshteinDistance(true=words_true, pred=words_pred) ld.print_distances() ld.print_percentiles() ld.plot_distances(Path("../data/out/distances.png")) ld.plot_percentiles(Path("../data/out/percentiles.png"))
def main(): global encoding args = parse_args() # determine whether to use the aligned or unaligned data assert args.aligned in [0, 1], "Too many instances of --aligned switch, should be 0 or 1" aligned = bool(args.aligned) # and decide between feature encodings and character embeddings assert args.ortho in [0, 1], "Too many instances of --ortho switch, should be 0 or 1" ortho = bool(args.ortho) # load data data_file = Path(args.data) assert data_file.exists() and data_file.is_file(), "Data file {} does not exist".format(data_file) # determine model assert args.model in MODELS, "Model should be one of {}".format(MODELS) # determine path to alphabet file & encoding alphabet_file = None if args.model == "ipa": encoding = 'utf-16' alphabet_file = Path("../data/alphabets/ipa.csv") elif args.model == "asjp": encoding = 'ascii' alphabet_file = Path("../data/alphabets/asjp.csv") elif args.model == 'latin': encoding = 'utf-16' alphabet_file = Path("../data/alphabets/latin.csv") # load data from file assert alphabet_file.exists() and alphabet_file.is_file(), "Alphabet file {} does not exist".format(alphabet_file) alphabet = Alphabet(alphabet_file, encoding=encoding, ortho=ortho) # number of epochs assert isinstance(args.epochs, int), "Epochs not int, but {}".format(type(args.epochs)) assert args.epochs > 0, "Epochs out of range: {}".format(args.epochs) epochs = args.epochs # number of hidden layers # assert args.n_hidden > 0, "Number of hidden layers should be at least 1 ;)" # n_hidden = args.n_hidden # determine output directories, create them if they do not exist out_tag = "_{}".format(args.out_tag) # and tag for files with train/test indices indices_tag = args.out_tag plots_dir = Path("../out/plots{}_many2one".format(out_tag)) if not plots_dir.exists(): plots_dir.mkdir(parents=True) results_dir = Path("../out/results{}_many2one".format(out_tag)) if not results_dir.exists(): results_dir.mkdir(parents=True) # create file for results result_file_path = results_dir / "m2one_{}{}{}.txt".format(args.model, "_aligned" if aligned else "", "_ortho" if ortho else "") result_file_path.touch() result_file = result_file_path.open('w', encoding=encoding) # determine ancestor ancestor = args.ancestor # create cognate sets cognate_sets = [] data = data_file.open(encoding='utf-16').read().split("\n") cols = data[HEADER_ROW].split(COLUMN_SEPARATOR) langs = cols[2:] # import tensorflow here to comply with the wiki entry https://wiki.lsv.uni-saarland.de/doku.php?id=cluster import tensorflow as tf # set random seed for weights tf.random.set_seed(seed=42) # start data extraction for li, line in enumerate(data[HEADER_ROW:]): # have to do that because the file with the latin characters doesn't contain aligned cognate sets if args.model == 'latin': if line == "": continue # but the other two do elif aligned: if line == "" or li % 2 == 0: continue # the unaligned case else: if line == "" or li % 2 != 0: continue row_split = line.split(COLUMN_SEPARATOR) id = row_split[ID_COLUMN] concept = row_split[CONCEPT_COLUMN] words = row_split[CONCEPT_COLUMN + 1:] cognate_dict = {} assert len(langs) == len(words), "Langs / Words mismatch, expected {}, got {}".format(len(langs), len(words)) for lang, word in zip(langs, words): cognate_dict[lang] = alphabet.translate(word) cognate_set = CognateSet(id=id, concept=concept, ancestor=ancestor, cognate_dict=cognate_dict, alphabet=alphabet) cognate_sets.append(cognate_set) # prepare train_test_split total_data = {str(i + 1): cognate_set for i, cognate_set in enumerate(cognate_sets)} train_indices = set(total_data.keys()) runs = cross_validation_runs(5, train_indices) # test_indices = Path("../data/{}_test_indices.txt".format(indices_tag)).open('r').read().split("\n") # train_data = {i: cognate_set for i, cognate_set in data.items() if i in train_indices} # test_data = {i: cognate_set for i, cognate_set in data.items() if i in test_indices} # define model model, optimizer, loss_object = create_many_to_one_model(lstm_dim=128, timesteps=len(langs) - 1, data_dim=alphabet.feature_dim, fc_dim=100, output_dim=alphabet.feature_dim) model.summary() # save model weights for reset initital_weights = model.get_weights() words_true = [] words_pred = [] wts = [] wps = [] epoch_losses = [] batch_losses = [] # Training with cross-validation for i, run in enumerate(runs): print("***** Cross-validation run [{}/{}] *****".format(i + 1, len(runs))) # reload initial model weights model.set_weights(initital_weights) # get train & test folds train_data = {i: cognate_set for i, cognate_set in total_data.items() if i in run['train']} test_data = {i: cognate_set for i, cognate_set in total_data.items() if i in run['test']} print("***** Start training *****") for epoch in range(1, epochs + 1): words_true.clear() words_pred.clear() batch_losses.clear() for batch, cognate_set in train_data.items(): output_characters = [] for lang_array in cognate_set: target = tf.keras.backend.expand_dims(lang_array.pop(ancestor).to_numpy(), axis=0) target = tf.dtypes.cast(target, tf.float32) data = [] for lang, vec in lang_array.items(): data.append(list(vec)) data = np.array(data) data = tf.keras.backend.expand_dims(data, axis=0) data = tf.dtypes.cast(data, tf.float32) # data = tf.reshape(data, (1, -1)) with tf.GradientTape() as tape: output = model(data) loss = loss_object(target, output) batch_losses.append(float(loss)) gradients = tape.gradient(loss, model.trainable_weights) optimizer.apply_gradients(zip(gradients, model.trainable_weights)) output_characters.append(alphabet.get_char_by_vector(output)) words_pred.append("".join(output_characters)) words_true.append(str(cognate_set.ancestor_word)) # print("".join(output_characters), str(cognate_set.ancestor_word)) if int(batch) % 100 == 0: print("Epoch [{}/{}], Batch [{}/{}]".format(epoch, epochs, batch, len(cognate_sets))) # calculate mean epoch loss mean_loss = np.mean(batch_losses) epoch_losses.append(mean_loss) print("Epoch[{}]/[{}], mean batch loss = {}".format(epoch, epochs, mean_loss)) # calculate levenshtein distance ld = LevenshteinDistance(true=words_true, pred=words_pred) ld.print_distances() ld.print_percentiles() words_pred.clear() words_true.clear() print("***** Training finished *****") print() # Testing # Do the same thing as above with the test data, but don't collect the gradients # and don't backpropagate print("***** Start testing *****") for i, cognate_set in test_data.items(): output_characters = [] for lang_array in cognate_set: target = tf.keras.backend.expand_dims(lang_array.pop(ancestor).to_numpy(), axis=0) target = tf.dtypes.cast(target, tf.float32) data = [] for lang, vec in lang_array.items(): data.append(list(vec)) data = np.array(data) data = tf.keras.backend.expand_dims(data, axis=0) data = tf.dtypes.cast(data, tf.float32) output = model(data) # loss = loss_object(target, output) output_characters.append(alphabet.get_char_by_vector(output)) # compile the reconstructed word words_pred.append("".join(output_characters)) # save the true word for the distance calculation words_true.append(str(cognate_set.ancestor_word)) wts.extend(words_true) wps.extend(words_pred) # create plots ld = LevenshteinDistance(words_true, words_pred) ld.print_distances() ld.print_percentiles() print("***** Testing finished *****") # save results after last run outfile = plots_dir / "many2one_test_{}{}{}.jpg".format(args.model, "_aligned" if aligned else "", "_ortho" if ortho else "") title = "Model [Test]: LSTM {}{}{}\n 5 cross-validation folds" \ .format(", " + args.model, ", aligned" if aligned else "", ", orthographic" if ortho else "") ld = LevenshteinDistance(wts, wps) plot_results(title=title, distances={"=<" + str(d): count / 5 for d, count in ld.distances.items()}, percentiles={"=<" + str(d): perc for d, perc in ld.percentiles.items()}, mean_dist=ld.mean_distance, mean_dist_norm=ld.mean_distance_normalized, losses=[], outfile=Path(outfile), testing=True)
from classes import Alphabet, Text, Partitioning, Key from program import decode_partition_fast, fitness, decode_partition_slow, decode_sort, decode_fitness # Example SymbolSets SymbolSets = { "Standard": Alphabet([*"abcdefghijklmnopqrstuvwxyz"]), "Reversed": Alphabet([*"zyxwvutsrqponmlkjihgfedcba"]), "Symbols": Alphabet([*"!§$%&/()=?-_{[]}#'+~:;.,@<"]), "Standard27": Alphabet([*"abcdefghijklmnopqrstuvwxyz "]), "Reversed27": Alphabet([*" zyxwvutsrqponmlkjihgfedcba"]), "Symbols27": Alphabet([*"!§$%&/()=?-_{[]}#'+~:;.,@<|"]) } # Example Partitionings Partitioning_ = { "Slow": Partitioning((6, 10, 9, 1)), "Normal": Partitioning((4, 2, 5, 5, 3, 5, 1, 1)), "Fast": Partitioning((4, 2, 5, 2, 3, 3, 3, 2, 1, 1)), "Slow27": Partitioning((6, 10, 9, 1, 1)), "Normal27": Partitioning((4, 2, 5, 5, 3, 5, 1, 1, 1)), "Fast27": Partitioning((4, 2, 5, 2, 3, 3, 3, 2, 1, 1, 1)) } replace = {'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss'} # Choosing the SymbolSets and the Partitioning symbols_ref = SymbolSets["Standard"] symbols_enc = SymbolSets["Reversed"] partitioning = Partitioning_["Fast"] # Creating the encryption key key = Key.new(symbols_ref, symbols_enc)
from lingpy import rc from pathlib import Path from classes import Ipa2Asjp, Alphabet ipa = Alphabet(Path("../data/alphabets/ipa.csv")) sca = rc('asjp') converter = Ipa2Asjp(sca, ["ː"]) romance_ipa_path = Path("../data/romance_ciobanu_ipa.csv") romance_ipa = romance_ipa_path.open(encoding='utf-16').read() out_path = Path("../data/romance_ciobanu_asjp.csv") out_path.touch() out_file = out_path.open('w', encoding='utf-16') langs = ["latin", "italian", "spanish", "french", "portuguese", "romanian"] col_names = ["id", "concept"] + langs header = "id,concept,latin,italian,spanish,french,portuguese,romanian\n" out_file.write(header) print(header) for line in romance_ipa.split("\n")[1:]: s = "" if line != "": row = line.split(",") assert len(row) == len(col_names), "Expected {} fields, found {}"\ .format(len(col_names), row) # create row data dict row_data = {
from pathlib import Path from classes import Alphabet id_col = 0 langs = ['latin', 'italian', 'spanish', 'french', 'portuguese', 'romanian'] path_to_ipa = Path("../data/alphabets/ipa.csv") path_to_romance_data = Path("../data/romance_swadesh_ipa.csv") ipa = Alphabet(path_to_ipa) romance_data = path_to_romance_data.open(encoding='utf-16').read().split("\n") cols = romance_data[0].split(",") romance_raw = [ romance_data[i] for i in range(1, len(romance_data) - 1) if i % 2 != 0 ] romance_aligned = [ romance_data[i] for i in range(2, len(romance_data)) if i % 2 == 0 ] assert len(romance_raw) == len(romance_aligned), "aligned and raw data of different length: {}, {}"\ .format(len(romance_raw), len(romance_aligned)) data = {} for category, lines in { 'raw': romance_raw, 'aligned': romance_aligned }.items():
def train(): # Command line call I used: # python ciobanu_rnn.py --data=ipa --model=ipa --epochs=10 --out_tag=test --model=ipa --ancestor=ancestor global encoding args = parser_args() # determine whether the model should use feature encodings or character embeddings assert args.ortho in [ 0, 1 ], "Too many instances of --orthographic switch, should be 0 or 1" ortho = bool(args.ortho) # determine whether to use the aligned or unaligned data assert args.aligned in [ 0, 1 ], "Too many instances of --aligned switch, should be 0 or 1" aligned = bool(args.aligned) # load data data_file = None if args.data == "ipa": encoding = 'utf-16' data_file = Path("../data/romance_ciobanu_ipa.csv") elif args.data == "asjp": encoding = 'ascii' data_file = Path("../data/romance_ciobanu_asjp.csv") assert data_file.exists() and data_file.is_file( ), "Data file {} does not exist".format(data_file) # determine model assert args.model in MODELS, "Model should be one of {}".format(MODELS) # determine path to alphabet file & encoding alphabet_file = None if args.model == "ipa": encoding = 'utf-16' alphabet_file = Path("../data/alphabets/ipa.csv") elif args.model == "asjp": encoding = 'ascii' alphabet_file = Path("../data/alphabets/asjp.csv") # load data from file assert alphabet_file.exists() and alphabet_file.is_file( ), "Alphabet file {} does not exist".format(alphabet_file) alphabet = Alphabet(alphabet_file, encoding=encoding, ortho=ortho) assert isinstance(args.epochs, int), "Epochs not int, but {}".format(type(args.epochs)) assert args.epochs > 0, "Epochs out of range: {}".format(args.epochs) epochs = args.epochs # ancestor ancestor = args.ancestor # determine output directories, create them if they do not exist out_tag = "_{}".format(args.out_tag) plots_dir = Path("../out/plots{}_deep".format(out_tag)) if not plots_dir.exists(): plots_dir.mkdir(parents=True) results_dir = Path("../out/results{}_deep".format(out_tag)) if not results_dir.exists(): results_dir.mkdir(parents=True) # create file for results result_file_path = results_dir / "deep_{}{}{}.txt".format( args.model, "_aligned" if aligned else "", "_ortho" if ortho else "") result_file_path.touch() result_file = result_file_path.open('w', encoding=encoding) print("alphabet:") print(alphabet) # initialize model model, optimizer, loss_object = create_model( input_dim=alphabet.get_feature_dim(), embedding_dim=28, context_dim=128, output_dim=alphabet.get_feature_dim()) model.summary() print("data_file: {}".format(data_file.absolute())) print("model: {}, orthographic={}, aligned={}".format( args.model, ortho, aligned)) print("alphabet: {}, read from {}".format(args.model, alphabet_file.absolute())) print("epochs: {}".format(epochs)) # create cognate sets cognate_sets = [] data = data_file.open(encoding='utf-16').read().split("\n") cols = data[HEADER_ROW].split(COLUMN_SEPARATOR) langs = cols[2:] print("langs") print(langs) for li, line in enumerate(data[HEADER_ROW:]): if aligned: if line == "" or li % 2 != 0: continue else: if line == "" or li % 2 == 0: continue row_split = line.split(COLUMN_SEPARATOR) id = row_split[ID_COLUMN] concept = row_split[CONCEPT_COLUMN] words = row_split[CONCEPT_COLUMN + 1:] # print("words") # print(words) cognate_dict = {} assert len(langs) == len( words), "Langs / Words mismatch, expected {}, got {}".format( len(langs), len(words)) for lang, word in zip(langs, words): # print("lang, word") # print(lang, word) cognate_dict[lang] = alphabet.translate(word) cs = CognateSet(id=id, concept=concept, ancestor=ancestor, cognate_dict=cognate_dict, alphabet=alphabet) cognate_sets.append(cs) # maybe we needn't do the evaluation, since we mainly want to know how # the model behaves with the different inputs split_index = int(valid_size * len(cognate_sets)) train_data = cognate_sets[:split_index] valid_data = cognate_sets[split_index:] print("train size: {}".format(len(train_data))) print("valid size: {}".format(len(valid_data))) # cognate_sets = cognate_sets[10:30] # print("cognate_sets in ral") # print(cognate_sets) words_true = [] words_pred = [] epoch_losses = [] batch_losses = [] for epoch in range(epochs): # reset lists epoch_losses.clear() words_true.clear() words_pred.clear() # iterate over the cognate sets for i, cs in enumerate(cognate_sets): # reset batch loss batch_losses.clear() # iterate over the character embeddings for j, char_embeddings in enumerate(cs): # add a dimension to the latin character embedding (ancestor embedding) # we add a dimension because we use a batch size of 1 and TensorFlow does not # automatically insert the batch size dimension target = tf.keras.backend.expand_dims(char_embeddings.pop( cs.ancestor).to_numpy(), axis=0) # convert the latin character embedding to float32 to match the dtype of the output (line 137) target = tf.dtypes.cast(target, tf.float32) # iterate through the embeddings # initialize the GradientTape with tf.GradientTape(persistent=True) as tape: for lang, embedding in char_embeddings.items(): # add a dimension to the the embeddings data = tf.keras.backend.expand_dims( embedding.to_numpy(), axis=0) output = model(data) # calculate the loss loss = loss_object(target, output) epoch_losses.append(float(loss)) batch_losses.append(float(loss)) # calculate the gradients gradients = tape.gradient(loss, model.trainable_weights) # backpropagate optimizer.apply_gradients( zip(gradients, model.trainable_weights)) # convert the character vector into a character output_char = alphabet.get_char_by_feature_vector(output) # append the converted vectors to a list so we can see the reconstructed word output_characters.append(output_char) # append the reconstructed word and the ancestor to the true/pred lists words_pred.append("".join(output_characters)) words_true.append(str(cs.ancestor)) # clear the list of output characters so we can create another word output_characters.clear() print("Batch {}, mean loss={}".format(i, np.mean(batch_losses))) # calculate distances ld = LevenshteinDistance(true=words_true, pred=words_pred) print("Epoch {} finished".format(epoch + 1)) print("Mean loss={}".format(epoch, np.mean(epoch_losses))) ld.print_distances() ld.print_percentiles() if epoch == epochs: outfile = "../out/plots_swadesh_deep/deep_{}{}{}.jpg".format( args.model, "_aligned" if aligned else "", "_ortho" if ortho else "") title = "Model: deep net{}{}{}".format( ", " + args.model, ", aligned" if aligned else "", ", orthographic" if ortho else "") plot_results(title=title, distances={ "=<" + str(d): count for d, count in ld.distances.items() }, percentiles={ "=<" + str(d): perc for d, perc in ld.percentiles.items() }, mean_dist=ld.mean_distance, mean_dist_norm=ld.mean_distance_normalized, losses=epoch_losses, outfile=Path(outfile)) # save reconstructed words (but only if the edit distance is at least one) import nltk for t, p in zip(words_true, words_pred): distance = nltk.edit_distance(t, p) if distance > 0: line = "{},{},distance={}\n".format( t, p, nltk.edit_distance(t, p)) result_file.write(line) result_file.close()
import lingpy as lp from pathlib import Path from typing import List from classes import Alphabet def lst_to_str(lst: List[str]): s = "" for c in lst: s += c return s ipa_csv_path = Path("../data/alphabets/ipa.csv") ipa = Alphabet(ipa_csv_path) romance_data_path = Path("../data/romance_ipa_partial.csv") romance_data = romance_data_path.open(encoding='utf-16').read() out_path = Path("../data/romance_ipa_aligned.csv") out_file = out_path.open('w') langs = ["latin", "italian", "spanish", "french", "romanian"] header = "id,concept,latin,italian,spanish, french,romanian\n" out_file.write(header) cognate_sets = {} for line in romance_data.split("\n")[1:51]: s = "" data = line.split(",")