def __init__(self, embed_size, hidden_size, panphon, model_name, load_model=False, train_file=None, val_file=None): self.model_name = model_name self.model = dy.ParameterCollection() self.panphon = panphon if self.panphon: self.ft = pp.FeatureTable() self.ws_panphon = self.model.add_parameters((embed_size, VEC_SIZE)) self.bs_panphon = self.model.add_parameters((embed_size)) else: self.source_vocab = defaultdict(lambda: len(self.source_vocab)) self.target_vocab = defaultdict(lambda: len(self.target_vocab)) self.source_lookup = self.model.add_lookup_parameters((len(self.source_vocab), embed_size)) self.target_lookup = self.model.add_lookup_parameters((len(self.target_vocab), embed_size)) self.training_data = self.read_train(train_file) if val_file: self.validation_data = self.read_data(val_file) self.source_lstm_forward = dy.LSTMBuilder(1, embed_size, hidden_size/2, self.model) self.source_lstm_backward = dy.LSTMBuilder(1, embed_size, hidden_size/2, self.model) self.target_lstm_forward = dy.LSTMBuilder(1, embed_size, hidden_size/2, self.model) self.target_lstm_backward = dy.LSTMBuilder(1, embed_size, hidden_size/2, self.model) # load model only if flag is true. will overwrite existing model if flag is false. set flag to True for fine-tuning or encoding if load_model: self.model.populate(self.model_name) print("Populated! " + self.model_name) print('done')
def __init__(self, code, preproc=True, postproc=True, ligatures=False, rev=False, rev_preproc=True, rev_postproc=True): """Constructs the backend object epitran uses for most languages Args: code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen preproc (bool): if True, apply preprocessor postproc (bool): if True, apply postprocessors ligatures (bool): if True, use phonetic ligatures for affricates instead of standard IPA rev (bool): if True, load reverse transliteration rev_preproc (bool): if True, apply preprocessor when reverse transliterating rev_postproc (bool): if True, apply postprocessor when reverse transliterating """ self.rev = rev self.g2p = self._load_g2p_map(code, False) self.regexp = self._construct_regex(self.g2p.keys()) self.puncnorm = PuncNorm() self.ft = panphon.FeatureTable() self.num_panphon_fts = len(self.ft.names) self.preprocessor = PrePostProcessor(code, 'pre', False) self.postprocessor = PrePostProcessor(code, 'post', False) self.strip_diacritics = StripDiacritics(code) self.preproc = preproc self.postproc = postproc self.ligatures = ligatures self.rev_preproc = rev_preproc self.rev_postproc = rev_postproc if rev: self.rev_g2p = self._load_g2p_map(code, True) self.rev_regexp = self._construct_regex(self.rev_g2p.keys()) self.rev_preprocessor = PrePostProcessor(code, 'pre', True) self.rev_postprocessor = PrePostProcessor(code, 'post', True) self.nils = defaultdict(int)
def __init__(self, incl_stress=True, incl_syllables=True): def preproc(row): row = row.split() if not incl_stress: row = map(lambda seg: seg.replace("1", ""), row) if not incl_syllables: row = map(lambda seg: seg.replace("-", ""), row) if incl_stress and incl_syllables: # Then let's move the stress to the beginning of its syllable. last_syl_marker_idx = -1 for idx, unit in enumerate(row[:]): last_syl_marker_idx = idx if unit == self.SYLL else last_syl_marker_idx if self.STRESS in unit: row[idx] = unit.replace(self.STRESS, "") row.insert(last_syl_marker_idx + 1, self.STRESS) return " ".join([ self.ipa_map[segment] if segment in self.ipa_map else segment for segment in row ]) raw_rows = open(self.PATH).readlines() self.rows = [preproc(row) for row in raw_rows] self._phonemes = self.get_phonemes() self.feature_table = panphon.FeatureTable()
def __init__(self, code): self.g2p = self._load_g2p_map(code) self.regexp = self._construct_regex() self.puncnorm = self._load_punc_norm_map() self.puncnorm_vals = self.puncnorm.values() self.ft = panphon.FeatureTable() self.num_panphon_fts = len(self.ft.names) self.preprocessor = PrePostProcessor(code, 'pre')
def __init__(self, infile=sys.stdin): """Validate Unicode IPA from file relative to panphon database. infile -- File from which input is taken; by default, STDIN. """ self.ws_punc_regex = re.compile(r'[," \t\n]', re.V1 | re.U) self.ft = panphon.FeatureTable() self._validate_file(infile)
def main(infiles, output): flite = epitran.flite.Flite() ft = panphon.FeatureTable() space = Counter() for fn in infiles: logging.debug(u'Scanning:\t{}'.format(fn).encode('utf-8')) space.update(add_file(flite, ft, fn)) print_space(output, space)
def main(code, op, infiles, output): epi = epitran.Epitran(code) ft = panphon.FeatureTable() space = Counter() for fn in infiles: logging.debug(u'Scanning:\t{}'.format(fn).encode('utf-8')) add_file = add_file_op if op else add_file_gen space.update(add_file(epi, ft, fn)) print_space(output, space)
def main(fn): ft = panphon.FeatureTable() xs = epitran.xsampa.XSampa() with open(fn, 'rb') as f: reader = csv.reader(f, encoding='utf-8') next(reader) phones = set() for orth, phon in reader: phones = phones.union(set(ft.segs_safe(phon))) print(len(phones)) print(sorted(list(map(xs.ipa2xs, phones))))
def __init__(self, model, C2I): self.model = model self.C2I = C2I self.E_plus = model.add_lookup_parameters((22, EMBEDDING_SIZE)) self.E_minus = model.add_lookup_parameters((22, EMBEDDING_SIZE)) self.E_not_relevant = model.add_lookup_parameters((22, EMBEDDING_SIZE)) self.E = model.add_lookup_parameters((len(C2I), EMBEDDING_SIZE)) self.ft = panphon.FeatureTable() self.E_lang = model.add_lookup_parameters((7, EMBEDDING_SIZE)) self.langs = ["s", "i", "r", "f", "p", "l", "sep"] self.W_combine = model.add_parameters((EMBEDDING_SIZE, 2 * EMBEDDING_SIZE))
def __init__(self, filename, panphon=False): self.panphon = panphon if self.panphon: self.ft = pp.FeatureTable() char_function = None else: self.source_vocab = defaultdict(lambda: len(self.source_vocab)) self.target_vocab = defaultdict(lambda: len(self.target_vocab)) char_function = self.char2int self.source_vocab[UNK] self.target_vocab[UNK] self.data = self.read_data(filename, char_function)
def main(): ft = panphon.FeatureTable() with open('mapping.yml') as f: mapping = yaml.load(f, Loader=yaml.Loader) phon2wav = defaultdict(list) for wav, phon in mapping.items(): phon2wav[phon['tone_dias']].append(wav) phons = list(phon2wav.keys()) random.seed(256) random.shuffle(phons) split1 = int(0.7 * len(phons)) split2 = int(0.8 * len(phons)) serialize_partition('train', mapping, phons[:split1], ft, phon2wav) serialize_partition('dev', mapping, phons[split1:split2], ft, phon2wav) serialize_partition('test', mapping, phons[split2:], ft, phon2wav)
def __init__(self, arpabet='arpabet', ligatures=False, cedict_file=None): """Construct a Flite "wrapper" Args: arpabet (str): file containing ARPAbet to IPA mapping ligatures (bool): if True, use non-standard ligatures instead of standard IPA cedict_filename (str): path to CC-CEDict dictionary (included for compatibility) """ arpabet = pkg_resources.resource_filename(__name__, os.path.join('data', arpabet + '.csv')) self.arpa_map = self._read_arpabet(arpabet) self.chunk_re = re.compile(r"([A-Za-z'’]+|[^A-Za-z'’]+)", re.U) self.letter_re = re.compile(r"[A-Za-z'’]+") self.puncnorm = PuncNorm() self.ligatures = ligatures self.ft = panphon.FeatureTable()
def __init__(self, arpabet='arpabet', ligatures=False, **kwargs): """Construct a Flite "wrapper" Args: arpabet (str): file containing ARPAbet to IPA mapping ligatures (bool): if True, use non-standard ligatures instead of standard IPA """ arpabet = pkg_resources.resource_filename( __name__, os.path.join('data', arpabet + '.csv')) self.arpa_map = self._read_arpabet(arpabet) self.chunk_re = re.compile(r"([A-Za-z'’]+|[^A-Za-z'’]+)", re.U) self.letter_re = re.compile(r"[A-Za-z'’]+") self.regexp = re.compile(r'[A-Za-z]') self.puncnorm = PuncNorm() self.ligatures = ligatures self.ft = panphon.FeatureTable() self.num_panphon_fts = len(self.ft.names)
def setUp(self): self.ft = panphon.FeatureTable() self.xs = panphon.xsampa.XSampa()
def __init__(self): """Construct an IPA-XSampa conversion object """ self.trie = self._read_ipa2xs() self.ft = panphon.FeatureTable()
def acousticArrayValuesNeg(word): properties = ['syl', 'son', 'cons', 'cont', 'delrel', 'lat', 'nas', 'strid', 'voi', 'sg', 'cg', 'ant', 'cor', 'distr', 'lab', 'hi', 'lo', 'back', 'round', 'velaric', 'tense', 'long'] ft=panphon.FeatureTable() acoustic_array = ft.word_array(properties, word) return ((acoustic_array<0)*acoustic_array).sum(axis=0)
def setUp(self): self.dist = distance.Distance(feature_model=feature_model) self.ft = panphon.FeatureTable()
def phonemeCount(word): properties = ['syl', 'son', 'cons', 'cont', 'delrel', 'lat', 'nas', 'strid', 'voi', 'sg', 'cg', 'ant', 'cor', 'distr', 'lab', 'hi', 'lo', 'back', 'round', 'velaric', 'tense', 'long'] ft=panphon.FeatureTable() acoustic_array = ft.word_array(properties, word) return (acoustic_array.shape[0])
"""Script to make ground truth phonological feature representations. $ python src/features.py """ import os import fire import pandas as pd import panphon from tqdm import tqdm from utils import write from wikipron import LANGUAGES, load_inventory FEATURES = panphon.FeatureTable() def feature_vector(phoneme): return FEATURES.word_fts(phoneme)[0].numeric() def create_features(language): inventory = load_inventory(language) phonemes = inventory["Phoneme"] representations = { phoneme: feature_vector(phoneme) for phoneme in phonemes } features = pd.DataFrame(representations, index=FEATURES.names).T os.makedirs(f"data/phoneme/features/{language}", exist_ok=True)
import panphon import difflib import unicodedata ft = panphon.FeatureTable() def prefilter(string): string = string.replace('d̥', "t") string = string.replace("ɡ̥", "k") string = string.replace("b̥", "b") string = string.replace("'", "ʼ") return string s = u"thi" errorlist = [] segdict = ft.ipa_segs(s) segpile = u"ɪaaːăbʲbʷb̞b̥cddʒdʲdːd̚d̥d͡zd͡ʑd͡ʒd͡ʒːeeːe̞ffʲfʷfːɡɡʲɡʷɡːɡ̟ʲhhʷiiːi̞i̥i̯jkk'kxkʰkʲkʷkʷ'kːk̟ʲk̟̚k͡p̚llʲlːmmʲmʷmːnnʲnːn̺ooːo̞o̥pp'pfpʰpʲpʷpːp̚rrːssʲsːtt'tstsʰtɕtɕʰtʃtʰtʲtʷ'tːt̚t̪t̪ʰt̪̚t͡st͡sʼt͡ɕt͡ɬt͡ʃt͡ʃʲt͡ʃʼt͡ʃːuuəuːvvʲvʷvːv̞v̞ʲwxyzzʲäæçðøŋŋ̟ŋ͡mœɐɐ̞ɑɓɔɕɕːɗəɛɟɡɡ̥ɣɤɤɐ̞ɤ̆ɥɦɨɪɫɯɯ̟ɯ̥ɰɱɲɴɸɹɹ̩ɻɻ̩ɽɾɾʲɾ̠ʀʂʃʃʲːʊʋʋʲʌʎʏʐʑʒʒ͡ɣʔʝββ̞θχḁ" segpilebad = u"ăb̥d̚d̥d͡zd͡ʑd͡ʒd͡ʒːeeːe̞ɡɡʲɡʷɡːɡ̟ʲhhʷk'kxkʰkʲkʷkʷ'k̟ʲk̟̚k͡p̚llʲlːmmʲmʷmːnnʲnːn̺ooːo̞o̥pp'pfpʰpʲpʷpːp̚rrːssʲsːtt'tstsʰtɕtɕʰtʃtʰtʲtʷ'tːt̚t̪t̪ʰt̪̚t͡st͡sʼt͡ɕt͡ɬt͡ʃt͡ʃʲt͡ʃʼt͡ʃːuuəuːvvʲvʷvːv̞v̞ʲwxyzzʲäæçðøŋŋ̟ŋ͡mœɐɐ̞ɑɓɔɕɕːɗəɛɟɡɡ̥ɣɤɤɐ̞ɤ̆ɥɦɨɪɫɯɯ̟ɯ̥ɰɱɲɴɸɹɹ̩ɻɻ̩ɽɾɾʲɾ̠ʀʂʃʃʲːʊʋʋʲʌʎʏʐʑʒʒ͡ɣʔʝββ̞θχḁ" seglist = [ u"ɪ", u"a", u"aː", u"ă", u"b", u"bʲ", u"bʷ", u"bː", u"b̞", u"b̥", u"c", u"d", u"dʒ", u"dʲ", u"dː", u"d̚", u"d̥", u"d͡z", u"d͡ʑ", u"d͡ʒ", u"d͡ʒː", u"e", u"eː", u"e̞", u"f", u"fʲ", u"fʷ", u"fː", u"ɡ", u"ɡʲ", u"ɡʷ", u"ɡː", u"ɡ̟ʲ", u"h", u"hʷ", u"i", u"iː", u"i̞", u"i̥", u"i̯", u"j", u"k", u"k'", u"kx", u"kʰ", u"kʲ", u"kʷ", u"kʷ'", u"kː", u"k̟ʲ", u"k̟̚", u"k͡p̚", u"l", u"lʲ", u"lː", u"m", u"mʲ", u"mʷ", u"mː", u"n", u"nʲ", u"nː", u"n̺", u"o", u"oː", u"o̞", u"o̥", u"p", u"p'", u"pf", u"pʰ", u"pʲ", u"pʷ", u"pː", u"p̚", u"r", u"rː", u"s", u"sʲ", u"sː", u"t", u"t'", u"ts", u"tsʰ", u"tɕ", u"tɕʰ", u"tʃ", u"tʰ", u"tʲ", u"tʷ'", u"tː", u"t̚", u"t̪", u"t̪ʰ", u"t̪̚", u"t͡s", u"t͡sʼ", u"t͡ɕ", u"t͡ɬ", u"t͡ʃ", u"t͡ʃʲ", u"t͡ʃʼ", u"t͡ʃː", u"u", u"uə",
def __init__(self): self.feature_table = panphon.FeatureTable()
class XYGram: features = [ 'syl', 'son', 'cont', 'nas', 'ant', 'cor', 'hi', 'lo', 'back' ] ft = panphon.FeatureTable() def __init__(self, lang1, lang2, max_offset=3, max_features=3): self.epi = (epitran.Epitran(lang1), epitran.Epitran(lang2)) self.max_offset = max_offset self.max_features = min(max_features, len(XYGram.features)) def _allFeatureCombos(self, v): result = [] for r in range(1, self.max_features + 1): result += list(itertools.combinations(v, r)) return result # lang: 1 or 2 based on which of the two languages def generateXYGram(self, s, lang): epi = self.epi[lang - 1] ft_vector = XYGram.ft.word_array(XYGram.features, epi.transliterate(s)) d = {} for i in range(len(s)): for j in range(i + 1, min(i + self.max_offset, len(s) + 1)): fv = ft_vector[i:j] tmp1 = [ [ k for k, x in enumerate(v) if x >= 0 ] for v in fv ] tmp2 = [ self._allFeatureCombos(v) for v in tmp1 ] keys = list(itertools.product(*tmp2)) for k in keys: d[k] = d.get(k, 0) + 1 return d # Prereq: v1, v2 int lists of equal length def cosineSimilarity(self, v1, v2): if (len(v1) != len(v2)): raise ValueError return 1 - spatial.distance.cosine(v1, v2) # Prereq: v1, v2 int lists of equal length def jaccardSimilarity(self, v1, v2): if (len(v1) != len(v2)): raise ValueError return jaccard_similarity_score(v1, v2) def compareXYGram(self, xy1, xy2): # Vectorize dictionaries with same keys v1 = [] v2 = [] k1 = set(xy1.keys()) k2 = set(xy2.keys()) k = k1.union(k2) for key in k: v1.append(xy1.get(key, 0)) v2.append(xy2.get(key, 0)) return self.jaccardSimilarity(v1, v2) def compareRaw(self, s1, s2): xy1 = self.generateXYGram(s1, 1) xy2 = self.generateXYGram(s2, 2) return self.compareXYGram(xy1, xy2)
def setUp(self): self.ft = panphon.FeatureTable()