def get_nat_classes(featlines, **kwargs): ''' takes as input a tuple: (featnames, seglines) see read_feat_file returns a dictionary of natural classes as keys, and segments as values {-son,+cons: [p, t, n, l, ...]} ''' segdict = segs_to_feats(featlines) featdict = make_feat_vectors(featlines) #find all pairs of segs that share feature values: natclasslist = [] for seg in segdict: for otherseg in segdict: overlap = set(segdict[seg]) & set(segdict[otherseg]) if len(overlap) > 0 and not overlap in natclasslist: natclasslist.append(list(overlap)) natclassdic = {} #compile lists of segments that natural classes expand to: for cl in natclasslist: clname = ','.join(sorted(cl)) natclassdic[clname] = [] #for every seg, check if its feature values are in that natural class description for seg in segdict: if set(cl).issubset(set(segdict[seg])): natclassdic[clname].append(seg) kwargs['message'] = f'\nNumber of natural classes: {len(natclassdic)}' msg.env_render(**kwargs) return natclassdic
def findOrphans(learningdata, featurefile, verbose=False): learningdatasegs = collectLDSegs(learningdata) featurefilesegs = collectFSegs(featurefile) for x in ['+', '-', '*', '|']: if x in featurefilesegs: msg.env_render( message= f"Please do not use {x} in your segment list. Choose a symbol that is not used in regular expressions. Your feature file is incompatible with the gain-based version of the MaxEnt Phonotactic Learner." ) if verbose: msg.env_render( message= f"\nThe segments in your data file are: \n {','.join(sorted(learningdatasegs))}" ) msg.env_render( message= f"\nThe segments in your Features.txt file are: \n {','.join(sorted(featurefilesegs))}" ) orphans = list(set(learningdatasegs) - set(featurefilesegs)) if len(orphans) == 0: msg.env_render( message=f'\nAll the segments are defined in the feature file.') else: if verbose: msg.env_render( message=f"\nyour orphan segments are: \n {','.join(orphans)}") return orphans else: #pass return orphans
def write_new_ld_file(clusters, oldpath, newpath, threshold=1, **kwargs): ''' clusters is created by get_new_segs above oldpath and newpath are the locations of LearningData.txt files, old and new threshold defaults to 1 (inseparability measure) bigram clusters are sorted by their inseparability value; thus, if something that eventually becomes a trigram or tetragram has two-way parts in the current inseparability table, the bigram that is higher on the list will be replaced first. ''' clustlist = sorted([x for x in clusters if clusters[x] >= threshold], key=clusters.get, reverse=True) with open(oldpath, 'r', encoding='utf-8') as f: with open(newpath, 'w', encoding='utf-8') as out: for line in f: word = line.strip() if '\t' in line: word = line.split('\t')[0] rest = line.split('\t')[1:] for clust in clustlist: x = r'(^|\s)' + (clust) + r'(\s|$)' y = r'\1' + ''.join(clust.split(" ")) + r'\2' word = resub(x, y, word) if '\t' in line: out.write(f"word\trest") else: out.write(word + '\n') msg.env_render( message= f"\n\nWrote modified learning data to {newpath.split('simulation')[1]}", **kwargs)
def collectLDSegs(somepath): segs = [] try: with open(somepath, 'r', encoding='utf-8') as ldatafile: for line in ldatafile: wordsegs = line.strip('\n').split() for seg in wordsegs: if not seg in segs: segs.append(seg.strip()) except FileNotFoundError: msg.env_render(message=f"\n\n\nNo file at {somepath} \n\n\n\n") return segs
def plot_insep(simpath, threshold=1, takefirst=15, show=False, ftype='pdf'): ''' searches contents of insepath for inseparability.txt files, and produces plots of them. saves each in the same location as the inseparability.txt file it represents. ''' try: pins.plot_all_vert(simpath, threshold, takefirst, show, ftype) msg.env_render( message=f'\n {ftype.upper()} Plot generated for {simpath}') except FileNotFoundError: msg.env_render( message=f'\n Check that {simpath} exists and can be written to')
def get_vocoids(featfilepath, **kwargs): ''' first argument is a path to a features.txt file. returns a list of vowel and glide symbols. kwargs are passed to essages module for `error handling ''' featlines = read_feat_file(featfilepath) feats = make_feat_vectors(featlines) if '-cons' in feats: return feats['-cons'] elif '-consonantal' in feats: return feats['-consonantal'] else: msg.env_render( message= f'\nThe feature file {featfilepath.split("simulation")[1]} does not have a column for -cons or -consonantal. The learner needs this feature to separate vocoids from true consonants. Fix this and try again.', **kwargs)
def get_consonants(featfilepath, **kwargs): ''' given a full path to a features.txt file, returns a list of all the symbols that are specified as -syll or -syllabic. Those feature names are special. kwargs are passed on to messages module for error handling ''' featlines = read_feat_file(featfilepath) feats = make_feat_vectors(featlines) if '-syll' in feats: return feats['-syll'] elif '-syllabic' in feats: return feats['-syllabic'] else: kwargs[ 'message'] = f'\nThe feature file {featfilepath.split("simulation")[1]} does not have a column for -syll or -syllabic. The learner needs this feature to separate consonants from vowels. Fix this and try again.' msg.env_render(**kwargs)
def read_feat_file(featfilepath, **kwargs): ''' the input argument is a path to features.txt, tab-formatted according to Hayes and Wilson rules returns a vector of feature names, and a vector of segments plus their feature values in order featnames: [syll, cons, son, dor, ...] seglines: [k, -, -, -, +, ...] ''' try: with open(featfilepath, 'r', encoding='utf-8') as f: feats = f.readlines() featnames = feats[0].strip().split('\t') seglines = [x.strip().split('\t') for x in feats[1:]] return (featnames, seglines) except FileNotFoundError: msg.env_render(message=f'could not open {featfilepath}', **kwargs) raise SystemExit
def make_custom_proj(feats, featfilepath, outpath, **kwargs): ''' feats: some feature(s) defining a natural class. e.g., '+son' or '-son,-cont'. If more than 1, must be a comma-separated string. no spaces featfilepath: path to feature file from which to read natural classes. outpath: where to put projections.txt. this function writes a projections file in the format used by Wilson's MaxEnt learner: projname feats_defining_class feats_visible_on_proj ngrams default proj always included. this is a stand-alone function, it opens the feature file rather than be fed pre-read lines ''' featl = [x.strip() for x in feats.split(',')] msg.env_render(message=f'\n{featl}', **kwargs) thesegs = feats_to_segs_wrapper(feats, featfilepath) feats_to_project = seglist_to_feats( thesegs, read_feat_file(featfilepath)).append('wb') with open(outpath, 'w', encoding='utf-8') as f: f.write('\t'.join(['default', 'any', 'all', '3'])) f.write('\t'.join( [feats, ''.join(featl), ','.join(feats_to_project), '2', '3']))
def segs_to_feats(featlines, **kwargs): ''' takes as input a tuple: (featnames, seglines) see read_feat_file returns a dictionary with segment name keys and +feat -feat lists as values {k: [-syll, -cons, -son, +dor, ...], p: [], etc} ''' featnames = featlines[0] seglines = featlines[1] segdict = {} for line in seglines: segdict[line[0]] = [] for feat in featnames: featvalue = line[featlines[0].index(feat) + 1] if not featvalue in ['+', '-', '0']: kwargs[ 'message'] = 'Your feature file is malformed. Feature values have to be "+", "-", or "0".' msg.env_render(**kwargs) if not featvalue == '0': segdict[line[0]].append(featvalue + feat) return segdict
def check_new_segs(newpath, oldfeats, newfeats, **kwargs): ''' given an interim data file, checks to see which segs from old feats are no longer in the new data. also checks which segs from the new feature file are not in the new data, because that can happen if the complex segment is really a trigram: old data: d z n d m b ... (but really, both 'nd' and 'ndzh' should be complex segs--see fijian) ''' ldsegs = dc.collectLDSegs(newpath) oldfeatsegs = sorted(pnc.segs_to_feats(oldfeats, **kwargs).keys()) newfeatsegs = sorted(pnc.segs_to_feats(newfeats, **kwargs).keys()) missing_segs = sorted([x for x in ldsegs if not x in newfeatsegs]) extra_segs = sorted(list(set(newfeatsegs) - set(ldsegs))) if missing_segs: msg.env_render( message= f"\n\nThe feature file is missing the following segments: \n{' '.join(missing_segs)}", **kwargs) return (missing_segs, 'missing') if extra_segs: kwargs[ 'message'] = f"\n\nThe following segments are in the feature file but are not in the data file, and will be removed from feature file:\n{' '.join(extra_segs)}" msg.env_render(**kwargs) return (extra_segs, 'extra') else: kwargs[ 'message'] = "\n\nYour segments are all defined in the feature file." msg.env_render(**kwargs) return (False, False)
def check_feats(featlines, **kwargs): ''' returns seg and feat value if the feature specifications of one seg are a proper subset of the other. when this holds, the first seg cannot be uniquely identified using its features, so the user should be told. ''' segdict = segs_to_feats(featlines) problemsegs = [] for seg, otherseg in itertools.combinations(segdict.keys(), 2): if set(segdict[seg]).issubset(set(segdict[otherseg])): problemsegs.append((seg, otherseg)) if not problemsegs: kwargs[ 'message'] = "\nThe new feature file is well-formed. all the segments can be uniquely identified.\n" msg.env_render(**kwargs) return True else: kwargs[ 'message'] = "\nThe new feature file does not allow certain segments to be distinguished from each other:\n\n" msg.env_render(**kwargs) for x in problemsegs: kwargs[ 'message'] = f'\n{x[0]} has a subset of the features of {x[1]}' msg.env_render(**kwargs) return False
def bidir_prob_wrapper(featpath, datapath, **kwargs): ''' a wrapper function for insep. featpath leads to Features.txt, and datapath leads to LearningData.txt ''' if 'vowels' in kwargs and kwargs['vowels']: msg.env_render(message='\nGetting vocoids...', **kwargs) conslist = pnc.get_vocoids(featpath, **kwargs) else: msg.env_render(message="\nGetting consonants...", **kwargs) conslist = pnc.get_consonants(featpath, **kwargs) msg.env_render(message='\n'+', '.join(conslist)+'\n', **kwargs) msg.env_render(message="\nGetting clusters...", **kwargs) clustlist = list_clusters(conslist, 2) msg.env_render(message="\nCounting clusters...", **kwargs) clustercount = count_clusters(clustlist, datapath, 2) msg.env_render(message="\nCalculating probabilities...", **kwargs) d= uni_counts(conslist, datapath) bidic = insep(clustercount, d) return bidic
msg.env_render( message=f"\nyour orphan segments are: \n {','.join(orphans)}") return orphans else: #pass return orphans if __name__ == '__main__': import sys import os helpmessage = 'please provide the full locations of the learning data file and the feature file as follows: \n$ python3 datachecker.py /home/me/Desktop/LearningData.txt /home/me/Desktop/Features.txt. Alternatively:\n\n $ python3 datachecker.py russian/wds_t_s_t_sh dirc\n\n this will check the LearningData.txt against the Features.txt file.' if 'dirc' in sys.argv: basepath = os.path.dirname(os.getcwd()) lgname = sys.argv[1] learningdata = os.path.join(basepath, 'data', lgname, 'LearningData.txt') featurefile = os.path.join(basepath, 'data', lgname, 'Features.txt') findOrphans(learningdata, featurefile, verbose=True) elif len(sys.argv) > 1: learningdata = sys.argv[1] featurefile = sys.argv[2] msg.env_render(message=f'learning data: {learningdata}') msg.env_render(message=f'"feature file: {featurefile}') findOrphans(learningdata, featurefile, verbose=True) else: try: findOrphans(learningdata, featurefile, verbose=True) except FileNotFoundError: msg.env_render(message=helpmessage)
# plt.suptitle("%s\n\n" % (ptit), **libfont) if maxval<=1: plt.xlim(0,1.1) else: plt.xlim(0) plt.subplots_adjust(left=0.20, bottom=0.20, wspace=0.4) if show: plt.show() fig.savefig(os.path.join(simpath, '.'.join(['insep_plots', ftype]))) #because matplotlib does not take out the trash fig.clf() plt.close(fig) if __name__=='__main__': import sys if 'help' in sys.argv: msg.env_render(message="Looks inside the language/simulation directory and plots inseparability values for the top 15 clusters in each iteration. The individual plots will be placed at the same level as inseparability.txt files that inspired them.\n\nUsage:\n\n$ python3 plot_insep.py languagename\n\n. The 'languagename' argument is a full path to the location of the 'simulation' folder that contains the inseparability.txt file. You can also plot simulations for languages in the 'data' folder.") try: plot_all_vert(sys.argv[1], threshold=1, takefirst=15, show=False, ftype='pdf') plot_all_vert(sys.argv[1], threshold=1, takefirst=15, show=False, ftype='png') print("plotted " + sys.argv[1]) except FileNotFoundError: simpath = os.path.join(os.path.dirname(os.getcwd()), 'data', sys.argv[1], 'simulation') plot_all_vert(simpath, threshold=1, takefirst=15, show=False, ftype = 'pdf') plot_all_vert(simpath, threshold=1, takefirst=15, show=False, ftype = 'png') msg.env_render(message=f'saved figures for {sys.argv[1]}"')
thesegs = feats_to_segs_wrapper(feats, featfilepath) feats_to_project = seglist_to_feats( thesegs, read_feat_file(featfilepath)).append('wb') with open(outpath, 'w', encoding='utf-8') as f: f.write('\t'.join(['default', 'any', 'all', '3'])) f.write('\t'.join( [feats, ''.join(featl), ','.join(feats_to_project), '2', '3'])) if __name__ == '__main__': import sys HelpString = '\n\nThis utility finds natural classes in a feature file formatted according to Hayes and Wilson (2009, Linguistic Inquiry) conventions. Basic usage: \n\n$ python3 pynatclasses.py /home/full/path/to/file/Features.txt /home/full/path/to/output.txt\n\n You can also get all the consonants from a feature file from a command line call: \n $ python3 pynatclasses.py /home/full/path/to/Features.txt cons\n\n This last option requires there being a -syll or -syllabic feature in the file.\n\n\n To see other options, import it into python and try help(pynatclasses)' CLError = '\n\nPlease provide the name of a feature file and a place to save the natural classes to. \n\nFor example: "python3 pynatclasses.py /home/you/Desktop/features.txt /home/you/Desktop/natclasses.txt"\n\n' basepath = os.path.dirname(os.path.dirname(os.getcwd())) if "help" in sys.argv: msg.env_render(message=HelpString) elif 'check' in sys.argv: feats = os.path.join(basepath, 'data', sys.argv[1], 'Features.txt') featlines = read_feat_file(feats) check_feats(featlines) elif 'cus' in sys.argv: feats = os.path.join(basepath, 'data', sys.argv[1], 'Features.txt') make_custom_proj(sys.argv[2], feats, '/home/maria/Desktop/projections.txt') elif not "cons" in sys.argv: try: feats = sys.argv[1] outfile = sys.argv[2] outwrite_classes(feats, outfile) except IndexError: msg.env_render(message=CLError)
def complexify(**kwargs): ''' the algorithm. feats is a full path to Features.txt or another appropriately formatted feature file ld is a full path to a LearningData.txt or other learning data file, with space-separated words outdir is a writable directory where the learner will save results. the function creates a subfolder inside this directory, called 'simulation', and creates new versions of learning data and features. NOTE: any existing simulation directories will be deleted without warning. threshold is the cutoff for the inseparability measure. clusters above the threshold get converted into complex segments. defaults to 1. ''' if not 'vowels' in kwargs: vowels = False else: vowels = kwargs['vowels'] feats = kwargs.get('feats') ld = kwargs.get('ld') outdir = kwargs.get('outdir') threshold = kwargs.get('threshold') alpha = kwargs.get('alpha') ofpth = os.path.join(outdir, 'simulation_report.txt') kwargs['outfilepath'] = ofpth msg.env_render(message="\nSearching for complex segments.", **kwargs) msg.env_render( message= f"\nInseparability threshold: {threshold}\nAlpha level for Fisher's Exact Test: {alpha}", **kwargs) oldfeats = pnc.read_feat_file( feats, outfilepath=ofpth ) #before starting the recursion, first version of features is arg passed to the command msg.env_render(message="\nChecking feature file...\n", **kwargs) if (vowels == False) and (not any( [feat.startswith('syll') for feat in oldfeats[0]])): message = "\nYou need to have a 'syll(abic)' feature in your feature file. The learner needs a list of consonants, [-syll], to get started.\n" x = f"Your features are: [{','.join(oldfeats[0])}]" msg.env_render(message=message, **kwargs) if __name__ == '__main__': raise SystemExit else: return message elif vowels and (not any([feat.startswith('cons') for feat in oldfeats[0]])): message = "\nYou need to have a 'cons(onantal)' feature in your feature file. The learner needs a list of vocoids, [-cons], to get started.\n" msg.env_render(message=message, **kwargs) if __name__ == '__main__': raise SystemExit elif pnc.check_feats(oldfeats, **kwargs): if os.path.isdir(os.path.join(outdir, 'simulation')): shutil.rmtree(os.path.join(outdir, 'simulation')) os.mkdir(os.path.join(outdir, 'simulation')) step = 1 while step: wdir = os.path.join(outdir, 'simulation', 'iteration' + str(step)) os.mkdir(wdir) #get numbers from feats and ld files temp = get_bidir_transprobs(feats, ld, **kwargs) if temp[0] == {}: shutil.rmtree(wdir) else: nc.write_insep(temp, os.path.join(wdir, 'inseparability.txt')) clusters = { k: v for (k, v) in temp[0].items() if temp[0][k] >= threshold } #fisher's test check to avoid unifying clusters that are too infrequent for c in clusters.copy(): counts = temp[1][c].split('\t') if float(counts[5]) > alpha: del clusters[c] if not clusters: msg.env_render( message= f'\nNo complex segments identified in {os.path.split(ld)[1]}. That is the final version of your learning data.', **kwargs) msg.env_render(message="\n\nSimulation Finished", **kwargs) step = 0 else: nc.write_insep(temp, os.path.join(wdir, 'inseparability.txt')) msg.tab_render( d=clusters, message=f'\nFound complex segments on iteration {step}:\n', **kwargs) newfeats = make_new_feats( feats, get_new_segs(feats, clusters, **kwargs)) msg.env_render( message="\nChecking learner-generated feature file...\n", **kwargs) if not pnc.check_feats(newfeats, **kwargs): msg.env_render(message=msg.messages['badfeatswarning'], **kwargs) write_new_ld_file(clusters, ld, os.path.join(wdir, 'LearningData.txt'), **kwargs) write_feats_w_check(os.path.join(wdir, 'Features.txt'), os.path.join(wdir, 'LearningData.txt'), oldfeats, newfeats, **kwargs) msg.env_render( message=f"\nExamining data from iteration {step}.\n", **kwargs) ld = os.path.join(wdir, 'LearningData.txt') feats = os.path.join(wdir, 'Features.txt') step += 1 shutil.move( ofpth, os.path.join(outdir, 'simulation', 'simulation_report.txt')) return None else: #failed feature check on first pass, cannot proceed if __name__ == "__main__": raise SystemExit else: if os.path.isfile(os.path.join(outdir, 'simulation_report.txt')): with open(os.path.join(outdir, 'simulation_report.txt'), 'r', encoding='utf-8') as f: errors = f.read().replace('\n', "<br>") return "Your feature file does not allow segments to be distinguished from each other. Perhaps try again with <a href='media/generic/Features.txt'>this generic feature file</a>?.<br>Here is how far the learner got:<br>" + errors
default=0.05) args = parser.parse_args() kwargs = vars(args) if args.language: lgpath = os.path.join(os.path.dirname(os.getcwd()), 'data', args.language) kwargs['ld'] = os.path.join(lgpath, 'LearningData.txt') kwargs['feats'] = os.path.join(lgpath, 'Features.txt') kwargs['outdir'] = lgpath simpath = os.path.join(lgpath, 'simulation') try: complexify(**kwargs) plot_insep(simpath, ftype='png') plot_insep(simpath, ftype='pdf') except FileNotFoundError: msg.env_render( message= f'\nCould not locate the Learning Data or Features or output path at data/{language}' ) raise else: try: complexify(**kwargs) plot_insep(os.path.join(args.outdir, 'simulation'), ftype='png') except: msg.env_render( message= f"attempting to plot: {os.path.join(sys.argv[3], 'simulation')} but something went wrong. Are the simulation files at that location?" ) raise