Пример #1
0
def get_nat_classes(featlines, **kwargs):
    '''
	takes as input a tuple: (featnames, seglines) see read_feat_file
	returns a dictionary of natural classes as keys, and segments as values
	{-son,+cons: [p, t, n, l, ...]}
	'''
    segdict = segs_to_feats(featlines)
    featdict = make_feat_vectors(featlines)
    #find all pairs of segs that share feature values:
    natclasslist = []
    for seg in segdict:
        for otherseg in segdict:
            overlap = set(segdict[seg]) & set(segdict[otherseg])
            if len(overlap) > 0 and not overlap in natclasslist:
                natclasslist.append(list(overlap))
    natclassdic = {}
    #compile lists of segments that natural classes expand to:
    for cl in natclasslist:
        clname = ','.join(sorted(cl))
        natclassdic[clname] = []
        #for every seg, check if its feature values are in that natural class description
        for seg in segdict:
            if set(cl).issubset(set(segdict[seg])):
                natclassdic[clname].append(seg)
    kwargs['message'] = f'\nNumber of natural classes: {len(natclassdic)}'
    msg.env_render(**kwargs)
    return natclassdic
Пример #2
0
def findOrphans(learningdata, featurefile, verbose=False):
    learningdatasegs = collectLDSegs(learningdata)
    featurefilesegs = collectFSegs(featurefile)
    for x in ['+', '-', '*', '|']:
        if x in featurefilesegs:
            msg.env_render(
                message=
                f"Please do not use {x} in your segment list. Choose a symbol that is not used in regular expressions. Your feature file is incompatible with the gain-based version of the MaxEnt Phonotactic Learner."
            )
    if verbose:
        msg.env_render(
            message=
            f"\nThe segments in your data file are: \n {','.join(sorted(learningdatasegs))}"
        )
        msg.env_render(
            message=
            f"\nThe segments in your Features.txt file are: \n {','.join(sorted(featurefilesegs))}"
        )
    orphans = list(set(learningdatasegs) - set(featurefilesegs))
    if len(orphans) == 0:
        msg.env_render(
            message=f'\nAll the segments are defined in the feature file.')
    else:
        if verbose:
            msg.env_render(
                message=f"\nyour orphan segments are: \n {','.join(orphans)}")
            return orphans
        else:
            #pass
            return orphans
Пример #3
0
def write_new_ld_file(clusters, oldpath, newpath, threshold=1, **kwargs):
    '''
    clusters is created by get_new_segs above
    oldpath and newpath are the locations of LearningData.txt files, old and new 
    threshold defaults to 1 (inseparability measure)
    bigram clusters are sorted by their inseparability value;
    thus, if something that eventually becomes a trigram or tetragram has two-way parts in the current inseparability table, the bigram that is higher on the list will be replaced first.
    '''
    clustlist = sorted([x for x in clusters if clusters[x] >= threshold],
                       key=clusters.get,
                       reverse=True)
    with open(oldpath, 'r', encoding='utf-8') as f:
        with open(newpath, 'w', encoding='utf-8') as out:
            for line in f:
                word = line.strip()
                if '\t' in line:
                    word = line.split('\t')[0]
                    rest = line.split('\t')[1:]
                for clust in clustlist:
                    x = r'(^|\s)' + (clust) + r'(\s|$)'
                    y = r'\1' + ''.join(clust.split(" ")) + r'\2'
                    word = resub(x, y, word)
                if '\t' in line:
                    out.write(f"word\trest")
                else:
                    out.write(word + '\n')
    msg.env_render(
        message=
        f"\n\nWrote modified learning data to {newpath.split('simulation')[1]}",
        **kwargs)
Пример #4
0
def collectLDSegs(somepath):
    segs = []
    try:
        with open(somepath, 'r', encoding='utf-8') as ldatafile:
            for line in ldatafile:
                wordsegs = line.strip('\n').split()
                for seg in wordsegs:
                    if not seg in segs:
                        segs.append(seg.strip())
    except FileNotFoundError:
        msg.env_render(message=f"\n\n\nNo file at {somepath} \n\n\n\n")
    return segs
Пример #5
0
def plot_insep(simpath, threshold=1, takefirst=15, show=False, ftype='pdf'):
    '''
    searches contents of insepath for inseparability.txt files, and produces plots of them.
    saves each in the same location as the inseparability.txt file it represents.
    '''
    try:
        pins.plot_all_vert(simpath, threshold, takefirst, show, ftype)
        msg.env_render(
            message=f'\n {ftype.upper()} Plot generated for {simpath}')
    except FileNotFoundError:
        msg.env_render(
            message=f'\n Check that {simpath} exists and can be written to')
Пример #6
0
def get_vocoids(featfilepath, **kwargs):
    '''
    first argument is a path to a features.txt file. returns a list of vowel and glide symbols. kwargs are passed to essages module for `error handling
    '''
    featlines = read_feat_file(featfilepath)
    feats = make_feat_vectors(featlines)
    if '-cons' in feats:
        return feats['-cons']
    elif '-consonantal' in feats:
        return feats['-consonantal']
    else:
        msg.env_render(
            message=
            f'\nThe feature file {featfilepath.split("simulation")[1]} does not have a column for -cons or -consonantal. The learner needs this feature to separate vocoids from true consonants. Fix this and try again.',
            **kwargs)
Пример #7
0
def get_consonants(featfilepath, **kwargs):
    '''
	given a full path to a features.txt file, returns a list of all the symbols that are specified as -syll or -syllabic. Those feature names are special.
        kwargs are passed on to messages module for error handling
	'''
    featlines = read_feat_file(featfilepath)
    feats = make_feat_vectors(featlines)
    if '-syll' in feats:
        return feats['-syll']
    elif '-syllabic' in feats:
        return feats['-syllabic']
    else:
        kwargs[
            'message'] = f'\nThe feature file {featfilepath.split("simulation")[1]} does not have a column for -syll or -syllabic. The learner needs this feature to separate consonants from vowels. Fix this and try again.'
        msg.env_render(**kwargs)
Пример #8
0
def read_feat_file(featfilepath, **kwargs):
    '''
	the input argument is a path to features.txt, tab-formatted according to Hayes and Wilson rules
	returns a vector of feature names, and a vector of segments plus their feature values in order
	featnames: [syll, cons, son, dor, ...]
	seglines: [k, -, -, -, +, ...] 
	'''
    try:
        with open(featfilepath, 'r', encoding='utf-8') as f:
            feats = f.readlines()
            featnames = feats[0].strip().split('\t')
            seglines = [x.strip().split('\t') for x in feats[1:]]
            return (featnames, seglines)
    except FileNotFoundError:
        msg.env_render(message=f'could not open {featfilepath}', **kwargs)
        raise SystemExit
Пример #9
0
def make_custom_proj(feats, featfilepath, outpath, **kwargs):
    '''
	feats: some feature(s) defining a natural class. e.g., '+son' or '-son,-cont'. If more than 1, must be a comma-separated string. no spaces
	featfilepath: path to feature file from which to read natural classes.
	outpath: where to put projections.txt.
	this function writes a projections file in the format used by Wilson's MaxEnt learner:
	projname    feats_defining_class    feats_visible_on_proj   ngrams
	default proj always included.
	this is a stand-alone function, it opens the feature file rather than be fed pre-read lines
	'''
    featl = [x.strip() for x in feats.split(',')]
    msg.env_render(message=f'\n{featl}', **kwargs)
    thesegs = feats_to_segs_wrapper(feats, featfilepath)
    feats_to_project = seglist_to_feats(
        thesegs, read_feat_file(featfilepath)).append('wb')
    with open(outpath, 'w', encoding='utf-8') as f:
        f.write('\t'.join(['default', 'any', 'all', '3']))
        f.write('\t'.join(
            [feats, ''.join(featl), ','.join(feats_to_project), '2', '3']))
Пример #10
0
def segs_to_feats(featlines, **kwargs):
    '''
	takes as input a tuple: (featnames, seglines) see read_feat_file
	returns a dictionary with segment name keys and +feat -feat lists as values
	{k: [-syll, -cons, -son, +dor, ...], p: [], etc}
	'''
    featnames = featlines[0]
    seglines = featlines[1]
    segdict = {}
    for line in seglines:
        segdict[line[0]] = []
        for feat in featnames:
            featvalue = line[featlines[0].index(feat) + 1]
            if not featvalue in ['+', '-', '0']:
                kwargs[
                    'message'] = 'Your feature file is malformed. Feature values have to be "+", "-", or "0".'
                msg.env_render(**kwargs)
            if not featvalue == '0':
                segdict[line[0]].append(featvalue + feat)
    return segdict
Пример #11
0
def check_new_segs(newpath, oldfeats, newfeats, **kwargs):
    '''
    given an interim data file, checks to see which segs from old feats are no longer in the new data.
    also checks which segs from the new feature file are not in the new data, because that can happen if the complex segment is really a trigram:
    old data:
    d z
    n d
    m b
    ... (but really, both 'nd' and 'ndzh' should be complex segs--see fijian)
    '''
    ldsegs = dc.collectLDSegs(newpath)
    oldfeatsegs = sorted(pnc.segs_to_feats(oldfeats, **kwargs).keys())
    newfeatsegs = sorted(pnc.segs_to_feats(newfeats, **kwargs).keys())
    missing_segs = sorted([x for x in ldsegs if not x in newfeatsegs])
    extra_segs = sorted(list(set(newfeatsegs) - set(ldsegs)))
    if missing_segs:
        msg.env_render(
            message=
            f"\n\nThe feature file is missing the following segments: \n{' '.join(missing_segs)}",
            **kwargs)
        return (missing_segs, 'missing')
    if extra_segs:
        kwargs[
            'message'] = f"\n\nThe following segments are in the feature file but are not in the data file, and will be removed from feature file:\n{' '.join(extra_segs)}"
        msg.env_render(**kwargs)
        return (extra_segs, 'extra')
    else:
        kwargs[
            'message'] = "\n\nYour segments are all defined in the feature file."
        msg.env_render(**kwargs)
        return (False, False)
Пример #12
0
def check_feats(featlines, **kwargs):
    '''
    returns seg and feat value if the feature specifications of one seg are a proper subset of the other. when this holds, the first seg cannot be uniquely identified using its features, so the user should be told.
    '''
    segdict = segs_to_feats(featlines)
    problemsegs = []
    for seg, otherseg in itertools.combinations(segdict.keys(), 2):
        if set(segdict[seg]).issubset(set(segdict[otherseg])):
            problemsegs.append((seg, otherseg))
    if not problemsegs:
        kwargs[
            'message'] = "\nThe new feature file is well-formed. all the segments can be uniquely identified.\n"
        msg.env_render(**kwargs)
        return True
    else:
        kwargs[
            'message'] = "\nThe new feature file does not allow certain segments to be distinguished from each other:\n\n"
        msg.env_render(**kwargs)
        for x in problemsegs:
            kwargs[
                'message'] = f'\n{x[0]}  has a subset of the features of {x[1]}'
            msg.env_render(**kwargs)
        return False
Пример #13
0
def bidir_prob_wrapper(featpath, datapath, **kwargs):
    '''
    a wrapper function for insep. featpath leads to Features.txt, and datapath leads to LearningData.txt
    '''
    if 'vowels' in kwargs and kwargs['vowels']:
        msg.env_render(message='\nGetting vocoids...', **kwargs)
        conslist = pnc.get_vocoids(featpath, **kwargs)
    else:
        msg.env_render(message="\nGetting consonants...", **kwargs)
        conslist = pnc.get_consonants(featpath, **kwargs)
    msg.env_render(message='\n'+', '.join(conslist)+'\n', **kwargs)
    msg.env_render(message="\nGetting clusters...", **kwargs)
    clustlist = list_clusters(conslist, 2)
    msg.env_render(message="\nCounting clusters...", **kwargs)
    clustercount = count_clusters(clustlist, datapath, 2)
    msg.env_render(message="\nCalculating probabilities...", **kwargs)
    d= uni_counts(conslist, datapath)
    bidic = insep(clustercount, d)
    return bidic
Пример #14
0
            msg.env_render(
                message=f"\nyour orphan segments are: \n {','.join(orphans)}")
            return orphans
        else:
            #pass
            return orphans


if __name__ == '__main__':
    import sys
    import os
    helpmessage = 'please provide the full locations of the learning data file and the feature file as follows: \n$ python3 datachecker.py /home/me/Desktop/LearningData.txt /home/me/Desktop/Features.txt. Alternatively:\n\n $ python3 datachecker.py russian/wds_t_s_t_sh dirc\n\n this will check the LearningData.txt against the Features.txt file.'
    if 'dirc' in sys.argv:
        basepath = os.path.dirname(os.getcwd())
        lgname = sys.argv[1]
        learningdata = os.path.join(basepath, 'data', lgname,
                                    'LearningData.txt')
        featurefile = os.path.join(basepath, 'data', lgname, 'Features.txt')
        findOrphans(learningdata, featurefile, verbose=True)
    elif len(sys.argv) > 1:
        learningdata = sys.argv[1]
        featurefile = sys.argv[2]
        msg.env_render(message=f'learning data: {learningdata}')
        msg.env_render(message=f'"feature file: {featurefile}')
        findOrphans(learningdata, featurefile, verbose=True)
    else:
        try:
            findOrphans(learningdata, featurefile, verbose=True)
        except FileNotFoundError:
            msg.env_render(message=helpmessage)
Пример #15
0
    #    plt.suptitle("%s\n\n" % (ptit), **libfont)
    if maxval<=1:
        plt.xlim(0,1.1)
    else:
        plt.xlim(0)
    plt.subplots_adjust(left=0.20, bottom=0.20, wspace=0.4)
    if show:
        plt.show()
    fig.savefig(os.path.join(simpath, '.'.join(['insep_plots', ftype])))
    #because matplotlib does not take out the trash
    fig.clf()
    plt.close(fig)



if __name__=='__main__':
    import sys
    if 'help' in sys.argv:
        msg.env_render(message="Looks inside the language/simulation directory and plots inseparability values for the top 15 clusters in each iteration. The individual plots will be placed at the same level as inseparability.txt files that inspired them.\n\nUsage:\n\n$ python3 plot_insep.py languagename\n\n. The 'languagename' argument is a full path to the location of the 'simulation' folder that contains the inseparability.txt file. You can also plot simulations for languages in the 'data' folder.") 
        try:
            plot_all_vert(sys.argv[1], threshold=1, takefirst=15, show=False, ftype='pdf')
            plot_all_vert(sys.argv[1], threshold=1, takefirst=15, show=False, ftype='png')
            print("plotted " + sys.argv[1])
        except FileNotFoundError:
            simpath = os.path.join(os.path.dirname(os.getcwd()), 'data', sys.argv[1], 'simulation')
            plot_all_vert(simpath, threshold=1, takefirst=15, show=False, ftype = 'pdf')
            plot_all_vert(simpath, threshold=1, takefirst=15, show=False, ftype = 'png')
        msg.env_render(message=f'saved figures for {sys.argv[1]}"')
    

Пример #16
0
    thesegs = feats_to_segs_wrapper(feats, featfilepath)
    feats_to_project = seglist_to_feats(
        thesegs, read_feat_file(featfilepath)).append('wb')
    with open(outpath, 'w', encoding='utf-8') as f:
        f.write('\t'.join(['default', 'any', 'all', '3']))
        f.write('\t'.join(
            [feats, ''.join(featl), ','.join(feats_to_project), '2', '3']))


if __name__ == '__main__':
    import sys
    HelpString = '\n\nThis utility finds natural classes in a feature file formatted according to Hayes and Wilson (2009, Linguistic Inquiry) conventions. Basic usage: \n\n$ python3 pynatclasses.py /home/full/path/to/file/Features.txt /home/full/path/to/output.txt\n\n You can also get all the consonants from a feature file from a command line call: \n $ python3 pynatclasses.py /home/full/path/to/Features.txt cons\n\n This last option requires there being a -syll or -syllabic feature in the file.\n\n\n To see other options, import it into python and try help(pynatclasses)'
    CLError = '\n\nPlease provide the name of a feature file and a place to save the natural classes to. \n\nFor example: "python3 pynatclasses.py /home/you/Desktop/features.txt /home/you/Desktop/natclasses.txt"\n\n'
    basepath = os.path.dirname(os.path.dirname(os.getcwd()))
    if "help" in sys.argv:
        msg.env_render(message=HelpString)
    elif 'check' in sys.argv:
        feats = os.path.join(basepath, 'data', sys.argv[1], 'Features.txt')
        featlines = read_feat_file(feats)
        check_feats(featlines)
    elif 'cus' in sys.argv:
        feats = os.path.join(basepath, 'data', sys.argv[1], 'Features.txt')
        make_custom_proj(sys.argv[2], feats,
                         '/home/maria/Desktop/projections.txt')
    elif not "cons" in sys.argv:
        try:
            feats = sys.argv[1]
            outfile = sys.argv[2]
            outwrite_classes(feats, outfile)
        except IndexError:
            msg.env_render(message=CLError)
Пример #17
0
def complexify(**kwargs):
    '''
    the algorithm.
    feats is a full path to Features.txt or another appropriately formatted feature file
    ld is a full path to a LearningData.txt or other learning data file, with space-separated words
    outdir is a writable directory where the learner will save results.
    the function creates a subfolder inside this directory, called 'simulation', and creates new versions of learning data and features. NOTE: any existing simulation directories will be deleted without warning.
    threshold is the cutoff for the inseparability measure. clusters above the threshold get converted into complex segments. defaults to 1.
    '''
    if not 'vowels' in kwargs:
        vowels = False
    else:
        vowels = kwargs['vowels']
    feats = kwargs.get('feats')
    ld = kwargs.get('ld')
    outdir = kwargs.get('outdir')
    threshold = kwargs.get('threshold')
    alpha = kwargs.get('alpha')
    ofpth = os.path.join(outdir, 'simulation_report.txt')
    kwargs['outfilepath'] = ofpth
    msg.env_render(message="\nSearching for complex segments.", **kwargs)
    msg.env_render(
        message=
        f"\nInseparability threshold: {threshold}\nAlpha level for Fisher's Exact Test: {alpha}",
        **kwargs)
    oldfeats = pnc.read_feat_file(
        feats, outfilepath=ofpth
    )  #before starting the recursion, first version of features is arg passed to the command
    msg.env_render(message="\nChecking feature file...\n", **kwargs)
    if (vowels == False) and (not any(
        [feat.startswith('syll') for feat in oldfeats[0]])):
        message = "\nYou need to have a 'syll(abic)' feature in your feature file. The learner needs a list of consonants, [-syll], to get started.\n"
        x = f"Your features are: [{','.join(oldfeats[0])}]"
        msg.env_render(message=message, **kwargs)
        if __name__ == '__main__':
            raise SystemExit
        else:
            return message
    elif vowels and (not any([feat.startswith('cons')
                              for feat in oldfeats[0]])):
        message = "\nYou need to have a 'cons(onantal)' feature in your feature file. The learner needs a list of vocoids, [-cons], to get started.\n"
        msg.env_render(message=message, **kwargs)
        if __name__ == '__main__':
            raise SystemExit
    elif pnc.check_feats(oldfeats, **kwargs):
        if os.path.isdir(os.path.join(outdir, 'simulation')):
            shutil.rmtree(os.path.join(outdir, 'simulation'))
        os.mkdir(os.path.join(outdir, 'simulation'))
        step = 1
        while step:
            wdir = os.path.join(outdir, 'simulation', 'iteration' + str(step))
            os.mkdir(wdir)
            #get numbers from feats and ld files
            temp = get_bidir_transprobs(feats, ld, **kwargs)
            if temp[0] == {}:
                shutil.rmtree(wdir)
            else:
                nc.write_insep(temp, os.path.join(wdir, 'inseparability.txt'))
            clusters = {
                k: v
                for (k, v) in temp[0].items() if temp[0][k] >= threshold
            }
            #fisher's test check to avoid unifying clusters that are too infrequent
            for c in clusters.copy():
                counts = temp[1][c].split('\t')
                if float(counts[5]) > alpha:
                    del clusters[c]
            if not clusters:
                msg.env_render(
                    message=
                    f'\nNo complex segments identified in {os.path.split(ld)[1]}. That is the final version of your learning data.',
                    **kwargs)
                msg.env_render(message="\n\nSimulation Finished", **kwargs)
                step = 0
            else:
                nc.write_insep(temp, os.path.join(wdir, 'inseparability.txt'))
                msg.tab_render(
                    d=clusters,
                    message=f'\nFound complex segments on iteration {step}:\n',
                    **kwargs)
                newfeats = make_new_feats(
                    feats, get_new_segs(feats, clusters, **kwargs))
                msg.env_render(
                    message="\nChecking learner-generated feature file...\n",
                    **kwargs)
                if not pnc.check_feats(newfeats, **kwargs):
                    msg.env_render(message=msg.messages['badfeatswarning'],
                                   **kwargs)
                write_new_ld_file(clusters, ld,
                                  os.path.join(wdir, 'LearningData.txt'),
                                  **kwargs)
                write_feats_w_check(os.path.join(wdir, 'Features.txt'),
                                    os.path.join(wdir, 'LearningData.txt'),
                                    oldfeats, newfeats, **kwargs)
                msg.env_render(
                    message=f"\nExamining data from iteration {step}.\n",
                    **kwargs)
                ld = os.path.join(wdir, 'LearningData.txt')
                feats = os.path.join(wdir, 'Features.txt')
                step += 1
        shutil.move(
            ofpth, os.path.join(outdir, 'simulation', 'simulation_report.txt'))
        return None
    else:  #failed feature check on first pass, cannot proceed
        if __name__ == "__main__":
            raise SystemExit
        else:
            if os.path.isfile(os.path.join(outdir, 'simulation_report.txt')):
                with open(os.path.join(outdir, 'simulation_report.txt'),
                          'r',
                          encoding='utf-8') as f:
                    errors = f.read().replace('\n', "<br>")
            return "Your feature file does not allow segments to be distinguished from each other. Perhaps try again with <a href='media/generic/Features.txt'>this generic feature file</a>?.<br>Here is how far the learner got:<br>" + errors
Пример #18
0
                        default=0.05)
    args = parser.parse_args()
    kwargs = vars(args)
    if args.language:
        lgpath = os.path.join(os.path.dirname(os.getcwd()), 'data',
                              args.language)
        kwargs['ld'] = os.path.join(lgpath, 'LearningData.txt')
        kwargs['feats'] = os.path.join(lgpath, 'Features.txt')
        kwargs['outdir'] = lgpath
        simpath = os.path.join(lgpath, 'simulation')
        try:
            complexify(**kwargs)
            plot_insep(simpath, ftype='png')
            plot_insep(simpath, ftype='pdf')
        except FileNotFoundError:
            msg.env_render(
                message=
                f'\nCould not locate the Learning Data or Features or output path at data/{language}'
            )
            raise
    else:
        try:
            complexify(**kwargs)
            plot_insep(os.path.join(args.outdir, 'simulation'), ftype='png')
        except:
            msg.env_render(
                message=
                f"attempting to plot: {os.path.join(sys.argv[3], 'simulation')} but something went wrong. Are the simulation files at that location?"
            )
            raise