def main(): #### Parse command-line arguments parser = argparse.ArgumentParser(description = 'Phonological CorpusTools: Kullback-Leibler CL interface') parser.add_argument('corpus_file_name', help='Path to corpus file. This can just be the file name if it\'s in the same directory as CorpusTools') parser.add_argument('seg1', help='First segment') parser.add_argument('seg2', help='Second segment') parser.add_argument('side', help='Context to check. Options are \'right\', \'left\' and \'both\'. You can enter just the first letter.') parser.add_argument('-s', '--sequence_type', default='transcription', help="The attribute of Words to calculate KL over. Normally this will be the transcription, but it can also be the spelling or a user-specified tier.") parser.add_argument('-t', '--type_or_token', default='token', help='Specifies whether entropy is based on type or token frequency.') parser.add_argument('-c', '--context_type', type=str, default='Canonical', help="How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details.") parser.add_argument('-o', '--outfile', help='Name of output file (optional)') args = parser.parse_args() #### try: home = os.path.expanduser('~') corpus = load_binary(os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS', args.corpus_file_name)) except FileNotFoundError: corpus_path = args.corpus_file_name if not os.path.isfile(corpus_path): corpus_path = os.path.join(os.getcwd(), corpus_path) corpus = load_binary(corpus_path) if args.context_type == 'Canonical': corpus = CanonicalVariantContext(corpus, args.sequence_type, args.type_or_token) elif args.context_type == 'MostFrequent': corpus = MostFrequentVariantContext(corpus, args.sequence_type, args.type_or_token) elif args.context_type == 'SeparatedTokens': corpus = SeparatedTokensVariantContext(corpus, args.sequence_type, args.type_or_token) elif args.context_type == 'Weighted': corpus = WeightedVariantContext(corpus, args.sequence_type, args.type_or_token) results = KullbackLeibler(corpus, args.seg1, args.seg2, args.side, outfile=None) outfile = args.outfile if outfile is not None: if not os.path.isfile(outfile): outfile = os.path.join(os.getcwd(), outfile) if not outfile.endswith('.txt'): outfile += '.txt' with open(outfile, mode='w', encoding='utf-8-sig') as f: print('Seg1,Seg2,Seg1 entropy,Seg2 entropy,Possible UR, Spurious UR\n\r',file=f) print(','.join([str(r) for r in results]), file=f) print('\n\rContext,Context frequency,{} frequency in context,{} frequency in context\n\r'.format(seg1,seg2), file=f) for context,result in allC.items(): cfrequency = freq_c[context]/totalC print('{},{},{},{}\n\r'.format(context, cfrequency, result.seg1/result.sum(), result.seg2/result.sum()), file=f) print('Done!') else: print(results)
def main(): #### Parse command-line arguments parser = argparse.ArgumentParser(description = \ 'Phonological CorpusTools: mutual information CL interface') parser.add_argument('corpus_file_name', help='Name of corpus file') group = parser.add_mutually_exclusive_group(required=True) group.add_argument('-q', '--query', help='bigram or segment pair, as str separated by comma') group.add_argument('-l', '--all_pairwise_mis', action='store_true', help="Calculate MI for all orders of all pairs of segments") parser.add_argument('-c', '--context_type', type=str, default='Canonical', help="How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details.") parser.add_argument('-s', '--sequence_type', default='transcription', help="The attribute of Words to calculate MI over. Normally this will be the transcription, but it can also be the spelling or a user-specified tier.") parser.add_argument('-w', '--in_word', action='store_true', help="Flag: domain for counting unigrams/bigrams set to the word rather than the unigram/bigram; ignores adjacency and word edges (#)") parser.add_argument('-e', '--halve_edges', action='store_true', help="Flag: make the number of edge characters (#) equal to the size of the corpus + 1, rather than double the size of the corpus - 1") parser.add_argument('-o', '--outfile', help='Name of output file') args = parser.parse_args() #### try: home = os.path.expanduser('~') corpus = load_binary(os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS', args.corpus_file_name)) except FileNotFoundError: corpus = load_binary(args.corpus_file_name) if args.context_type == 'Canonical': corpus = CanonicalVariantContext(corpus, args.sequence_type) elif args.context_type == 'MostFrequent': corpus = MostFrequentVariantContext(corpus, args.sequence_type) elif args.context_type == 'SeparatedTokens': corpus = SeparatedTokensVariantContext(corpus, args.sequence_type) elif args.context_type == 'Weighted': corpus = WeightedVariantContext(corpus, args.sequence_type) if args.all_pairwise_mis: result = all_mis(corpus, halve_edges = args.halve_edges, in_word = args.in_word) else: query = tuple(args.query.split(',')) if len(query) < 2: print('Warning! Your queried bigram could not be processed. Please separate the two segments with a comma, as in the call: pct_mutualinfo example.corpus m,a') result = pointwise_mi(corpus, query, args.halve_edges, args.in_word) if args.outfile: with open(args.outfile, 'w') as outfile: if type(result) != list: outstr = 'result\t' + '\t'.join([a for a in vars(args)]) + '\n' + str(result) + '\t' + '\t'.join([str(getattr(args, a)) for a in vars(args)]) outfile.write(outstr) else: outstr = 'result\tsegments\t' + '\t'.join([a for a in vars(args)]) + '\n' for element in result: outstr += str(element[1]) + '\t' + str(element[0]) + '\t' + '\t'.join([str(getattr(args,a)) for a in vars(args)]) + '\n' outfile.write(outstr) else: print('No output file name provided.') print('The mutual information of the given inputs is {}.'.format(str(result)))
def main(): #### Parse command-line arguments parser = argparse.ArgumentParser(description = \ 'Phonological CorpusTools: phonological search CL interface') parser.add_argument('corpus_file_name', help='Name of corpus file') parser.add_argument('sequence', help=('Sequence to search for, with segment positions separated by commas,' +' and with sets separated by slashes.' +' E.g. the input i will return all words with the segment [i], while' +' the input a/o,t/p,i,n will return all words with [atin], [apin],' +' [otin], or [opin].')) parser.add_argument('-s', '--sequence_type', default='transcription', help="The attribute of Words to search within. Normally this will be the transcription, but it can also be the spelling or a user-specified tier.") parser.add_argument('-o', '--outfile', help='Name of output file') args = parser.parse_args() #### try: home = os.path.expanduser('~') corpus = load_binary(os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS', args.corpus_file_name)) except FileNotFoundError: corpus = load_binary(args.corpus_file_name) split_sequence = [tuple(pos.split('/')) for pos in args.sequence.split(',')] middle = split_sequence[0] try: rhs = split_sequence[1:] except: rhs = None if len(rhs) == 0: rhs = None ef = EnvironmentFilter(middle, None, rhs) results = phonological_search(corpus, [ef], sequence_type=args.sequence_type) if args.outfile: with open(args.outfile, 'w') as outfile: for result in results: outfile.write(' '.join(getattr(result[0], args.sequence_type))+'\n') print('Search results written to output file.') else: print('No output file name provided.') print('Your search produced the results below:') for result in results: print('{}'.format(result[0])) print('Total number of results: {}'.format(str(len(results)))) print('Please specify an output file name with -o to save these results.')
def changeFeatureSystem(self): path = self.changeWidget.path() if path is None: self.specifier = None else: try: self.specifier = load_binary(path) except OSError: return self.changeDisplay()
def run(self): time.sleep(0.1) if self.stopCheck(): return try: self.results = load_binary(self.kwargs['path']) except PCTError as e: self.errorEncountered.emit(e) return except Exception as e: e = PCTPythonError(e) self.errorEncountered.emit(e) return if self.stopCheck(): return self.dataReady.emit(self.results)
def main(): #### Parse command-line arguments parser = argparse.ArgumentParser(description = \ 'Phonological CorpusTools: phonological search CL interface') parser.add_argument('corpus_file_name', help='Name of corpus file') parser.add_argument('seg_list', help='Segments to search for, separated by commas') parser.add_argument('-e', '--environments', help='Environments in which to search for the segments, written using _ (underscore) notation and separated by commas') parser.add_argument('-s', '--sequence_type', default='transcription', help="The attribute of Words to search within. Normally this will be the transcription, but it can also be the spelling or a user-specified tier.") parser.add_argument('-o', '--outfile', help='Name of output file') # fix for argparse's inability to take optional arguments beginning with - for i, arg in enumerate(sys.argv): if arg == '-e': sys.argv[i] = '-e{}'.format(sys.argv[i+1]) sys.argv[i+1] = '' sys.argv = [arg for arg in sys.argv if arg != ''] args = parser.parse_args() #### corpus = load_binary(args.corpus_file_name) segments = args.seg_list.split(',') if args.environments: args.environments = re.split(',(?!^|\+|\-|0|\.|1)', args.environments) results = corpus.phonological_search(segments, envs=args.environments, sequence_type=args.sequence_type) if args.outfile: with open(args.outfile, 'w') as outfile: for result in results: outfile.write(' '.join(getattr(result[0], args.sequence_type))+'\n') else: print('No output file name provided.') print('Your search produced the results below:') for result in results: print('{}'.format(result[0])) print('Total number of results: {}'.format(str(len(results)))) print('Please specify an output file name with -o to save these results.')
def main(): #### Parse command-line arguments parser = argparse.ArgumentParser(description = \ 'Phonological CorpusTools: phonological search CL interface') parser.add_argument('corpus_file_name', help='Name of corpus file') parser.add_argument( 'sequence', help= ('Sequence to search for, with segment positions separated by commas,' + ' and with sets separated by slashes.' + ' E.g. the input i will return all words with the segment [i], while' + ' the input a/o,t/p,i,n will return all words with [atin], [apin],' + ' [otin], or [opin].')) parser.add_argument( '-s', '--sequence_type', default='transcription', help= "The attribute of Words to search within. Normally this will be the transcription, but it can also be the spelling or a user-specified tier." ) parser.add_argument('-o', '--outfile', help='Name of output file') args = parser.parse_args() #### try: home = os.path.expanduser('~') corpus = load_binary( os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS', args.corpus_file_name)) except FileNotFoundError: corpus = load_binary(args.corpus_file_name) split_sequence = [ tuple(pos.split('/')) for pos in args.sequence.split(',') ] middle = split_sequence[0] try: rhs = split_sequence[1:] except: rhs = None if len(rhs) == 0: rhs = None ef = EnvironmentFilter(middle, None, rhs) results = phonological_search(corpus, [ef], sequence_type=args.sequence_type) if args.outfile: with open(args.outfile, 'w') as outfile: for result in results: outfile.write( ' '.join(getattr(result[0], args.sequence_type)) + '\n') print('Search results written to output file.') else: print('No output file name provided.') print('Your search produced the results below:') for result in results: print('{}'.format(result[0])) print('Total number of results: {}'.format(str(len(results)))) print( 'Please specify an output file name with -o to save these results.' )
def main(): #### Parse command-line arguments parser = argparse.ArgumentParser( description='Phonological CorpusTools: Kullback-Leibler CL interface') parser.add_argument( 'corpus_file_name', help= 'Path to corpus file. This can just be the file name if it\'s in the same directory as CorpusTools' ) parser.add_argument('seg1', help='First segment') parser.add_argument('seg2', help='Second segment') parser.add_argument( 'side', help= 'Context to check. Options are \'right\', \'left\' and \'both\'. You can enter just the first letter.' ) parser.add_argument( '-s', '--sequence_type', default='transcription', help= "The attribute of Words to calculate KL over. Normally this will be the transcription, but it can also be the spelling or a user-specified tier." ) parser.add_argument( '-t', '--type_or_token', default='token', help='Specifies whether entropy is based on type or token frequency.') parser.add_argument( '-c', '--context_type', type=str, default='Canonical', help= "How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details." ) parser.add_argument('-o', '--outfile', help='Name of output file (optional)') args = parser.parse_args() #### try: home = os.path.expanduser('~') corpus = load_binary( os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS', args.corpus_file_name)) except FileNotFoundError: corpus_path = args.corpus_file_name if not os.path.isfile(corpus_path): corpus_path = os.path.join(os.getcwd(), corpus_path) corpus = load_binary(corpus_path) if args.context_type == 'Canonical': corpus = CanonicalVariantContext(corpus, args.sequence_type, args.type_or_token) elif args.context_type == 'MostFrequent': corpus = MostFrequentVariantContext(corpus, args.sequence_type, args.type_or_token) elif args.context_type == 'SeparatedTokens': corpus = SeparatedTokensVariantContext(corpus, args.sequence_type, args.type_or_token) elif args.context_type == 'Weighted': corpus = WeightedVariantContext(corpus, args.sequence_type, args.type_or_token) results = KullbackLeibler(corpus, args.seg1, args.seg2, args.side, outfile=None) outfile = args.outfile if outfile is not None: if not os.path.isfile(outfile): outfile = os.path.join(os.getcwd(), outfile) if not outfile.endswith('.txt'): outfile += '.txt' with open(outfile, mode='w', encoding='utf-8-sig') as f: print( 'Seg1,Seg2,Seg1 entropy,Seg2 entropy,Possible UR, Spurious UR\n\r', file=f) print(','.join([str(r) for r in results]), file=f) print( '\n\rContext,Context frequency,{} frequency in context,{} frequency in context\n\r' .format(seg1, seg2), file=f) for context, result in allC.items(): cfrequency = freq_c[context] / totalC print('{},{},{},{}\n\r'.format(context, cfrequency, result.seg1 / result.sum(), result.seg2 / result.sum()), file=f) print('Done!') else: print(results)
def main(): #### Parse command-line arguments parser = argparse.ArgumentParser(description = \ 'Phonological CorpusTools: functional load CL interface') parser.add_argument('corpus_file_name', help='Name of corpus file') group = parser.add_mutually_exclusive_group(required=True) group.add_argument('-p', '--pairs_file_name_or_segment', help='Name of file with segment pairs (or target segment if relative_fl is True)') group.add_argument('-l', '--all_pairwise_fls', action='store_true', help="Calculate FL for all pairs of segments") parser.add_argument('-c', '--context_type', type=str, default='Canonical', help="How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details.") parser.add_argument('-a', '--algorithm', default='minpair', help='Algorithm to use for calculating functional load: "minpair" for minimal pair count or "deltah" for change in entropy. Defaults to minpair.') parser.add_argument('-f', '--frequency_cutoff', type=float, default=0, help='Minimum frequency of words to consider as possible minimal pairs or contributing to lexicon entropy.') parser.add_argument('-r', '--relative_count', type=check_bool, default=True, help='For minimal pair FL: whether or not to divide the number of minimal pairs by the number of possible minimal pairs (words with either segment in the proper environment). Defaults to True; pass -r False to set as False.') parser.add_argument('-d', '--distinguish_homophones', action='store_true', help="For minimal pair FL: if False, then you'll count sock~shock (sock=clothing) and sock~shock (sock=punch) as just one minimal pair; but if True, you'll overcount alternative spellings of the same word, e.g. axel~actual and axle~actual. False is the value used by Wedel et al.") parser.add_argument('-t', '--type_or_token', default='token', help='For change in entropy FL: specifies whether entropy is based on type or token frequency.') parser.add_argument('-e', '--relative_fl', action='store_true', help="If True, calculate the relative FL of a single segment by averaging across the functional loads of it and all other segments.") parser.add_argument('-s', '--sequence_type', default='transcription', help="The attribute of Words to calculate FL over. Normally this will be the transcription, but it can also be the spelling or a user-specified tier.") parser.add_argument('-q', '--environment_lhs', default=None, help="Left hand side of environment filter. Format: positions separated by commas, groups by slashes, e.g. m/n,i matches mi or ni.") parser.add_argument('-w', '--environment_rhs', default=None, help="Right hand side of environment filter. Format: positions separated by commas, groups by slashes, e.g. m/n,i matches mi or ni.") parser.add_argument('-n', '--prevent_normalization', action='store_true', help="For deltah entropy: prevents normalization of the entropy difference by the pre-neutralization entropy. To replicate the Surendran \& Niyogi metric, do NOT use this flag.") parser.add_argument('-x', '--separate_pairs', action='store_true', help="If present, calculate FL for each pair in the pairs file separately.") parser.add_argument('-o', '--outfile', help='Name of output file') args = parser.parse_args() #### # Parse paths try: home = os.path.expanduser('~') corpus = load_binary(os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS', args.corpus_file_name)) except FileNotFoundError: corpus = load_binary(args.corpus_file_name) # Create corpus context if args.context_type == 'Canonical': corpus = CanonicalVariantContext(corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff) elif args.context_type == 'MostFrequent': corpus = MostFrequentVariantContext(corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff) elif args.context_type == 'SeparatedTokens': corpus = SeparatedTokensVariantContext(corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff) elif args.context_type == 'Weighted': corpus = WeightedVariantContext(corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff) # Create environment filters if not args.environment_lhs and not args.environment_rhs: environment_filters = [] else: if args.environment_lhs: split_lhs = [tuple(pos.split('/')) for pos in args.environment_lhs.split(',')] else: split_lhs = None if args.environment_rhs: split_rhs = [tuple(pos.split('/')) for pos in args.environment_rhs.split(',')] else: split_rhs = None environment_filters = [EnvironmentFilter([], split_lhs, split_rhs)] # Initialize results overall_result = None detailed_results = {} keys_label = '' values_label = 'functional load' # Determine which function to call if args.all_pairwise_fls: results = all_pairwise_fls(corpus, relative_fl=args.relative_fl, algorithm=args.algorithm, relative_count=args.relative_count, distinguish_homophones=args.distinguish_homophones, environment_filters=environment_filters, prevent_normalization=args.prevent_normalization) for pair, fl in results: detailed_results[pair] = fl keys_label = 'segment pair' values_label = 'functional load' else: if args.relative_fl != True: try: with open(args.pairs_file_name_or_segment) as segpairs_or_segment_file: segpairs_or_segment = [line for line in csv.reader(segpairs_or_segment_file, delimiter='\t') if len(line) > 0] except FileNotFoundError: raise FileNotFoundError("Did not find the segment pairs file even though 'relative_fl' is set to false. If calculating the relative FL of a single segement, please set 'relative_fl' to True. Otherwise, specify correct filename.") else: segpairs_or_segment = args.pairs_file_name_or_segment if args.algorithm == 'minpair': if args.relative_fl: results = relative_minpair_fl(corpus, segpairs_or_segment, relative_count=bool(args.relative_count), distinguish_homophones=args.distinguish_homophones, environment_filters=environment_filters) overall_result = results[0] detailed_results = results[1] keys_label = 'segment pair' else: if args.separate_pairs: for pair in segpairs_or_segment: pair = tuple(pair) detailed_results[pair] = minpair_fl(corpus, [pair], relative_count=bool(args.relative_count), distinguish_homophones=args.distinguish_homophones, environment_filters=environment_filters)[0] keys_label = 'segment pair' else: results = minpair_fl(corpus, segpairs_or_segment, relative_count=bool(args.relative_count), distinguish_homophones=args.distinguish_homophones, environment_filters=environment_filters) overall_result = results[0] detailed_results = {mp: '' for mp in results[1]} keys_label = 'minimal pair (all listed regardless of distinguish_homophones value)' elif args.algorithm == 'deltah': if args.relative_fl: results = relative_deltah_fl(corpus, segpairs_or_segment, environment_filters=environment_filters, prevent_normalization=args.prevent_normalization) overall_result = results[0] detailed_results = results[1] keys_label = 'segment pair' else: if args.separate_pairs: for pair in segpairs_or_segment: pair = tuple(pair) detailed_results[pair] = (deltah_fl(corpus, [pair], environment_filters=environment_filters, prevent_normalization=args.prevent_normalization)) keys_label = 'segment pair' else: overall_result = deltah_fl(corpus, segpairs_or_segment, environment_filters=environment_filters, prevent_normalization=args.prevent_normalization) else: raise Exception('-a / --algorithm must be set to either \'minpair\' or \'deltah\'.') if args.outfile: with open(args.outfile, 'w') as outfile: outstr = '{}\t{}\n'.format(keys_label, values_label) if overall_result: outstr += 'OVERALL\t{}\n'.format(overall_result) for key in detailed_results: outstr += '{}\t{}\n'.format(key, detailed_results[key]) outfile.write(outstr) else: if overall_result: easy_result = overall_result else: easy_result = detailed_results print('No output file name provided.') print('The functional load of the given inputs is {}.'.format(str(easy_result)))
def setUpClass(cls): corpus = load_binary(r'C:\Users\Scott\Documents\GitHub\CorpusTools\corpustools\lemurian.corpus') inventory = InventoryModel(corpus.inventory, copy_mode=True) cls.dialog = FLDialog(main, None, corpus, inventory, False)
def main(): #### Parse command-line arguments parser = argparse.ArgumentParser(description = \ 'Phonological CorpusTools: mutual information CL interface') parser.add_argument('corpus_file_name', help='Name of corpus file') group = parser.add_mutually_exclusive_group(required=True) group.add_argument( '-q', '--query', help='bigram or segment pair, as str separated by comma') group.add_argument( '-l', '--all_pairwise_mis', action='store_true', help="Calculate MI for all orders of all pairs of segments") parser.add_argument( '-c', '--context_type', type=str, default='Canonical', help= "How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details." ) parser.add_argument( '-s', '--sequence_type', default='transcription', help= "The attribute of Words to calculate MI over. Normally this will be the transcription, but it can also be the spelling or a user-specified tier." ) parser.add_argument( '-w', '--in_word', action='store_true', help= "Flag: domain for counting unigrams/bigrams set to the word rather than the unigram/bigram; ignores adjacency and word edges (#)" ) parser.add_argument( '-e', '--halve_edges', action='store_true', help= "Flag: make the number of edge characters (#) equal to the size of the corpus + 1, rather than double the size of the corpus - 1" ) parser.add_argument('-o', '--outfile', help='Name of output file') args = parser.parse_args() #### try: home = os.path.expanduser('~') corpus = load_binary( os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS', args.corpus_file_name)) except FileNotFoundError: corpus = load_binary(args.corpus_file_name) if args.context_type == 'Canonical': corpus = CanonicalVariantContext(corpus, args.sequence_type) elif args.context_type == 'MostFrequent': corpus = MostFrequentVariantContext(corpus, args.sequence_type) elif args.context_type == 'SeparatedTokens': corpus = SeparatedTokensVariantContext(corpus, args.sequence_type) elif args.context_type == 'Weighted': corpus = WeightedVariantContext(corpus, args.sequence_type) if args.all_pairwise_mis: result = all_mis(corpus, halve_edges=args.halve_edges, in_word=args.in_word) else: query = tuple(args.query.split(',')) if len(query) < 2: print( 'Warning! Your queried bigram could not be processed. Please separate the two segments with a comma, as in the call: pct_mutualinfo example.corpus m,a' ) result = pointwise_mi(corpus, query, args.halve_edges, args.in_word) if args.outfile: with open(args.outfile, 'w') as outfile: if type(result) != list: outstr = 'result\t' + '\t'.join([ a for a in vars(args) ]) + '\n' + str(result) + '\t' + '\t'.join( [str(getattr(args, a)) for a in vars(args)]) outfile.write(outstr) else: outstr = 'result\tsegments\t' + '\t'.join( [a for a in vars(args)]) + '\n' for element in result: outstr += str(element[1]) + '\t' + str( element[0]) + '\t' + '\t'.join( [str(getattr(args, a)) for a in vars(args)]) + '\n' outfile.write(outstr) else: print('No output file name provided.') print('The mutual information of the given inputs is {}.'.format( str(result)))
def main(): #### Parse command-line arguments parser = argparse.ArgumentParser(description = \ 'Phonological CorpusTools: functional load CL interface') parser.add_argument('corpus_file_name', help='Name of corpus file') group = parser.add_mutually_exclusive_group(required=True) group.add_argument( '-p', '--pairs_file_name_or_segment', help= 'Name of file with segment pairs (or target segment if relative_fl is True)' ) group.add_argument('-l', '--all_pairwise_fls', action='store_true', help="Calculate FL for all pairs of segments") parser.add_argument( '-c', '--context_type', type=str, default='Canonical', help= "How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details." ) parser.add_argument( '-a', '--algorithm', default='minpair', help= 'Algorithm to use for calculating functional load: "minpair" for minimal pair count or "deltah" for change in entropy. Defaults to minpair.' ) parser.add_argument( '-f', '--frequency_cutoff', type=float, default=0, help= 'Minimum frequency of words to consider as possible minimal pairs or contributing to lexicon entropy.' ) parser.add_argument( '-r', '--relative_count', type=check_bool, default=True, help= 'For minimal pair FL: whether or not to divide the number of minimal pairs by the number of possible minimal pairs (words with either segment).' ) parser.add_argument( '-d', '--distinguish_homophones', action='store_true', help= "For minimal pair FL: if False, then you'll count sock~shock (sock=clothing) and sock~shock (sock=punch) as just one minimal pair; but if True, you'll overcount alternative spellings of the same word, e.g. axel~actual and axle~actual. False is the value used by Wedel et al." ) parser.add_argument( '-t', '--type_or_token', default='token', help= 'For change in entropy FL: specifies whether entropy is based on type or token frequency.' ) parser.add_argument( '-e', '--relative_fl', action='store_true', help= "If True, calculate the relative FL of a single segment by averaging across the functional loads of it and all other segments." ) parser.add_argument( '-s', '--sequence_type', default='transcription', help= "The attribute of Words to calculate FL over. Normally this will be the transcription, but it can also be the spelling or a user-specified tier." ) parser.add_argument( '-q', '--environment_lhs', default=None, help= "Left hand side of environment filter. Format: positions separated by commas, groups by slashes, e.g. m/n,i matches mi or ni." ) parser.add_argument( '-w', '--environment_rhs', default=None, help= "Right hand side of environment filter. Format: positions separated by commas, groups by slashes, e.g. m/n,i matches mi or ni." ) parser.add_argument( '-x', '--separate_pairs', action='store_true', help= "If present, calculate FL for each pair in the pairs file separately.") parser.add_argument('-o', '--outfile', help='Name of output file') args = parser.parse_args() #### corpus = load_binary(args.corpus_file_name) if args.context_type == 'Canonical': corpus = CanonicalVariantContext( corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff) elif args.context_type == 'MostFrequent': corpus = MostFrequentVariantContext( corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff) elif args.context_type == 'SeparatedTokens': corpus = SeparatedTokensVariantContext( corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff) elif args.context_type == 'Weighted': corpus = WeightedVariantContext( corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff) if not args.environment_lhs and not args.environment_rhs: environment_filter = None else: if args.environment_lhs: split_lhs = [ tuple(pos.split('/')) for pos in args.environment_lhs.split(',') ] else: split_lhs = None if args.environment_rhs: split_rhs = [ tuple(pos.split('/')) for pos in args.environment_rhs.split(',') ] else: split_rhs = None environment_filter = EnvironmentFilter([], split_lhs, split_rhs) if args.all_pairwise_fls: result = all_pairwise_fls( corpus, relative_fl=args.relative_fl, algorithm=args.algorithm, relative_count=args.relative_count, distinguish_homophones=args.distinguish_homophones, environment_filter=environment_filter) else: if args.relative_fl != True: try: with open(args.pairs_file_name_or_segment ) as segpairs_or_segment_file: segpairs_or_segment = [ line for line in csv.reader(segpairs_or_segment_file, delimiter='\t') if len(line) > 0 ] except FileNotFoundError: raise FileNotFoundError( "Did not find the segment pairs file even though 'relative_fl' is set to false. If calculating the relative FL of a single segement, please set 'relative_fl' to True. Otherwise, specify correct filename." ) else: segpairs_or_segment = args.pairs_file_name_or_segment if args.algorithm == 'minpair': if args.relative_fl: result = relative_minpair_fl( corpus, segpairs_or_segment, relative_count=bool(args.relative_count), distinguish_homophones=args.distinguish_homophones, environment_filter=environment_filter) else: if args.separate_pairs: result = [] for pair in segpairs_or_segment: result.append( minpair_fl(corpus, [pair], relative_count=bool( args.relative_count), distinguish_homophones=args. distinguish_homophones, environment_filter=environment_filter)) else: result = minpair_fl( corpus, segpairs_or_segment, relative_count=bool(args.relative_count), distinguish_homophones=args.distinguish_homophones, environment_filter=environment_filter) elif args.algorithm == 'deltah': if args.relative_fl: result = relative_deltah_fl( corpus, segpairs_or_segment, environment_filter=environment_filter) else: if args.separate_pairs: result = [] for pair in segpairs_or_segment: result.append( deltah_fl(corpus, [pair], environment_filter=environment_filter)) else: result = deltah_fl(corpus, segpairs_or_segment, environment_filter=environment_filter) else: raise Exception( '-a / --algorithm must be set to either \'minpair\' or \'deltah\'.' ) if args.outfile: with open(args.outfile, 'w') as outfile: if type(result) != list: outstr = 'result\t' + '\t'.join([ a for a in vars(args) ]) + '\n' + str(result) + '\t' + '\t'.join( [str(getattr(args, a)) for a in vars(args)]) outfile.write(outstr) else: outstr = 'result\tsegment(s)\t' + '\t'.join( [a for a in vars(args)]) + '\n' for element in result: outstr += str(element[1]) + '\t' + str( element[0]) + '\t' + '\t'.join( [str(getattr(args, a)) for a in vars(args)]) + '\n' outfile.write(outstr) else: print('No output file name provided.') print('The functional load of the given inputs is {}.'.format( str(result)))
def main(): #### Parse command-line arguments parser = argparse.ArgumentParser(description = \ 'Phonological CorpusTools: functional load CL interface') parser.add_argument('corpus_file_name', help='Name of corpus file') group = parser.add_mutually_exclusive_group(required=True) group.add_argument( '-p', '--pairs_file_name_or_segment', help= 'Name of file with segment pairs (or target segment if relative_fl is True)' ) group.add_argument('-l', '--all_pairwise_fls', action='store_true', help="Calculate FL for all pairs of segments") parser.add_argument( '-c', '--context_type', type=str, default='Canonical', help= "How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details." ) parser.add_argument( '-a', '--algorithm', default='minpair', help= 'Algorithm to use for calculating functional load: "minpair" for minimal pair count or "deltah" for change in entropy. Defaults to minpair.' ) parser.add_argument( '-f', '--frequency_cutoff', type=float, default=0, help= 'Minimum frequency of words to consider as possible minimal pairs or contributing to lexicon entropy.' ) parser.add_argument( '-r', '--relative_count', type=check_bool, default=True, help= 'For minimal pair FL: whether or not to divide the number of minimal pairs by the number of possible minimal pairs (words with either segment in the proper environment). Defaults to True; pass -r False to set as False.' ) parser.add_argument( '-d', '--distinguish_homophones', action='store_true', help= "For minimal pair FL: if False, then you'll count sock~shock (sock=clothing) and sock~shock (sock=punch) as just one minimal pair; but if True, you'll overcount alternative spellings of the same word, e.g. axel~actual and axle~actual. False is the value used by Wedel et al." ) parser.add_argument( '-t', '--type_or_token', default='token', help= 'For change in entropy FL: specifies whether entropy is based on type or token frequency.' ) parser.add_argument( '-e', '--relative_fl', action='store_true', help= "If True, calculate the relative FL of a single segment by averaging across the functional loads of it and all other segments." ) parser.add_argument( '-s', '--sequence_type', default='transcription', help= "The attribute of Words to calculate FL over. Normally this will be the transcription, but it can also be the spelling or a user-specified tier." ) parser.add_argument( '-q', '--environment_lhs', default=None, help= "Left hand side of environment filter. Format: positions separated by commas, groups by slashes, e.g. m/n,i matches mi or ni." ) parser.add_argument( '-w', '--environment_rhs', default=None, help= "Right hand side of environment filter. Format: positions separated by commas, groups by slashes, e.g. m/n,i matches mi or ni." ) parser.add_argument( '-n', '--prevent_normalization', action='store_true', help= "For deltah entropy: prevents normalization of the entropy difference by the pre-neutralization entropy. To replicate the Surendran \& Niyogi metric, do NOT use this flag." ) parser.add_argument( '-x', '--separate_pairs', action='store_true', help= "If present, calculate FL for each pair in the pairs file separately.") parser.add_argument('-o', '--outfile', help='Name of output file') args = parser.parse_args() #### # Parse paths try: home = os.path.expanduser('~') corpus = load_binary( os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS', args.corpus_file_name)) except FileNotFoundError: corpus = load_binary(args.corpus_file_name) # Create corpus context if args.context_type == 'Canonical': corpus = CanonicalVariantContext( corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff) elif args.context_type == 'MostFrequent': corpus = MostFrequentVariantContext( corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff) elif args.context_type == 'SeparatedTokens': corpus = SeparatedTokensVariantContext( corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff) elif args.context_type == 'Weighted': corpus = WeightedVariantContext( corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff) # Create environment filters if not args.environment_lhs and not args.environment_rhs: environment_filters = [] else: if args.environment_lhs: split_lhs = [ tuple(pos.split('/')) for pos in args.environment_lhs.split(',') ] else: split_lhs = None if args.environment_rhs: split_rhs = [ tuple(pos.split('/')) for pos in args.environment_rhs.split(',') ] else: split_rhs = None environment_filters = [EnvironmentFilter([], split_lhs, split_rhs)] # Initialize results overall_result = None detailed_results = {} keys_label = '' values_label = 'functional load' # Determine which function to call if args.all_pairwise_fls: results = all_pairwise_fls( corpus, relative_fl=args.relative_fl, algorithm=args.algorithm, relative_count=args.relative_count, distinguish_homophones=args.distinguish_homophones, environment_filters=environment_filters, prevent_normalization=args.prevent_normalization) for pair, fl in results: detailed_results[pair] = fl keys_label = 'segment pair' values_label = 'functional load' else: if args.relative_fl != True: try: with open(args.pairs_file_name_or_segment ) as segpairs_or_segment_file: segpairs_or_segment = [ line for line in csv.reader(segpairs_or_segment_file, delimiter='\t') if len(line) > 0 ] except FileNotFoundError: raise FileNotFoundError( "Did not find the segment pairs file even though 'relative_fl' is set to false. If calculating the relative FL of a single segement, please set 'relative_fl' to True. Otherwise, specify correct filename." ) else: segpairs_or_segment = args.pairs_file_name_or_segment if args.algorithm == 'minpair': if args.relative_fl: results = relative_minpair_fl( corpus, segpairs_or_segment, relative_count=bool(args.relative_count), distinguish_homophones=args.distinguish_homophones, environment_filters=environment_filters) overall_result = results[0] detailed_results = results[1] keys_label = 'segment pair' else: if args.separate_pairs: for pair in segpairs_or_segment: pair = tuple(pair) detailed_results[pair] = minpair_fl( corpus, [pair], relative_count=bool(args.relative_count), distinguish_homophones=args.distinguish_homophones, environment_filters=environment_filters)[0] keys_label = 'segment pair' else: results = minpair_fl( corpus, segpairs_or_segment, relative_count=bool(args.relative_count), distinguish_homophones=args.distinguish_homophones, environment_filters=environment_filters) overall_result = results[0] detailed_results = {mp: '' for mp in results[1]} keys_label = 'minimal pair (all listed regardless of distinguish_homophones value)' elif args.algorithm == 'deltah': if args.relative_fl: results = relative_deltah_fl( corpus, segpairs_or_segment, environment_filters=environment_filters, prevent_normalization=args.prevent_normalization) overall_result = results[0] detailed_results = results[1] keys_label = 'segment pair' else: if args.separate_pairs: for pair in segpairs_or_segment: pair = tuple(pair) detailed_results[pair] = (deltah_fl( corpus, [pair], environment_filters=environment_filters, prevent_normalization=args.prevent_normalization)) keys_label = 'segment pair' else: overall_result = deltah_fl( corpus, segpairs_or_segment, environment_filters=environment_filters, prevent_normalization=args.prevent_normalization) else: raise Exception( '-a / --algorithm must be set to either \'minpair\' or \'deltah\'.' ) if args.outfile: with open(args.outfile, 'w') as outfile: outstr = '{}\t{}\n'.format(keys_label, values_label) if overall_result: outstr += 'OVERALL\t{}\n'.format(overall_result) for key in detailed_results: outstr += '{}\t{}\n'.format(key, detailed_results[key]) outfile.write(outstr) else: if overall_result: easy_result = overall_result else: easy_result = detailed_results print('No output file name provided.') print('The functional load of the given inputs is {}.'.format( str(easy_result)))
def main(): #### Parse command-line arguments parser = argparse.ArgumentParser(description = \ 'Phonological CorpusTools: functional load CL interface') parser.add_argument('corpus_file_name', help='Name of corpus file') group = parser.add_mutually_exclusive_group(required=True) group.add_argument('-p', '--pairs_file_name_or_segment', help='Name of file with segment pairs (or target segment if relative_fl is True)') group.add_argument('-l', '--all_pairwise_fls', action='store_true', help="Calculate FL for all pairs of segments") parser.add_argument('-c', '--context_type', type=str, default='Canonical', help="How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details.") parser.add_argument('-a', '--algorithm', default='minpair', help='Algorithm to use for calculating functional load: "minpair" for minimal pair count or "deltah" for change in entropy. Defaults to minpair.') parser.add_argument('-f', '--frequency_cutoff', type=float, default=0, help='Minimum frequency of words to consider as possible minimal pairs or contributing to lexicon entropy.') parser.add_argument('-r', '--relative_count', type=check_bool, default=True, help='For minimal pair FL: whether or not to divide the number of minimal pairs by the number of possible minimal pairs (words with either segment).') parser.add_argument('-d', '--distinguish_homophones', action='store_true', help="For minimal pair FL: if False, then you'll count sock~shock (sock=clothing) and sock~shock (sock=punch) as just one minimal pair; but if True, you'll overcount alternative spellings of the same word, e.g. axel~actual and axle~actual. False is the value used by Wedel et al.") parser.add_argument('-t', '--type_or_token', default='token', help='For change in entropy FL: specifies whether entropy is based on type or token frequency.') parser.add_argument('-e', '--relative_fl', action='store_true', help="If True, calculate the relative FL of a single segment by averaging across the functional loads of it and all other segments.") parser.add_argument('-s', '--sequence_type', default='transcription', help="The attribute of Words to calculate FL over. Normally this will be the transcription, but it can also be the spelling or a user-specified tier.") parser.add_argument('-q', '--environment_lhs', default=None, help="Left hand side of environment filter. Format: positions separated by commas, groups by slashes, e.g. m/n,i matches mi or ni.") parser.add_argument('-w', '--environment_rhs', default=None, help="Right hand side of environment filter. Format: positions separated by commas, groups by slashes, e.g. m/n,i matches mi or ni.") parser.add_argument('-x', '--separate_pairs', action='store_true', help="If present, calculate FL for each pair in the pairs file separately.") parser.add_argument('-o', '--outfile', help='Name of output file') args = parser.parse_args() #### corpus = load_binary(args.corpus_file_name) if args.context_type == 'Canonical': corpus = CanonicalVariantContext(corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff) elif args.context_type == 'MostFrequent': corpus = MostFrequentVariantContext(corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff) elif args.context_type == 'SeparatedTokens': corpus = SeparatedTokensVariantContext(corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff) elif args.context_type == 'Weighted': corpus = WeightedVariantContext(corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff) if not args.environment_lhs and not args.environment_rhs: environment_filter = None else: if args.environment_lhs: split_lhs = [tuple(pos.split('/')) for pos in args.environment_lhs.split(',')] else: split_lhs = None if args.environment_rhs: split_rhs = [tuple(pos.split('/')) for pos in args.environment_rhs.split(',')] else: split_rhs = None environment_filter = EnvironmentFilter([], split_lhs, split_rhs) if args.all_pairwise_fls: result = all_pairwise_fls(corpus, relative_fl=args.relative_fl, algorithm=args.algorithm, relative_count=args.relative_count, distinguish_homophones=args.distinguish_homophones, environment_filter=environment_filter) else: if args.relative_fl != True: try: with open(args.pairs_file_name_or_segment) as segpairs_or_segment_file: segpairs_or_segment = [line for line in csv.reader(segpairs_or_segment_file, delimiter='\t') if len(line) > 0] except FileNotFoundError: raise FileNotFoundError("Did not find the segment pairs file even though 'relative_fl' is set to false. If calculating the relative FL of a single segement, please set 'relative_fl' to True. Otherwise, specify correct filename.") else: segpairs_or_segment = args.pairs_file_name_or_segment if args.algorithm == 'minpair': if args.relative_fl: result = relative_minpair_fl(corpus, segpairs_or_segment, relative_count=bool(args.relative_count), distinguish_homophones=args.distinguish_homophones, environment_filter=environment_filter) else: if args.separate_pairs: result = [] for pair in segpairs_or_segment: result.append(minpair_fl(corpus, [pair], relative_count=bool(args.relative_count), distinguish_homophones=args.distinguish_homophones, environment_filter=environment_filter)) else: result = minpair_fl(corpus, segpairs_or_segment, relative_count=bool(args.relative_count), distinguish_homophones=args.distinguish_homophones, environment_filter=environment_filter) elif args.algorithm == 'deltah': if args.relative_fl: result = relative_deltah_fl(corpus, segpairs_or_segment, environment_filter=environment_filter) else: if args.separate_pairs: result = [] for pair in segpairs_or_segment: result.append(deltah_fl(corpus, [pair], environment_filter=environment_filter)) else: result = deltah_fl(corpus, segpairs_or_segment, environment_filter=environment_filter) else: raise Exception('-a / --algorithm must be set to either \'minpair\' or \'deltah\'.') if args.outfile: with open(args.outfile, 'w') as outfile: if type(result) != list: outstr = 'result\t' + '\t'.join([a for a in vars(args)]) + '\n' + str(result) + '\t' + '\t'.join([str(getattr(args, a)) for a in vars(args)]) outfile.write(outstr) else: outstr = 'result\tsegment(s)\t' + '\t'.join([a for a in vars(args)]) + '\n' for element in result: outstr += str(element[1]) + '\t' + str(element[0]) + '\t' + '\t'.join([str(getattr(args,a)) for a in vars(args)]) + '\n' outfile.write(outstr) else: print('No output file name provided.') print('The functional load of the given inputs is {}.'.format(str(result)))
def setUpClass(cls): corpus = load_binary( r'C:\Users\Scott\Documents\GitHub\CorpusTools\corpustools\lemurian.corpus' ) inventory = InventoryModel(corpus.inventory, copy_mode=True) cls.dialog = FLDialog(main, None, corpus, inventory, False)