Python load_binary示例，corpustools.corpus.io.load_binary Python示例

示例#1

0

显示文件

文件： pct_kl.py 项目： PhonologicalCorpusTools/CorpusTools

def main():

    #### Parse command-line arguments
    parser = argparse.ArgumentParser(description = 'Phonological CorpusTools: Kullback-Leibler CL interface')
    parser.add_argument('corpus_file_name', help='Path to corpus file. This can just be the file name if it\'s in the same directory as CorpusTools')
    parser.add_argument('seg1', help='First segment')
    parser.add_argument('seg2', help='Second segment')
    parser.add_argument('side', help='Context to check. Options are \'right\', \'left\' and \'both\'. You can enter just the first letter.')
    parser.add_argument('-s', '--sequence_type', default='transcription', help="The attribute of Words to calculate KL over. Normally this will be the transcription, but it can also be the spelling or a user-specified tier.")
    parser.add_argument('-t', '--type_or_token', default='token', help='Specifies whether entropy is based on type or token frequency.')
    parser.add_argument('-c', '--context_type', type=str, default='Canonical', help="How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details.")
    parser.add_argument('-o', '--outfile', help='Name of output file (optional)')
    
    args = parser.parse_args()

    ####

    try:
        home = os.path.expanduser('~')
        corpus = load_binary(os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS', args.corpus_file_name))
    except FileNotFoundError:
        corpus_path = args.corpus_file_name
        if not os.path.isfile(corpus_path):
            corpus_path = os.path.join(os.getcwd(), corpus_path)
        corpus = load_binary(corpus_path)

    if args.context_type == 'Canonical':
        corpus = CanonicalVariantContext(corpus, args.sequence_type, args.type_or_token)
    elif args.context_type == 'MostFrequent':
        corpus = MostFrequentVariantContext(corpus, args.sequence_type, args.type_or_token)
    elif args.context_type == 'SeparatedTokens':
        corpus = SeparatedTokensVariantContext(corpus, args.sequence_type, args.type_or_token)
    elif args.context_type == 'Weighted':
        corpus = WeightedVariantContext(corpus, args.sequence_type, args.type_or_token)

    results = KullbackLeibler(corpus, args.seg1, args.seg2, args.side, outfile=None)

    outfile = args.outfile
    if outfile is not None:
        if not os.path.isfile(outfile):
            outfile = os.path.join(os.getcwd(), outfile)
        if not outfile.endswith('.txt'):
            outfile += '.txt'

        with open(outfile, mode='w', encoding='utf-8-sig') as f:
            print('Seg1,Seg2,Seg1 entropy,Seg2 entropy,Possible UR, Spurious UR\n\r',file=f)
            print(','.join([str(r) for r in results]), file=f)
            print('\n\rContext,Context frequency,{} frequency in context,{} frequency in context\n\r'.format(seg1,seg2), file=f)
            for context,result in allC.items():
                cfrequency = freq_c[context]/totalC
                print('{},{},{},{}\n\r'.format(context,
                                cfrequency,
                                result.seg1/result.sum(),
                                result.seg2/result.sum()),
                        file=f)
        print('Done!')

    else:
        print(results)

示例#2

0

显示文件

文件： pct_mutualinfo.py 项目： PhonologicalCorpusTools/CorpusTools

def main():

    #### Parse command-line arguments
    parser = argparse.ArgumentParser(description = \
             'Phonological CorpusTools: mutual information CL interface')
    parser.add_argument('corpus_file_name', help='Name of corpus file')
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('-q', '--query', help='bigram or segment pair, as str separated by comma')
    group.add_argument('-l', '--all_pairwise_mis', action='store_true', help="Calculate MI for all orders of all pairs of segments")
    parser.add_argument('-c', '--context_type', type=str, default='Canonical', help="How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details.")
    parser.add_argument('-s', '--sequence_type', default='transcription', help="The attribute of Words to calculate MI over. Normally this will be the transcription, but it can also be the spelling or a user-specified tier.")
    parser.add_argument('-w', '--in_word', action='store_true', help="Flag: domain for counting unigrams/bigrams set to the word rather than the unigram/bigram; ignores adjacency and word edges (#)")
    parser.add_argument('-e', '--halve_edges', action='store_true', help="Flag: make the number of edge characters (#) equal to the size of the corpus + 1, rather than double the size of the corpus - 1")
    parser.add_argument('-o', '--outfile', help='Name of output file')

    args = parser.parse_args()

    ####

    try:
        home = os.path.expanduser('~')
        corpus = load_binary(os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS', args.corpus_file_name))
    except FileNotFoundError:
        corpus = load_binary(args.corpus_file_name)

    if args.context_type == 'Canonical':
        corpus = CanonicalVariantContext(corpus, args.sequence_type)
    elif args.context_type == 'MostFrequent':
        corpus = MostFrequentVariantContext(corpus, args.sequence_type)
    elif args.context_type == 'SeparatedTokens':
        corpus = SeparatedTokensVariantContext(corpus, args.sequence_type)
    elif args.context_type == 'Weighted':
        corpus = WeightedVariantContext(corpus, args.sequence_type)


    if args.all_pairwise_mis:
        result = all_mis(corpus, halve_edges = args.halve_edges, in_word = args.in_word)

    else:
        query = tuple(args.query.split(','))
        if len(query) < 2:
            print('Warning! Your queried bigram could not be processed. Please separate the two segments with a comma, as in the call: pct_mutualinfo example.corpus m,a')

        result = pointwise_mi(corpus, query, args.halve_edges, args.in_word)

    if args.outfile:
        with open(args.outfile, 'w') as outfile:
            if type(result) != list:
                outstr = 'result\t' + '\t'.join([a for a in vars(args)]) + '\n' + str(result) + '\t' + '\t'.join([str(getattr(args, a)) for a in vars(args)])
                outfile.write(outstr)
            else:
                outstr = 'result\tsegments\t' + '\t'.join([a for a in vars(args)]) + '\n'
                for element in result:
                    outstr += str(element[1]) + '\t' + str(element[0]) + '\t' + '\t'.join([str(getattr(args,a)) for a in vars(args)]) + '\n'
                outfile.write(outstr)
    else:
        print('No output file name provided.')
        print('The mutual information of the given inputs is {}.'.format(str(result)))

示例#3

0

显示文件

文件： pct_search.py 项目： PhonologicalCorpusTools/CorpusTools

def main():

    #### Parse command-line arguments
    parser = argparse.ArgumentParser(description = \
             'Phonological CorpusTools: phonological search CL interface')
    parser.add_argument('corpus_file_name', help='Name of corpus file')
    parser.add_argument('sequence', 
        help=('Sequence to search for, with segment positions separated by commas,'
        +' and with sets separated by slashes.'
        +' E.g. the input i will return all words with the segment [i], while'
        +' the input a/o,t/p,i,n will return all words with [atin], [apin],'
        +' [otin], or [opin].'))
    parser.add_argument('-s', '--sequence_type', default='transcription', 
        help="The attribute of Words to search within. Normally this will be the transcription, but it can also be the spelling or a user-specified tier.")
    parser.add_argument('-o', '--outfile', help='Name of output file')

    args = parser.parse_args()

    ####

    try:
        home = os.path.expanduser('~')
        corpus = load_binary(os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS', args.corpus_file_name))
    except FileNotFoundError:
        corpus = load_binary(args.corpus_file_name)

    split_sequence = [tuple(pos.split('/')) for pos in args.sequence.split(',')]
    middle = split_sequence[0]
    try:
        rhs = split_sequence[1:]
    except:
        rhs = None
    if len(rhs) == 0:
        rhs = None

    ef = EnvironmentFilter(middle, None, rhs)

    results = phonological_search(corpus, [ef], sequence_type=args.sequence_type)

    if args.outfile:
        with open(args.outfile, 'w') as outfile:
            for result in results:
                outfile.write(' '.join(getattr(result[0], args.sequence_type))+'\n')
        print('Search results written to output file.')
    else:
        print('No output file name provided.')
        print('Your search produced the results below:')
        for result in results:
            print('{}'.format(result[0]))
        print('Total number of results: {}'.format(str(len(results))))
        print('Please specify an output file name with -o to save these results.')

示例#4

0

显示文件

文件： featuregui.py 项目： adilnurimanov/CorpusTools

 def changeFeatureSystem(self):
     path = self.changeWidget.path()
     if path is None:
         self.specifier = None
     else:
         try:
             self.specifier = load_binary(path)
         except OSError:
             return
     self.changeDisplay()

示例#5

0

显示文件

 def changeFeatureSystem(self):
     path = self.changeWidget.path()
     if path is None:
         self.specifier = None
     else:
         try:
             self.specifier = load_binary(path)
         except OSError:
             return
     self.changeDisplay()

示例#6

0

显示文件

文件： iogui.py 项目： adilnurimanov/CorpusTools

 def run(self):
     time.sleep(0.1)
     if self.stopCheck():
         return
     try:
         self.results = load_binary(self.kwargs['path'])
     except PCTError as e:
         self.errorEncountered.emit(e)
         return
     except Exception as e:
         e = PCTPythonError(e)
         self.errorEncountered.emit(e)
         return
     if self.stopCheck():
         return
     self.dataReady.emit(self.results)

示例#7

0

显示文件

文件： iogui.py 项目： pedmiston/CorpusTools

 def run(self):
     time.sleep(0.1)
     if self.stopCheck():
         return
     try:
         self.results = load_binary(self.kwargs['path'])
     except PCTError as e:
         self.errorEncountered.emit(e)
         return
     except Exception as e:
         e = PCTPythonError(e)
         self.errorEncountered.emit(e)
         return
     if self.stopCheck():
         return
     self.dataReady.emit(self.results)

示例#8

0

显示文件

文件： pct_search.py 项目： FieldDB/CorpusTools

def main():

    #### Parse command-line arguments
    parser = argparse.ArgumentParser(description = \
             'Phonological CorpusTools: phonological search CL interface')
    parser.add_argument('corpus_file_name', help='Name of corpus file')
    parser.add_argument('seg_list', help='Segments to search for, separated by commas')
    parser.add_argument('-e', '--environments', help='Environments in which to search for the segments, written using _ (underscore) notation and separated by commas')
    parser.add_argument('-s', '--sequence_type', default='transcription', help="The attribute of Words to search within. Normally this will be the transcription, but it can also be the spelling or a user-specified tier.")
    parser.add_argument('-o', '--outfile', help='Name of output file')

    # fix for argparse's inability to take optional arguments beginning with -
    for i, arg in enumerate(sys.argv):
        if arg == '-e':
            sys.argv[i] = '-e{}'.format(sys.argv[i+1])
            sys.argv[i+1] = ''
    sys.argv = [arg for arg in sys.argv if arg != '']

    args = parser.parse_args()

    ####

    corpus = load_binary(args.corpus_file_name)

    segments = args.seg_list.split(',')
    if args.environments:
        args.environments = re.split(',(?!^|\+|\-|0|\.|1)', args.environments)

    results = corpus.phonological_search(segments, envs=args.environments, sequence_type=args.sequence_type)

    if args.outfile:
        with open(args.outfile, 'w') as outfile:
            for result in results:
                outfile.write(' '.join(getattr(result[0], args.sequence_type))+'\n')
    else:
        print('No output file name provided.')
        print('Your search produced the results below:')
        for result in results:
            print('{}'.format(result[0]))
        print('Total number of results: {}'.format(str(len(results))))
        print('Please specify an output file name with -o to save these results.')

示例#9

0

显示文件

文件： pct_search.py 项目： akki2825/CorpusTools

def main():

    #### Parse command-line arguments
    parser = argparse.ArgumentParser(description = \
             'Phonological CorpusTools: phonological search CL interface')
    parser.add_argument('corpus_file_name', help='Name of corpus file')
    parser.add_argument(
        'sequence',
        help=
        ('Sequence to search for, with segment positions separated by commas,'
         + ' and with sets separated by slashes.' +
         ' E.g. the input i will return all words with the segment [i], while'
         +
         ' the input a/o,t/p,i,n will return all words with [atin], [apin],' +
         ' [otin], or [opin].'))
    parser.add_argument(
        '-s',
        '--sequence_type',
        default='transcription',
        help=
        "The attribute of Words to search within. Normally this will be the transcription, but it can also be the spelling or a user-specified tier."
    )
    parser.add_argument('-o', '--outfile', help='Name of output file')

    args = parser.parse_args()

    ####

    try:
        home = os.path.expanduser('~')
        corpus = load_binary(
            os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS',
                         args.corpus_file_name))
    except FileNotFoundError:
        corpus = load_binary(args.corpus_file_name)

    split_sequence = [
        tuple(pos.split('/')) for pos in args.sequence.split(',')
    ]
    middle = split_sequence[0]
    try:
        rhs = split_sequence[1:]
    except:
        rhs = None
    if len(rhs) == 0:
        rhs = None

    ef = EnvironmentFilter(middle, None, rhs)

    results = phonological_search(corpus, [ef],
                                  sequence_type=args.sequence_type)

    if args.outfile:
        with open(args.outfile, 'w') as outfile:
            for result in results:
                outfile.write(
                    ' '.join(getattr(result[0], args.sequence_type)) + '\n')
        print('Search results written to output file.')
    else:
        print('No output file name provided.')
        print('Your search produced the results below:')
        for result in results:
            print('{}'.format(result[0]))
        print('Total number of results: {}'.format(str(len(results))))
        print(
            'Please specify an output file name with -o to save these results.'
        )

示例#10

0

显示文件

def main():

    #### Parse command-line arguments
    parser = argparse.ArgumentParser(
        description='Phonological CorpusTools: Kullback-Leibler CL interface')
    parser.add_argument(
        'corpus_file_name',
        help=
        'Path to corpus file. This can just be the file name if it\'s in the same directory as CorpusTools'
    )
    parser.add_argument('seg1', help='First segment')
    parser.add_argument('seg2', help='Second segment')
    parser.add_argument(
        'side',
        help=
        'Context to check. Options are \'right\', \'left\' and \'both\'. You can enter just the first letter.'
    )
    parser.add_argument(
        '-s',
        '--sequence_type',
        default='transcription',
        help=
        "The attribute of Words to calculate KL over. Normally this will be the transcription, but it can also be the spelling or a user-specified tier."
    )
    parser.add_argument(
        '-t',
        '--type_or_token',
        default='token',
        help='Specifies whether entropy is based on type or token frequency.')
    parser.add_argument(
        '-c',
        '--context_type',
        type=str,
        default='Canonical',
        help=
        "How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details."
    )
    parser.add_argument('-o',
                        '--outfile',
                        help='Name of output file (optional)')

    args = parser.parse_args()

    ####

    try:
        home = os.path.expanduser('~')
        corpus = load_binary(
            os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS',
                         args.corpus_file_name))
    except FileNotFoundError:
        corpus_path = args.corpus_file_name
        if not os.path.isfile(corpus_path):
            corpus_path = os.path.join(os.getcwd(), corpus_path)
        corpus = load_binary(corpus_path)

    if args.context_type == 'Canonical':
        corpus = CanonicalVariantContext(corpus, args.sequence_type,
                                         args.type_or_token)
    elif args.context_type == 'MostFrequent':
        corpus = MostFrequentVariantContext(corpus, args.sequence_type,
                                            args.type_or_token)
    elif args.context_type == 'SeparatedTokens':
        corpus = SeparatedTokensVariantContext(corpus, args.sequence_type,
                                               args.type_or_token)
    elif args.context_type == 'Weighted':
        corpus = WeightedVariantContext(corpus, args.sequence_type,
                                        args.type_or_token)

    results = KullbackLeibler(corpus,
                              args.seg1,
                              args.seg2,
                              args.side,
                              outfile=None)

    outfile = args.outfile
    if outfile is not None:
        if not os.path.isfile(outfile):
            outfile = os.path.join(os.getcwd(), outfile)
        if not outfile.endswith('.txt'):
            outfile += '.txt'

        with open(outfile, mode='w', encoding='utf-8-sig') as f:
            print(
                'Seg1,Seg2,Seg1 entropy,Seg2 entropy,Possible UR, Spurious UR\n\r',
                file=f)
            print(','.join([str(r) for r in results]), file=f)
            print(
                '\n\rContext,Context frequency,{} frequency in context,{} frequency in context\n\r'
                .format(seg1, seg2),
                file=f)
            for context, result in allC.items():
                cfrequency = freq_c[context] / totalC
                print('{},{},{},{}\n\r'.format(context, cfrequency,
                                               result.seg1 / result.sum(),
                                               result.seg2 / result.sum()),
                      file=f)
        print('Done!')

    else:
        print(results)

示例#11

0

显示文件

文件： pct_funcload.py 项目： PhonologicalCorpusTools/CorpusTools

def main():

    #### Parse command-line arguments
    parser = argparse.ArgumentParser(description = \
             'Phonological CorpusTools: functional load CL interface')
    parser.add_argument('corpus_file_name', help='Name of corpus file')
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('-p', '--pairs_file_name_or_segment', help='Name of file with segment pairs (or target segment if relative_fl is True)')
    group.add_argument('-l', '--all_pairwise_fls', action='store_true', help="Calculate FL for all pairs of segments")
    parser.add_argument('-c', '--context_type', type=str, default='Canonical', help="How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details.")
    parser.add_argument('-a', '--algorithm', default='minpair', help='Algorithm to use for calculating functional load: "minpair" for minimal pair count or "deltah" for change in entropy. Defaults to minpair.')
    parser.add_argument('-f', '--frequency_cutoff', type=float, default=0, help='Minimum frequency of words to consider as possible minimal pairs or contributing to lexicon entropy.')
    parser.add_argument('-r', '--relative_count', type=check_bool, default=True, help='For minimal pair FL: whether or not to divide the number of minimal pairs by the number of possible minimal pairs (words with either segment in the proper environment). Defaults to True; pass -r False to set as False.')
    parser.add_argument('-d', '--distinguish_homophones', action='store_true', help="For minimal pair FL: if False, then you'll count sock~shock (sock=clothing) and sock~shock (sock=punch) as just one minimal pair; but if True, you'll overcount alternative spellings of the same word, e.g. axel~actual and axle~actual. False is the value used by Wedel et al.")
    parser.add_argument('-t', '--type_or_token', default='token', help='For change in entropy FL: specifies whether entropy is based on type or token frequency.')
    parser.add_argument('-e', '--relative_fl', action='store_true', help="If True, calculate the relative FL of a single segment by averaging across the functional loads of it and all other segments.")
    parser.add_argument('-s', '--sequence_type', default='transcription', help="The attribute of Words to calculate FL over. Normally this will be the transcription, but it can also be the spelling or a user-specified tier.")
    parser.add_argument('-q', '--environment_lhs', default=None, help="Left hand side of environment filter. Format: positions separated by commas, groups by slashes, e.g. m/n,i matches mi or ni.")
    parser.add_argument('-w', '--environment_rhs', default=None, help="Right hand side of environment filter. Format: positions separated by commas, groups by slashes, e.g. m/n,i matches mi or ni.")
    parser.add_argument('-n', '--prevent_normalization', action='store_true', help="For deltah entropy: prevents normalization of the entropy difference by the pre-neutralization entropy. To replicate the Surendran \& Niyogi metric, do NOT use this flag.")
    parser.add_argument('-x', '--separate_pairs', action='store_true', help="If present, calculate FL for each pair in the pairs file separately.")
    parser.add_argument('-o', '--outfile', help='Name of output file')

    args = parser.parse_args()

    ####

    # Parse paths

    try:
        home = os.path.expanduser('~')
        corpus = load_binary(os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS', args.corpus_file_name))
    except FileNotFoundError:
        corpus = load_binary(args.corpus_file_name)

    # Create corpus context

    if args.context_type == 'Canonical':
        corpus = CanonicalVariantContext(corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff)
    elif args.context_type == 'MostFrequent':
        corpus = MostFrequentVariantContext(corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff)
    elif args.context_type == 'SeparatedTokens':
        corpus = SeparatedTokensVariantContext(corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff)
    elif args.context_type == 'Weighted':
        corpus = WeightedVariantContext(corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff)

    # Create environment filters

    if not args.environment_lhs and not args.environment_rhs:
        environment_filters = []
    else:
        if args.environment_lhs:
            split_lhs = [tuple(pos.split('/')) for pos in args.environment_lhs.split(',')]
        else:
            split_lhs = None
        if args.environment_rhs:
            split_rhs = [tuple(pos.split('/')) for pos in args.environment_rhs.split(',')]
        else:
            split_rhs = None
        environment_filters = [EnvironmentFilter([], split_lhs, split_rhs)]

    # Initialize results

    overall_result = None
    detailed_results = {}
    keys_label = ''
    values_label = 'functional load'

    # Determine which function to call

    if args.all_pairwise_fls:
        results = all_pairwise_fls(corpus, relative_fl=args.relative_fl, algorithm=args.algorithm, relative_count=args.relative_count,
                     distinguish_homophones=args.distinguish_homophones, environment_filters=environment_filters, prevent_normalization=args.prevent_normalization)
        for pair, fl in results:
            detailed_results[pair] = fl
        keys_label = 'segment pair'
        values_label = 'functional load'

    else:
        if args.relative_fl != True:
            try:
                with open(args.pairs_file_name_or_segment) as segpairs_or_segment_file:
                    segpairs_or_segment = [line for line in csv.reader(segpairs_or_segment_file, delimiter='\t') if len(line) > 0]
            except FileNotFoundError:
                raise FileNotFoundError("Did not find the segment pairs file even though 'relative_fl' is set to false. If calculating the relative FL of a single segement, please set 'relative_fl' to True. Otherwise, specify correct filename.")
        else:
            segpairs_or_segment = args.pairs_file_name_or_segment

        if args.algorithm == 'minpair':
            if args.relative_fl:
                results = relative_minpair_fl(corpus, segpairs_or_segment, relative_count=bool(args.relative_count), distinguish_homophones=args.distinguish_homophones, environment_filters=environment_filters)
                overall_result = results[0]
                detailed_results = results[1]
                keys_label = 'segment pair'
            else:
                if args.separate_pairs:
                    for pair in segpairs_or_segment:
                        pair = tuple(pair)
                        detailed_results[pair] = minpair_fl(corpus, [pair], relative_count=bool(args.relative_count), distinguish_homophones=args.distinguish_homophones, environment_filters=environment_filters)[0]
                    keys_label = 'segment pair'
                else:
                    results = minpair_fl(corpus, segpairs_or_segment, relative_count=bool(args.relative_count), distinguish_homophones=args.distinguish_homophones, environment_filters=environment_filters)
                    overall_result = results[0]
                    detailed_results = {mp: '' for mp in results[1]}
                    keys_label = 'minimal pair (all listed regardless of distinguish_homophones value)'
        elif args.algorithm == 'deltah':
            if args.relative_fl:
                results = relative_deltah_fl(corpus, segpairs_or_segment, environment_filters=environment_filters, prevent_normalization=args.prevent_normalization)
                overall_result = results[0]
                detailed_results = results[1]
                keys_label = 'segment pair'
            else:
                if args.separate_pairs:
                    for pair in segpairs_or_segment:
                        pair = tuple(pair)
                        detailed_results[pair] = (deltah_fl(corpus, [pair], environment_filters=environment_filters, prevent_normalization=args.prevent_normalization))
                    keys_label = 'segment pair'
                else:
                    overall_result = deltah_fl(corpus, segpairs_or_segment, environment_filters=environment_filters, prevent_normalization=args.prevent_normalization)
        else:
            raise Exception('-a / --algorithm must be set to either \'minpair\' or \'deltah\'.')

    if args.outfile:
        with open(args.outfile, 'w') as outfile:
            outstr = '{}\t{}\n'.format(keys_label, values_label)
            if overall_result:
                outstr += 'OVERALL\t{}\n'.format(overall_result)
            for key in detailed_results:
                outstr += '{}\t{}\n'.format(key, detailed_results[key])
            outfile.write(outstr)


    else:
        if overall_result:
            easy_result = overall_result
        else:
            easy_result = detailed_results
        print('No output file name provided.')
        print('The functional load of the given inputs is {}.'.format(str(easy_result)))

示例#12

0

显示文件

文件： guitest.py 项目： PhonologicalCorpusTools/CorpusTools

 def setUpClass(cls):
     corpus = load_binary(r'C:\Users\Scott\Documents\GitHub\CorpusTools\corpustools\lemurian.corpus')
     inventory = InventoryModel(corpus.inventory, copy_mode=True)
     cls.dialog = FLDialog(main, None, corpus, inventory, False)

示例#13

0

显示文件

文件： pct_mutualinfo.py 项目： akki2825/CorpusTools

def main():

    #### Parse command-line arguments
    parser = argparse.ArgumentParser(description = \
             'Phonological CorpusTools: mutual information CL interface')
    parser.add_argument('corpus_file_name', help='Name of corpus file')
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument(
        '-q',
        '--query',
        help='bigram or segment pair, as str separated by comma')
    group.add_argument(
        '-l',
        '--all_pairwise_mis',
        action='store_true',
        help="Calculate MI for all orders of all pairs of segments")
    parser.add_argument(
        '-c',
        '--context_type',
        type=str,
        default='Canonical',
        help=
        "How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details."
    )
    parser.add_argument(
        '-s',
        '--sequence_type',
        default='transcription',
        help=
        "The attribute of Words to calculate MI over. Normally this will be the transcription, but it can also be the spelling or a user-specified tier."
    )
    parser.add_argument(
        '-w',
        '--in_word',
        action='store_true',
        help=
        "Flag: domain for counting unigrams/bigrams set to the word rather than the unigram/bigram; ignores adjacency and word edges (#)"
    )
    parser.add_argument(
        '-e',
        '--halve_edges',
        action='store_true',
        help=
        "Flag: make the number of edge characters (#) equal to the size of the corpus + 1, rather than double the size of the corpus - 1"
    )
    parser.add_argument('-o', '--outfile', help='Name of output file')

    args = parser.parse_args()

    ####

    try:
        home = os.path.expanduser('~')
        corpus = load_binary(
            os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS',
                         args.corpus_file_name))
    except FileNotFoundError:
        corpus = load_binary(args.corpus_file_name)

    if args.context_type == 'Canonical':
        corpus = CanonicalVariantContext(corpus, args.sequence_type)
    elif args.context_type == 'MostFrequent':
        corpus = MostFrequentVariantContext(corpus, args.sequence_type)
    elif args.context_type == 'SeparatedTokens':
        corpus = SeparatedTokensVariantContext(corpus, args.sequence_type)
    elif args.context_type == 'Weighted':
        corpus = WeightedVariantContext(corpus, args.sequence_type)

    if args.all_pairwise_mis:
        result = all_mis(corpus,
                         halve_edges=args.halve_edges,
                         in_word=args.in_word)

    else:
        query = tuple(args.query.split(','))
        if len(query) < 2:
            print(
                'Warning! Your queried bigram could not be processed. Please separate the two segments with a comma, as in the call: pct_mutualinfo example.corpus m,a'
            )

        result = pointwise_mi(corpus, query, args.halve_edges, args.in_word)

    if args.outfile:
        with open(args.outfile, 'w') as outfile:
            if type(result) != list:
                outstr = 'result\t' + '\t'.join([
                    a for a in vars(args)
                ]) + '\n' + str(result) + '\t' + '\t'.join(
                    [str(getattr(args, a)) for a in vars(args)])
                outfile.write(outstr)
            else:
                outstr = 'result\tsegments\t' + '\t'.join(
                    [a for a in vars(args)]) + '\n'
                for element in result:
                    outstr += str(element[1]) + '\t' + str(
                        element[0]) + '\t' + '\t'.join(
                            [str(getattr(args, a)) for a in vars(args)]) + '\n'
                outfile.write(outstr)
    else:
        print('No output file name provided.')
        print('The mutual information of the given inputs is {}.'.format(
            str(result)))

示例#14

0

显示文件

文件： pct_funcload.py 项目： pedmiston/CorpusTools

def main():

    #### Parse command-line arguments
    parser = argparse.ArgumentParser(description = \
             'Phonological CorpusTools: functional load CL interface')
    parser.add_argument('corpus_file_name', help='Name of corpus file')
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument(
        '-p',
        '--pairs_file_name_or_segment',
        help=
        'Name of file with segment pairs (or target segment if relative_fl is True)'
    )
    group.add_argument('-l',
                       '--all_pairwise_fls',
                       action='store_true',
                       help="Calculate FL for all pairs of segments")
    parser.add_argument(
        '-c',
        '--context_type',
        type=str,
        default='Canonical',
        help=
        "How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details."
    )
    parser.add_argument(
        '-a',
        '--algorithm',
        default='minpair',
        help=
        'Algorithm to use for calculating functional load: "minpair" for minimal pair count or "deltah" for change in entropy. Defaults to minpair.'
    )
    parser.add_argument(
        '-f',
        '--frequency_cutoff',
        type=float,
        default=0,
        help=
        'Minimum frequency of words to consider as possible minimal pairs or contributing to lexicon entropy.'
    )
    parser.add_argument(
        '-r',
        '--relative_count',
        type=check_bool,
        default=True,
        help=
        'For minimal pair FL: whether or not to divide the number of minimal pairs by the number of possible minimal pairs (words with either segment).'
    )
    parser.add_argument(
        '-d',
        '--distinguish_homophones',
        action='store_true',
        help=
        "For minimal pair FL: if False, then you'll count sock~shock (sock=clothing) and sock~shock (sock=punch) as just one minimal pair; but if True, you'll overcount alternative spellings of the same word, e.g. axel~actual and axle~actual. False is the value used by Wedel et al."
    )
    parser.add_argument(
        '-t',
        '--type_or_token',
        default='token',
        help=
        'For change in entropy FL: specifies whether entropy is based on type or token frequency.'
    )
    parser.add_argument(
        '-e',
        '--relative_fl',
        action='store_true',
        help=
        "If True, calculate the relative FL of a single segment by averaging across the functional loads of it and all other segments."
    )
    parser.add_argument(
        '-s',
        '--sequence_type',
        default='transcription',
        help=
        "The attribute of Words to calculate FL over. Normally this will be the transcription, but it can also be the spelling or a user-specified tier."
    )
    parser.add_argument(
        '-q',
        '--environment_lhs',
        default=None,
        help=
        "Left hand side of environment filter. Format: positions separated by commas, groups by slashes, e.g. m/n,i matches mi or ni."
    )
    parser.add_argument(
        '-w',
        '--environment_rhs',
        default=None,
        help=
        "Right hand side of environment filter. Format: positions separated by commas, groups by slashes, e.g. m/n,i matches mi or ni."
    )
    parser.add_argument(
        '-x',
        '--separate_pairs',
        action='store_true',
        help=
        "If present, calculate FL for each pair in the pairs file separately.")
    parser.add_argument('-o', '--outfile', help='Name of output file')

    args = parser.parse_args()

    ####

    corpus = load_binary(args.corpus_file_name)
    if args.context_type == 'Canonical':
        corpus = CanonicalVariantContext(
            corpus,
            args.sequence_type,
            args.type_or_token,
            frequency_threshold=args.frequency_cutoff)
    elif args.context_type == 'MostFrequent':
        corpus = MostFrequentVariantContext(
            corpus,
            args.sequence_type,
            args.type_or_token,
            frequency_threshold=args.frequency_cutoff)
    elif args.context_type == 'SeparatedTokens':
        corpus = SeparatedTokensVariantContext(
            corpus,
            args.sequence_type,
            args.type_or_token,
            frequency_threshold=args.frequency_cutoff)
    elif args.context_type == 'Weighted':
        corpus = WeightedVariantContext(
            corpus,
            args.sequence_type,
            args.type_or_token,
            frequency_threshold=args.frequency_cutoff)

    if not args.environment_lhs and not args.environment_rhs:
        environment_filter = None
    else:
        if args.environment_lhs:
            split_lhs = [
                tuple(pos.split('/'))
                for pos in args.environment_lhs.split(',')
            ]
        else:
            split_lhs = None
        if args.environment_rhs:
            split_rhs = [
                tuple(pos.split('/'))
                for pos in args.environment_rhs.split(',')
            ]
        else:
            split_rhs = None
        environment_filter = EnvironmentFilter([], split_lhs, split_rhs)

    if args.all_pairwise_fls:
        result = all_pairwise_fls(
            corpus,
            relative_fl=args.relative_fl,
            algorithm=args.algorithm,
            relative_count=args.relative_count,
            distinguish_homophones=args.distinguish_homophones,
            environment_filter=environment_filter)

    else:
        if args.relative_fl != True:
            try:
                with open(args.pairs_file_name_or_segment
                          ) as segpairs_or_segment_file:
                    segpairs_or_segment = [
                        line for line in csv.reader(segpairs_or_segment_file,
                                                    delimiter='\t')
                        if len(line) > 0
                    ]
            except FileNotFoundError:
                raise FileNotFoundError(
                    "Did not find the segment pairs file even though 'relative_fl' is set to false. If calculating the relative FL of a single segement, please set 'relative_fl' to True. Otherwise, specify correct filename."
                )
        else:
            segpairs_or_segment = args.pairs_file_name_or_segment

        if args.algorithm == 'minpair':
            if args.relative_fl:
                result = relative_minpair_fl(
                    corpus,
                    segpairs_or_segment,
                    relative_count=bool(args.relative_count),
                    distinguish_homophones=args.distinguish_homophones,
                    environment_filter=environment_filter)
            else:
                if args.separate_pairs:
                    result = []
                    for pair in segpairs_or_segment:
                        result.append(
                            minpair_fl(corpus, [pair],
                                       relative_count=bool(
                                           args.relative_count),
                                       distinguish_homophones=args.
                                       distinguish_homophones,
                                       environment_filter=environment_filter))
                else:
                    result = minpair_fl(
                        corpus,
                        segpairs_or_segment,
                        relative_count=bool(args.relative_count),
                        distinguish_homophones=args.distinguish_homophones,
                        environment_filter=environment_filter)
        elif args.algorithm == 'deltah':
            if args.relative_fl:
                result = relative_deltah_fl(
                    corpus,
                    segpairs_or_segment,
                    environment_filter=environment_filter)
            else:
                if args.separate_pairs:
                    result = []
                    for pair in segpairs_or_segment:
                        result.append(
                            deltah_fl(corpus, [pair],
                                      environment_filter=environment_filter))
                else:
                    result = deltah_fl(corpus,
                                       segpairs_or_segment,
                                       environment_filter=environment_filter)
        else:
            raise Exception(
                '-a / --algorithm must be set to either \'minpair\' or \'deltah\'.'
            )

    if args.outfile:
        with open(args.outfile, 'w') as outfile:
            if type(result) != list:
                outstr = 'result\t' + '\t'.join([
                    a for a in vars(args)
                ]) + '\n' + str(result) + '\t' + '\t'.join(
                    [str(getattr(args, a)) for a in vars(args)])
                outfile.write(outstr)
            else:
                outstr = 'result\tsegment(s)\t' + '\t'.join(
                    [a for a in vars(args)]) + '\n'
                for element in result:
                    outstr += str(element[1]) + '\t' + str(
                        element[0]) + '\t' + '\t'.join(
                            [str(getattr(args, a)) for a in vars(args)]) + '\n'
                outfile.write(outstr)

    else:
        print('No output file name provided.')
        print('The functional load of the given inputs is {}.'.format(
            str(result)))

示例#15

0

显示文件

文件： pct_funcload.py 项目： akki2825/CorpusTools

def main():

    #### Parse command-line arguments
    parser = argparse.ArgumentParser(description = \
             'Phonological CorpusTools: functional load CL interface')
    parser.add_argument('corpus_file_name', help='Name of corpus file')
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument(
        '-p',
        '--pairs_file_name_or_segment',
        help=
        'Name of file with segment pairs (or target segment if relative_fl is True)'
    )
    group.add_argument('-l',
                       '--all_pairwise_fls',
                       action='store_true',
                       help="Calculate FL for all pairs of segments")
    parser.add_argument(
        '-c',
        '--context_type',
        type=str,
        default='Canonical',
        help=
        "How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details."
    )
    parser.add_argument(
        '-a',
        '--algorithm',
        default='minpair',
        help=
        'Algorithm to use for calculating functional load: "minpair" for minimal pair count or "deltah" for change in entropy. Defaults to minpair.'
    )
    parser.add_argument(
        '-f',
        '--frequency_cutoff',
        type=float,
        default=0,
        help=
        'Minimum frequency of words to consider as possible minimal pairs or contributing to lexicon entropy.'
    )
    parser.add_argument(
        '-r',
        '--relative_count',
        type=check_bool,
        default=True,
        help=
        'For minimal pair FL: whether or not to divide the number of minimal pairs by the number of possible minimal pairs (words with either segment in the proper environment). Defaults to True; pass -r False to set as False.'
    )
    parser.add_argument(
        '-d',
        '--distinguish_homophones',
        action='store_true',
        help=
        "For minimal pair FL: if False, then you'll count sock~shock (sock=clothing) and sock~shock (sock=punch) as just one minimal pair; but if True, you'll overcount alternative spellings of the same word, e.g. axel~actual and axle~actual. False is the value used by Wedel et al."
    )
    parser.add_argument(
        '-t',
        '--type_or_token',
        default='token',
        help=
        'For change in entropy FL: specifies whether entropy is based on type or token frequency.'
    )
    parser.add_argument(
        '-e',
        '--relative_fl',
        action='store_true',
        help=
        "If True, calculate the relative FL of a single segment by averaging across the functional loads of it and all other segments."
    )
    parser.add_argument(
        '-s',
        '--sequence_type',
        default='transcription',
        help=
        "The attribute of Words to calculate FL over. Normally this will be the transcription, but it can also be the spelling or a user-specified tier."
    )
    parser.add_argument(
        '-q',
        '--environment_lhs',
        default=None,
        help=
        "Left hand side of environment filter. Format: positions separated by commas, groups by slashes, e.g. m/n,i matches mi or ni."
    )
    parser.add_argument(
        '-w',
        '--environment_rhs',
        default=None,
        help=
        "Right hand side of environment filter. Format: positions separated by commas, groups by slashes, e.g. m/n,i matches mi or ni."
    )
    parser.add_argument(
        '-n',
        '--prevent_normalization',
        action='store_true',
        help=
        "For deltah entropy: prevents normalization of the entropy difference by the pre-neutralization entropy. To replicate the Surendran \& Niyogi metric, do NOT use this flag."
    )
    parser.add_argument(
        '-x',
        '--separate_pairs',
        action='store_true',
        help=
        "If present, calculate FL for each pair in the pairs file separately.")
    parser.add_argument('-o', '--outfile', help='Name of output file')

    args = parser.parse_args()

    ####

    # Parse paths

    try:
        home = os.path.expanduser('~')
        corpus = load_binary(
            os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS',
                         args.corpus_file_name))
    except FileNotFoundError:
        corpus = load_binary(args.corpus_file_name)

    # Create corpus context

    if args.context_type == 'Canonical':
        corpus = CanonicalVariantContext(
            corpus,
            args.sequence_type,
            args.type_or_token,
            frequency_threshold=args.frequency_cutoff)
    elif args.context_type == 'MostFrequent':
        corpus = MostFrequentVariantContext(
            corpus,
            args.sequence_type,
            args.type_or_token,
            frequency_threshold=args.frequency_cutoff)
    elif args.context_type == 'SeparatedTokens':
        corpus = SeparatedTokensVariantContext(
            corpus,
            args.sequence_type,
            args.type_or_token,
            frequency_threshold=args.frequency_cutoff)
    elif args.context_type == 'Weighted':
        corpus = WeightedVariantContext(
            corpus,
            args.sequence_type,
            args.type_or_token,
            frequency_threshold=args.frequency_cutoff)

    # Create environment filters

    if not args.environment_lhs and not args.environment_rhs:
        environment_filters = []
    else:
        if args.environment_lhs:
            split_lhs = [
                tuple(pos.split('/'))
                for pos in args.environment_lhs.split(',')
            ]
        else:
            split_lhs = None
        if args.environment_rhs:
            split_rhs = [
                tuple(pos.split('/'))
                for pos in args.environment_rhs.split(',')
            ]
        else:
            split_rhs = None
        environment_filters = [EnvironmentFilter([], split_lhs, split_rhs)]

    # Initialize results

    overall_result = None
    detailed_results = {}
    keys_label = ''
    values_label = 'functional load'

    # Determine which function to call

    if args.all_pairwise_fls:
        results = all_pairwise_fls(
            corpus,
            relative_fl=args.relative_fl,
            algorithm=args.algorithm,
            relative_count=args.relative_count,
            distinguish_homophones=args.distinguish_homophones,
            environment_filters=environment_filters,
            prevent_normalization=args.prevent_normalization)
        for pair, fl in results:
            detailed_results[pair] = fl
        keys_label = 'segment pair'
        values_label = 'functional load'

    else:
        if args.relative_fl != True:
            try:
                with open(args.pairs_file_name_or_segment
                          ) as segpairs_or_segment_file:
                    segpairs_or_segment = [
                        line for line in csv.reader(segpairs_or_segment_file,
                                                    delimiter='\t')
                        if len(line) > 0
                    ]
            except FileNotFoundError:
                raise FileNotFoundError(
                    "Did not find the segment pairs file even though 'relative_fl' is set to false. If calculating the relative FL of a single segement, please set 'relative_fl' to True. Otherwise, specify correct filename."
                )
        else:
            segpairs_or_segment = args.pairs_file_name_or_segment

        if args.algorithm == 'minpair':
            if args.relative_fl:
                results = relative_minpair_fl(
                    corpus,
                    segpairs_or_segment,
                    relative_count=bool(args.relative_count),
                    distinguish_homophones=args.distinguish_homophones,
                    environment_filters=environment_filters)
                overall_result = results[0]
                detailed_results = results[1]
                keys_label = 'segment pair'
            else:
                if args.separate_pairs:
                    for pair in segpairs_or_segment:
                        pair = tuple(pair)
                        detailed_results[pair] = minpair_fl(
                            corpus, [pair],
                            relative_count=bool(args.relative_count),
                            distinguish_homophones=args.distinguish_homophones,
                            environment_filters=environment_filters)[0]
                    keys_label = 'segment pair'
                else:
                    results = minpair_fl(
                        corpus,
                        segpairs_or_segment,
                        relative_count=bool(args.relative_count),
                        distinguish_homophones=args.distinguish_homophones,
                        environment_filters=environment_filters)
                    overall_result = results[0]
                    detailed_results = {mp: '' for mp in results[1]}
                    keys_label = 'minimal pair (all listed regardless of distinguish_homophones value)'
        elif args.algorithm == 'deltah':
            if args.relative_fl:
                results = relative_deltah_fl(
                    corpus,
                    segpairs_or_segment,
                    environment_filters=environment_filters,
                    prevent_normalization=args.prevent_normalization)
                overall_result = results[0]
                detailed_results = results[1]
                keys_label = 'segment pair'
            else:
                if args.separate_pairs:
                    for pair in segpairs_or_segment:
                        pair = tuple(pair)
                        detailed_results[pair] = (deltah_fl(
                            corpus, [pair],
                            environment_filters=environment_filters,
                            prevent_normalization=args.prevent_normalization))
                    keys_label = 'segment pair'
                else:
                    overall_result = deltah_fl(
                        corpus,
                        segpairs_or_segment,
                        environment_filters=environment_filters,
                        prevent_normalization=args.prevent_normalization)
        else:
            raise Exception(
                '-a / --algorithm must be set to either \'minpair\' or \'deltah\'.'
            )

    if args.outfile:
        with open(args.outfile, 'w') as outfile:
            outstr = '{}\t{}\n'.format(keys_label, values_label)
            if overall_result:
                outstr += 'OVERALL\t{}\n'.format(overall_result)
            for key in detailed_results:
                outstr += '{}\t{}\n'.format(key, detailed_results[key])
            outfile.write(outstr)

    else:
        if overall_result:
            easy_result = overall_result
        else:
            easy_result = detailed_results
        print('No output file name provided.')
        print('The functional load of the given inputs is {}.'.format(
            str(easy_result)))

示例#16

0

显示文件

文件： pct_funcload.py 项目： adilnurimanov/CorpusTools

def main():

    #### Parse command-line arguments
    parser = argparse.ArgumentParser(description = \
             'Phonological CorpusTools: functional load CL interface')
    parser.add_argument('corpus_file_name', help='Name of corpus file')
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('-p', '--pairs_file_name_or_segment', help='Name of file with segment pairs (or target segment if relative_fl is True)')
    group.add_argument('-l', '--all_pairwise_fls', action='store_true', help="Calculate FL for all pairs of segments")
    parser.add_argument('-c', '--context_type', type=str, default='Canonical', help="How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details.")
    parser.add_argument('-a', '--algorithm', default='minpair', help='Algorithm to use for calculating functional load: "minpair" for minimal pair count or "deltah" for change in entropy. Defaults to minpair.')
    parser.add_argument('-f', '--frequency_cutoff', type=float, default=0, help='Minimum frequency of words to consider as possible minimal pairs or contributing to lexicon entropy.')
    parser.add_argument('-r', '--relative_count', type=check_bool, default=True, help='For minimal pair FL: whether or not to divide the number of minimal pairs by the number of possible minimal pairs (words with either segment).')
    parser.add_argument('-d', '--distinguish_homophones', action='store_true', help="For minimal pair FL: if False, then you'll count sock~shock (sock=clothing) and sock~shock (sock=punch) as just one minimal pair; but if True, you'll overcount alternative spellings of the same word, e.g. axel~actual and axle~actual. False is the value used by Wedel et al.")
    parser.add_argument('-t', '--type_or_token', default='token', help='For change in entropy FL: specifies whether entropy is based on type or token frequency.')
    parser.add_argument('-e', '--relative_fl', action='store_true', help="If True, calculate the relative FL of a single segment by averaging across the functional loads of it and all other segments.")
    parser.add_argument('-s', '--sequence_type', default='transcription', help="The attribute of Words to calculate FL over. Normally this will be the transcription, but it can also be the spelling or a user-specified tier.")
    parser.add_argument('-q', '--environment_lhs', default=None, help="Left hand side of environment filter. Format: positions separated by commas, groups by slashes, e.g. m/n,i matches mi or ni.")
    parser.add_argument('-w', '--environment_rhs', default=None, help="Right hand side of environment filter. Format: positions separated by commas, groups by slashes, e.g. m/n,i matches mi or ni.")
    parser.add_argument('-x', '--separate_pairs', action='store_true', help="If present, calculate FL for each pair in the pairs file separately.")
    parser.add_argument('-o', '--outfile', help='Name of output file')

    args = parser.parse_args()

    ####

    corpus = load_binary(args.corpus_file_name)
    if args.context_type == 'Canonical':
        corpus = CanonicalVariantContext(corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff)
    elif args.context_type == 'MostFrequent':
        corpus = MostFrequentVariantContext(corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff)
    elif args.context_type == 'SeparatedTokens':
        corpus = SeparatedTokensVariantContext(corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff)
    elif args.context_type == 'Weighted':
        corpus = WeightedVariantContext(corpus, args.sequence_type, args.type_or_token, frequency_threshold=args.frequency_cutoff)

    if not args.environment_lhs and not args.environment_rhs:
        environment_filter = None
    else:
        if args.environment_lhs:
            split_lhs = [tuple(pos.split('/')) for pos in args.environment_lhs.split(',')]
        else:
            split_lhs = None
        if args.environment_rhs:
            split_rhs = [tuple(pos.split('/')) for pos in args.environment_rhs.split(',')]
        else:
            split_rhs = None
        environment_filter = EnvironmentFilter([], split_lhs, split_rhs)

    if args.all_pairwise_fls:
        result = all_pairwise_fls(corpus, relative_fl=args.relative_fl, algorithm=args.algorithm, relative_count=args.relative_count,
                     distinguish_homophones=args.distinguish_homophones, environment_filter=environment_filter)

    else:
        if args.relative_fl != True:
            try:
                with open(args.pairs_file_name_or_segment) as segpairs_or_segment_file:
                    segpairs_or_segment = [line for line in csv.reader(segpairs_or_segment_file, delimiter='\t') if len(line) > 0]
            except FileNotFoundError:
                raise FileNotFoundError("Did not find the segment pairs file even though 'relative_fl' is set to false. If calculating the relative FL of a single segement, please set 'relative_fl' to True. Otherwise, specify correct filename.")
        else:
            segpairs_or_segment = args.pairs_file_name_or_segment

        if args.algorithm == 'minpair':
            if args.relative_fl:
                result = relative_minpair_fl(corpus, segpairs_or_segment, relative_count=bool(args.relative_count), distinguish_homophones=args.distinguish_homophones, environment_filter=environment_filter)
            else:
                if args.separate_pairs:
                    result = []
                    for pair in segpairs_or_segment:
                        result.append(minpair_fl(corpus, [pair], relative_count=bool(args.relative_count), distinguish_homophones=args.distinguish_homophones, environment_filter=environment_filter))
                else:
                    result = minpair_fl(corpus, segpairs_or_segment, relative_count=bool(args.relative_count), distinguish_homophones=args.distinguish_homophones, environment_filter=environment_filter)
        elif args.algorithm == 'deltah':
            if args.relative_fl:
                result = relative_deltah_fl(corpus, segpairs_or_segment, environment_filter=environment_filter)
            else:
                if args.separate_pairs:
                    result = []
                    for pair in segpairs_or_segment:
                        result.append(deltah_fl(corpus, [pair], environment_filter=environment_filter))
                else:
                    result = deltah_fl(corpus, segpairs_or_segment, environment_filter=environment_filter)
        else:
            raise Exception('-a / --algorithm must be set to either \'minpair\' or \'deltah\'.')

    if args.outfile:
        with open(args.outfile, 'w') as outfile:
            if type(result) != list:
                outstr = 'result\t' + '\t'.join([a for a in vars(args)]) + '\n' + str(result) + '\t' + '\t'.join([str(getattr(args, a)) for a in vars(args)])
                outfile.write(outstr)
            else:
                outstr = 'result\tsegment(s)\t' + '\t'.join([a for a in vars(args)]) + '\n'
                for element in result:
                    outstr += str(element[1]) + '\t' + str(element[0]) + '\t' + '\t'.join([str(getattr(args,a)) for a in vars(args)]) + '\n'
                outfile.write(outstr)

    else:
        print('No output file name provided.')
        print('The functional load of the given inputs is {}.'.format(str(result)))

示例#17

0

显示文件

 def setUpClass(cls):
     corpus = load_binary(
         r'C:\Users\Scott\Documents\GitHub\CorpusTools\corpustools\lemurian.corpus'
     )
     inventory = InventoryModel(corpus.inventory, copy_mode=True)
     cls.dialog = FLDialog(main, None, corpus, inventory, False)