def test_basic_corpus_nd(specified_test_corpus): calls = [({ 'query': specified_test_corpus.find('mata'), 'max_distance': 1 }, 1.0), ({ 'query': specified_test_corpus.find('nata'), 'max_distance': 2 }, 3.0), ({ 'query': specified_test_corpus.find('mata'), 'algorithm': 'phono_edit_distance', 'max_distance': 3 }, 1.0)] with CanonicalVariantContext(specified_test_corpus, 'transcription', 'type') as c: for kwargs, v in calls: result = neighborhood_density(c, **kwargs) assert (abs(result[0] - v) < 0.0001) with CanonicalVariantContext(specified_test_corpus, 'spelling', 'type') as c: result = neighborhood_density( c, **{ 'query': specified_test_corpus.find('mata'), 'max_distance': 1 }) assert (abs(result[0] - 1.0) < 0.0001)
def test_basic_corpus_nd(specified_test_corpus): calls = [({'query':specified_test_corpus.find('mata'), 'max_distance':1},1.0), ({'query':specified_test_corpus.find('nata'), 'max_distance':2},3.0), ({'query':specified_test_corpus.find('mata'), 'algorithm':'phono_edit_distance', 'max_distance':3},1.0)] with CanonicalVariantContext(specified_test_corpus, 'transcription', 'type') as c: for kwargs,v in calls: result = neighborhood_density(c, **kwargs) assert(abs(result[0]-v) < 0.0001) with CanonicalVariantContext(specified_test_corpus, 'spelling', 'type') as c: result = neighborhood_density(c, **{'query':specified_test_corpus.find('mata'), 'max_distance':1}) assert(abs(result[0]-1.0) < 0.0001)
def run(self): time.sleep(0.1) kwargs = self.kwargs self.results = [] context = kwargs.pop('context') if context == RestrictedContextWidget.canonical_value: cm = CanonicalVariantContext elif context == RestrictedContextWidget.frequent_value: cm = MostFrequentVariantContext corpus = kwargs['corpusModel'].corpus st = kwargs['sequence_type'] tt = kwargs['type_token'] att = kwargs.get('attribute', None) with cm(corpus, st, tt, att) as c: try: if 'query' in kwargs: for q in kwargs['query']: if kwargs['algorithm'] != 'substitution': res = neighborhood_density(c, q, algorithm = kwargs['algorithm'], max_distance = kwargs['max_distance'], stop_check = kwargs['stop_check'], call_back = kwargs['call_back']) else: res = find_mutation_minpairs(c, q, stop_check = kwargs['stop_check'], call_back = kwargs['call_back']) if 'output_filename' in kwargs and kwargs['output_filename'] is not None: print_neighden_results(kwargs['output_filename'],res[1]) if self.stopped: break self.results.append([q,res[0]]) else: end = kwargs['corpusModel'].beginAddColumn(att) if kwargs['algorithm'] != 'substitution': neighborhood_density_all_words(c, algorithm = kwargs['algorithm'], max_distance = kwargs['max_distance'], num_cores = kwargs['num_cores'], call_back = kwargs['call_back'], stop_check = kwargs['stop_check'] ) else: find_mutation_minpairs_all_words(c, num_cores = kwargs['num_cores'], stop_check = kwargs['stop_check'], call_back = kwargs['call_back']) end = kwargs['corpusModel'].endAddColumn(end) except PCTError as e: self.errorEncountered.emit(e) return except Exception as e: e = PCTPythonError(e) self.errorEncountered.emit(e) return if self.stopped: self.finishedCancelling.emit() return self.dataReady.emit(self.results)
def run(self): kwargs = self.kwargs self.results = [] context = kwargs.pop('context') if context == RestrictedContextWidget.canonical_value: cm = CanonicalVariantContext elif context == RestrictedContextWidget.frequent_value: cm = MostFrequentVariantContext corpus = kwargs['corpusModel'].corpus st = kwargs['sequence_type'] tt = kwargs['type_token'] att = kwargs.get('attribute', None) ft = kwargs['frequency_cutoff'] output = list() with cm(corpus, st, tt, attribute=att, frequency_threshold=ft) as c: try: tierdict = defaultdict(list) # Create a dict with sequence_type keys for constant-time lookup for entry in c: w = getattr(entry, kwargs['sequence_type']) key = str(w) tierdict[key].append(entry) if 'query' in kwargs: #this will be true when searching for a single word (in the corpus or not) last_value_removed = None last_key_removed = None for q in kwargs['query']: q = ensure_query_is_word(q, c, c.sequence_type, kwargs['tier_type']) #the following code for adding/removing keys is to ensure that homophones are counted later in #the ND algorithm (if the user wants to), but that words are not considered their own neighbours #however, we only do this when comparing inside a corpus. when using a list of external words #we don't want to do this, since it's possible for the external list to contain words that #are in the corpus, and removing them gives the wrong ND value in this case if kwargs['in_corpus']: if last_value_removed: tierdict[last_key_removed].append( last_value_removed) w = getattr(q, kwargs['sequence_type']) last_key_removed = str(w) #last_value_removed = tierdict[last_key_removed].pop() for i, item in enumerate( tierdict[last_key_removed]): if str(item) == str(q): last_value_removed = tierdict[ last_key_removed].pop(i) break #now we call the actual ND algorithms if kwargs['algorithm'] != 'substitution': res = neighborhood_density( c, q, tierdict, algorithm=kwargs['algorithm'], max_distance=kwargs['max_distance'], force_quadratic=kwargs['force_quadratic'], collapse_homophones=kwargs[ 'collapse_homophones'], file_type=kwargs['file_type'], tier_type=kwargs['tier_type'], sequence_type=kwargs['sequence_type'], stop_check=kwargs['stop_check'], call_back=kwargs['call_back']) else: res = find_mutation_minpairs( c, q, tier_type=kwargs['tier_type'], collapse_homophones=kwargs[ 'collapse_homophones'], stop_check=kwargs['stop_check'], call_back=kwargs['call_back']) if 'output_filename' in kwargs and kwargs[ 'output_filename'] is not None: print_neighden_results(kwargs['output_filename'], res[1], kwargs['output_format']) if self.stopped: break if kwargs['file_list'] is not None: output.append(','.join([ str(q), str(res[0]), ','.join([str(r) for r in res[1]]) ])) self.results.append([q, res[0]]) else: #this will be the case if searching the entire corpus end = kwargs['corpusModel'].beginAddColumn(att) if kwargs['algorithm'] != 'substitution': results = neighborhood_density_all_words( c, tierdict, tier_type=kwargs['tier_type'], algorithm=kwargs['algorithm'], output_format=kwargs['output_format'], max_distance=kwargs['max_distance'], num_cores=kwargs['num_cores'], call_back=kwargs['call_back'], stop_check=kwargs['stop_check'], settable_attr=kwargs['attribute'], collapse_homophones=kwargs['collapse_homophones']) else: results = find_mutation_minpairs_all_words( c, tierdict, tier_type=kwargs['tier_type'], collapse_homophones=kwargs['collapse_homophones'], num_cores=kwargs['num_cores'], stop_check=kwargs['stop_check'], call_back=kwargs['call_back']) end = kwargs['corpusModel'].endAddColumn(end) if 'output_filename' in kwargs and kwargs[ 'output_filename'] is not None: print_all_neighden_results(kwargs['output_filename'], results) except PCTError as e: self.errorEncountered.emit(e) return except Exception as e: e = PCTPythonError(e) self.errorEncountered.emit(e) return if self.stopped: self.finishedCancelling.emit() return if output and 'output_filename' in kwargs: with open(kwargs['output_filename'], encoding='utf-8', mode='w') as outf: print('Word,Density,Neighbors', file=outf) for item in output: print(item, file=outf) self.dataReady.emit(self.results)
def run(self): time.sleep(0.1) kwargs = self.kwargs self.results = [] context = kwargs.pop('context') if context == RestrictedContextWidget.canonical_value: cm = CanonicalVariantContext elif context == RestrictedContextWidget.frequent_value: cm = MostFrequentVariantContext corpus = kwargs['corpusModel'].corpus st = kwargs['sequence_type'] tt = kwargs['type_token'] att = kwargs.get('attribute', None) with cm(corpus, st, tt, att) as c: try: if 'query' in kwargs: for q in kwargs['query']: if kwargs['algorithm'] != 'substitution': res = neighborhood_density( c, q, algorithm=kwargs['algorithm'], max_distance=kwargs['max_distance'], stop_check=kwargs['stop_check'], call_back=kwargs['call_back']) else: res = find_mutation_minpairs( c, q, stop_check=kwargs['stop_check'], call_back=kwargs['call_back']) if 'output_filename' in kwargs and kwargs[ 'output_filename'] is not None: print_neighden_results(kwargs['output_filename'], res[1]) if self.stopped: break self.results.append([q, res[0]]) else: end = kwargs['corpusModel'].beginAddColumn(att) if kwargs['algorithm'] != 'substitution': neighborhood_density_all_words( c, algorithm=kwargs['algorithm'], max_distance=kwargs['max_distance'], num_cores=kwargs['num_cores'], call_back=kwargs['call_back'], stop_check=kwargs['stop_check']) else: find_mutation_minpairs_all_words( c, num_cores=kwargs['num_cores'], stop_check=kwargs['stop_check'], call_back=kwargs['call_back']) end = kwargs['corpusModel'].endAddColumn(end) except PCTError as e: self.errorEncountered.emit(e) return except Exception as e: e = PCTPythonError(e) self.errorEncountered.emit(e) return if self.stopped: self.finishedCancelling.emit() return self.dataReady.emit(self.results)
def main(): #### Parse command-line arguments parser = argparse.ArgumentParser(description = \ 'Phonological CorpusTools: neighborhood density CL interface') parser.add_argument('corpus_file_name', help='Name of corpus file') parser.add_argument('query', help='Word to query, or name of file including a list of words') parser.add_argument('-c', '--context_type', type=str, default='Canonical', help="How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details.") parser.add_argument('-a', '--algorithm', default= 'edit_distance', help="The algorithm used to determine distance") parser.add_argument('-d', '--max_distance', type=int, default = 1, help="Maximum edit distance from the queried word to consider a word a neighbor.") parser.add_argument('-s', '--sequence_type', default = 'transcription', help="The name of the tier on which to calculate distance") parser.add_argument('-w', '--count_what', default ='type', help="If 'type', count neighbors in terms of their type frequency. If 'token', count neighbors in terms of their token frequency.") parser.add_argument('-e', '--trans_delimiter', default='', help="If not empty string, splits the query by this str to make a transcription/spelling list for the query's Word object.") parser.add_argument('-m', '--find_mutation_minpairs', action='store_true', help='This flag causes the script not to calculate neighborhood density, but rather to find minimal pairs---see documentation.') parser.add_argument('-q', '--force_quadratic_algorithm', action='store_true', help='This flag prevents PCT from using the more efficient linear-time algorithm for edit distance of 1 neighborhoods.') parser.add_argument('-o', '--outfile', help='Name of output file') args = parser.parse_args() #### try: home = os.path.expanduser('~') corpus = load_binary(os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS', args.corpus_file_name)) except FileNotFoundError: corpus = load_binary(args.corpus_file_name) if args.context_type == 'Canonical': corpus = CanonicalVariantContext(corpus, args.sequence_type, type_or_token=args.count_what) elif args.context_type == 'MostFrequent': corpus = MostFrequentVariantContext(corpus, args.sequence_type, type_or_token=args.count_what) elif args.context_type == 'SeparatedTokens': corpus = SeparatedTokensVariantContext(corpus, args.sequence_type, type_or_token=args.count_what) elif args.context_type == 'Weighted': corpus = WeightedVariantContext(corpus, args.sequence_type, type_or_token=args.count_what) if args.find_mutation_minpairs: query = ensure_query_is_word(args.query, corpus, args.sequence_type, args.trans_delimiter) matches = find_mutation_minpairs(corpus, query) for match in matches[1]: print(match) print('Total number of matches: {}'.format(str(matches[0]))) else: try: # read query as a file name with open(args.query) as queryfile: queries = [line[0] for line in csv.reader(queryfile, delimiter='\t') if len(line) > 0] queries = [ensure_query_is_word(q, corpus, args.sequence_type, args.trans_delimiter) for q in queries] results = [neighborhood_density(corpus, q, algorithm = args.algorithm, max_distance = args.max_distance, force_quadratic=args.force_quadratic_algorithm) for q in queries] if args.outfile: with open(args.outfile, 'w') as outfile: for q, r in zip(queries, results): outfile.write('{}\t{}'.format(q, str(r[0])) + ''.join(['\t{}'.format(str(n)) for n in r[1]]) + '\n') else: raise Exception('In order to use a file of queries as input, you must provide an output file name using the option -o.') except FileNotFoundError: # read query as a single word query = ensure_query_is_word(args.query, corpus, args.sequence_type, args.trans_delimiter) result = neighborhood_density(corpus, query, algorithm = args.algorithm, max_distance = args.max_distance, force_quadratic=args.force_quadratic_algorithm) if args.outfile: with open(args.outfile, 'w') as outfile: outfile.write('{}\t{}'.format(query, str(result[0])) + ''.join(['\t{}'.format(str(n)) for n in result[1]])) else: print('No output file name provided.') print('The neighborhood density of the given form is {}. For a list of neighbors, please provide an output file name.'.format(str(result[0])))
def run(self): kwargs = self.kwargs self.results = [] context = kwargs.pop('context') if context == RestrictedContextWidget.canonical_value: cm = CanonicalVariantContext elif context == RestrictedContextWidget.frequent_value: cm = MostFrequentVariantContext corpus = kwargs['corpusModel'].corpus st = kwargs['sequence_type'] tt = kwargs['type_token'] att = kwargs.get('attribute', None) ft = kwargs['frequency_cutoff'] output = list() with cm(corpus, st, tt, attribute=att, frequency_threshold=ft) as c: try: tierdict = defaultdict(list) for entry in c: w = getattr(entry, kwargs['sequence_type']) tierdict[str(w)].append(entry) if 'query' in kwargs: # Create a dict with sequence_type keys for constaint-time lookup for q in kwargs['query']: if kwargs['algorithm'] != 'substitution': res = neighborhood_density( c, q, tierdict, algorithm=kwargs['algorithm'], max_distance=kwargs['max_distance'], force_quadratic=kwargs['force_quadratic'], file_type=kwargs['file_type'], tier_type=kwargs['tier_type'], stop_check=kwargs['stop_check'], call_back=kwargs['call_back']) else: res = find_mutation_minpairs( c, q, tier_type=kwargs['tier_type'], stop_check=kwargs['stop_check'], call_back=kwargs['call_back']) if 'output_filename' in kwargs and kwargs[ 'output_filename'] is not None: print_neighden_results(kwargs['output_filename'], res[1]) if self.stopped: break if kwargs['file_list'] is not None: output.append(','.join([ q, str(res[0]), ','.join([str(r) for r in res[1]]) ])) self.results.append([q, res[0]]) else: end = kwargs['corpusModel'].beginAddColumn(att) if kwargs['algorithm'] != 'substitution': neighborhood_density_all_words( c, tierdict, tier_type=kwargs['tier_type'], algorithm=kwargs['algorithm'], max_distance=kwargs['max_distance'], num_cores=kwargs['num_cores'], call_back=kwargs['call_back'], stop_check=kwargs['stop_check'], settable_attr=kwargs['attribute']) else: find_mutation_minpairs_all_words( c, tier_type=kwargs['tier_type'], num_cores=kwargs['num_cores'], stop_check=kwargs['stop_check'], call_back=kwargs['call_back']) end = kwargs['corpusModel'].endAddColumn(end) except PCTError as e: self.errorEncountered.emit(e) return except Exception as e: e = PCTPythonError(e) self.errorEncountered.emit(e) return if self.stopped: self.finishedCancelling.emit() return if output and kwargs['file_list']: with open(kwargs['output_filename'], encoding='utf-8', mode='w') as outf: print('Word,Density,Neighbors', file=outf) for item in output: print(item, file=outf) self.dataReady.emit(self.results)
def run(self): kwargs = self.kwargs self.results = [] context = kwargs.pop('context') if context == RestrictedContextWidget.canonical_value: cm = CanonicalVariantContext elif context == RestrictedContextWidget.frequent_value: cm = MostFrequentVariantContext corpus = kwargs['corpusModel'].corpus st = kwargs['sequence_type'] tt = kwargs['type_token'] att = kwargs.get('attribute', None) ft = kwargs['frequency_cutoff'] output = list() with cm(corpus, st, tt, attribute=att, frequency_threshold = ft) as c: try: tierdict = defaultdict(list) # Create a dict with sequence_type keys for constant-time lookup for entry in c: w = getattr(entry, kwargs['sequence_type']) key = str(w) tierdict[key].append(entry) if 'query' in kwargs:#this will be true when searching for a single word (in the corpus or not) last_value_removed = None last_key_removed = None for q in kwargs['query']: q = ensure_query_is_word(q, c, c.sequence_type, kwargs['tier_type']) #the following code for adding/removing keys is to ensure that homophones are counted later in #the ND algorithm (if the user wants to), but that words are not considered their own neighbours #however, we only do this when comparing inside a corpus. when using a list of external words #we don't want to do this, since it's possible for the external list to contain words that #are in the corpus, and removing them gives the wrong ND value in this case if kwargs['in_corpus']: if last_value_removed: tierdict[last_key_removed].append(last_value_removed) w = getattr(q, kwargs['sequence_type']) last_key_removed = str(w) #last_value_removed = tierdict[last_key_removed].pop() for i, item in enumerate(tierdict[last_key_removed]): if str(item) == str(q): last_value_removed = tierdict[last_key_removed].pop(i) break #now we call the actual ND algorithms if kwargs['algorithm'] != 'substitution': res = neighborhood_density(c, q, tierdict, algorithm = kwargs['algorithm'], max_distance = kwargs['max_distance'], force_quadratic=kwargs['force_quadratic'], collapse_homophones = kwargs['collapse_homophones'], file_type = kwargs['file_type'], tier_type = kwargs['tier_type'], sequence_type = kwargs['sequence_type'], stop_check = kwargs['stop_check'], call_back = kwargs['call_back']) else: res = find_mutation_minpairs(c, q, tier_type=kwargs['tier_type'], collapse_homophones = kwargs['collapse_homophones'], stop_check = kwargs['stop_check'], call_back = kwargs['call_back']) if 'output_filename' in kwargs and kwargs['output_filename'] is not None: print_neighden_results(kwargs['output_filename'], res[1], kwargs['output_format']) if self.stopped: break if kwargs['file_list'] is not None: output.append(','.join([str(q), str(res[0]), ','.join([str(r) for r in res[1]])])) self.results.append([q,res[0]]) else:#this will be the case if searching the entire corpus end = kwargs['corpusModel'].beginAddColumn(att) if kwargs['algorithm'] != 'substitution': results = neighborhood_density_all_words(c, tierdict, tier_type = kwargs['tier_type'], algorithm = kwargs['algorithm'], output_format = kwargs['output_format'], max_distance = kwargs['max_distance'], num_cores = kwargs['num_cores'], call_back = kwargs['call_back'], stop_check = kwargs['stop_check'], settable_attr = kwargs['attribute'], collapse_homophones = kwargs['collapse_homophones'] ) else: results = find_mutation_minpairs_all_words(c, tierdict, tier_type = kwargs['tier_type'], collapse_homophones = kwargs['collapse_homophones'], num_cores = kwargs['num_cores'], stop_check = kwargs['stop_check'], call_back = kwargs['call_back']) end = kwargs['corpusModel'].endAddColumn(end) if 'output_filename' in kwargs and kwargs['output_filename'] is not None: print_all_neighden_results(kwargs['output_filename'], results) except PCTError as e: self.errorEncountered.emit(e) return except Exception as e: e = PCTPythonError(e) self.errorEncountered.emit(e) return if self.stopped: self.finishedCancelling.emit() return if output and 'output_filename' in kwargs: with open(kwargs['output_filename'], encoding='utf-8', mode='w') as outf: print('Word,Density,Neighbors', file=outf) for item in output: print(item, file=outf) self.dataReady.emit(self.results)