def treat_options( opts, arg, n_arg, usage_string ) : """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global measures global supported_measures global main_freq_name global join_all_contrastive treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o in ( "-m", "--measures" ) : try : measures = [] measures = interpret_measures( a ) except ValueError as message : error( str(message)+"\nargument must be list separated by " "\":\" and containing the names: "+ str( supported_measures )) elif o in ( "-o", "--original" ) : main_freq_name = a elif o in ( "-a", "--all" ) : join_all_contrastive = True if not main_freq_name : error( "Option -o is mandatory")
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. @param usage_string The usage string for the current script. """ global attributes treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-a", "--attributes"): attributes = a.split(":") for attr in attributes: if attr not in WORD_ATTRIBUTES: error("Unknown attribute '%s'!" % attr) if attributes is None: print >> sys.stderr, "The option -a <attributes> is mandatory." usage(usage_string) sys.exit(2)
def handle_candidate(self, candidate, info={}): """For each candidate, stores it in a temporary Database (so that it can be retrieved later) and also creates a tuple containing the sorting key feature values and the candidate ID. All the tuples are stored in a global list, that will be sorted once all candidates are read and stored into the temporary DB. @param candidate The `Candidate` that is being read from the XML file. """ global feat_list, all_feats, feat_list_ok, feat_to_order # First, verifies if all the features defined as sorting keys are real # features, by matching them against the meta-features of the header. This # is only performed once, before the first candidate is processed if not feat_list_ok: for feat_name in feat_list: if feat_name not in all_feats: error( "%(feat)s is not a valid feature\n" + "Please chose features from the list below\n" + "%(list)s" % {"feat": feat_name, "list": "\n".join(map(lambda x: "* " + x, all_feats))} ) feat_list_ok = True for tp_class in candidate.tpclasses: for feat_name in feat_list: feat_value = candidate.get_feat_value(feat_name) tp_value = candidate.get_tpclass_value(tp_class.name) if feat_value != UNKNOWN_FEAT_VALUE and tp_value != UNKNOWN_FEAT_VALUE: tuple = (float(feat_value), tp_value == "True") feat_to_order[tp_class.name][feat_name].append(tuple)
def search_terms(self, in_text, query): """ """ if DEFAULT_LANG != "en": print("WARNING: Yahoo terms only works for English", file=sys.stderr) input_text = in_text.strip() if isinstance(input_text, unicode): input_text = input_text.encode('utf-8') try: url = ('http://search.yahooapis.com/ContentAnalysisService/' 'V1/termExtraction') post_data = urllib.urlencode({ "context": input_text, "appid": YAHOO_APPID, "query": query, "output": "json" }) request = urllib2.Request(url, post_data) response = urllib2.urlopen(request) results = simplejson.load(response) return results["ResultSet"]["Result"] except Exception as err: error("Got an error ->" + str(err) + "\nPLEASE VERIFY YOUR INTERNET CONNECTION")
def _fallback_entity(self, entity, info={}): """ For each entity, stores it in a temporary Database (so that it can be retrieved later) and also creates a tuple containing the sorting key feature values and the entity ID. All the tuples are stored in a global list, that will be sorted once all candidates are read and stored into the temporary DB. """ global feat_list # First, verifies if all the features defined as sorting keys are real # features, by matching them against the meta-features of the header. This # is only performed once, before the first entity is processed if not self.feat_list_ok: for feat_name in feat_list: if feat_name not in self.all_feats: error("\"{feat}\" is not a valid feature\n" \ "Please chose features from the list below:\n" \ "{list}".format(feat=feat_name, list="\n".join( "* " + feat for feat in self.all_feats))) self.feat_list_ok = True # Store the whole entity in a temporary database #info['parser'] = info['fileobj'] = None self.all_entities[unicode(entity.id_number)] = (entity, info) # Build up a tuple to be added to a list. one_tuple = [] for feat_name in feat_list: one_tuple.append(self.feat_value(entity, feat_name)) # The tuple will contain the sorting key values and the # entity ID. The former are used to sort the candidates, the # latter is used to retrieve a entity from the temporary DB one_tuple.append(entity.id_number) self.feat_to_order.append(tuple(one_tuple))
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global web_freq treat_options_simplest(opts, arg, n_arg, usage_string) mode = [] for (o, a) in opts: if o in ("-y", "--yahoo"): web_freq = YahooFreq() mode.append("yahoo") elif o in ("-w", "--google"): web_freq = GoogleFreq() mode.append("google") if len(mode) > 1: error("At most one option -y or -w, should be provided")
def read_distances( d_filename ) : """ Reads the distances between categories from a tab-separated file and generates a list of tuples which will, once the annotation file is read, be converted into a category x category matrix. This needs to be done like this because, before reading the annotations file, we do not know how many categories Nk will be used. @param d_filename The input file name from which the data is read @return A list of tuples containing, in the first position, category 1, in the second position, category 2, and in the third position a float with the distance between them. """ try : d_data = open( d_filename ) distances_map = {} # Use a map to remove duplicates if present for line in d_data.readlines() : if len(line.strip()) > 0 : # Ignore blank lines try : cat1, cat2, distance = line.strip().split("\t") key = "###SEPARATOR###".join( sorted( [ cat1, cat2 ] ) ) distances_map[ key ] = float( distance ) except ValueError : error("ERROR reading distances, expected three values " "separated by TAB, found:\n" + line) return distances_map except IOError : error("\nERROR: Distance file \"%s\" not found" % d_filename)
def treat_options(opts, arg, n_arg, usage_string): """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global reference_fname global mwe_evaluator global corpus_filetype_ext global reference_filetype_ext sentence_aligner_class = NaiveSentenceAligner mwe_evaluator_class = ExactMatchMWEEvaluator treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-r", "--reference"): reference_fname = a elif o in ("--sentence-aligner"): sentence_aligner_class = SENTENCE_ALIGNERS[a] elif o in ("-e", "--evaluator"): mwe_evaluator_class = MWE_EVALUATORS[a] elif o == "--corpus-from": corpus_filetype_ext = a elif o == "--reference-from": reference_filetype_ext = a else: raise Exception("Bad arg: " + o) if not reference_fname: error("No reference file given!") sentence_aligner = sentence_aligner_class() mwe_evaluator = mwe_evaluator_class(sentence_aligner)
def treat_options(opts, arg, n_arg, usage_string): """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global input_filetype_ext global output_filetype_ext global append_pos_tag global clean_special treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o == ("--from"): input_filetype_ext = a elif o == ("--to"): output_filetype_ext = a elif o == "--append-pos-tag": if a in ("coarse","fine"): append_pos_tag = a else: error("Expected \"coarse\" or \"fine\", found " + a) elif o == "--clean-special": clean_special = True else: raise Exception("Bad arg: " + o)
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. @param usage_string The usage string for the current script. """ global attributes treat_options_simplest( opts, arg, n_arg, usage_string ) for (o, a) in opts: if o in ("-a", "--attributes"): attributes = a.split(":") for attr in attributes: if attr not in WORD_ATTRIBUTES: error("Unknown attribute '%s'!" % attr) if attributes is None: print >>sys.stderr, "The option -a <attributes> is mandatory." usage(usage_string) sys.exit(2)
def _fallback_entity(self, entity, info={}): """ For each entity, stores it in a temporary Database (so that it can be retrieved later) and also creates a tuple containing the sorting key feature values and the entity ID. All the tuples are stored in a global list, that will be sorted once all candidates are read and stored into the temporary DB. """ global feat_list # First, verifies if all the features defined as sorting keys are real # features, by matching them against the meta-features of the header. This # is only performed once, before the first entity is processed if not self.feat_list_ok: for feat_name in feat_list: if feat_name not in self.all_feats: error("\"{feat}\" is not a valid feature\n" \ "Please chose features from the list below:\n" \ "{list}".format(feat=feat_name, list="\n".join( "* " + feat for feat in self.all_feats))) self.feat_list_ok = True # Store the whole entity in a temporary database #info['parser'] = info['fileobj'] = None self.all_entities[unicode(entity.id_number)] = (entity,info) # Build up a tuple to be added to a list. one_tuple = [] for feat_name in feat_list: one_tuple.append(self.feat_value(entity, feat_name)) # The tuple will contain the sorting key values and the # entity ID. The former are used to sort the candidates, the # latter is used to retrieve a entity from the temporary DB one_tuple.append(entity.id_number) self.feat_to_order.append(tuple(one_tuple))
def read_distances(d_filename): """ Reads the distances between categories from a tab-separated file and generates a list of tuples which will, once the annotation file is read, be converted into a category x category matrix. This needs to be done like this because, before reading the annotations file, we do not know how many categories Nk will be used. @param d_filename The input file name from which the data is read @return A list of tuples containing, in the first position, category 1, in the second position, category 2, and in the third position a float with the distance between them. """ try: d_data = open(d_filename) distances_map = {} # Use a map to remove duplicates if present for line in d_data.readlines(): if len(line.strip()) > 0: # Ignore blank lines try: cat1, cat2, distance = line.strip().split("\t") key = "###SEPARATOR###".join(sorted([cat1, cat2])) distances_map[key] = float(distance) except ValueError: error("ERROR reading distances, expected three values " "separated by TAB, found:\n" + line) return distances_map except IOError: error("\nERROR: Distance file \"%s\" not found" % d_filename)
def treat_options( opts, arg, n_arg, usage_string ) : """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global algoname global lower_attr global input_filetype_ext global output_filetype_ext treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a elif o in ("-l","--lemmas" ) : lower_attr = "lemma" elif o in ("-a", "--algorithm"): algoname = a.lower() elif o in ("-m", "-x"): error( "Deprecated options -x and -m. Run with -h for details" ) else: raise Exception("Bad arg: " + o)
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global web_freq treat_options_simplest(opts, arg, n_arg, usage_string) mode = [] for ( o, a ) in opts: if o in ( "-y", "--yahoo" ): web_freq = YahooFreq() mode.append("yahoo") elif o in ( "-w", "--google" ): web_freq = GoogleFreq() mode.append("google") if len(mode) > 1: error("At most one option -y or -w, should be provided")
def treat_options( opts, arg, n_arg, usage_string ) : """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global reference_fname global mwe_evaluator global corpus_filetype_ext global reference_filetype_ext sentence_aligner_class = NaiveSentenceAligner mwe_evaluator_class = ExactMatchMWEEvaluator treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-r", "--reference"): reference_fname = a elif o in ("--sentence-aligner"): sentence_aligner_class = SENTENCE_ALIGNERS[a] elif o in ("-e", "--evaluator"): mwe_evaluator_class = MWE_EVALUATORS[a] elif o == "--corpus-from": corpus_filetype_ext = a elif o == "--reference-from": reference_filetype_ext = a else: raise Exception("Bad arg: " + o) if not reference_fname: error("No reference file given!") sentence_aligner = sentence_aligner_class() mwe_evaluator = mwe_evaluator_class(sentence_aligner)
def treat_options(opts, arg, n_arg, usage_string): """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global limit global entity_buffer global input_filetype_ext global output_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) for ( o, a ) in opts: if o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a elif o in ("-n", "--number"): try: limit = int(a) entity_buffer = [None] * limit if limit < 0: raise ValueError except ValueError: error("You must provide a positive " + \ "integer value as argument of -n option.") else: raise Exception("Bad arg: " + o)
def handle_candidate(self, candidate, info={}): """For each candidate, stores it in a temporary Database (so that it can be retrieved later) and also creates a tuple containing the sorting key feature values and the candidate ID. All the tuples are stored in a global list, that will be sorted once all candidates are read and stored into the temporary DB. @param candidate The `Candidate` that is being read from the XML file. """ global feat_list, all_feats, feat_list_ok, feat_to_order # First, verifies if all the features defined as sorting keys are real # features, by matching them against the meta-features of the header. This # is only performed once, before the first candidate is processed if not feat_list_ok: for feat_name in feat_list: if feat_name not in all_feats: error("%(feat)s is not a valid feature\n" + \ "Please chose features from the list below\n" + \ "%(list)s" % {"feat": feat_name, "list": "\n".join( map(lambda x: "* " + x, all_feats))}) feat_list_ok = True for tp_class in candidate.tpclasses: for feat_name in feat_list: feat_value = candidate.get_feat_value(feat_name) tp_value = candidate.get_tpclass_value(tp_class.name) if feat_value != UNKNOWN_FEAT_VALUE and \ tp_value != UNKNOWN_FEAT_VALUE : tuple = (float(feat_value), tp_value == "True") feat_to_order[tp_class.name][feat_name].append(tuple)
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global measures global supported_measures global main_freq_name global join_all_contrastive treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-m", "--measures"): try: measures = [] measures = interpret_measures(a) except ValueError as message: error( str(message) + "\nargument must be list separated by " "\":\" and containing the names: " + str(supported_measures)) elif o in ("-o", "--original"): main_freq_name = a elif o in ("-a", "--all"): join_all_contrastive = True if not main_freq_name: error("Option -o is mandatory")
def treat_options( opts, arg, n_arg, usage_string ) : """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global limit global input_filetype_ext global output_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a elif o in ("-n", "--number"): try: limit = int( a ) if limit < 0: raise ValueError except ValueError: error("You must provide a positive " \ "integer value as argument of -n option.") else: raise Exception("Bad arg")
def feat_value(self, entity, feat_name): r"""Return value for given feature name.""" if feat_name.startswith("@"): if feat_name == "@SURFACE": return tuple(w.surface for w in entity) if feat_name == "@LEMMA": return tuple(w.lemma for w in entity) if feat_name == "@POS": return tuple(w.pos for w in entity) error("Bad pseudo-feature name", feat_name=feat_name) return entity.get_feat_value(feat_name)
def __init__(self): global algoname if algoname == "simple" : self.handle_sentence = self.handle_sentence_simple # Redundant, kept for clarity elif algoname == "complex" : self.handle_sentence = self.handle_sentence_complex elif algoname == "aggressive" : self.handle_sentence = self.handle_sentence_aggressive # Redundant, kept for clarity else : error("%s is not a valid algorithm\nYou must provide a valid "+\ "algorithm (e.g. \"complex\", \"simple\")." % algoname)
def calculate_distances(distances_map, all_categories): """ Generates a distances matrix from the distances map and the correspondence between nominal categories and their IDs. This function is called just after reading the data when a distances file is provided. @param distances_map A dictionary where the keys are strings of the form category1###SEPARATOR###category2 and the values are the distances between category1 and category1 @param all_categories A dictionary where the keys are the string nominal category names and the values are the integer unique IDs of each category @return A simmetric matrix Nk x Nk, with the distance between categories represented in the cells. The rows and columns are indexed with the IDs from 0 to Nk-1, the matrix contains 0.0 in the main diagonal. The values not specified in the distances_map are set to the maximum distance seen in the map by default. If no distance file is provided (distance_map is empty) the distances between each two different categories are 1.0. """ Nk = len(all_categories.keys()) distances_matrix = [] max_distance = 0.0 for k in range(Nk): distances_matrix.append(Nk * [-1.0]) for key, distance in distances_map.items(): cats = key.split("###SEPARATOR###") try: k1, k2 = map(lambda x: all_categories[x], cats) except KeyError: error("Distance file incompatible with annotations\nDid not find " "categories %s in the annotation data" % cats) return None if k1 == k2: warn( "defined distance for category and self for %s. Replacing by 0" % cats[0]) distances_matrix[k1][k2] = distance distances_matrix[k2][k1] = distance if distance > max_distance: max_distance = distance if len(distances_map.keys()) == 0: max_distance = 1.0 # Fill in the non-specified distances with the maximal value for k1 in range(Nk): distances_matrix[k1][k1] = 0.0 # Distance between categ and itself = 0 for k2 in range(Nk): if distances_matrix[k1][k2] < 0.0: # Not specified or negative distances_matrix[k1][k2] = max_distance return distances_matrix
def calculate_distances( distances_map, all_categories ) : """ Generates a distances matrix from the distances map and the correspondence between nominal categories and their IDs. This function is called just after reading the data when a distances file is provided. @param distances_map A dictionary where the keys are strings of the form category1###SEPARATOR###category2 and the values are the distances between category1 and category1 @param all_categories A dictionary where the keys are the string nominal category names and the values are the integer unique IDs of each category @return A simmetric matrix Nk x Nk, with the distance between categories represented in the cells. The rows and columns are indexed with the IDs from 0 to Nk-1, the matrix contains 0.0 in the main diagonal. The values not specified in the distances_map are set to the maximum distance seen in the map by default. If no distance file is provided (distance_map is empty) the distances between each two different categories are 1.0. """ Nk = len( all_categories.keys() ) distances_matrix = [] max_distance = 0.0 for k in range(Nk) : distances_matrix.append( Nk * [-1.0] ) for key,distance in distances_map.items() : cats = key.split("###SEPARATOR###") try : k1,k2 = map(lambda x : all_categories[x], cats ) except KeyError : error("Distance file incompatible with annotations\nDid not find " "categories %s in the annotation data" % cats) return None if k1 == k2 : warn("defined distance for category and self for %s. Replacing by 0" % cats[ 0 ]) distances_matrix[ k1 ][ k2 ] = distance distances_matrix[ k2 ][ k1 ] = distance if distance > max_distance : max_distance = distance if len( distances_map.keys() ) == 0 : max_distance = 1.0 # Fill in the non-specified distances with the maximal value for k1 in range(Nk) : distances_matrix[k1][k1] = 0.0 # Distance between categ and itself = 0 for k2 in range(Nk) : if distances_matrix[ k1 ][ k2 ] < 0.0 : # Not specified or negative distances_matrix[ k1 ][ k2 ] = max_distance return distances_matrix
def interpret_length(l, maxormin): """ Transform argument given to -a or -i options into integer + error checks. @param l: A string passed as argument to -i or -a @param maxormin: A string indicating whether this is "maximum" or "minimum" @return: An integer corresponding to l """ try: result = int(l) if result < 0: raise ValueError verbose("%s length: %d" % (maxormin, result)) return result except ValueError: error("Argument of must be non-negative integer, got " + repr(l))
def interpret_length( l, maxormin ): """ Transform argument given to -a or -i options into integer + error checks. @param l: A string passed as argument to -i or -a @param maxormin: A string indicating whether this is "maximum" or "minimum" @return: An integer corresponding to l """ try : result = int( l ) if result < 0: raise ValueError verbose( "%s length: %d" % (maxormin, result) ) return result except ValueError: error("Argument of must be non-negative integer, got " + repr(l))
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global pre_gs global ignore_pos global gs_name global ignore_case global lemma_or_surface global input_filetype_ext global reference_filetype_ext ref_name = None treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-r", "--reference"): ref_name = a elif o in ("-g", "--ignore-pos"): ignore_pos = True elif o in ("-c", "--case"): ignore_case = False elif o in ("-L", "--lemma-or-surface"): lemma_or_surface = True elif o == "--input-from": input_filetype_ext = a elif o == "--reference-from": reference_filetype_ext = a else: raise Exception("Bad arg: " + o) # The reference list needs to be opened after all the options are read, # since options such as -g and -c modify the way the list is represented if ref_name: filetype.parse([ref_name], ReferenceReaderHandler(), reference_filetype_ext) gs_name = re.sub(".*/", "", re.sub("\.xml", "", ref_name)) # There's no reference list... Oh oh cannot evaluate :-( if not pre_gs: error("You MUST provide a non-empty reference list!")
def create_patterns_file( ngram_range ) : """ Create an artificial list of MWE patterns in which all the parts of the words are wildcards. Such artificial patterns match every ngram of size n, which is exactly what we want to do with the option -n. This may seem a weird way to extract ngrams, but it allows a single transparent candidate extraction function, treat_sentence. @param ngram_range String argument of the -n option. """ global patterns, usage_string, shortest_pattern, longest_pattern result = interpret_ngram( ngram_range ) if result : ( shortest_pattern, longest_pattern ) = result patterns.append(build_generic_pattern(shortest_pattern, longest_pattern)) else : error("Invalid argument for -n.")
def treat_options( opts, arg, n_arg, usage_string ) : """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global pre_gs global ignore_pos global gs_name global ignore_case global lemma_or_surface global input_filetype_ext global reference_filetype_ext ref_name = None treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o in ("-r", "--reference"): ref_name = a elif o in ("-g", "--ignore-pos"): ignore_pos = True elif o in ("-c", "--case"): ignore_case = False elif o in ("-L", "--lemma-or-surface"): lemma_or_surface = True elif o == "--input-from": input_filetype_ext = a elif o == "--reference-from": reference_filetype_ext = a else: raise Exception("Bad arg: " + o) # The reference list needs to be opened after all the options are read, # since options such as -g and -c modify the way the list is represented if ref_name : filetype.parse([ref_name], ReferenceReaderHandler(), reference_filetype_ext) gs_name = re.sub( ".*/", "", re.sub( "\.xml", "", ref_name ) ) # There's no reference list... Oh oh cannot evaluate :-( if not pre_gs : error("You MUST provide a non-empty reference list!")
def create_patterns_file(ngram_range): """ Create an artificial list of MWE patterns in which all the parts of the words are wildcards. Such artificial patterns match every ngram of size n, which is exactly what we want to do with the option -n. This may seem a weird way to extract ngrams, but it allows a single transparent candidate extraction function, treat_sentence. @param ngram_range String argument of the -n option. """ global patterns, usage_string, shortest_pattern, longest_pattern result = interpret_ngram(ngram_range) if result: (shortest_pattern, longest_pattern) = result patterns.append( build_generic_pattern(shortest_pattern, longest_pattern)) else: error("Invalid argument for -n.")
def handle_meta(self, meta, info={}) : """ Reads the `corpus_size` meta header and initializes a global counter dictionary with zero for each corpus. This dict will contain the total number of candidate frequencies summed up, as in the csmwe original formulation. @param meta The `Meta` header that is being read from the XML file. """ global totals_dict, main_freq_name main_freq_valid = False for corpus_size in meta.corpus_sizes : totals_dict[ corpus_size.name ] = 0 if corpus_size.name == main_freq_name : main_freq_valid = True if not main_freq_valid : error("main frequency must be a valid freq. name\nPossible values: " + str( totals_dict.keys() ))
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global surface_instead_lemmas global glue global base_attr global min_ngram global max_ngram global min_frequency global ngram_counts global selected_candidates global use_shelve global input_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) mode = [] for (o, a) in opts: if o in ("-s", "--surface"): surface_instead_lemmas = True base_attr = 'surface' elif o in ("-f", "--freq"): min_frequency = int(a) elif o in ("-n", "--ngram"): (min_ngram, max_ngram) = interpret_ngram(a) elif o in ("-G", "--glue"): if a == "scp": glue = scp_glue else: error("Unknown glue function '%s'" % a) elif o in ("-S", "--shelve"): use_shelve = True elif o == "--from": input_filetype_ext = a else: raise Exception("Bad arg: " + o)
def handle_meta(self, meta, info={}): """ Reads the `corpus_size` meta header and initializes a global counter dictionary with zero for each corpus. This dict will contain the total number of candidate frequencies summed up, as in the csmwe original formulation. @param meta The `Meta` header that is being read from the XML file. """ global totals_dict, main_freq_name main_freq_valid = False for corpus_size in meta.corpus_sizes: totals_dict[corpus_size.name] = 0 if corpus_size.name == main_freq_name: main_freq_valid = True if not main_freq_valid: error( "main frequency must be a valid freq. name\nPossible values: " + str(totals_dict.keys()))
def treat_options( opts, arg, n_arg, usage_string ) : """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global surface_instead_lemmas global glue global base_attr global min_ngram global max_ngram global min_frequency global ngram_counts global selected_candidates global use_shelve global input_filetype_ext treat_options_simplest( opts, arg, n_arg, usage_string ) mode = [] for ( o, a ) in opts: if o in ("-s", "--surface") : surface_instead_lemmas = True base_attr = 'surface' elif o in ("-f", "--freq") : min_frequency = int(a) elif o in ("-n", "--ngram") : (min_ngram, max_ngram) = interpret_ngram(a) elif o in ("-G", "--glue"): if a == "scp": glue = scp_glue else: error("Unknown glue function '%s'" % a) elif o in ("-S", "--shelve"): use_shelve = True elif o == "--from": input_filetype_ext = a else: raise Exception("Bad arg: " + o)
def treat_options( opts, arg, n_arg, usage_string ) : """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global limit treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o in ("-n", "--number") : try : limit = int( a ) if limit < 0 : raise ValueError except ValueError : error("You must provide a positive integer value as argument " "of -n option.")
def handle_candidate(self, candidate, info={}): """For each candidate and for each `CorpusSize` read from the `Meta` header, generates four features that correspond to the Association Measures described above. @param candidate The `Candidate` that is being read from the XML file. """ global corpussize_dict, main_freq joint_freq = {} singleword_freq = {} backed_off = False # Convert all these integers to floats... for freq in candidate.freqs: joint_freq[freq.name] = (float(abs(freq.value))) singleword_freq[freq.name] = [] if freq.value < 0: backed_off = True for word in candidate: for freq in word.freqs: singleword_freq[freq.name].append(abs(float(freq.value))) # Little trick: negative counts indicate backed-off counts if freq.value < 0: backed_off = True for freq in candidate.freqs: corpus_name = freq.name if not backed_off and corpus_name == "backoff": N = corpussize_dict[main_freq] else: N = corpussize_dict[corpus_name] try: feats = calculate_ams(joint_freq[corpus_name], singleword_freq[corpus_name], N, corpus_name) for feat in feats: candidate.add_feat(feat) except Exception: error( "This should never be printed. The end of the world is here" ) self.chain.handle_candidate(candidate, info)
def treat_options(opts, arg, n_arg, usage_string): """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global limit treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-n", "--number"): try: limit = int(a) if limit < 0: raise ValueError except ValueError: error("You must provide a positive integer value as argument " "of -n option.")
def treat_options( opts, arg, n_arg, usage_string ) : """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global input_patterns global input_filetype_ext global output_filetype_ext global match_distance global non_overlapping global id_order global annotate global only_the_matching_subpart util.treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o == "--input-from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a elif o in ("-p", "--patterns"): input_patterns = filetype.parse_entities([a]) elif o in ("-d", "--match-distance") : match_distance = a elif o in ("-N", "--non-overlapping") : non_overlapping = True elif o == "--id-order": id_order = a.split(":") elif o == "--annotate": annotate = True elif o == "--only-matching": only_the_matching_subpart = True else: raise Exception("Bad arg " + o) if input_patterns is None: util.error("No patterns provided. Option --patterns is mandatory!") if only_the_matching_subpart and annotate: util.warn("Switch --only-matching disables --annotate")
def open_index(prefix): """ Open the index files (valid index created by the `index.py` script). @param prefix The string name of the index file. """ global freq_name, the_corpus_size global index, suffix_array assert prefix.endswith(".info") prefix = prefix[:-len(".info")] try: verbose("Loading index files... this may take some time.") index = Index(prefix) index.load_metadata() freq_name = re.sub(".*/", "", prefix) #pdb.set_trace() the_corpus_size = index.metadata["corpus_size"] except IOError: error("Error opening the index.\nTry again with another index filename") except KeyError: error("Error opening the index.\nTry again with another index filename")
def calculate_and_print(annotations, Ni, Nc, Nk, categ_names): """ Given the set of annotations read from the files, calculate the agreement coefficients and print them in a nice way. @param annotations The list of annotations containing one row per item, one column per rater, and the nominal categories in the cells @param Ni The total number of items I in the data @param Nc The total number of raters C in the data @param Nk The total number of categories K in the data @param categ_names The names of the categories used to annotate, sorted by their IDs. """ global calculate_pairwise global calculate_confusion if Ni != 0 and Nc != 0 and Nk != 0: # empty file if calculate_pairwise: pairwise_map = compute_pairwise_all(annotations, Ni, Nc, Nk) for pair in pairwise_map.keys(): print("\nAgreement for pair " + pair) (a0, S, pi, kappa, wkappa) = pairwise_map[pair] print("ao = %f, S = %f, pi = %f, (Cohen's) kappa = %f, " "weighted kappa = %f" % (a0, S, pi, kappa, wkappa)) print_matrix_kappa(pairwise_map, Nc) print ( "\nNc = %(Nc)d raters\nNi = %(Ni)d items\nNk = %(Nk)d " + \ "categories\nNc x Ni = %(j)d judgements" ) %\ {"Nc": Nc, "Ni": Ni, "Nk": Nk, "j": Ni * Nc } coeffs = compute_multi(annotations, Ni, Nc, Nk) print("\nOverall agreement coefficients for all annotators:") print( "multi-ao = %f\nmulti-pi (Fleiss' kappa) = %f\nmulti-kappa = %f\n" % coeffs) coeffs_weighted = compute_weighted_multi(annotations, Ni, Nc, Nk) print("Weighted agreement coefficients for all annotators:") print("alpha = %f\nalpha-kappa = %f\n" % coeffs_weighted) if calculate_confusion: confusion, counters = compute_confusion(annotations, Nc) print_matrix_confusion(confusion, categ_names, counters, Ni, Nc, Nk) else: error("you probably provided an empty file")
def handle_candidate(self, candidate, info={}): """For each candidate and for each `CorpusSize` read from the `Meta` header, generates four features that correspond to the Association Measures described above. @param candidate The `Candidate` that is being read from the XML file. """ global corpussize_dict, main_freq joint_freq = {} singleword_freq = {} backed_off = False # Convert all these integers to floats... for freq in candidate.freqs : joint_freq[ freq.name ] = ( float(abs( freq.value ) ) ) singleword_freq[ freq.name ] = [] if freq.value < 0 : backed_off = True for word in candidate : for freq in word.freqs : singleword_freq[ freq.name ].append( abs( float(freq.value) ) ) # Little trick: negative counts indicate backed-off counts if freq.value < 0 : backed_off = True for freq in candidate.freqs : corpus_name = freq.name if not backed_off and corpus_name == "backoff" : N = corpussize_dict[ main_freq ] else : N = corpussize_dict[ corpus_name ] try : feats = calculate_ams( joint_freq[ corpus_name ], singleword_freq[ corpus_name ], N, corpus_name ) for feat in feats : candidate.add_feat( feat ) except Exception : error( "This should never be printed. The end of the world is here") self.chain.handle_candidate(candidate, info)
def calculate_and_print( annotations, Ni, Nc, Nk, categ_names ) : """ Given the set of annotations read from the files, calculate the agreement coefficients and print them in a nice way. @param annotations The list of annotations containing one row per item, one column per rater, and the nominal categories in the cells @param Ni The total number of items I in the data @param Nc The total number of raters C in the data @param Nk The total number of categories K in the data @param categ_names The names of the categories used to annotate, sorted by their IDs. """ global calculate_pairwise global calculate_confusion if Ni != 0 and Nc != 0 and Nk != 0 : # empty file if calculate_pairwise : pairwise_map = compute_pairwise_all( annotations, Ni, Nc, Nk) for pair in pairwise_map.keys() : print("\nAgreement for pair " + pair) (a0, S, pi, kappa, wkappa) = pairwise_map[pair] print("ao = %f, S = %f, pi = %f, (Cohen's) kappa = %f, " "weighted kappa = %f" % (a0, S, pi, kappa, wkappa) ) print_matrix_kappa( pairwise_map, Nc ) print ( "\nNc = %(Nc)d raters\nNi = %(Ni)d items\nNk = %(Nk)d " + \ "categories\nNc x Ni = %(j)d judgements" ) %\ {"Nc": Nc, "Ni": Ni, "Nk": Nk, "j": Ni * Nc } coeffs = compute_multi( annotations, Ni, Nc, Nk ) print("\nOverall agreement coefficients for all annotators:") print("multi-ao = %f\nmulti-pi (Fleiss' kappa) = %f\nmulti-kappa = %f\n" % coeffs) coeffs_weighted = compute_weighted_multi( annotations, Ni, Nc, Nk ) print("Weighted agreement coefficients for all annotators:") print("alpha = %f\nalpha-kappa = %f\n" % coeffs_weighted) if calculate_confusion : confusion, counters = compute_confusion( annotations, Nc ) print_matrix_confusion(confusion, categ_names, counters, Ni, Nc, Nk) else : error("you probably provided an empty file")
def open_index(prefix): """ Open the index files (valid index created by the `index.py` script). @param prefix The string name of the index file. """ global freq_name, the_corpus_size global index, suffix_array assert prefix.endswith(".info") prefix = prefix[:-len(".info")] try: verbose("Loading index files... this may take some time.") index = Index(prefix) index.load_metadata() freq_name = re.sub(".*/", "", prefix) #pdb.set_trace() the_corpus_size = index.metadata["corpus_size"] except IOError: error( "Error opening the index.\nTry again with another index filename") except KeyError: error( "Error opening the index.\nTry again with another index filename")
def handle_candidate(self, candidate, info={}) : """ For each candidate and for each `CorpusSize` read from the `Meta` header, generates features that correspond to the Contrastive Measures described above. @param candidate The `Candidate` that is being read from the XML file. """ global corpussize_dict global totals_dict global main_freq_name # get the original corpus freq, store the others in contrastive corpus dict # We use plus one smoothing to avoid dealing with zero freqs contrast_freqs = {} if join_all_contrastive : contrast_freqs[ "all" ] = 1 main_freq = None for freq in candidate.freqs : if freq.name == main_freq_name : main_freq = float( freq.value ) + 1 elif join_all_contrastive : contrast_freqs[ "all" ] += float( freq.value ) else : contrast_freqs[ freq.name ] = float( freq.value ) + 1 for contrast_name in contrast_freqs.keys() : try : feats = calculate_indiv( corpussize_dict[ main_freq_name ], corpussize_dict[ contrast_name ], main_freq, contrast_freqs[ contrast_name ], totals_dict[ contrast_name ], contrast_name ) for feat in feats : candidate.add_feat( feat ) except Exception : error("Error in calculating the measures.") self.chain.handle_candidate(candidate, info)
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global measures global supported_measures global main_freq global not_normalize_mle global input_filetype_ext global output_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-m", "--measures"): try: measures = interpret_measures(a) except ValueError as message: error( str(message) + "\nargument must be list separated by " "\":\" and containing the names: " + str(supported_measures)) elif o in ("-o", "--original"): main_freq = a elif o in ("-u", "--unnorm-mle"): not_normalize_mle = True elif o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad arg: " + o)
def handle_candidate(self, candidate, info={}): """ For each candidate and for each `CorpusSize` read from the `Meta` header, generates features that correspond to the Contrastive Measures described above. @param candidate The `Candidate` that is being read from the XML file. """ global corpussize_dict global totals_dict global main_freq_name # get the original corpus freq, store the others in contrastive corpus dict # We use plus one smoothing to avoid dealing with zero freqs contrast_freqs = {} if join_all_contrastive: contrast_freqs["all"] = 1 main_freq = None for freq in candidate.freqs: if freq.name == main_freq_name: main_freq = float(freq.value) + 1 elif join_all_contrastive: contrast_freqs["all"] += float(freq.value) else: contrast_freqs[freq.name] = float(freq.value) + 1 for contrast_name in contrast_freqs.keys(): try: feats = calculate_indiv(corpussize_dict[main_freq_name], corpussize_dict[contrast_name], main_freq, contrast_freqs[contrast_name], totals_dict[contrast_name], contrast_name) for feat in feats: candidate.add_feat(feat) except Exception: error("Error in calculating the measures.") self.chain.handle_candidate(candidate, info)
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global feat_list global ascending global print_precs treat_options_simplest(opts, arg, n_arg, usage_string) a_or_d = [] for (o, a) in opts: if o in ("-f", "--feat"): feat_list = treat_feat_list(a) elif o in ("-a", "--asc"): ascending = True a_or_d.append("a") elif o in ("-d", "--desc"): ascending = False a_or_d.append("d") elif o in ("-p", "--precs"): print_precs = True elif o == "--from": input_filetype_ext = a else: raise Exception("Bad arg: " + o) if len(a_or_d) > 1: warn("you should provide only one option, -a OR -d. Only the last one"+\ " will be considered.") if not feat_list: error("You MUST provide at least one feature with -f")
def search_terms( self, in_text, query ) : """ """ if DEFAULT_LANG != "en" : print("WARNING: Yahoo terms only works for English",file=sys.stderr) input_text = in_text.strip() if isinstance( input_text, unicode ) : input_text = input_text.encode( 'utf-8' ) try: url = ('http://search.yahooapis.com/ContentAnalysisService/' 'V1/termExtraction' ) post_data = urllib.urlencode( { "context": input_text, "appid": YAHOO_APPID, "query": query, "output": "json" } ) request = urllib2.Request( url, post_data ) response = urllib2.urlopen(request) results = simplejson.load(response) return results[ "ResultSet" ][ "Result" ] except Exception as err: error( "Got an error ->" + str(err) + "\nPLEASE VERIFY YOUR INTERNET CONNECTION")
def handle_meta(self, meta, info={}): """Treats the meta information of the file. Besides of printing the meta header out, it also keeps track of all the meta-features. The list of `all_feats` will be used in order to verify that all key features have a valid meta-feature. This is important because we need to determine the correct type of the feature value, since it might influence sorting order (e.g. integers 1 < 2 < 10 but strings "1" < "10" < "2") @param meta The `Meta` header that is being read from the XML file. """ global all_feats, usage_string, feat_to_order for meta_feat in meta.meta_feats: if meta_feat.feat_type in ("integer", "real"): all_feats.append(meta_feat.name) tp_classes_ok = False for meta_tp in meta.meta_tpclasses: if meta_tp.feat_type == "{True,False}": tp_classes_ok = True feat_to_order[meta_tp.name] = {} for feat_name in all_feats: feat_to_order[meta_tp.name][feat_name] = [] if not tp_classes_ok: error("You must define a boolean TP class")
def treat_options( opts, arg, n_arg, usage_string ) : """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global measures global supported_measures global main_freq global not_normalize_mle global input_filetype_ext global output_filetype_ext treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o in ( "-m", "--measures" ) : try : measures = interpret_measures( a ) except ValueError as message : error( str(message) + "\nargument must be list separated by " "\":\" and containing the names: " + str( supported_measures )) elif o in ( "-o", "--original" ) : main_freq = a elif o in ( "-u", "--unnorm-mle" ) : not_normalize_mle = True elif o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad arg: " + o)
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global feat_list global ascending global print_precs treat_options_simplest(opts, arg, n_arg, usage_string) a_or_d = [] for (o, a) in opts: if o in ("-f", "--feat"): feat_list = treat_feat_list(a) elif o in ("-a", "--asc"): ascending = True a_or_d.append("a") elif o in ("-d", "--desc"): ascending = False a_or_d.append("d") elif o in ("-p", "--precs"): print_precs = True elif o == "--from": input_filetype_ext = a else: raise Exception("Bad arg: " + o) if len(a_or_d) > 1: warn("you should provide only one option, -a OR -d. Only the last one" + " will be considered.") if not feat_list: error("You MUST provide at least one feature with -f")
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global used_attributes global basename global build_entry global use_text_format global input_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) used_attributes = ["lemma", "pos", "surface", "syn"] for (o, a) in opts: if o in ("-i", "--index"): basename = a elif o == "--from": input_filetype_ext = a elif o in ("-a", "--attributes"): used_attributes = a.split(":") elif o in ("-m", "--moses"): use_text_format = "moses" elif o in ("-c", "--conll"): use_text_format = "conll" elif o in ("-o", "--old"): indexlib.Index.use_c_indexer(False) if basename is None: error("You must provide a filename for the index.\n" "Option -i is mandatory.")
def treat_options_csv2xml( opts, arg, n_arg, usage_string ): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global SEPCHAR global SURFACE_FLAG for ( o , a ) in opts: if o == "-F": # sets a new separator character to be used when spliting a line SEPCHAR = a elif o == "-s": # sets the assignment of a word to the "surface" item. # default is set to "lemma". SURFACE_FLAG = 1 else: error("Option " + o + " is not a valid option")
def treat_options_csv2xml(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global SEPCHAR global SURFACE_FLAG for (o, a) in opts: if o == "-F": # sets a new separator character to be used when spliting a line SEPCHAR = a elif o == "-s": # sets the assignment of a word to the "surface" item. # default is set to "lemma". SURFACE_FLAG = 1 else: error("Option " + o + " is not a valid option")
def treat_options( opts, arg, n_arg, usage_string ) : """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global surface_instead_lemmas global glue global corpus_from_index global base_attr global min_ngram global max_ngram global min_frequency treat_options_simplest( opts, arg, n_arg, usage_string ) mode = [] for ( o, a ) in opts: if o in ("-s", "--surface") : surface_instead_lemmas = True base_attr = 'surface' elif o in ("-f", "--freq") : min_frequency = int(a) elif o in ("-n", "--ngram") : (min_ngram, max_ngram) = interpret_ngram(a) elif o in ("-i", "--index") : corpus_from_index = True elif o in ("-G", "--glue"): if a == "scp": glue = scp_glue else: error("Unknown glue function '%s'" % a)
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global surface_instead_lemmas global glue global corpus_from_index global base_attr global min_ngram global max_ngram global min_frequency treat_options_simplest(opts, arg, n_arg, usage_string) mode = [] for (o, a) in opts: if o in ("-s", "--surface"): surface_instead_lemmas = True base_attr = 'surface' elif o in ("-f", "--freq"): min_frequency = int(a) elif o in ("-n", "--ngram"): (min_ngram, max_ngram) = interpret_ngram(a) elif o in ("-i", "--index"): corpus_from_index = True elif o in ("-G", "--glue"): if a == "scp": glue = scp_glue else: error("Unknown glue function '%s'" % a)