def print_candidates(self, chain): """Prints a XML file (mwetoolkit-candidates.dtd) from a temporary candidates file generated by the treat_sentence callback function. Repeated candidates are not printed several times: instead, each base form has a joint frequency of the candidate in the corpus. Since the new version of the "count.py" script, this initial frequency is only printed if you explicitely ask to do it through the -f option. @param filename: The file name of the corpus from which we generate the candidates. """ global print_cand_freq, print_source verbose("Outputting candidates file...") for ngram_basestring, info in self.all_entities.iteritems(): cand = self.candidate_factory.make() cand.from_string(ngram_basestring) for corpus_name, (surface_dict, total_freq) in info.iteritems(): if print_cand_freq: freq = Frequency(corpus_name, total_freq) cand.add_frequency(freq) for occur_string in surface_dict.keys(): occur_form = Ngram(None, None) occur_form.from_string(occur_string) sources = surface_dict[occur_string] freq_value = len(sources) freq = Frequency(corpus_name, freq_value) occur_form.add_frequency(freq) if print_source: occur_form.add_sources(sources) cand.add_occur(occur_form) chain.handle_candidate(cand, info)
def print_candidates(self, chain): """Prints a XML file (mwetoolkit-candidates.dtd) from a temporary candidates file generated by the treat_sentence callback function. Repeated candidates are not printed several times: instead, each base form has a joint frequency of the candidate in the corpus. Since the new version of the "count.py" script, this initial frequency is only printed if you explicitely ask to do it through the -f option. @param filename: The file name of the corpus from which we generate the candidates. """ global print_cand_freq, print_source verbose("Outputting candidates file...") for ngram_basestring, info in self.all_entities.iteritems() : cand = self.candidate_factory.make() cand.from_string(ngram_basestring) for corpus_name, (surface_dict, total_freq) in info.iteritems(): if print_cand_freq : freq = Frequency( corpus_name, total_freq ) cand.add_frequency( freq ) for occur_string in surface_dict.keys() : occur_form = Ngram( None, None ) occur_form.from_string(occur_string) sources = surface_dict[occur_string] freq_value = len(sources) freq = Frequency( corpus_name, freq_value ) occur_form.add_frequency( freq ) if print_source: occur_form.add_sources(sources) cand.add_occur( occur_form ) chain.handle_candidate(cand, info)
def finish(self, info={}): """After we read all input, we can finally be sure about which lines need to be printed. Those correspond exactly to the unique lines added to the buffer. """ global entity_buffer verbose("Output the unified ngrams...") for uniq_counter, (entity, info) in enumerate(entity_buffer.values()): #entity.id_number = uniq_counter if isinstance(entity, Candidate): # WARNING: This is sort of specific for the VERBS 2010 paper. This # whole script should actually be redefined and documented. But for # the moment it's useful and I have no time to be a good programmer # -Carlos freq_sum = {} for freq in entity.freqs: freq_entry = freq_sum.get(freq.name, 0) freq_entry += int(freq.value) freq_sum[freq.name] = freq_entry entity.freqs.clear() for (name, value) in freq_sum.items(): entity.add_frequency(Frequency(name, value)) elif isinstance(entity, Entry): pass elif isinstance(entity, Sentence): pass self.chain.handle(entity, info) self.chain.finish(info)
def finish(self, info={}): """After we read all input, we can finally be sure about which lines need to be printed. Those correspond exactly to the unique lines added to the buffer. """ global entity_buffer verbose( "Output the unified ngrams..." ) for uniq_counter, (entity, info) in enumerate(entity_buffer.values()): #entity.id_number = uniq_counter if isinstance( entity, Candidate ) : # WARNING: This is sort of specific for the VERBS 2010 paper. This # whole script should actually be redefined and documented. But for # the moment it's useful and I have no time to be a good programmer # -Carlos freq_sum = {} for freq in entity.freqs : freq_entry = freq_sum.get( freq.name, 0 ) freq_entry += int( freq.value ) freq_sum[ freq.name ] = freq_entry entity.freqs.clear() for ( name, value ) in freq_sum.items() : entity.add_frequency( Frequency( name, value ) ) elif isinstance( entity, Entry ) : pass elif isinstance( entity, Sentence ) : pass self.chain.handle(entity, info) self.chain.finish(info)
def treat_options( opts, arg, n_arg, usage_string ) : """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global filetype_corpus_ext global filetype_candidates_ext global output_filetype_ext global action_annotate global action_filter treat_options_simplest(opts, arg, n_arg, usage_string) detector_class = ContiguousLemmaDetector candidates_fnames = [] n_gaps = None for (o, a) in opts: if o in ("-c", "--candidates"): candidates_fnames.append(a) elif o in ("-d", "--detector"): detector_class = detectors.get(a,None) if detector_class is None : error("Unkown detector name: "+a) elif o in ("-S", "--source"): detector_class = SourceDetector elif o in ("-g", "--gaps"): n_gaps = int(a) elif o == "--corpus-from": filetype_corpus_ext = a elif o == "--candidates-from": filetype_candidates_ext = a elif o == "--to": output_filetype_ext = a elif o == "--filter": action_annotate = False action_filter = True elif o == "--filter-and-annot": action_filter = True else: raise Exception("Bad arg: " + o) if not candidates_fnames: error("No candidates file given!") if detector_class == SourceDetector and n_gaps is not None: error('Bad arguments: method "Source" with "--gaps"') c = CandidatesHandler() verbose("Reading MWE list from candidates file") filetype.parse(candidates_fnames, c, filetype_candidates_ext) verbose("MWE list loaded in memory successfully") global detector detector = detector_class(c.info, n_gaps)
def after_file(self, fileobj, info={}): global corpus_size_f corpus_size_f = float(corpus_size) verbose("Selecting ngrams through LocalMaxs...") self.localmaxs() verbose("Outputting candidates file...") for ngram_key in selected_candidates: if selected_candidates[ngram_key] and ngram_counts[ngram_key] >= min_frequency: self.dump_ngram(ngram_key, None) self.chain.after_file(fileobj, info)
def after_file(self, fileobj, info={}): global corpus_size_f corpus_size_f = float(corpus_size) verbose("Selecting ngrams through LocalMaxs...") self.localmaxs() verbose("Outputting candidates file...") for ngram_key in selected_candidates: if selected_candidates[ ngram_key] and ngram_counts[ngram_key] >= min_frequency: self.dump_ngram(ngram_key, None) self.chain.after_file(fileobj, info)
def transform_format(rasp): """ Reads an input file and converts it into mwetoolkit corpus XML format, printing the XML file to stdout. @param rasp Is the input, file or piped. """ global morphg_folder global work_path if morphg_folder: os.chdir(morphg_folder) n_line = 0 l_empty = 2 first_line = True phrase = {} l = unicode(rasp.readline(), "utf-8") #pdb.set_trace() # THIS LOOP IS HORRIBLE. PLEASE REFACTOR ME, PLEEEEASE!!!! - CR 20140905 while l != "": if l == "\n": l_empty += 1 if l_empty == 1: sorted_phrase = map(lambda x: x[1], sorted(phrase.items())) write_entry(n_line, sorted_phrase) phrase = {} if n_line % 100 == 0: verbose("Processing sentence number %d" % n_line) n_line += 1 first_line = True l = unicode(rasp.readline(), "utf-8") continue # too long sentences not parsed because -w word limit passed to parser elif l.startswith("(X "): while l != "\n": l = unicode(rasp.readline(), "utf-8") continue if first_line: if l_empty >= 1: l_empty = 0 process_line(l, phrase) #pdb.set_trace() first_line = False l = unicode(rasp.readline(), "utf-8") #ignore line else: l_empty = 0 first_line = True else: process_tree_branch(l, phrase) l = unicode(rasp.readline(), "utf-8") if l_empty != 1 and len(phrase) != 0: #save last entry write_entry(n_line, map(lambda x: x[1], sorted(phrase.items()))) if morphg_folder: os.chdir(work_path)
def treat_options(opts, arg, n_arg, usage_string): """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global filetype_corpus_ext global filetype_candidates_ext global output_filetype_ext global action_annotate global action_filter treat_options_simplest(opts, arg, n_arg, usage_string) detector_class = ContiguousLemmaDetector candidates_fnames = [] n_gaps = None for (o, a) in opts: if o in ("-c", "--candidates"): candidates_fnames.append(a) elif o in ("-d", "--detector"): detector_class = detectors.get(a, None) if detector_class is None: error("Unkown detector name: " + a) elif o in ("-S", "--source"): detector_class = SourceDetector elif o in ("-g", "--gaps"): n_gaps = int(a) elif o == "--corpus-from": filetype_corpus_ext = a elif o == "--candidates-from": filetype_candidates_ext = a elif o == "--to": output_filetype_ext = a elif o == "--filter": action_annotate = False action_filter = True elif o == "--filter-and-annot": action_filter = True else: raise Exception("Bad arg: " + o) if not candidates_fnames: error("No candidates file given!") if detector_class == SourceDetector and n_gaps is not None: error('Bad arguments: method "Source" with "--gaps"') c = CandidatesHandler() verbose("Reading MWE list from candidates file") filetype.parse(candidates_fnames, c, filetype_candidates_ext) verbose("MWE list loaded in memory successfully") global detector detector = detector_class(c.info, n_gaps)
def transform_format(rasp): """ Reads an input file and converts it into mwetoolkit corpus XML format, printing the XML file to stdout. @param rasp Is the input, file or piped. """ global morphg_folder global work_path if morphg_folder : os.chdir( morphg_folder ) n_line=0 l_empty=2 first_line=True phrase = {} l=unicode(rasp.readline(),"utf-8") #pdb.set_trace() # THIS LOOP IS HORRIBLE. PLEASE REFACTOR ME, PLEEEEASE!!!! - CR 20140905 while l != "": if l=="\n": l_empty+=1 if l_empty == 1: sorted_phrase = map( lambda x: x[1], sorted( phrase.items() ) ) write_entry(n_line,sorted_phrase) phrase = {} if n_line % 100 == 0 : verbose( "Processing sentence number %d" % n_line ) n_line+=1 first_line=True l=unicode( rasp.readline(), "utf-8" ) continue # too long sentences not parsed because -w word limit passed to parser elif l.startswith( "(X " ) : while l != "\n" : l = unicode( rasp.readline(), "utf-8" ) continue if first_line: if l_empty>=1: l_empty=0 process_line(l,phrase) #pdb.set_trace() first_line=False l=unicode( rasp.readline(), "utf-8" ) #ignore line else: l_empty=0 first_line=True else: process_tree_branch(l,phrase) l=unicode( rasp.readline(), "utf-8" ) if l_empty != 1 and len(phrase) != 0 : #save last entry write_entry(n_line,map( lambda x: x[1], sorted( phrase.items() ) )) if morphg_folder : os.chdir( work_path )
def read_data( f_data ) : """ Reads the annotation data from a tab-separated file and generates a matrix with Ni rows, one per item to annotate, and Nc columns, one per rater. The content of the matrix are the categories from K assigned by coder c to item i. However, each category is converted to an integer unique ID from 0 to Nk-1, so that it is easier to sort the categories. Also returns the total number of items, raters and categories. @param f_data The input file from which the data is read @return A tuple containing, in the first position, the matrix with one subject per row, one rater per column, and the annotation category IDs in the cells as integers. The second, third and fourth fields of the tuple are the number of items Ni, of coders Nc and of categories Nk. The fifth field is a list containing the names of the categories sorted by their IDs (position 0 contains the name of category IDentified by 0, and so on). """ global first_rater global first_header global separator if first_header : f_data.readline() # Ignores first row annotations = [] all_categories = {} Ni = 0 Nc = 0 for line in f_data.readlines() : if len(line.strip()) > 0 : # Ignore blank lines Ni = Ni + 1 annot_row = line.strip().split( separator )[first_rater:] if Nc == 0 : Nc = len( annot_row ) elif Nc != len( annot_row ) : raise ValueError, "Row %d: the file must contain the same \ number of fields in all rows" % Ni # improvement for cases where file was Space-Tab separated clean_annot_row = [] # contains the annotation cleaned from spaces for annotation in annot_row : clean_annot = annotation.strip() # Remove spurious spaces all_categories[ clean_annot ] = 1 clean_annot_row.append( clean_annot ) annotations.append( clean_annot_row ) (annotations,categ_names) = categories_to_ids( annotations, all_categories ) Nk = len( all_categories ) verbose( "\n%d items\n%d raters\n%d categories\n" % (Ni, Nc, Nk) ) return ( annotations, Ni, Nc, Nk, categ_names )
def read_data(f_data): """ Reads the annotation data from a tab-separated file and generates a matrix with Ni rows, one per item to annotate, and Nc columns, one per rater. The content of the matrix are the categories from K assigned by coder c to item i. However, each category is converted to an integer unique ID from 0 to Nk-1, so that it is easier to sort the categories. Also returns the total number of items, raters and categories. @param f_data The input file from which the data is read @return A tuple containing, in the first position, the matrix with one subject per row, one rater per column, and the annotation category IDs in the cells as integers. The second, third and fourth fields of the tuple are the number of items Ni, of coders Nc and of categories Nk. The fifth field is a list containing the names of the categories sorted by their IDs (position 0 contains the name of category IDentified by 0, and so on). """ global first_rater global first_header global separator if first_header: f_data.readline() # Ignores first row annotations = [] all_categories = {} Ni = 0 Nc = 0 for line in f_data.readlines(): if len(line.strip()) > 0: # Ignore blank lines Ni = Ni + 1 annot_row = line.strip().split(separator)[first_rater:] if Nc == 0: Nc = len(annot_row) elif Nc != len(annot_row): raise ValueError, "Row %d: the file must contain the same \ number of fields in all rows" % Ni # improvement for cases where file was Space-Tab separated clean_annot_row = [] # contains the annotation cleaned from spaces for annotation in annot_row: clean_annot = annotation.strip() # Remove spurious spaces all_categories[clean_annot] = 1 clean_annot_row.append(clean_annot) annotations.append(clean_annot_row) (annotations, categ_names) = categories_to_ids(annotations, all_categories) Nk = len(all_categories) verbose("\n%d items\n%d raters\n%d categories\n" % (Ni, Nc, Nk)) return (annotations, Ni, Nc, Nk, categ_names)
def interpret_length(l, maxormin): """ Transform argument given to -a or -i options into integer + error checks. @param l: A string passed as argument to -i or -a @param maxormin: A string indicating whether this is "maximum" or "minimum" @return: An integer corresponding to l """ try: result = int(l) if result < 0: raise ValueError verbose("%s length: %d" % (maxormin, result)) return result except ValueError: error("Argument of must be non-negative integer, got " + repr(l))
def interpret_length( l, maxormin ): """ Transform argument given to -a or -i options into integer + error checks. @param l: A string passed as argument to -i or -a @param maxormin: A string indicating whether this is "maximum" or "minimum" @return: An integer corresponding to l """ try : result = int( l ) if result < 0: raise ValueError verbose( "%s length: %d" % (maxormin, result) ) return result except ValueError: error("Argument of must be non-negative integer, got " + repr(l))
def open_index(prefix): """ Open the index files (valid index created by the `index.py` script). @param prefix The string name of the index file. """ global freq_name, the_corpus_size global index, suffix_array assert prefix.endswith(".info") prefix = prefix[:-len(".info")] try: verbose("Loading index files... this may take some time.") index = Index(prefix) index.load_metadata() freq_name = re.sub(".*/", "", prefix) #pdb.set_trace() the_corpus_size = index.metadata["corpus_size"] except IOError: error("Error opening the index.\nTry again with another index filename") except KeyError: error("Error opening the index.\nTry again with another index filename")
def main(): """ Main function. """ global corpus_size_f if corpus_from_index: index = Index(corpus_path) index.load_main() for sentence in index.iterate_sentences(): treat_sentence(sentence) else: input_file = open(corpus_path) parser = xml.sax.make_parser() parser.setContentHandler( CorpusXMLHandler( treat_sentence ) ) parser.parse( input_file ) input_file.close() corpus_size_f = float(corpus_size) localmaxs() verbose("Outputting candidates file...") print(XML_HEADER % { "category": "candidates", "ns": "" }) meta = Meta([CorpusSize("corpus", corpus_size)], [MetaFeat("glue", "real")], []) print(meta.to_xml().encode('utf-8')) id = 0 for ngram in select: if (len(ngram) >= min_ngram and len(ngram) <= max_ngram and select[ngram] and ngram_counts[ngram] >= min_frequency): dump_ngram(ngram, id) id += 1 print(XML_FOOTER % { "category": "candidates" })
def open_index(prefix): """ Open the index files (valid index created by the `index.py` script). @param prefix The string name of the index file. """ global freq_name, the_corpus_size global index, suffix_array assert prefix.endswith(".info") prefix = prefix[:-len(".info")] try: verbose("Loading index files... this may take some time.") index = Index(prefix) index.load_metadata() freq_name = re.sub(".*/", "", prefix) #pdb.set_trace() the_corpus_size = index.metadata["corpus_size"] except IOError: error( "Error opening the index.\nTry again with another index filename") except KeyError: error( "Error opening the index.\nTry again with another index filename")
def main(): """ Main function. """ global corpus_size_f if corpus_from_index: index = Index(corpus_path) index.load_main() for sentence in index.iterate_sentences(): treat_sentence(sentence) else: input_file = open(corpus_path) parser = xml.sax.make_parser() parser.setContentHandler(CorpusXMLHandler(treat_sentence)) parser.parse(input_file) input_file.close() corpus_size_f = float(corpus_size) localmaxs() verbose("Outputting candidates file...") print(XML_HEADER % {"category": "candidates", "ns": ""}) meta = Meta([CorpusSize("corpus", corpus_size)], [MetaFeat("glue", "real")], []) print(meta.to_xml().encode('utf-8')) id = 0 for ngram in select: if (len(ngram) >= min_ngram and len(ngram) <= max_ngram and select[ngram] and ngram_counts[ngram] >= min_frequency): dump_ngram(ngram, id) id += 1 print(XML_FOOTER % {"category": "candidates"})
def main(corpus_paths): """ Main function. """ global use_shelve, ngram_counts, selected_candidates # Dummy file initialization to avoid warnings in PyCharm ngram_counts_tmpfile = selected_candidates_tmpfile = None if use_shelve: verbose("Making temporary file...") (ngram_counts, ngram_counts_tmpfile) = make_shelve() (selected_candidates, selected_candidates_tmpfile) = make_shelve() verbose("Counting ngrams...") filetype.parse(corpus_paths, NGramCounterHandler(), input_filetype_ext) if use_shelve: verbose("Removing temporary files...") destroy_shelve(ngram_counts, ngram_counts_tmpfile) destroy_shelve(selected_candidates, selected_candidates_tmpfile)
candidate.add_feat( Feature( "n", len( candidate ) ) ) case_classes = {} #pdb.set_trace() has_hyphen = False for w in candidate : case_class = w.get_case_class() count_class = case_classes.get( case_class, 0 ) case_classes[ case_class ] = count_class + 1 has_hyphen = has_hyphen or "-" in w.lemma argmax_case_class = max( zip( case_classes.values(), case_classes.keys() ) )[ 1 ] candidate.add_feat( Feature( "capitalized", argmax_case_class ) ) candidate.add_feat( Feature( "hyphen", str( has_hyphen ) ) ) self.chain.handle_candidate(candidate, info) ################################################################################ # MAIN SCRIPT args = read_options( "", [], treat_options_simplest, 1, usage_string ) # Done in 2 passes, one to define the type of the feature and another to # print the feature values for each candidate verbose( "1st pass : recover all POS patterns for meta feature" ) # Will ignore meta information and simply recover all the possible patterns filetype.parse(args, RecovererHandler()) # Second pass to print the metafeat header with all possible pattern values verbose( "2nd pass : add the features" ) filetype.parse(args, FeatGeneratorHandler())
def treat_options( opts, arg, n_arg, usage_string ) : """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. @param usage_string The usage string printed if the arguments are wrong. """ global first_header global first_rater global calculate_pairwise global calculate_confusion global separator global distances_matrix global unknown treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o in ("-r", "--raters") : verbose( "First row in file ignored -> considered as rater labels") first_header = True if o in ("-i", "--items") : verbose("First column in file ignored -> considered as item labels") first_rater = 1 if o in ("-p", "--pairwise") : verbose( "Computing pairwise coefficients" ) calculate_pairwise = True if o in ("-u", "--unknown") : verbose( "Unknown value - TODO: implement: " + a ) unknown = a if o in ("-s", "--separator") : verbose( "Field separator: " + a ) separator = a if len( separator ) > 1 : warn("Multi-char field separator!") if o in ("-d", "--distance") : verbose("Calculating weighted coefficients using distance file") distances_matrix = read_distances( a ) if distances_matrix is None : warn("Error in distance matrix! Weighted coefficients will use 1.0 as default distance") if o in ("-c", "--confusion") : verbose( "Calculating confusion matrices" ) calculate_confusion = True
def get_freq_web1t(surfaces, lemmas, pos): """ Gets the frequency (number of occurrences) of an ngram in Google's Web 1T 5-gram Corpus. """ global build_entry, web1t_data_path length = len(surfaces) if length > 5: warn("Cannot count the frequency of an n-gram, n>5!") return 0 search_term = ' '.join(map(build_entry, surfaces, lemmas, pos)) # Find the file in which to look for the ngram. if length == 1: filename = web1t_data_path + "/1gms/vocab.gz" else: indexfile = web1t_data_path + "/%dgms/%dgm.idx" % (length, length) filenames = [x.split("\t") for x in read_file(indexfile).split("\n")] filename = None for (name, first) in filenames: # Assumes byte-value-based ordering! if first > search_term: break else: filename = name if filename is None: return 0 filename = "%s/%dgms/%s" % (web1t_data_path, length, filename) verbose("WEB1T: Opening %s, looking for %s" % (filename, search_term)) # This has been absurdly slow in Python. #file = gzip.open(filename, "rb") # #search_term += "\t" #freq = 0 # #for line in file: # if line.startswith(search_term): # freq = int(line.split("\t")[1]) # break # #print >>sys.stderr, "buenito: %d" % freq # #file.close() file = subprocess.Popen( ["zgrep", "--", "^" + re.escape(search_term) + "\t", filename], stdout=subprocess.PIPE).stdout line = file.read() file.close() if line: freq = int(line.split("\t")[1]) else: freq = 0 verbose("freq =" + str(freq)) return freq
suffix_array = index.load("lemma+pos") else: # Web search, entries are single surface or lemma forms if surface_flag: build_entry = lambda surface, lemma, pos: surface else: build_entry = lambda surface, lemma, pos: lemma if len(mode) != 1: error("Exactly one option -u, -w or -i, must be provided") #elif text_input and web_freq is None: # warn("-x option is recommended for web queries, not textual indices") ################################################################################ # MAIN SCRIPT longopts = ["candidates-from=", "corpus-from=", "to=", "yahoo", "google", "index=", "ignore-pos", "surface", "old", "lower=", "upper=", "vars", "lang=", "no-joint", "bigrams", "univ=", "web1t="] args = read_options("ywi:gsoal:Jbu:T:", longopts, treat_options, -1, usage_string) try: verbose("Counting ngrams in candidates file") filetype.parse(args, CounterPrinter(), filetype_candidates_ext) finally: if web_freq: web_freq.flush_cache() # VERY IMPORTANT!
def treat_options( opts, arg, n_arg, usage_string ) : """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global thresh_source global thresh_value global equals_name global equals_value global reverse global minlength global maxlength global min_mweoccurs global max_mweoccurs global input_filetype_ext global output_filetype_ext treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o in ( "-t", "--threshold" ) : threshold = interpret_threshold( a ) if threshold : (thresh_source, thresh_value) = threshold else : error( "The format of the -t argument must be <source>:" "<value>\n<source> must be a valid corpus name and " "<value> must be a non-negative integer") elif o in ( "-e", "--equals" ) : equals = interpret_equals( a ) if equals : ( equals_name, equals_value ) = equals else : error( "The format of the -e argument must be <name>:" "<value>\n<name> must be a valid feat name and " "<value> must be a non-empty string") elif o in ("-p", "--patterns") : verbose( "Reading patterns file" ) global patterns patterns = filetype.parse_entities([a]) elif o in ("-r", "--reverse") : reverse = True verbose("Option REVERSE active") elif o in ("-i", "--minlength") : minlength = interpret_length( a, "minimum" ) elif o in ("-a", "--maxlength") : maxlength = interpret_length( a, "maximum" ) elif o == "--min-mweoccurs": min_mweoccurs = interpret_length(a, "minimum") elif o == "--max-mweoccurs": max_mweoccurs = interpret_length(a, "maximum") elif o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad arg: " + o) if minlength > maxlength: warn("minlength should be <= maxlength") if min_mweoccurs > max_mweoccurs: warn("min-mweoccurs should be <= max-mweoccurs")
treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a elif o in ("-l","--lemmas" ) : lower_attr = "lemma" elif o in ("-a", "--algorithm"): algoname = a.lower() elif o in ("-m", "-x"): error( "Deprecated options -x and -m. Run with -h for details" ) else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = [ "from=", "to=", "algorithm=", "lemmas" ] args = read_options( "a:xml", longopts, treat_options, 1, usage_string ) if algoname != "simple" : verbose( "Pass 1: Reading vocabulary from file... please wait" ) filetype.parse(args, VocabReaderHandler(), input_filetype_ext) verbose( "Pass 2: Lowercasing the words in the file" ) filetype.parse(args, LowercaserHandler(), input_filetype_ext)
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global thresh_source global thresh_value global equals_name global equals_value global reverse global minlength global maxlength global min_mweoccurs global max_mweoccurs global input_filetype_ext global output_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-t", "--threshold"): threshold = interpret_threshold(a) if threshold: (thresh_source, thresh_value) = threshold else: error("The format of the -t argument must be <source>:" "<value>\n<source> must be a valid corpus name and " "<value> must be a non-negative integer") elif o in ("-e", "--equals"): equals = interpret_equals(a) if equals: (equals_name, equals_value) = equals else: error("The format of the -e argument must be <name>:" "<value>\n<name> must be a valid feat name and " "<value> must be a non-empty string") elif o in ("-p", "--patterns"): verbose("Reading patterns file") global patterns patterns = filetype.parse_entities([a]) elif o in ("-r", "--reverse"): reverse = True verbose("Option REVERSE active") elif o in ("-i", "--minlength"): minlength = interpret_length(a, "minimum") elif o in ("-a", "--maxlength"): maxlength = interpret_length(a, "maximum") elif o == "--min-mweoccurs": min_mweoccurs = interpret_length(a, "minimum") elif o == "--max-mweoccurs": max_mweoccurs = interpret_length(a, "maximum") elif o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad arg: " + o) if minlength > maxlength: warn("minlength should be <= maxlength") if min_mweoccurs > max_mweoccurs: warn("min-mweoccurs should be <= max-mweoccurs")
for ( o, a ) in opts: if o in ( "-m", "--measures" ) : try : measures = [] measures = interpret_measures( a ) except ValueError as message : error( str(message)+"\nargument must be list separated by " "\":\" and containing the names: "+ str( supported_measures )) elif o in ( "-o", "--original" ) : main_freq_name = a elif o in ( "-a", "--all" ) : join_all_contrastive = True if not main_freq_name : error( "Option -o is mandatory") ################################################################################ # MAIN SCRIPT longopts = ["measures=", "original=", "all"] args = read_options( "m:o:a", longopts, treat_options, 1, usage_string ) for a in args : verbose( "Pass 1 for " + a ) filetype.parse([a], TotalCalculatorHandler()) # First calculate Nc for each contrastive corpus verbose( "Pass 2 for " + a ) filetype.parse([a], MeasureCalculatorHandler())
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. @param usage_string The usage string printed if the arguments are wrong. """ global first_header global first_rater global calculate_pairwise global calculate_confusion global separator global distances_matrix global unknown treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-r", "--raters"): verbose("First row in file ignored -> considered as rater labels") first_header = True if o in ("-i", "--items"): verbose( "First column in file ignored -> considered as item labels") first_rater = 1 if o in ("-p", "--pairwise"): verbose("Computing pairwise coefficients") calculate_pairwise = True if o in ("-u", "--unknown"): verbose("Unknown value - TODO: implement: " + a) unknown = a if o in ("-s", "--separator"): verbose("Field separator: " + a) separator = a if len(separator) > 1: warn("Multi-char field separator!") if o in ("-d", "--distance"): verbose("Calculating weighted coefficients using distance file") distances_matrix = read_distances(a) if distances_matrix is None: warn( "Error in distance matrix! Weighted coefficients will use 1.0 as default distance" ) if o in ("-c", "--confusion"): verbose("Calculating confusion matrices") calculate_confusion = True
if o in ("-m", "--measures"): try: measures = [] measures = interpret_measures(a) except ValueError as message: error( str(message) + "\nargument must be list separated by " "\":\" and containing the names: " + str(supported_measures)) elif o in ("-o", "--original"): main_freq_name = a elif o in ("-a", "--all"): join_all_contrastive = True if not main_freq_name: error("Option -o is mandatory") ################################################################################ # MAIN SCRIPT longopts = ["measures=", "original=", "all"] args = read_options("m:o:a", longopts, treat_options, 1, usage_string) for a in args: verbose("Pass 1 for " + a) filetype.parse([a], TotalCalculatorHandler()) # First calculate Nc for each contrastive corpus verbose("Pass 2 for " + a) filetype.parse([a], MeasureCalculatorHandler())
def before_file(self, fileobj, info={}): if not self.chain: self.chain = self.make_printer(info, output_filetype_ext) self.chain.before_file(fileobj, info) verbose("Annotating corpus with MWEs found in list")
cache_out[ k1 ] = cache1[ k1 ] # Update entries in cache_out if corresponding entry in cache_2 is newer for k2 in cache2.keys() : ( freq2, date2 ) = cache2[ k2 ] ( freq_out, date_out ) = cache_out.get( k2, ( -1, None ) ) if date_out is None : cache_out[ k2 ] = ( freq2, date2 ) elif date2 < date_out : cache_out[ k2 ] = ( freq2, date2 ) ################################################################################ # MAIN SCRIPT longopts = [] arg = read_options( "", longopts, treat_options_simplest, 3, usage_string ) verbose( "Opening files and checking consistency" ) cache1_desc = open( arg[ 0 ], "r" ) cache2_desc = open( arg[ 1 ], "r" ) cache_out_desc = open( arg[ 2 ], "w" ) cache1 = cPickle.load( cache1_desc ) cache2 = cPickle.load( cache2_desc ) cache_out = {} verbose( "Combining cache files..." ) combine_caches( cache1, cache2, cache_out ) verbose( "Writing new cache file..." ) cPickle.dump( cache_out, cache_out_desc ) verbose( "{c} had {n} entries".format(c=arg[ 0 ], n=len(cache1)) ) verbose( "{c} had {n} entries".format(c=arg[ 1 ], n=len(cache2)) ) verbose( "Result has {n} entries".format(n=len(cache_out)) )
# Update entries in cache_out if corresponding entry in cache_2 is newer for k2 in cache2.keys(): (freq2, date2) = cache2[k2] (freq_out, date_out) = cache_out.get(k2, (-1, None)) if date_out is None: cache_out[k2] = (freq2, date2) elif date2 < date_out: cache_out[k2] = (freq2, date2) ################################################################################ # MAIN SCRIPT longopts = [] arg = read_options("", longopts, treat_options_simplest, 3, usage_string) verbose("Opening files and checking consistency") cache1_desc = open(arg[0], "r") cache2_desc = open(arg[1], "r") cache_out_desc = open(arg[2], "w") cache1 = cPickle.load(cache1_desc) cache2 = cPickle.load(cache2_desc) cache_out = {} verbose("Combining cache files...") combine_caches(cache1, cache2, cache_out) verbose("Writing new cache file...") cPickle.dump(cache_out, cache_out_desc) verbose("{c} had {n} entries".format(c=arg[0], n=len(cache1))) verbose("{c} had {n} entries".format(c=arg[1], n=len(cache2))) verbose("Result has {n} entries".format(n=len(cache_out)))
else: # Web search, entries are single surface or lemma forms if surface_flag: build_entry = lambda surface, lemma, pos: surface else: build_entry = lambda surface, lemma, pos: lemma if len(mode) != 1: error("Exactly one option -u, -w or -i, must be provided") #elif text_input and web_freq is None: # warn("-x option is recommended for web queries, not textual indices") ################################################################################ # MAIN SCRIPT longopts = [ "candidates-from=", "corpus-from=", "to=", "yahoo", "google", "index=", "ignore-pos", "surface", "old", "lower=", "upper=", "vars", "lang=", "no-joint", "bigrams", "univ=", "web1t=" ] args = read_options("ywi:gsoal:Jbu:T:", longopts, treat_options, -1, usage_string) try: verbose("Counting ngrams in candidates file") filetype.parse(args, CounterPrinter(), filetype_candidates_ext) finally: if web_freq: web_freq.flush_cache() # VERY IMPORTANT!
if algoname == "simple" : # Redundant, kept for clarity sent_handler = LowercaserHandler.handle_sentence_simple elif algoname == "complex" : sent_handler = LowercaserHandler.handle_sentence_complex elif algoname == "aggressive" : # Redundant, kept for clarity sent_handler = LowercaserHandler.handle_sentence_aggressive else : ctxinfo.error("Bad algorithm name `{name}`", name=algoname) elif o == "-m": ctxinfo.error("Deprecated option. Use --from=Moses instead" ) elif o == "-x": ctxinfo.error("Deprecated option. " \ "Use --from=PlainCorpus instead") else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = [ "from=", "to=", "algorithm=", "lemmas" ] args = util.read_options( "a:xml", longopts, treat_options, 1, usage_string ) if sent_handler != LowercaserHandler.handle_sentence_simple : util.verbose( "Pass 1: Reading vocabulary from file... please wait" ) filetype.parse(args, VocabReaderHandler(), input_filetype_ext) util.verbose( "Pass 2: Lowercasing the words in the file" ) filetype.parse(args, LowercaserHandler(), input_filetype_ext)
def _fallback_entity(self, entity, info={}) : """For each candidate, verifies whether its number of occurrences in a given source corpus is superior or equal to the threshold. If no source corpus was provided (thresh_source is None), then all the corpora will be considered when verifying the threshold constraint. A candidate is printed to stdout only if it occurrs thres_value times or more in the corpus names thresh_source. @param entity: The `Ngram` that is being read from the XML file. """ global thresh_source global thresh_value global equals_name global equals_value global reverse global patterns global maxlength global minlength print_it = True ngram_to_print = entity # Threshold test if entity.freqs : for freq in entity.freqs : if thresh_source : if ( thresh_source == freq.name or thresh_source == freq.name + ".xml" ) and \ freq.value < thresh_value : print_it = False else : if freq.value < thresh_value : print_it = False # Equality test if print_it and equals_name : print_it = False for feat in entity.features : if feat.name == equals_name and feat.value == equals_value : print_it = True # NOTE: Different patterns may match the same ngram, with different # results, when the 'ignore' pattern attribute is involved. Currently, # we are only printing the first such match. if print_it and patterns : print_it = False words = entity for pattern in patterns : for (match_ngram, wordnums) in pattern.matches(words, anchored_begin=True, anchored_end=True): print_it = True ngram_to_print = match_ngram break if print_it : break # Filter out too long or too short elements lenentity = len(entity) if lenentity < minlength or lenentity > maxlength : print_it = False verbose("Filtered out: %d tokens" % lenentity) # Filter out sentences with too few/too many MWE candidates if info["kind"] == "sentence": n = len(entity.mweoccurs) if not (min_mweoccurs <= n <= max_mweoccurs): print_it = False if reverse : print_it = not print_it if print_it : self.chain.handle(ngram_to_print, info)
def _fallback_entity(self, entity, info={}): """For each candidate, verifies whether its number of occurrences in a given source corpus is superior or equal to the threshold. If no source corpus was provided (thresh_source is None), then all the corpora will be considered when verifying the threshold constraint. A candidate is printed to stdout only if it occurrs thres_value times or more in the corpus names thresh_source. @param entity: The `Ngram` that is being read from the XML file. """ global thresh_source global thresh_value global equals_name global equals_value global reverse global patterns global maxlength global minlength print_it = True ngram_to_print = entity # Threshold test if entity.freqs: for freq in entity.freqs: if thresh_source: if ( thresh_source == freq.name or thresh_source == freq.name + ".xml" ) and \ freq.value < thresh_value : print_it = False else: if freq.value < thresh_value: print_it = False # Equality test if print_it and equals_name: print_it = False for feat in entity.features: if feat.name == equals_name and feat.value == equals_value: print_it = True # NOTE: Different patterns may match the same ngram, with different # results, when the 'ignore' pattern attribute is involved. Currently, # we are only printing the first such match. if print_it and patterns: print_it = False words = entity for pattern in patterns: for (match_ngram, wordnums) in pattern.matches(words, anchored_begin=True, anchored_end=True): print_it = True ngram_to_print = match_ngram break if print_it: break # Filter out too long or too short elements lenentity = len(entity) if lenentity < minlength or lenentity > maxlength: print_it = False verbose("Filtered out: %d tokens" % lenentity) # Filter out sentences with too few/too many MWE candidates if info["kind"] == "sentence": n = len(entity.mweoccurs) if not (min_mweoccurs <= n <= max_mweoccurs): print_it = False if reverse: print_it = not print_it if print_it: self.chain.handle(ngram_to_print, info)