def open_index(prefix): """ Open the index files (valid index created by the `index.py` script). @param prefix The string name of the index file. """ global freq_name, the_corpus_size global index, suffix_array assert prefix.endswith(".info") prefix = prefix[:-len(".info")] try: verbose("Loading index files... this may take some time.") index = Index(prefix) index.load_metadata() freq_name = re.sub(".*/", "", prefix) #pdb.set_trace() the_corpus_size = index.metadata["corpus_size"] except IOError: error("Error opening the index.\nTry again with another index filename") except KeyError: error("Error opening the index.\nTry again with another index filename")
def open_index(prefix): """ Open the index files (valid index created by the `index.py` script). @param prefix The string name of the index file. """ global freq_name, the_corpus_size global index, suffix_array assert prefix.endswith(".info") prefix = prefix[:-len(".info")] try: verbose("Loading index files... this may take some time.") index = Index(prefix) index.load_metadata() freq_name = re.sub(".*/", "", prefix) #pdb.set_trace() the_corpus_size = index.metadata["corpus_size"] except IOError: error( "Error opening the index.\nTry again with another index filename") except KeyError: error( "Error opening the index.\nTry again with another index filename")
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global cache_file, get_freq_function, build_entry, web_freq global the_corpus_size, freq_name global low_limit, up_limit global count_vars global language global suffix_array global count_joint_frequency global count_bigrams global web1t_data_path global filetype_corpus_ext global filetype_candidates_ext global output_filetype_ext surface_flag = False ignorepos_flag = False mode = [] treat_options_simplest(opts, arg, n_arg, usage_string) for ( o, a ) in opts: if o in ( "-i", "--index" ): open_index(a) get_freq_function = get_freq_index mode.append("index") elif o in ( "-y", "--yahoo" ): error("THIS OPTION IS DEPRECATED AS YAHOO SHUT DOWN THEIR FREE " "SEARCH API") #web_freq = YahooFreq() #freq_name = "yahoo" #ignorepos_flag = True #the_corpus_size = web_freq.corpus_size() #get_freq_function = get_freq_web #mode.append( "yahoo" ) elif o in ( "-w", "--google" ): web_freq = GoogleFreq() freq_name = "google" ignorepos_flag = True the_corpus_size = web_freq.corpus_size() get_freq_function = get_freq_web mode.append("google") elif o in ( "-u", "--univ" ): web_freq = GoogleFreqUniv(a) freq_name = "google" ignorepos_flag = True the_corpus_size = web_freq.corpus_size() get_freq_function = get_freq_web mode.append("google") elif o in ("-T", "--web1t"): ignorepos_flag = True freq_name = "web1t" web1t_data_path = a the_corpus_size = int(read_file(web1t_data_path + "/1gms/total")) get_freq_function = get_freq_web1t mode.append("web1t") elif o in ("-s", "--surface" ): surface_flag = True elif o in ("-g", "--ignore-pos"): ignorepos_flag = True elif o in ("--lower", "--upper" ): try: limit = int(a) if limit < 0: raise ValueError, "Argument of " + o + " must be positive" if o == "--lower" : if up_limit == -1 or up_limit >= limit: low_limit = limit else: raise ValueError, "Argument of -f >= argument of -t" else: if low_limit == -1 or low_limit <= limit: up_limit = limit else: raise ValueError, "Argument of -t <= argument of -t" except ValueError as message: error( str(message) + "\nArgument of " + o + " must be integer") elif o in ("-a", "--vars" ): count_vars = True elif o in ("-l", "--lang" ): language = a elif o in ("-J", "--no-joint"): count_joint_frequency = False elif o in ("-B", "--bigrams"): count_bigrams = True elif o in ("-o", "--old"): Index.use_c_indexer(False) elif o == "--corpus-from": filetype_corpus_ext = a elif o == "--candidates-from": filetype_candidates_ext = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad arg: " + o) if mode == ["index"]: if surface_flag and ignorepos_flag: build_entry = lambda surface, lemma, pos: surface suffix_array = index.load("surface") elif surface_flag: build_entry = lambda surface, lemma, pos: surface +\ ATTRIBUTE_SEPARATOR + pos suffix_array = index.load("surface+pos") elif ignorepos_flag: build_entry = lambda surface, lemma, pos: lemma suffix_array = index.load("lemma") else: build_entry = lambda surface, lemma, pos: lemma +\ ATTRIBUTE_SEPARATOR + pos suffix_array = index.load("lemma+pos") else: # Web search, entries are single surface or lemma forms if surface_flag: build_entry = lambda surface, lemma, pos: surface else: build_entry = lambda surface, lemma, pos: lemma if len(mode) != 1: error("Exactly one option -u, -w or -i, must be provided")
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global cache_file, get_freq_function, build_entry, web_freq global the_corpus_size, freq_name global low_limit, up_limit global count_vars global language global suffix_array global count_joint_frequency global count_bigrams global web1t_data_path global filetype_corpus_ext global filetype_candidates_ext global output_filetype_ext surface_flag = False ignorepos_flag = False mode = [] treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-i", "--index"): open_index(a) get_freq_function = get_freq_index mode.append("index") elif o in ("-y", "--yahoo"): error("THIS OPTION IS DEPRECATED AS YAHOO SHUT DOWN THEIR FREE " "SEARCH API") #web_freq = YahooFreq() #freq_name = "yahoo" #ignorepos_flag = True #the_corpus_size = web_freq.corpus_size() #get_freq_function = get_freq_web #mode.append( "yahoo" ) elif o in ("-w", "--google"): web_freq = GoogleFreq() freq_name = "google" ignorepos_flag = True the_corpus_size = web_freq.corpus_size() get_freq_function = get_freq_web mode.append("google") elif o in ("-u", "--univ"): web_freq = GoogleFreqUniv(a) freq_name = "google" ignorepos_flag = True the_corpus_size = web_freq.corpus_size() get_freq_function = get_freq_web mode.append("google") elif o in ("-T", "--web1t"): ignorepos_flag = True freq_name = "web1t" web1t_data_path = a the_corpus_size = int(read_file(web1t_data_path + "/1gms/total")) get_freq_function = get_freq_web1t mode.append("web1t") elif o in ("-s", "--surface"): surface_flag = True elif o in ("-g", "--ignore-pos"): ignorepos_flag = True elif o in ("--lower", "--upper"): try: limit = int(a) if limit < 0: raise ValueError, "Argument of " + o + " must be positive" if o == "--lower": if up_limit == -1 or up_limit >= limit: low_limit = limit else: raise ValueError, "Argument of -f >= argument of -t" else: if low_limit == -1 or low_limit <= limit: up_limit = limit else: raise ValueError, "Argument of -t <= argument of -t" except ValueError as message: error(str(message) + "\nArgument of " + o + " must be integer") elif o in ("-a", "--vars"): count_vars = True elif o in ("-l", "--lang"): language = a elif o in ("-J", "--no-joint"): count_joint_frequency = False elif o in ("-B", "--bigrams"): count_bigrams = True elif o in ("-o", "--old"): Index.use_c_indexer(False) elif o == "--corpus-from": filetype_corpus_ext = a elif o == "--candidates-from": filetype_candidates_ext = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad arg: " + o) if mode == ["index"]: if surface_flag and ignorepos_flag: build_entry = lambda surface, lemma, pos: surface suffix_array = index.load("surface") elif surface_flag: build_entry = lambda surface, lemma, pos: surface +\ ATTRIBUTE_SEPARATOR + pos suffix_array = index.load("surface+pos") elif ignorepos_flag: build_entry = lambda surface, lemma, pos: lemma suffix_array = index.load("lemma") else: build_entry = lambda surface, lemma, pos: lemma +\ ATTRIBUTE_SEPARATOR + pos suffix_array = index.load("lemma+pos") else: # Web search, entries are single surface or lemma forms if surface_flag: build_entry = lambda surface, lemma, pos: surface else: build_entry = lambda surface, lemma, pos: lemma if len(mode) != 1: error("Exactly one option -u, -w or -i, must be provided")