Пример #1
0
            print_cand_freq = True
        elif o in ("-i", "--index") :
            input_filetype_ext = "BinaryIndex"
            warn("Option -i is deprecated; use --from=BinaryIndex")
        elif o == "--id-order":
            id_order = a.split(":")
        elif o == "--from" :
            input_filetype_ext = a
        elif o == "--to" :
            output_filetype_ext = a
        else:
            raise Exception("Bad flag")

    if non_overlapping and match_distance == "All":
        # If we are taking all matches, we need to be able to overlap...
        error("Conflicting options: --match-distance=All and --non-overlapping")

    if len(mode) != 1 :
        error("Exactly one option, -p or -n, must be provided")
    if "patterns" in mode:
        global patterns
        patterns = filetype.parse_entities([patterns_file])

################################################################################  
# MAIN SCRIPT

longopts = [ "from=", "to=", "patterns=", "ngram=", "index", "match-distance=",
        "non-overlapping", "freq", "ignore-pos", "surface", "source", "id-order=" ]
arg = read_options( "p:n:id:NfgsS", longopts, treat_options, -1, usage_string )
filetype.parse(arg, CandidatesGeneratorHandler(), input_filetype_ext)
Пример #2
0
    for (o, a) in opts:
        if o in ("-f", "--feat"):
            feat_list = treat_feat_list(a)
        elif o in ("-a", "--asc"):
            ascending = True
            a_or_d.append("a")
        elif o in ("-d", "--desc"):
            ascending = False
            a_or_d.append("d")
        elif o in ("-p", "--precs"):
            print_precs = True
        elif o == "--from":
            input_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if len(a_or_d) > 1:
        warn("you should provide only one option, -a OR -d. Only the last one"+\
             " will be considered.")
    if not feat_list:
        error("You MUST provide at least one feature with -f")


################################################################################
# MAIN SCRIPT

longopts = ["from=", "feat=", "asc", "desc", "precs"]
args = read_options("f:adp", longopts, treat_options, 1, usage_string)
filetype.parse(args, StatsCollectorHandler(), input_filetype_ext)
print_stats()
Пример #3
0
    for k1 in cache1.keys() :
        cache_out[ k1 ] = cache1[ k1 ]   
    # Update entries in cache_out if corresponding entry in cache_2 is newer
    for k2 in cache2.keys() :        
        ( freq2, date2 ) = cache2[ k2 ]
        ( freq_out, date_out ) = cache_out.get( k2, ( -1, None ) )
        if date_out is None :
            cache_out[ k2 ] = ( freq2, date2 )
        elif date2 < date_out :
            cache_out[ k2 ] = ( freq2, date2 )

################################################################################     
# MAIN SCRIPT

longopts = []
arg = read_options( "", longopts, treat_options_simplest, 3, usage_string )

verbose( "Opening files and checking consistency" )
cache1_desc = open( arg[ 0 ], "r" )
cache2_desc = open( arg[ 1 ], "r" )
cache_out_desc = open( arg[ 2 ], "w" )
cache1 = cPickle.load( cache1_desc )
cache2 = cPickle.load( cache2_desc )
cache_out = {}
verbose( "Combining cache files..." )
combine_caches( cache1, cache2, cache_out )
verbose( "Writing new cache file..." )
cPickle.dump( cache_out, cache_out_desc )
verbose( "{c} had {n} entries".format(c=arg[ 0 ], n=len(cache1)) )
verbose( "{c} had {n} entries".format(c=arg[ 1 ], n=len(cache2)) )
verbose( "Result has {n} entries".format(n=len(cache_out)) )
Пример #4
0
    global limit
    global entity_buffer
    global input_filetype_ext
    global output_filetype_ext

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for ( o, a ) in opts:
        if o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o in ("-n", "--number"):
            try:
                limit = int(a)
                entity_buffer = [None] * limit
                if limit < 0:
                    raise ValueError
            except ValueError:
                error("You must provide a positive " + \
                      "integer value as argument of -n option.")
        else:
            raise Exception("Bad arg: " + o)


################################################################################
# MAIN SCRIPT

args = read_options("n:", ["from=", "to=", "number="], treat_options, -1, usage_string)
filetype.parse(args, TailPrinterHandler(limit), input_filetype_ext)
Пример #5
0
    """
    global combination
    global supported_combination
    global main_freq

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-c", "--combination"):
            try:
                combination = []
                combination = interpret_combinations(a)
            except ValueError as message:
                print >> sys.stderr, message
                print >> sys.stderr, "ERROR: argument must be list separated"+ \
                                     "by \":\" and containing the names: "+\
                                     str( supported_combination )
                usage(usage_string)
                sys.exit(2)
        elif o in ("-o", "--original"):
            main_freq = a


################################################################################
# MAIN SCRIPT

longopts = ["combination=", "original="]
args = read_options("c:o:", longopts, treat_options, -1, usage_string)

filetype.parse(args, FreqCombinerHandler())
Пример #6
0
    treat_options_simplest( opts, arg, n_arg, usage_string )        

    for ( o, a ) in opts:
        if o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o in ("-l","--lemmas" ) :
            lower_attr = "lemma"
        elif o in ("-a", "--algorithm"):
            algoname = a.lower()
        elif o in ("-m", "-x"):
        	error( "Deprecated options -x and -m. Run with -h for details" )
        else:
            raise Exception("Bad arg: " + o)

 
################################################################################
# MAIN SCRIPT

longopts = [ "from=", "to=", "algorithm=", "lemmas" ]
args = read_options( "a:xml", longopts, treat_options, 1, usage_string )

if algoname != "simple" :
    verbose( "Pass 1: Reading vocabulary from file... please wait" )
    filetype.parse(args, VocabReaderHandler(), input_filetype_ext)

verbose( "Pass 2: Lowercasing the words in the file" )
filetype.parse(args, LowercaserHandler(), input_filetype_ext)
            if algoname == "simple" :  # Redundant, kept for clarity
                sent_handler = LowercaserHandler.handle_sentence_simple
            elif algoname == "complex" :
                sent_handler = LowercaserHandler.handle_sentence_complex
            elif algoname == "aggressive" :  # Redundant, kept for clarity
                sent_handler = LowercaserHandler.handle_sentence_aggressive
            else :
                ctxinfo.error("Bad algorithm name `{name}`", name=algoname)

        elif o == "-m":
            ctxinfo.error("Deprecated option. Use --from=Moses instead" )
        elif o == "-x":
            ctxinfo.error("Deprecated option. " \
                    "Use --from=PlainCorpus instead")
        else:
            raise Exception("Bad arg: " + o)

 
################################################################################
# MAIN SCRIPT

longopts = [ "from=", "to=", "algorithm=", "lemmas" ]
args = util.read_options( "a:xml", longopts, treat_options, 1, usage_string )

if sent_handler != LowercaserHandler.handle_sentence_simple :
    util.verbose( "Pass 1: Reading vocabulary from file... please wait" )
    filetype.parse(args, VocabReaderHandler(), input_filetype_ext)

util.verbose( "Pass 2: Lowercasing the words in the file" )
filetype.parse(args, LowercaserHandler(), input_filetype_ext)
Пример #8
0
        if o in ("-s", "--surface"):
            surface_instead_lemmas = True
            base_attr = 'surface'
        elif o in ("-f", "--freq"):
            min_frequency = int(a)
        elif o in ("-n", "--ngram"):
            (min_ngram, max_ngram) = interpret_ngram(a)
        elif o in ("-i", "--index"):
            corpus_from_index = True
        elif o in ("-G", "--glue"):
            if a == "scp":
                glue = scp_glue
            else:
                error("Unknown glue function '%s'" % a)


################################################################################

corpus_from_index = False
base_attr = 'lemma'
glue = scp_glue
min_ngram = 2
max_ngram = 8
min_frequency = 2

longopts = ["surface", "glue=", "ngram=", "freq=", "index"]
arg = read_options("sG:n:f:i", longopts, treat_options, 1, usage_string)
corpus_path = arg[0]

main()
Пример #9
0
            use_text_format = "moses"
        elif o in ("-c", "--conll"):
            use_text_format = "conll"
        elif o in ("-o", "--old"):
            indexlib.Index.use_c_indexer(False)

    if basename is None:
        error("You must provide a filename for the index.\n"
              "Option -i is mandatory.")


################################################################################
# MAIN SCRIPT

longopts = ["from=", "index=", "attributes=", "old", "moses", "conll"]
arg = read_options("i:a:omc", longopts, treat_options, -1, usage_string)

simple_attrs = [a for a in used_attributes if '+' not in a]
composite_attrs = [a for a in used_attributes if '+' in a]

for attrs in [attr.split('+') for attr in composite_attrs]:
    for attr in attrs:
        if attr not in simple_attrs:
            simple_attrs.append(attr)

index = indexlib.Index(basename, simple_attrs)
indexlib.populate_index(index, arg, input_filetype_ext)
for attr in composite_attrs:
    index.make_fused_array(attr.split('+'))
#index.build_suffix_arrays()
#index.save_main()
Пример #10
0
    a_or_d = []
    for (o, a) in opts:
        if o in ("-f", "--feat"):
            feat_list = treat_feat_list(a)
        elif o in ("-a", "--asc"):
            ascending = True
            a_or_d.append("a")
        elif o in ("-d", "--desc"):
            ascending = False
            a_or_d.append("d")
        elif o in ("-p", "--precs"):
            print_precs = True
        elif o == "--from":
            input_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if len(a_or_d) > 1:
        warn("you should provide only one option, -a OR -d. Only the last one" + " will be considered.")
    if not feat_list:
        error("You MUST provide at least one feature with -f")


################################################################################
# MAIN SCRIPT

longopts = ["from=", "feat=", "asc", "desc", "precs"]
args = read_options("f:adp", longopts, treat_options, 1, usage_string)
filetype.parse(args, StatsCollectorHandler(), input_filetype_ext)
print_stats()
Пример #11
0
            unknown = a
        if o in ("-s", "--separator") : 
            verbose( "Field separator: " + a )
            separator = a
            if len( separator ) > 1 :
                warn("Multi-char field separator!")
        if o in ("-d", "--distance") :
            verbose("Calculating weighted coefficients using distance file")
            distances_matrix = read_distances( a )
            if distances_matrix is None :
                warn("Error in distance matrix! Weighted coefficients will use 1.0 as default distance")
        if o in ("-c", "--confusion") :
            verbose( "Calculating confusion matrices" )
            calculate_confusion = True

################################################################################     
# MAIN SCRIPT

longopts = [ "raters", "items", "pairwise", "separator=", "distance=",
              "confusion", "unknown=" ]
arg = read_options( "rips:d:cu:", longopts, treat_options, -1, usage_string )   

if len( arg ) == 0 :
    (annotations, Ni, Nc, Nk, categ_names) = read_data( sys.stdin )
    calculate_and_print( annotations, Ni, Nc, Nk, categ_names )
else :
    for a in arg :
        input_file = open( a )
        (annotations, Ni, Nc, Nk, categ_names) = read_data( input_file )
        calculate_and_print( annotations, Ni, Nc, Nk, categ_names )
Пример #12
0
    else:  # Web search, entries are single surface or lemma forms
        if surface_flag:
            build_entry = lambda surface, lemma, pos: surface
        else:
            build_entry = lambda surface, lemma, pos: lemma

    if len(mode) != 1:
        error("Exactly one option -u, -w or -i, must be provided")
    #elif text_input and web_freq is None:
    #    warn("-x option is recommended for web queries, not textual indices")


################################################################################
# MAIN SCRIPT

longopts = [
    "candidates-from=", "corpus-from=", "to=", "yahoo", "google", "index=",
    "ignore-pos", "surface", "old", "lower=", "upper=", "vars", "lang=",
    "no-joint", "bigrams", "univ=", "web1t="
]
args = read_options("ywi:gsoal:Jbu:T:", longopts, treat_options, -1,
                    usage_string)

try:
    verbose("Counting ngrams in candidates file")
    filetype.parse(args, CounterPrinter(), filetype_candidates_ext)
finally:
    if web_freq:
        web_freq.flush_cache()  # VERY IMPORTANT!
Пример #13
0
        elif o in ("-p", "--patterns"):
            input_patterns = filetype.parse_entities([a])
        elif o in ("-d", "--match-distance") : 
            match_distance = a
        elif o in ("-N", "--non-overlapping") : 
            non_overlapping = True
        elif o == "--id-order":
            id_order = a.split(":")
        elif o == "--annotate":
            annotate = True
        elif o == "--only-matching":
            only_the_matching_subpart = True
        else:
            raise Exception("Bad arg " + o)

    if input_patterns is None:
        util.error("No patterns provided. Option --patterns is mandatory!")

    if only_the_matching_subpart and annotate:
        util.warn("Switch --only-matching disables --annotate")


################################################################################
# MAIN SCRIPT

longopts = ["input-from=", "to=", "patterns=",
        "match-distance=", "non-overlapping=", "id-order=", "annotate",
        "only-matching"]
args = util.read_options("p:d:N", longopts, treat_options, -1, usage_string)
filetype.parse(args, GrepHandler(), input_filetype_ext)
Пример #14
0
            ignore_pos = True
        elif o in ("-c", "--case"):
            ignore_case = False
        elif o in ("-L", "--lemma-or-surface"):
            lemma_or_surface = True
        elif o == "--input-from":
            input_filetype_ext = a
        elif o == "--reference-from":
            reference_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)
            
    # The reference list needs to be opened after all the options are read,
    # since options such as -g and -c modify the way the list is represented
    if ref_name :
        filetype.parse([ref_name], ReferenceReaderHandler(), reference_filetype_ext)
        gs_name = re.sub( ".*/", "", re.sub( "\.xml", "", ref_name ) )
    # There's no reference list... Oh oh cannot evaluate :-(
    if not pre_gs :
        error("You MUST provide a non-empty reference list!")


################################################################################
# MAIN SCRIPT

longopts = ["input-from=", "reference-from=",
        "reference=", "ignore-pos", "case", "lemma-or-surface"]
args = read_options( "r:gcL", longopts, treat_options, -1, usage_string )

filetype.parse(args, EvaluatorHandler(), input_filetype_ext)
Пример #15
0
    """

    global SEPCHAR
    global SURFACE_FLAG

    for (o, a) in opts:
        if o == "-F":
            # sets a new separator character to be used when spliting a line
            SEPCHAR = a
        elif o == "-s":
            # sets the assignment of a word to the "surface" item.
            # default is set to "lemma".
            SURFACE_FLAG = 1
        else:
            error("Option " + o + " is not a valid option")


################################################################################
# MAIN SCRIPT

if __name__ == '__main__':

    files = read_options("F:s", [], treat_options_csv2xml, 2, usage_string)

    for file in files:
        initialize(file)
        print(XML_HEADER % {"category": "candidates", "ns": ""})
        getMeta(file)
        getCand(file)
        print(XML_FOOTER % {"category": "candidates"})
Пример #16
0
    for ( o, a ) in opts:
        if o in ( "-m", "--measures" ) :
            try :
                measures = []
                measures = interpret_measures( a )
            except ValueError as message :
                error( str(message)+"\nargument must be list separated by "
                                    "\":\" and containing the names: "+
                       str( supported_measures ))
        elif o in ( "-o", "--original" ) :
            main_freq_name = a
        elif o in ( "-a", "--all" ) :
            join_all_contrastive = True
    
    if not main_freq_name :
        error( "Option -o is mandatory")


################################################################################
# MAIN SCRIPT

longopts = ["measures=", "original=", "all"]
args = read_options( "m:o:a", longopts, treat_options, 1, usage_string )

for a in args :
    verbose( "Pass 1 for " + a )
    filetype.parse([a], TotalCalculatorHandler())
    # First calculate Nc for each contrastive corpus        
    verbose( "Pass 2 for " + a )    
    filetype.parse([a], MeasureCalculatorHandler())
Пример #17
0
        
        @param n_arg The number of arguments expected for this script.    
    """
    global simplify
    global input_filetype_ext
    global output_filetype_ext

    treat_options_simplest(opts, arg, n_arg, usage_string)

    simplify = simplify_ptb

    for (o, a) in opts:
        if o in ("-p", "--palavras"):
            simplify = simplify_palavras
        elif o in ("-G", "--genia"):
            simplify = simplify_genia
        elif o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)


################################################################################
# MAIN SCRIPT

longopts = ["from=", "to=", "palavras", "genia"]
args = read_options("xF:pg", longopts, treat_options, -1, usage_string)
filetype.parse(args, FilterHandler(), input_filetype_ext)
Пример #18
0
                warn("Multi-char field separator!")
        if o in ("-d", "--distance"):
            verbose("Calculating weighted coefficients using distance file")
            distances_matrix = read_distances(a)
            if distances_matrix is None:
                warn(
                    "Error in distance matrix! Weighted coefficients will use 1.0 as default distance"
                )
        if o in ("-c", "--confusion"):
            verbose("Calculating confusion matrices")
            calculate_confusion = True


################################################################################
# MAIN SCRIPT

longopts = [
    "raters", "items", "pairwise", "separator=", "distance=", "confusion",
    "unknown="
]
arg = read_options("rips:d:cu:", longopts, treat_options, -1, usage_string)

if len(arg) == 0:
    (annotations, Ni, Nc, Nk, categ_names) = read_data(sys.stdin)
    calculate_and_print(annotations, Ni, Nc, Nk, categ_names)
else:
    for a in arg:
        input_file = open(a)
        (annotations, Ni, Nc, Nk, categ_names) = read_data(input_file)
        calculate_and_print(annotations, Ni, Nc, Nk, categ_names)
Пример #19
0
            output_filetype_ext = a
        elif o == "--filter":
            action_annotate = False
            action_filter = True
        elif o == "--filter-and-annot":
            action_filter = True
        else:
            raise Exception("Bad arg: " + o)

    if not candidates_fnames:
        error("No candidates file given!")
    if detector_class == SourceDetector and n_gaps is not None:
        error('Bad arguments: method "Source" with "--gaps"')
    c = CandidatesHandler()
    verbose("Reading MWE list from candidates file")
    filetype.parse(candidates_fnames, c, filetype_candidates_ext)
    verbose("MWE list loaded in memory successfully")
    global detector
    detector = detector_class(c.info, n_gaps)


################################################################################
# MAIN SCRIPT

longopts = [
    "corpus-from=", "candidates-from=", "to=", "candidates=", "detector=",
    "gaps=", "source", "filter", "filter-and-annot"
]
arg = read_options("c:d:g:So:", longopts, treat_options, -1, usage_string)
filetype.parse(arg, AnnotatorHandler(), filetype_corpus_ext)
Пример #20
0
            use_text_format = "moses"
        elif o in ("-c", "--conll"):
            use_text_format = "conll"            
        elif o in ("-o", "--old"):
            indexlib.Index.use_c_indexer(False)
            
    if basename is None:     
        error("You must provide a filename for the index.\n"
              "Option -i is mandatory.")

                            
################################################################################
# MAIN SCRIPT

longopts = ["from=", "index=", "attributes=", "old", "moses", "conll" ]
arg = read_options( "i:a:omc", longopts, treat_options, -1, usage_string )

simple_attrs = [a for a in used_attributes if '+' not in a]
composite_attrs = [a for a in used_attributes if '+' in a]

for attrs in [attr.split('+') for attr in composite_attrs]:
    for attr in attrs:
        if attr not in simple_attrs:
            simple_attrs.append(attr)


index = indexlib.Index(basename, simple_attrs)
indexlib.populate_index(index, arg, input_filetype_ext)
for attr in composite_attrs:
    index.make_fused_array(attr.split('+'))
#index.build_suffix_arrays()
Пример #21
0
        elif o in ("-i", "--minlength"):
            minlength = interpret_length(a, "minimum")
        elif o in ("-a", "--maxlength"):
            maxlength = interpret_length(a, "maximum")
        elif o == "--min-mweoccurs":
            min_mweoccurs = interpret_length(a, "minimum")
        elif o == "--max-mweoccurs":
            max_mweoccurs = interpret_length(a, "maximum")
        elif o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if minlength > maxlength:
        warn("minlength should be <= maxlength")
    if min_mweoccurs > max_mweoccurs:
        warn("min-mweoccurs should be <= max-mweoccurs")


################################################################################
# MAIN SCRIPT

longopts = [
    "threshold=", "equals=", "patterns=", "reverse", "maxlength=",
    "minlength=", "min-mweoccurs=", "max-mweoccurs=", "from=", "to="
]
args = read_options("t:e:p:ra:i:", longopts, treat_options, -1, usage_string)
filetype.parse(args, FilterHandler(), input_filetype_ext)
Пример #22
0
    """
    global executable_w
    global executable_beg
    global executable_end
    global input_filetype_ext
    global output_filetype_ext

    util.treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o == "--begin":
            executable_beg = compile(a, "<cmdline:--begin>", "exec")
        elif o == "--end":
            executable_end = compile(a, "<cmdline:--end>", "exec")
        elif o in ("-w", "--each-word"):
            executable_w = compile(a, "<cmdline:--each-word>", "exec")
        else:
            raise Exception("Bad arg " + o)


################################################################################
# MAIN SCRIPT

longopts = ["from=", "to=", "begin=", "end=", "each-word="]
args = util.read_options("w:", longopts, treat_options, -1, usage_string)
filetype.parse(args, TransformHandler(), input_filetype_ext)
Пример #23
0
            else:
                l_empty=0
                first_line=True
        else:
            process_tree_branch(l,phrase)
        l=unicode( rasp.readline(), "utf-8" )
    if l_empty != 1 and len(phrase) != 0 : #save last entry
        write_entry(n_line,map( lambda x: x[1], sorted( phrase.items() ) )) 
    if morphg_folder :
        os.chdir( work_path )

###############################################################################
# MAIN SCRIPT

longopts = ["morphg=", "moses"]
arg = read_options( "m:x", longopts, treat_options, -1, usage_string )

if not generate_text :
    print( XML_HEADER % { "category": "corpus", "ns": "" } )

if len( arg ) == 0 :
    transform_format( sys.stdin )
else :
    for a in arg :
        try:
            input_file=open(a, 'r')
        except IOError as e:
            error( 'Error opening file for reading.' )
        transform_format( input_file )
        input_file.close()    
               
Пример #24
0
        @param n_arg The number of arguments expected for this script.   
    """
    
    global SEPCHAR
    global SURFACE_FLAG
    
    for ( o , a ) in opts:
        if o == "-F":
            # sets a new separator character to be used when spliting a line
            SEPCHAR = a
        elif o == "-s":
            # sets the assignment of a word to the "surface" item.
            # default is set to "lemma".
            SURFACE_FLAG = 1
        else:
            error("Option " + o + " is not a valid option")

################################################################################
# MAIN SCRIPT

if __name__ == '__main__':
    
    files = read_options( "F:s", [], treat_options_csv2xml, 2, usage_string )

    for file in files:
        initialize(file)
        print(XML_HEADER % { "category":"candidates", "ns":"" })
        getMeta(file)
        getCand(file)
        print(XML_FOOTER % { "category":"candidates" })
Пример #25
0
            suffix_array = index.load("lemma+pos")

    else:  # Web search, entries are single surface or lemma forms
        if surface_flag:
            build_entry = lambda surface, lemma, pos: surface
        else:
            build_entry = lambda surface, lemma, pos: lemma

    if len(mode) != 1:
        error("Exactly one option -u, -w or -i, must be provided")
    #elif text_input and web_freq is None:
    #    warn("-x option is recommended for web queries, not textual indices")


################################################################################
# MAIN SCRIPT

longopts = ["candidates-from=", "corpus-from=", "to=",
            "yahoo", "google", "index=", "ignore-pos", "surface", "old",
            "lower=", "upper=", "vars", "lang=", "no-joint", "bigrams",
            "univ=", "web1t="]
args = read_options("ywi:gsoal:Jbu:T:", longopts,
        treat_options, -1, usage_string)

try:
    verbose("Counting ngrams in candidates file")
    filetype.parse(args, CounterPrinter(), filetype_candidates_ext)
finally:
    if web_freq:
        web_freq.flush_cache()  # VERY IMPORTANT!
Пример #26
0
INFO = EvitaInfo()


class EvitaPrinter(filetype.common.AbstractPrinter):
    filetype_info = INFO
    valid_categories = ["candidates"]

    def handle_candidate(self, candidate, info={}):
        """For each `Candidate`, print the candidate ID, its POS pattern and the 
        list of occurrences one per line
        
        @param candidate The `Candidate` that is being read from the XML file.
        """
        pos = candidate.get_pos_pattern()
        pos = pos.replace(SEPARATOR, " ")
        self.add_string("candid=%(id)s pos=\"%(pos)s\"\n" % \
                {"id": candidate.id_number, "pos": pos})
        for form in candidate.occurs:
            form.set_all(lemma="", pos="")
            occur = form.to_string()
            occur = occur.replace(SEPARATOR, "")
            occur = occur.replace(WORD_SEPARATOR, " ")
            self.add_string(("\"%(occur)s\"\n" % {"occur": occur}).encode('utf-8'))
        self.add_string("\n")

################################################################################     
# MAIN SCRIPT

args = read_options("", [], treat_options_simplest, -1, usage_string)
filetype.parse(args, EvitaPrinter("candidates"))
Пример #27
0
        elif o == "--id-order":
            id_order = a.split(":")
        elif o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad flag")

    if non_overlapping and match_distance == "All":
        # If we are taking all matches, we need to be able to overlap...
        error(
            "Conflicting options: --match-distance=All and --non-overlapping")

    if len(mode) != 1:
        error("Exactly one option, -p or -n, must be provided")
    if "patterns" in mode:
        global patterns
        patterns = filetype.parse_entities([patterns_file])


################################################################################
# MAIN SCRIPT

longopts = [
    "from=", "to=", "patterns=", "ngram=", "index", "match-distance=",
    "non-overlapping", "freq", "ignore-pos", "surface", "source", "id-order="
]
arg = read_options("p:n:id:NfgsS", longopts, treat_options, -1, usage_string)
filetype.parse(arg, CandidatesGeneratorHandler(), input_filetype_ext)
Пример #28
0
        elif o == "--filter":
            action_annotate = False
            action_filter = True
        elif o == "--filter-and-annot":            
            action_filter = True            
        else:
            raise Exception("Bad arg: " + o)

    if not candidates_fnames:
        error("No candidates file given!")
    if detector_class == SourceDetector and n_gaps is not None:
        error('Bad arguments: method "Source" with "--gaps"')
    c = CandidatesHandler()
    verbose("Reading MWE list from candidates file")
    filetype.parse(candidates_fnames,
            c, filetype_candidates_ext)
    verbose("MWE list loaded in memory successfully")
    global detector
    detector = detector_class(c.info, n_gaps)

        
################################################################################  
# MAIN SCRIPT


longopts = ["corpus-from=", "candidates-from=", "to=",
        "candidates=", "detector=", "gaps=", "source", "filter", 
        "filter-and-annot"]
arg = read_options("c:d:g:So:", longopts, treat_options, -1, usage_string)
filetype.parse(arg, AnnotatorHandler(), filetype_corpus_ext)
Пример #29
0
        cache_out[k1] = cache1[k1]
    # Update entries in cache_out if corresponding entry in cache_2 is newer
    for k2 in cache2.keys():
        (freq2, date2) = cache2[k2]
        (freq_out, date_out) = cache_out.get(k2, (-1, None))
        if date_out is None:
            cache_out[k2] = (freq2, date2)
        elif date2 < date_out:
            cache_out[k2] = (freq2, date2)


################################################################################
# MAIN SCRIPT

longopts = []
arg = read_options("", longopts, treat_options_simplest, 3, usage_string)

verbose("Opening files and checking consistency")
cache1_desc = open(arg[0], "r")
cache2_desc = open(arg[1], "r")
cache_out_desc = open(arg[2], "w")
cache1 = cPickle.load(cache1_desc)
cache2 = cPickle.load(cache2_desc)
cache_out = {}
verbose("Combining cache files...")
combine_caches(cache1, cache2, cache_out)
verbose("Writing new cache file...")
cPickle.dump(cache_out, cache_out_desc)
verbose("{c} had {n} entries".format(c=arg[0], n=len(cache1)))
verbose("{c} had {n} entries".format(c=arg[1], n=len(cache2)))
verbose("Result has {n} entries".format(n=len(cache_out)))
Пример #30
0
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global surface_instead_lemmas
    global lemmapos
    global input_filetype_ext
    
    treat_options_simplest( opts, arg, n_arg, usage_string )

    for ( o, a ) in opts:        
        if o in ("-s", "--surface") : 
            surface_instead_lemmas = True
        elif o in ("-p", "--lemmapos") : 
            lemmapos = True   
        elif o in ("-f", "--freq-source") : 
            freq_source = a
        elif o == "--from":
            input_filetype_ext = a                          
        else:
            raise Exception("Bad arg: " + o)

################################################################################     
# MAIN SCRIPT

longopts = [ "surface", "lemmapos", "freq-source=" "from=" ]
args = read_options( "spf:", longopts, treat_options, -1, usage_string )
handler = ft_ucs.UCSPrinter("candidates", freq_source=freq_source,
        lemmapos=lemmapos, surfaces=surface_instead_lemmas)
filetype.parse(args, handler, input_filetype_ext)
Пример #31
0
    for ( o, a ) in opts:
        if o in ("-s", "--surface") : 
            surface_instead_lemmas = True
            base_attr = 'surface'
        elif o in ("-f", "--freq") :
            min_frequency = int(a)
        elif o in ("-n", "--ngram") :
            (min_ngram, max_ngram) = interpret_ngram(a)
        elif o in ("-i", "--index") :
            corpus_from_index = True
        elif o in ("-G", "--glue"):
            if a == "scp":
                glue = scp_glue
            else:
                error("Unknown glue function '%s'" % a)

################################################################################

corpus_from_index = False
base_attr = 'lemma'
glue = scp_glue
min_ngram = 2
max_ngram = 8
min_frequency = 2

longopts = ["surface", "glue=", "ngram=", "freq=", "index"]
arg = read_options("sG:n:f:i", longopts, treat_options, 1, usage_string)
corpus_path = arg[0]

main()
Пример #32
0
            verbose("Option REVERSE active")

        elif o in ("-i", "--minlength") :
            minlength = interpret_length( a, "minimum" )
        elif o in ("-a", "--maxlength") :
            maxlength = interpret_length( a, "maximum" )
        elif o == "--min-mweoccurs":
            min_mweoccurs = interpret_length(a, "minimum")
        elif o == "--max-mweoccurs":
            max_mweoccurs = interpret_length(a, "maximum")
        elif o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if minlength > maxlength:
        warn("minlength should be <= maxlength")
    if min_mweoccurs > max_mweoccurs:
        warn("min-mweoccurs should be <= max-mweoccurs")
            

################################################################################
# MAIN SCRIPT

longopts = [ "threshold=", "equals=", "patterns=", "reverse", "maxlength=",
             "minlength=", "min-mweoccurs=", "max-mweoccurs=", "from=", "to=" ]
args = read_options( "t:e:p:ra:i:", longopts, treat_options, -1, usage_string )
filetype.parse(args, FilterHandler(), input_filetype_ext)
Пример #33
0
        if o in ("-m", "--measures"):
            try:
                measures = []
                measures = interpret_measures(a)
            except ValueError as message:
                error(
                    str(message) + "\nargument must be list separated by "
                    "\":\" and containing the names: " +
                    str(supported_measures))
        elif o in ("-o", "--original"):
            main_freq_name = a
        elif o in ("-a", "--all"):
            join_all_contrastive = True

    if not main_freq_name:
        error("Option -o is mandatory")


################################################################################
# MAIN SCRIPT

longopts = ["measures=", "original=", "all"]
args = read_options("m:o:a", longopts, treat_options, 1, usage_string)

for a in args:
    verbose("Pass 1 for " + a)
    filetype.parse([a], TotalCalculatorHandler())
    # First calculate Nc for each contrastive corpus
    verbose("Pass 2 for " + a)
    filetype.parse([a], MeasureCalculatorHandler())
Пример #34
0
                print("\t " + str(word.attrib))

        currentMWE = candidateId + ";" + ngram.find('freq').get("value") + ";"

        print()
        for word in ngram.xpath('./w'):
            print(word.attrib)
            currentMWE = currentMWE + str(word.attrib) + ";"
        if currentMWE.count(";") == 4:
            currentMWE = currentMWE + ";"
        validation = raw_input("Is this a MWE: ")
        print("\t You entered: ", validation)
        if validation == ".":
            validatedMWEfile.close()
            exit()
        currentMWE = currentMWE + validation
        validatedMWEfile.write(currentMWE + "\n")
        totalValidated = totalValidated + 1
        print("\t ---> Total validated:", str(totalValidated))

    validatedMWEfile.close()


################################################################################
# MAIN SCRIPT

args = read_options("", [], treat_options_simplest, 2, usage_string)
candidates_filename = args[0]
output_filename = args[1]
annotate_candidates()
Пример #35
0
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.
        
        @param usage_string The usage string for the current script.    
    """
    global attributes

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-a", "--attributes"):
            attributes = a.split(":")
            for attr in attributes:
                if attr not in WORD_ATTRIBUTES:
                    error("Unknown attribute '%s'!" % attr)

    if attributes is None:
        print >> sys.stderr, "The option -a <attributes> is mandatory."
        usage(usage_string)
        sys.exit(2)


################################################################################
# MAIN SCRIPT

longopts = ["atttibutes="]
arg = read_options("a:", longopts, treat_options, -1, usage_string)
filetype.parse(arg, TxtGeneratorHandler())
Пример #36
0
        elif o in ("-L", "--lemma-or-surface"):
            lemma_or_surface = True
        elif o == "--input-from":
            input_filetype_ext = a
        elif o == "--reference-from":
            reference_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    # The reference list needs to be opened after all the options are read,
    # since options such as -g and -c modify the way the list is represented
    if ref_name:
        filetype.parse([ref_name], ReferenceReaderHandler(),
                       reference_filetype_ext)
        gs_name = re.sub(".*/", "", re.sub("\.xml", "", ref_name))
    # There's no reference list... Oh oh cannot evaluate :-(
    if not pre_gs:
        error("You MUST provide a non-empty reference list!")


################################################################################
# MAIN SCRIPT

longopts = [
    "input-from=", "reference-from=", "reference=", "ignore-pos", "case",
    "lemma-or-surface"
]
args = read_options("r:gcL", longopts, treat_options, -1, usage_string)

filetype.parse(args, EvaluatorHandler(), input_filetype_ext)