Exemplo n.º 1
0
    def print_candidates(self, chain):
        """Prints a XML file (mwetoolkit-candidates.dtd) from a temporary 
        candidates file generated by the treat_sentence callback function. 
        Repeated candidates are not printed several times: instead, each base 
        form has a joint frequency of the candidate in the corpus. Since the
        new version of the "count.py" script, this initial frequency is only
        printed if you explicitely ask to do it through the -f option.

        @param filename: The file name of the corpus from which we generate the
        candidates.
        """
        global print_cand_freq, print_source
        verbose("Outputting candidates file...")
        for ngram_basestring, info in self.all_entities.iteritems():
            cand = self.candidate_factory.make()
            cand.from_string(ngram_basestring)
            for corpus_name, (surface_dict, total_freq) in info.iteritems():
                if print_cand_freq:
                    freq = Frequency(corpus_name, total_freq)
                    cand.add_frequency(freq)
                for occur_string in surface_dict.keys():
                    occur_form = Ngram(None, None)
                    occur_form.from_string(occur_string)
                    sources = surface_dict[occur_string]
                    freq_value = len(sources)
                    freq = Frequency(corpus_name, freq_value)
                    occur_form.add_frequency(freq)
                    if print_source:
                        occur_form.add_sources(sources)
                    cand.add_occur(occur_form)
            chain.handle_candidate(cand, info)
Exemplo n.º 2
0
    def print_candidates(self, chain):
        """Prints a XML file (mwetoolkit-candidates.dtd) from a temporary 
        candidates file generated by the treat_sentence callback function. 
        Repeated candidates are not printed several times: instead, each base 
        form has a joint frequency of the candidate in the corpus. Since the
        new version of the "count.py" script, this initial frequency is only
        printed if you explicitely ask to do it through the -f option.

        @param filename: The file name of the corpus from which we generate the
        candidates.
        """
        global print_cand_freq, print_source
        verbose("Outputting candidates file...")
        for ngram_basestring, info in self.all_entities.iteritems() :
            cand = self.candidate_factory.make()
            cand.from_string(ngram_basestring)
            for corpus_name, (surface_dict, total_freq) in info.iteritems():
                if print_cand_freq :
                   freq = Frequency( corpus_name, total_freq )
                   cand.add_frequency( freq )
                for occur_string in surface_dict.keys() :
                    occur_form = Ngram( None, None )
                    occur_form.from_string(occur_string)
                    sources = surface_dict[occur_string]
                    freq_value = len(sources)
                    freq = Frequency( corpus_name, freq_value )
                    occur_form.add_frequency( freq )
                    if print_source:
                        occur_form.add_sources(sources)
                    cand.add_occur( occur_form )
            chain.handle_candidate(cand, info)
Exemplo n.º 3
0
 def finish(self, info={}):
     """After we read all input, we can finally be sure about which lines
     need to be printed. Those correspond exactly to the unique lines added
     to the buffer.
     """
     global entity_buffer
     verbose("Output the unified ngrams...")
     for uniq_counter, (entity, info) in enumerate(entity_buffer.values()):
         #entity.id_number = uniq_counter
         if isinstance(entity, Candidate):
             # WARNING: This is sort of specific for the VERBS 2010 paper. This
             # whole script should actually be redefined and documented. But for
             # the moment it's useful and I have no time to be a good programmer
             # -Carlos
             freq_sum = {}
             for freq in entity.freqs:
                 freq_entry = freq_sum.get(freq.name, 0)
                 freq_entry += int(freq.value)
                 freq_sum[freq.name] = freq_entry
             entity.freqs.clear()
             for (name, value) in freq_sum.items():
                 entity.add_frequency(Frequency(name, value))
         elif isinstance(entity, Entry):
             pass
         elif isinstance(entity, Sentence):
             pass
         self.chain.handle(entity, info)
     self.chain.finish(info)
Exemplo n.º 4
0
 def finish(self, info={}):
     """After we read all input, we can finally be sure about which lines
     need to be printed. Those correspond exactly to the unique lines added
     to the buffer.
     """
     global entity_buffer
     verbose( "Output the unified ngrams..." )
     for uniq_counter, (entity, info) in enumerate(entity_buffer.values()):
         #entity.id_number = uniq_counter
         if isinstance( entity, Candidate ) :
             # WARNING: This is sort of specific for the VERBS 2010 paper. This
             # whole script should actually be redefined and documented. But for
             # the moment it's useful and I have no time to be a good programmer
             # -Carlos
             freq_sum = {}
             for freq in entity.freqs :
                 freq_entry = freq_sum.get( freq.name, 0 )
                 freq_entry += int( freq.value )
                 freq_sum[ freq.name ] = freq_entry
             entity.freqs.clear()
             for ( name, value ) in freq_sum.items() :
                 entity.add_frequency( Frequency( name, value ) )
         elif isinstance( entity, Entry ) :
             pass
         elif isinstance( entity, Sentence ) :
             pass          
         self.chain.handle(entity, info)
     self.chain.finish(info)
Exemplo n.º 5
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.    
    """
    global filetype_corpus_ext
    global filetype_candidates_ext
    global output_filetype_ext
    global action_annotate
    global action_filter

    treat_options_simplest(opts, arg, n_arg, usage_string)

    detector_class = ContiguousLemmaDetector
    candidates_fnames = []
    n_gaps = None

    for (o, a) in opts:
        if o in ("-c", "--candidates"):
            candidates_fnames.append(a)
        elif o in ("-d", "--detector"):
            detector_class = detectors.get(a,None)
            if detector_class is None :
                error("Unkown detector name: "+a)
        elif o in ("-S", "--source"):
            detector_class = SourceDetector
        elif o in ("-g", "--gaps"):
            n_gaps = int(a)
        elif o == "--corpus-from":
            filetype_corpus_ext = a
        elif o == "--candidates-from":
            filetype_candidates_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o == "--filter":
            action_annotate = False
            action_filter = True
        elif o == "--filter-and-annot":            
            action_filter = True            
        else:
            raise Exception("Bad arg: " + o)

    if not candidates_fnames:
        error("No candidates file given!")
    if detector_class == SourceDetector and n_gaps is not None:
        error('Bad arguments: method "Source" with "--gaps"')
    c = CandidatesHandler()
    verbose("Reading MWE list from candidates file")
    filetype.parse(candidates_fnames,
            c, filetype_candidates_ext)
    verbose("MWE list loaded in memory successfully")
    global detector
    detector = detector_class(c.info, n_gaps)
Exemplo n.º 6
0
    def after_file(self, fileobj, info={}):
        global corpus_size_f
        corpus_size_f = float(corpus_size)
        verbose("Selecting ngrams through LocalMaxs...")
        self.localmaxs()
        verbose("Outputting candidates file...")

        for ngram_key in selected_candidates:
            if selected_candidates[ngram_key] and ngram_counts[ngram_key] >= min_frequency:
                self.dump_ngram(ngram_key, None)
        self.chain.after_file(fileobj, info)
Exemplo n.º 7
0
    def after_file(self, fileobj, info={}):
        global corpus_size_f
        corpus_size_f = float(corpus_size)
        verbose("Selecting ngrams through LocalMaxs...")
        self.localmaxs()
        verbose("Outputting candidates file...")

        for ngram_key in selected_candidates:
            if selected_candidates[
                    ngram_key] and ngram_counts[ngram_key] >= min_frequency:
                self.dump_ngram(ngram_key, None)
        self.chain.after_file(fileobj, info)
Exemplo n.º 8
0
def transform_format(rasp):
    """
        Reads an input file and converts it into mwetoolkit corpus XML format,
        printing the XML file to stdout.
    
        @param rasp Is the input, file or piped.
    """
    global morphg_folder
    global work_path
    if morphg_folder:
        os.chdir(morphg_folder)
    n_line = 0
    l_empty = 2
    first_line = True
    phrase = {}
    l = unicode(rasp.readline(), "utf-8")
    #pdb.set_trace()
    # THIS LOOP IS HORRIBLE. PLEASE REFACTOR ME, PLEEEEASE!!!! - CR 20140905
    while l != "":
        if l == "\n":
            l_empty += 1
            if l_empty == 1:
                sorted_phrase = map(lambda x: x[1], sorted(phrase.items()))
                write_entry(n_line, sorted_phrase)
                phrase = {}
                if n_line % 100 == 0:
                    verbose("Processing sentence number %d" % n_line)
                n_line += 1
                first_line = True
            l = unicode(rasp.readline(), "utf-8")
            continue
        # too long sentences not parsed because -w word limit passed to parser
        elif l.startswith("(X "):
            while l != "\n":
                l = unicode(rasp.readline(), "utf-8")
            continue
        if first_line:
            if l_empty >= 1:
                l_empty = 0
                process_line(l, phrase)
                #pdb.set_trace()
                first_line = False
                l = unicode(rasp.readline(), "utf-8")  #ignore line
            else:
                l_empty = 0
                first_line = True
        else:
            process_tree_branch(l, phrase)
        l = unicode(rasp.readline(), "utf-8")
    if l_empty != 1 and len(phrase) != 0:  #save last entry
        write_entry(n_line, map(lambda x: x[1], sorted(phrase.items())))
    if morphg_folder:
        os.chdir(work_path)
Exemplo n.º 9
0
def treat_options(opts, arg, n_arg, usage_string):
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.    
    """
    global filetype_corpus_ext
    global filetype_candidates_ext
    global output_filetype_ext
    global action_annotate
    global action_filter

    treat_options_simplest(opts, arg, n_arg, usage_string)

    detector_class = ContiguousLemmaDetector
    candidates_fnames = []
    n_gaps = None

    for (o, a) in opts:
        if o in ("-c", "--candidates"):
            candidates_fnames.append(a)
        elif o in ("-d", "--detector"):
            detector_class = detectors.get(a, None)
            if detector_class is None:
                error("Unkown detector name: " + a)
        elif o in ("-S", "--source"):
            detector_class = SourceDetector
        elif o in ("-g", "--gaps"):
            n_gaps = int(a)
        elif o == "--corpus-from":
            filetype_corpus_ext = a
        elif o == "--candidates-from":
            filetype_candidates_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o == "--filter":
            action_annotate = False
            action_filter = True
        elif o == "--filter-and-annot":
            action_filter = True
        else:
            raise Exception("Bad arg: " + o)

    if not candidates_fnames:
        error("No candidates file given!")
    if detector_class == SourceDetector and n_gaps is not None:
        error('Bad arguments: method "Source" with "--gaps"')
    c = CandidatesHandler()
    verbose("Reading MWE list from candidates file")
    filetype.parse(candidates_fnames, c, filetype_candidates_ext)
    verbose("MWE list loaded in memory successfully")
    global detector
    detector = detector_class(c.info, n_gaps)
Exemplo n.º 10
0
def transform_format(rasp):
    """
        Reads an input file and converts it into mwetoolkit corpus XML format,
        printing the XML file to stdout.
    
        @param rasp Is the input, file or piped.
    """
    global morphg_folder
    global work_path
    if morphg_folder :
        os.chdir( morphg_folder )
    n_line=0
    l_empty=2
    first_line=True    
    phrase = {}
    l=unicode(rasp.readline(),"utf-8")
    #pdb.set_trace()
    # THIS LOOP IS HORRIBLE. PLEASE REFACTOR ME, PLEEEEASE!!!! - CR 20140905    
    while l != "":        
        if l=="\n":
            l_empty+=1
            if l_empty == 1:
                sorted_phrase = map( lambda x: x[1], sorted( phrase.items() ) )
                write_entry(n_line,sorted_phrase)
                phrase = {}
                if n_line % 100 == 0 :
                    verbose( "Processing sentence number %d" % n_line )
                n_line+=1
                first_line=True
            l=unicode( rasp.readline(), "utf-8" )
            continue
        # too long sentences not parsed because -w word limit passed to parser
        elif l.startswith( "(X " ) :
            while l != "\n" :
                l = unicode( rasp.readline(), "utf-8" )
            continue
        if first_line:
            if l_empty>=1:
                l_empty=0
                process_line(l,phrase)
                #pdb.set_trace()
                first_line=False
                l=unicode( rasp.readline(), "utf-8" ) #ignore line
            else:
                l_empty=0
                first_line=True
        else:
            process_tree_branch(l,phrase)
        l=unicode( rasp.readline(), "utf-8" )
    if l_empty != 1 and len(phrase) != 0 : #save last entry
        write_entry(n_line,map( lambda x: x[1], sorted( phrase.items() ) )) 
    if morphg_folder :
        os.chdir( work_path )
Exemplo n.º 11
0
def read_data( f_data ) :
    """
        Reads the annotation data from a tab-separated file and generates a
        matrix with Ni rows, one per item to annotate, and Nc columns, one per 
        rater. The content of the matrix are the categories from K assigned 
        by coder c to item i. However, each category is converted to an integer
        unique ID from 0 to Nk-1, so that it is easier to sort the categories.
        Also returns the total number of items, raters and categories.
        
        @param f_data The input file from which the data is read
        @return A tuple containing, in the first position, the matrix with one 
        subject per row, one rater per column, and the annotation category IDs 
        in the cells as integers. The second, third and fourth fields of the 
        tuple are the number of items Ni, of coders Nc and of categories Nk. The
        fifth field is a list containing the names of the categories sorted by
        their IDs (position 0 contains the name of category IDentified by 0, and
        so on).
    """
    global first_rater
    global first_header
    global separator
    
    if first_header :
        f_data.readline() # Ignores first row   
    
    annotations = []
    all_categories = {}
    Ni = 0
    Nc = 0
    for line in f_data.readlines() :
        if len(line.strip()) > 0 : # Ignore blank lines
            Ni = Ni + 1
            annot_row = line.strip().split( separator )[first_rater:]
            if Nc == 0 :
                Nc = len( annot_row )
            elif Nc != len( annot_row ) :
                raise ValueError, "Row %d: the file must contain the same \
                                   number of fields in all rows" % Ni
            # improvement for cases where file was Space-Tab separated                                   
            clean_annot_row = [] # contains the annotation cleaned from spaces
            for annotation in annot_row :
                clean_annot = annotation.strip() # Remove spurious spaces
                all_categories[ clean_annot ] = 1
                clean_annot_row.append( clean_annot )          
            annotations.append( clean_annot_row )
    (annotations,categ_names) = categories_to_ids( annotations, all_categories )
    Nk = len( all_categories )
    verbose( "\n%d items\n%d raters\n%d categories\n" % (Ni, Nc, Nk) )
    return ( annotations, Ni, Nc, Nk, categ_names ) 
Exemplo n.º 12
0
def read_data(f_data):
    """
        Reads the annotation data from a tab-separated file and generates a
        matrix with Ni rows, one per item to annotate, and Nc columns, one per 
        rater. The content of the matrix are the categories from K assigned 
        by coder c to item i. However, each category is converted to an integer
        unique ID from 0 to Nk-1, so that it is easier to sort the categories.
        Also returns the total number of items, raters and categories.
        
        @param f_data The input file from which the data is read
        @return A tuple containing, in the first position, the matrix with one 
        subject per row, one rater per column, and the annotation category IDs 
        in the cells as integers. The second, third and fourth fields of the 
        tuple are the number of items Ni, of coders Nc and of categories Nk. The
        fifth field is a list containing the names of the categories sorted by
        their IDs (position 0 contains the name of category IDentified by 0, and
        so on).
    """
    global first_rater
    global first_header
    global separator

    if first_header:
        f_data.readline()  # Ignores first row

    annotations = []
    all_categories = {}
    Ni = 0
    Nc = 0
    for line in f_data.readlines():
        if len(line.strip()) > 0:  # Ignore blank lines
            Ni = Ni + 1
            annot_row = line.strip().split(separator)[first_rater:]
            if Nc == 0:
                Nc = len(annot_row)
            elif Nc != len(annot_row):
                raise ValueError, "Row %d: the file must contain the same \
                                   number of fields in all rows" % Ni
            # improvement for cases where file was Space-Tab separated
            clean_annot_row = []  # contains the annotation cleaned from spaces
            for annotation in annot_row:
                clean_annot = annotation.strip()  # Remove spurious spaces
                all_categories[clean_annot] = 1
                clean_annot_row.append(clean_annot)
            annotations.append(clean_annot_row)
    (annotations, categ_names) = categories_to_ids(annotations, all_categories)
    Nk = len(all_categories)
    verbose("\n%d items\n%d raters\n%d categories\n" % (Ni, Nc, Nk))
    return (annotations, Ni, Nc, Nk, categ_names)
Exemplo n.º 13
0
def interpret_length(l, maxormin):
    """
    Transform argument given to -a or -i options into integer + error checks.

    @param l: A string passed as argument to -i or -a
    @param maxormin: A string indicating whether this is "maximum" or "minimum"
    @return: An integer corresponding to l
    """
    try:
        result = int(l)
        if result < 0:
            raise ValueError
        verbose("%s length: %d" % (maxormin, result))
        return result
    except ValueError:
        error("Argument of must be non-negative integer, got " + repr(l))
Exemplo n.º 14
0
def interpret_length( l, maxormin ):
    """
    Transform argument given to -a or -i options into integer + error checks.

    @param l: A string passed as argument to -i or -a
    @param maxormin: A string indicating whether this is "maximum" or "minimum"
    @return: An integer corresponding to l
    """
    try :
        result = int( l )
        if result < 0:
            raise ValueError
        verbose( "%s length: %d" % (maxormin, result) )
        return result
    except ValueError:
        error("Argument of must be non-negative integer, got " + repr(l))
Exemplo n.º 15
0
def open_index(prefix):
    """
    Open the index files (valid index created by the `index.py` script). 
    @param prefix The string name of the index file.
    """
    global freq_name, the_corpus_size
    global index, suffix_array
    assert prefix.endswith(".info")
    prefix = prefix[:-len(".info")]
    try:
        verbose("Loading index files... this may take some time.")
        index = Index(prefix)
        index.load_metadata()
        freq_name = re.sub(".*/", "", prefix)
        #pdb.set_trace()
        the_corpus_size = index.metadata["corpus_size"]
    except IOError:
        error("Error opening the index.\nTry again with another index filename")
    except KeyError:
        error("Error opening the index.\nTry again with another index filename")
Exemplo n.º 16
0
def main():
    """
        Main function.
    """
    global corpus_size_f

    if corpus_from_index:
        index = Index(corpus_path)
        index.load_main()
        for sentence in index.iterate_sentences():
            treat_sentence(sentence)
    else:
        input_file = open(corpus_path)    
        parser = xml.sax.make_parser()
        parser.setContentHandler( CorpusXMLHandler( treat_sentence ) ) 
        parser.parse( input_file )
        input_file.close()

    corpus_size_f = float(corpus_size)

    localmaxs()


    verbose("Outputting candidates file...")
    print(XML_HEADER % { "category": "candidates", "ns": "" })
    

    meta = Meta([CorpusSize("corpus", corpus_size)],
                [MetaFeat("glue", "real")], [])
    print(meta.to_xml().encode('utf-8'))

    id = 0

    for ngram in select:
        if (len(ngram) >= min_ngram and len(ngram) <= max_ngram and
            select[ngram] and ngram_counts[ngram] >= min_frequency):
                dump_ngram(ngram, id)
                id += 1

    print(XML_FOOTER % { "category": "candidates" })
Exemplo n.º 17
0
def open_index(prefix):
    """
    Open the index files (valid index created by the `index.py` script). 
    @param prefix The string name of the index file.
    """
    global freq_name, the_corpus_size
    global index, suffix_array
    assert prefix.endswith(".info")
    prefix = prefix[:-len(".info")]
    try:
        verbose("Loading index files... this may take some time.")
        index = Index(prefix)
        index.load_metadata()
        freq_name = re.sub(".*/", "", prefix)
        #pdb.set_trace()
        the_corpus_size = index.metadata["corpus_size"]
    except IOError:
        error(
            "Error opening the index.\nTry again with another index filename")
    except KeyError:
        error(
            "Error opening the index.\nTry again with another index filename")
Exemplo n.º 18
0
def main():
    """
        Main function.
    """
    global corpus_size_f

    if corpus_from_index:
        index = Index(corpus_path)
        index.load_main()
        for sentence in index.iterate_sentences():
            treat_sentence(sentence)
    else:
        input_file = open(corpus_path)
        parser = xml.sax.make_parser()
        parser.setContentHandler(CorpusXMLHandler(treat_sentence))
        parser.parse(input_file)
        input_file.close()

    corpus_size_f = float(corpus_size)

    localmaxs()

    verbose("Outputting candidates file...")
    print(XML_HEADER % {"category": "candidates", "ns": ""})

    meta = Meta([CorpusSize("corpus", corpus_size)],
                [MetaFeat("glue", "real")], [])
    print(meta.to_xml().encode('utf-8'))

    id = 0

    for ngram in select:
        if (len(ngram) >= min_ngram and len(ngram) <= max_ngram
                and select[ngram] and ngram_counts[ngram] >= min_frequency):
            dump_ngram(ngram, id)
            id += 1

    print(XML_FOOTER % {"category": "candidates"})
Exemplo n.º 19
0
def main(corpus_paths):
    """
        Main function.
    """
    global use_shelve, ngram_counts, selected_candidates
    # Dummy file initialization to avoid warnings in PyCharm
    ngram_counts_tmpfile = selected_candidates_tmpfile = None
    if use_shelve:
        verbose("Making temporary file...")
        (ngram_counts, ngram_counts_tmpfile) = make_shelve()
        (selected_candidates, selected_candidates_tmpfile) = make_shelve()

    verbose("Counting ngrams...")
    filetype.parse(corpus_paths, NGramCounterHandler(), input_filetype_ext)

    if use_shelve:
        verbose("Removing temporary files...")
        destroy_shelve(ngram_counts, ngram_counts_tmpfile)
        destroy_shelve(selected_candidates, selected_candidates_tmpfile)
Exemplo n.º 20
0
def main(corpus_paths):
    """
        Main function.
    """
    global use_shelve, ngram_counts, selected_candidates
    # Dummy file initialization to avoid warnings in PyCharm
    ngram_counts_tmpfile = selected_candidates_tmpfile = None
    if use_shelve:
        verbose("Making temporary file...")
        (ngram_counts, ngram_counts_tmpfile) = make_shelve()
        (selected_candidates, selected_candidates_tmpfile) = make_shelve()

    verbose("Counting ngrams...")
    filetype.parse(corpus_paths, NGramCounterHandler(), input_filetype_ext)

    if use_shelve:
        verbose("Removing temporary files...")
        destroy_shelve(ngram_counts, ngram_counts_tmpfile)
        destroy_shelve(selected_candidates, selected_candidates_tmpfile)
Exemplo n.º 21
0
        candidate.add_feat( Feature( "n", len( candidate ) ) )
        case_classes = {}
        #pdb.set_trace()
        has_hyphen = False
        for w in candidate :
            case_class = w.get_case_class()
            count_class = case_classes.get( case_class, 0 )
            case_classes[ case_class ] = count_class + 1
            has_hyphen = has_hyphen or "-" in w.lemma
        argmax_case_class = max( zip( case_classes.values(), 
                                      case_classes.keys() ) )[ 1 ]
        candidate.add_feat( Feature( "capitalized", argmax_case_class ) )        
        candidate.add_feat( Feature( "hyphen", str( has_hyphen ) ) )
        self.chain.handle_candidate(candidate, info)


################################################################################
# MAIN SCRIPT

args = read_options( "", [], treat_options_simplest, 1, usage_string )

# Done in 2 passes, one to define the type of the feature and another to
# print the feature values for each candidate
verbose( "1st pass : recover all POS patterns for meta feature" )
# Will ignore meta information and simply recover all the possible patterns
filetype.parse(args, RecovererHandler())

# Second pass to print the metafeat header with all possible pattern values
verbose( "2nd pass : add the features" )
filetype.parse(args, FeatGeneratorHandler())
Exemplo n.º 22
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts.        
        @param arg The argument list parsed by getopts.        
        @param n_arg The number of arguments expected for this script.        
        @param usage_string The usage string printed if the arguments are wrong.        
    """
    
    global first_header
    global first_rater
    global calculate_pairwise
    global calculate_confusion
    global separator
    global distances_matrix
    global unknown
    
    treat_options_simplest( opts, arg, n_arg, usage_string )

    for ( o, a ) in opts:        
        if o in ("-r", "--raters") :
            verbose( "First row in file ignored -> considered as rater labels")
            first_header = True     
        if o in ("-i", "--items") : 
            verbose("First column in file ignored -> considered as item labels")        
            first_rater = 1 
        if o in ("-p", "--pairwise") : 
            verbose( "Computing pairwise coefficients" )
            calculate_pairwise = True
        if o in ("-u", "--unknown") : 
            verbose( "Unknown value - TODO: implement: " + a )
            unknown = a
        if o in ("-s", "--separator") : 
            verbose( "Field separator: " + a )
            separator = a
            if len( separator ) > 1 :
                warn("Multi-char field separator!")
        if o in ("-d", "--distance") :
            verbose("Calculating weighted coefficients using distance file")
            distances_matrix = read_distances( a )
            if distances_matrix is None :
                warn("Error in distance matrix! Weighted coefficients will use 1.0 as default distance")
        if o in ("-c", "--confusion") :
            verbose( "Calculating confusion matrices" )
            calculate_confusion = True
Exemplo n.º 23
0
def get_freq_web1t(surfaces, lemmas, pos):
    """
        Gets the frequency (number of occurrences) of an ngram in Google's
        Web 1T 5-gram Corpus.
    """

    global build_entry, web1t_data_path

    length = len(surfaces)

    if length > 5:
        warn("Cannot count the frequency of an n-gram, n>5!")
        return 0

    search_term = ' '.join(map(build_entry, surfaces, lemmas, pos))

    # Find the file in which to look for the ngram.
    if length == 1:
        filename = web1t_data_path + "/1gms/vocab.gz"
    else:
        indexfile = web1t_data_path + "/%dgms/%dgm.idx" % (length, length)
        filenames = [x.split("\t") for x in read_file(indexfile).split("\n")]
        filename = None
        for (name, first) in filenames:
            # Assumes byte-value-based ordering!
            if first > search_term:
                break
            else:
                filename = name

        if filename is None:
            return 0
        filename = "%s/%dgms/%s" % (web1t_data_path, length, filename)

    verbose("WEB1T: Opening %s, looking for %s" % (filename, search_term))

    # This has been absurdly slow in Python.
    #file = gzip.open(filename, "rb")
    #
    #search_term += "\t"
    #freq = 0
    #
    #for line in file:
    #    if line.startswith(search_term):
    #        freq = int(line.split("\t")[1])
    #        break
    #
    #print >>sys.stderr, "buenito: %d" % freq
    #
    #file.close()

    file = subprocess.Popen(
        ["zgrep", "--", "^" + re.escape(search_term) + "\t", filename],
        stdout=subprocess.PIPE).stdout
    line = file.read()
    file.close()
    if line:
        freq = int(line.split("\t")[1])
    else:
        freq = 0
    verbose("freq =" + str(freq))
    return freq
Exemplo n.º 24
0
            suffix_array = index.load("lemma+pos")

    else:  # Web search, entries are single surface or lemma forms
        if surface_flag:
            build_entry = lambda surface, lemma, pos: surface
        else:
            build_entry = lambda surface, lemma, pos: lemma

    if len(mode) != 1:
        error("Exactly one option -u, -w or -i, must be provided")
    #elif text_input and web_freq is None:
    #    warn("-x option is recommended for web queries, not textual indices")


################################################################################
# MAIN SCRIPT

longopts = ["candidates-from=", "corpus-from=", "to=",
            "yahoo", "google", "index=", "ignore-pos", "surface", "old",
            "lower=", "upper=", "vars", "lang=", "no-joint", "bigrams",
            "univ=", "web1t="]
args = read_options("ywi:gsoal:Jbu:T:", longopts,
        treat_options, -1, usage_string)

try:
    verbose("Counting ngrams in candidates file")
    filetype.parse(args, CounterPrinter(), filetype_candidates_ext)
finally:
    if web_freq:
        web_freq.flush_cache()  # VERY IMPORTANT!
Exemplo n.º 25
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global thresh_source
    global thresh_value
    global equals_name
    global equals_value
    global reverse
    global minlength
    global maxlength
    global min_mweoccurs
    global max_mweoccurs
    global input_filetype_ext
    global output_filetype_ext
    
    treat_options_simplest( opts, arg, n_arg, usage_string )    
    
    for ( o, a ) in opts:
        if o in ( "-t", "--threshold" ) : 
            threshold = interpret_threshold( a )
            if threshold :
                (thresh_source, thresh_value) = threshold
            else :
                error( "The format of the -t argument must be <source>:"
                       "<value>\n<source> must be a valid corpus name and "
                       "<value> must be a non-negative integer")
        elif o in ( "-e", "--equals" ) :
            equals = interpret_equals( a )
            if equals :
                ( equals_name, equals_value ) = equals
            else :
                error( "The format of the -e argument must be <name>:"
                       "<value>\n<name> must be a valid feat name and "
                       "<value> must be a non-empty string")

        elif o in ("-p", "--patterns") :
            verbose( "Reading patterns file" )
            global patterns
            patterns = filetype.parse_entities([a])
        elif o in ("-r", "--reverse") :
            reverse = True
            verbose("Option REVERSE active")

        elif o in ("-i", "--minlength") :
            minlength = interpret_length( a, "minimum" )
        elif o in ("-a", "--maxlength") :
            maxlength = interpret_length( a, "maximum" )
        elif o == "--min-mweoccurs":
            min_mweoccurs = interpret_length(a, "minimum")
        elif o == "--max-mweoccurs":
            max_mweoccurs = interpret_length(a, "maximum")
        elif o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if minlength > maxlength:
        warn("minlength should be <= maxlength")
    if min_mweoccurs > max_mweoccurs:
        warn("min-mweoccurs should be <= max-mweoccurs")
Exemplo n.º 26
0
    treat_options_simplest( opts, arg, n_arg, usage_string )        

    for ( o, a ) in opts:
        if o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o in ("-l","--lemmas" ) :
            lower_attr = "lemma"
        elif o in ("-a", "--algorithm"):
            algoname = a.lower()
        elif o in ("-m", "-x"):
        	error( "Deprecated options -x and -m. Run with -h for details" )
        else:
            raise Exception("Bad arg: " + o)

 
################################################################################
# MAIN SCRIPT

longopts = [ "from=", "to=", "algorithm=", "lemmas" ]
args = read_options( "a:xml", longopts, treat_options, 1, usage_string )

if algoname != "simple" :
    verbose( "Pass 1: Reading vocabulary from file... please wait" )
    filetype.parse(args, VocabReaderHandler(), input_filetype_ext)

verbose( "Pass 2: Lowercasing the words in the file" )
filetype.parse(args, LowercaserHandler(), input_filetype_ext)
Exemplo n.º 27
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global thresh_source
    global thresh_value
    global equals_name
    global equals_value
    global reverse
    global minlength
    global maxlength
    global min_mweoccurs
    global max_mweoccurs
    global input_filetype_ext
    global output_filetype_ext

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-t", "--threshold"):
            threshold = interpret_threshold(a)
            if threshold:
                (thresh_source, thresh_value) = threshold
            else:
                error("The format of the -t argument must be <source>:"
                      "<value>\n<source> must be a valid corpus name and "
                      "<value> must be a non-negative integer")
        elif o in ("-e", "--equals"):
            equals = interpret_equals(a)
            if equals:
                (equals_name, equals_value) = equals
            else:
                error("The format of the -e argument must be <name>:"
                      "<value>\n<name> must be a valid feat name and "
                      "<value> must be a non-empty string")

        elif o in ("-p", "--patterns"):
            verbose("Reading patterns file")
            global patterns
            patterns = filetype.parse_entities([a])
        elif o in ("-r", "--reverse"):
            reverse = True
            verbose("Option REVERSE active")

        elif o in ("-i", "--minlength"):
            minlength = interpret_length(a, "minimum")
        elif o in ("-a", "--maxlength"):
            maxlength = interpret_length(a, "maximum")
        elif o == "--min-mweoccurs":
            min_mweoccurs = interpret_length(a, "minimum")
        elif o == "--max-mweoccurs":
            max_mweoccurs = interpret_length(a, "maximum")
        elif o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if minlength > maxlength:
        warn("minlength should be <= maxlength")
    if min_mweoccurs > max_mweoccurs:
        warn("min-mweoccurs should be <= max-mweoccurs")
Exemplo n.º 28
0
    for ( o, a ) in opts:
        if o in ( "-m", "--measures" ) :
            try :
                measures = []
                measures = interpret_measures( a )
            except ValueError as message :
                error( str(message)+"\nargument must be list separated by "
                                    "\":\" and containing the names: "+
                       str( supported_measures ))
        elif o in ( "-o", "--original" ) :
            main_freq_name = a
        elif o in ( "-a", "--all" ) :
            join_all_contrastive = True
    
    if not main_freq_name :
        error( "Option -o is mandatory")


################################################################################
# MAIN SCRIPT

longopts = ["measures=", "original=", "all"]
args = read_options( "m:o:a", longopts, treat_options, 1, usage_string )

for a in args :
    verbose( "Pass 1 for " + a )
    filetype.parse([a], TotalCalculatorHandler())
    # First calculate Nc for each contrastive corpus        
    verbose( "Pass 2 for " + a )    
    filetype.parse([a], MeasureCalculatorHandler())
Exemplo n.º 29
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts.        
        @param arg The argument list parsed by getopts.        
        @param n_arg The number of arguments expected for this script.        
        @param usage_string The usage string printed if the arguments are wrong.        
    """

    global first_header
    global first_rater
    global calculate_pairwise
    global calculate_confusion
    global separator
    global distances_matrix
    global unknown

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-r", "--raters"):
            verbose("First row in file ignored -> considered as rater labels")
            first_header = True
        if o in ("-i", "--items"):
            verbose(
                "First column in file ignored -> considered as item labels")
            first_rater = 1
        if o in ("-p", "--pairwise"):
            verbose("Computing pairwise coefficients")
            calculate_pairwise = True
        if o in ("-u", "--unknown"):
            verbose("Unknown value - TODO: implement: " + a)
            unknown = a
        if o in ("-s", "--separator"):
            verbose("Field separator: " + a)
            separator = a
            if len(separator) > 1:
                warn("Multi-char field separator!")
        if o in ("-d", "--distance"):
            verbose("Calculating weighted coefficients using distance file")
            distances_matrix = read_distances(a)
            if distances_matrix is None:
                warn(
                    "Error in distance matrix! Weighted coefficients will use 1.0 as default distance"
                )
        if o in ("-c", "--confusion"):
            verbose("Calculating confusion matrices")
            calculate_confusion = True
Exemplo n.º 30
0
        if o in ("-m", "--measures"):
            try:
                measures = []
                measures = interpret_measures(a)
            except ValueError as message:
                error(
                    str(message) + "\nargument must be list separated by "
                    "\":\" and containing the names: " +
                    str(supported_measures))
        elif o in ("-o", "--original"):
            main_freq_name = a
        elif o in ("-a", "--all"):
            join_all_contrastive = True

    if not main_freq_name:
        error("Option -o is mandatory")


################################################################################
# MAIN SCRIPT

longopts = ["measures=", "original=", "all"]
args = read_options("m:o:a", longopts, treat_options, 1, usage_string)

for a in args:
    verbose("Pass 1 for " + a)
    filetype.parse([a], TotalCalculatorHandler())
    # First calculate Nc for each contrastive corpus
    verbose("Pass 2 for " + a)
    filetype.parse([a], MeasureCalculatorHandler())
Exemplo n.º 31
0
 def before_file(self, fileobj, info={}):
     if not self.chain:
         self.chain = self.make_printer(info, output_filetype_ext)
     self.chain.before_file(fileobj, info)
     verbose("Annotating corpus with MWEs found in list")
Exemplo n.º 32
0
def get_freq_web1t(surfaces, lemmas, pos):
    """
        Gets the frequency (number of occurrences) of an ngram in Google's
        Web 1T 5-gram Corpus.
    """

    global build_entry, web1t_data_path

    length = len(surfaces)

    if length > 5:
        warn("Cannot count the frequency of an n-gram, n>5!")
        return 0

    search_term = ' '.join(map(build_entry, surfaces, lemmas, pos))

    # Find the file in which to look for the ngram.
    if length == 1:
        filename = web1t_data_path + "/1gms/vocab.gz"
    else:
        indexfile = web1t_data_path + "/%dgms/%dgm.idx" % (length, length)
        filenames = [x.split("\t") for x in read_file(indexfile).split("\n")]
        filename = None
        for (name, first) in filenames:
            # Assumes byte-value-based ordering!
            if first > search_term:
                break
            else:
                filename = name

        if filename is None:
            return 0
        filename = "%s/%dgms/%s" % (web1t_data_path, length, filename)

    verbose("WEB1T: Opening %s, looking for %s" % (filename, search_term))

    # This has been absurdly slow in Python.
    #file = gzip.open(filename, "rb")
    #
    #search_term += "\t"
    #freq = 0
    #
    #for line in file:
    #    if line.startswith(search_term):
    #        freq = int(line.split("\t")[1])
    #        break
    #
    #print >>sys.stderr, "buenito: %d" % freq
    #
    #file.close()

    file = subprocess.Popen(
        ["zgrep", "--", "^" + re.escape(search_term) + "\t", filename],
        stdout=subprocess.PIPE).stdout
    line = file.read()
    file.close()
    if line:
        freq = int(line.split("\t")[1])
    else:
        freq = 0
    verbose("freq =" + str(freq))
    return freq
Exemplo n.º 33
0
        cache_out[ k1 ] = cache1[ k1 ]   
    # Update entries in cache_out if corresponding entry in cache_2 is newer
    for k2 in cache2.keys() :        
        ( freq2, date2 ) = cache2[ k2 ]
        ( freq_out, date_out ) = cache_out.get( k2, ( -1, None ) )
        if date_out is None :
            cache_out[ k2 ] = ( freq2, date2 )
        elif date2 < date_out :
            cache_out[ k2 ] = ( freq2, date2 )

################################################################################     
# MAIN SCRIPT

longopts = []
arg = read_options( "", longopts, treat_options_simplest, 3, usage_string )

verbose( "Opening files and checking consistency" )
cache1_desc = open( arg[ 0 ], "r" )
cache2_desc = open( arg[ 1 ], "r" )
cache_out_desc = open( arg[ 2 ], "w" )
cache1 = cPickle.load( cache1_desc )
cache2 = cPickle.load( cache2_desc )
cache_out = {}
verbose( "Combining cache files..." )
combine_caches( cache1, cache2, cache_out )
verbose( "Writing new cache file..." )
cPickle.dump( cache_out, cache_out_desc )
verbose( "{c} had {n} entries".format(c=arg[ 0 ], n=len(cache1)) )
verbose( "{c} had {n} entries".format(c=arg[ 1 ], n=len(cache2)) )
verbose( "Result has {n} entries".format(n=len(cache_out)) )
Exemplo n.º 34
0
    # Update entries in cache_out if corresponding entry in cache_2 is newer
    for k2 in cache2.keys():
        (freq2, date2) = cache2[k2]
        (freq_out, date_out) = cache_out.get(k2, (-1, None))
        if date_out is None:
            cache_out[k2] = (freq2, date2)
        elif date2 < date_out:
            cache_out[k2] = (freq2, date2)


################################################################################
# MAIN SCRIPT

longopts = []
arg = read_options("", longopts, treat_options_simplest, 3, usage_string)

verbose("Opening files and checking consistency")
cache1_desc = open(arg[0], "r")
cache2_desc = open(arg[1], "r")
cache_out_desc = open(arg[2], "w")
cache1 = cPickle.load(cache1_desc)
cache2 = cPickle.load(cache2_desc)
cache_out = {}
verbose("Combining cache files...")
combine_caches(cache1, cache2, cache_out)
verbose("Writing new cache file...")
cPickle.dump(cache_out, cache_out_desc)
verbose("{c} had {n} entries".format(c=arg[0], n=len(cache1)))
verbose("{c} had {n} entries".format(c=arg[1], n=len(cache2)))
verbose("Result has {n} entries".format(n=len(cache_out)))
Exemplo n.º 35
0
    else:  # Web search, entries are single surface or lemma forms
        if surface_flag:
            build_entry = lambda surface, lemma, pos: surface
        else:
            build_entry = lambda surface, lemma, pos: lemma

    if len(mode) != 1:
        error("Exactly one option -u, -w or -i, must be provided")
    #elif text_input and web_freq is None:
    #    warn("-x option is recommended for web queries, not textual indices")


################################################################################
# MAIN SCRIPT

longopts = [
    "candidates-from=", "corpus-from=", "to=", "yahoo", "google", "index=",
    "ignore-pos", "surface", "old", "lower=", "upper=", "vars", "lang=",
    "no-joint", "bigrams", "univ=", "web1t="
]
args = read_options("ywi:gsoal:Jbu:T:", longopts, treat_options, -1,
                    usage_string)

try:
    verbose("Counting ngrams in candidates file")
    filetype.parse(args, CounterPrinter(), filetype_candidates_ext)
finally:
    if web_freq:
        web_freq.flush_cache()  # VERY IMPORTANT!
            if algoname == "simple" :  # Redundant, kept for clarity
                sent_handler = LowercaserHandler.handle_sentence_simple
            elif algoname == "complex" :
                sent_handler = LowercaserHandler.handle_sentence_complex
            elif algoname == "aggressive" :  # Redundant, kept for clarity
                sent_handler = LowercaserHandler.handle_sentence_aggressive
            else :
                ctxinfo.error("Bad algorithm name `{name}`", name=algoname)

        elif o == "-m":
            ctxinfo.error("Deprecated option. Use --from=Moses instead" )
        elif o == "-x":
            ctxinfo.error("Deprecated option. " \
                    "Use --from=PlainCorpus instead")
        else:
            raise Exception("Bad arg: " + o)

 
################################################################################
# MAIN SCRIPT

longopts = [ "from=", "to=", "algorithm=", "lemmas" ]
args = util.read_options( "a:xml", longopts, treat_options, 1, usage_string )

if sent_handler != LowercaserHandler.handle_sentence_simple :
    util.verbose( "Pass 1: Reading vocabulary from file... please wait" )
    filetype.parse(args, VocabReaderHandler(), input_filetype_ext)

util.verbose( "Pass 2: Lowercasing the words in the file" )
filetype.parse(args, LowercaserHandler(), input_filetype_ext)
Exemplo n.º 37
0
    def _fallback_entity(self, entity, info={}) :
        """For each candidate, verifies whether its number of occurrences in a 
        given source corpus is superior or equal to the threshold. If no source
        corpus was provided (thresh_source is None), then all the corpora will
        be considered when verifying the threshold constraint. A candidate is
        printed to stdout only if it occurrs thres_value times or more in the
        corpus names thresh_source.
        
        @param entity: The `Ngram` that is being read from the XML file.
        """
        global thresh_source
        global thresh_value
        global equals_name
        global equals_value
        global reverse
        global patterns
        global maxlength
        global minlength

        print_it = True
        ngram_to_print = entity

        # Threshold test
        if entity.freqs :
            for freq in entity.freqs :
                if thresh_source :
                    if ( thresh_source == freq.name or
                         thresh_source == freq.name + ".xml" ) and \
                         freq.value < thresh_value :
                        print_it = False
                else :
                    if freq.value < thresh_value :
                        print_it = False

        # Equality test
        if print_it and equals_name :
            print_it = False
            for feat in entity.features :
                if feat.name == equals_name and feat.value == equals_value :
                    print_it = True


        # NOTE: Different patterns may match the same ngram, with different
        # results, when the 'ignore' pattern attribute is involved. Currently,
        # we are only printing the first such match.
        if print_it and patterns :
            print_it = False
            words = entity
            for pattern in patterns :
                for (match_ngram, wordnums) in pattern.matches(words,
                        anchored_begin=True, anchored_end=True):
                    print_it = True
                    ngram_to_print = match_ngram
                    break
                if print_it :
                    break

        # Filter out too long or too short elements
        lenentity = len(entity)
        if lenentity < minlength or lenentity > maxlength :
            print_it = False
            verbose("Filtered out: %d tokens" % lenentity)

        # Filter out sentences with too few/too many MWE candidates
        if info["kind"] == "sentence":
            n = len(entity.mweoccurs)
            if not (min_mweoccurs <= n <= max_mweoccurs):
                print_it = False

        if reverse :
            print_it = not print_it

        if print_it :   
            self.chain.handle(ngram_to_print, info)
Exemplo n.º 38
0
 def before_file(self, fileobj, info={}):
     if not self.chain:
         self.chain = self.make_printer(info, output_filetype_ext)
     self.chain.before_file(fileobj, info)
     verbose("Annotating corpus with MWEs found in list")
Exemplo n.º 39
0
    def _fallback_entity(self, entity, info={}):
        """For each candidate, verifies whether its number of occurrences in a 
        given source corpus is superior or equal to the threshold. If no source
        corpus was provided (thresh_source is None), then all the corpora will
        be considered when verifying the threshold constraint. A candidate is
        printed to stdout only if it occurrs thres_value times or more in the
        corpus names thresh_source.
        
        @param entity: The `Ngram` that is being read from the XML file.
        """
        global thresh_source
        global thresh_value
        global equals_name
        global equals_value
        global reverse
        global patterns
        global maxlength
        global minlength

        print_it = True
        ngram_to_print = entity

        # Threshold test
        if entity.freqs:
            for freq in entity.freqs:
                if thresh_source:
                    if ( thresh_source == freq.name or
                         thresh_source == freq.name + ".xml" ) and \
                         freq.value < thresh_value :
                        print_it = False
                else:
                    if freq.value < thresh_value:
                        print_it = False

        # Equality test
        if print_it and equals_name:
            print_it = False
            for feat in entity.features:
                if feat.name == equals_name and feat.value == equals_value:
                    print_it = True

        # NOTE: Different patterns may match the same ngram, with different
        # results, when the 'ignore' pattern attribute is involved. Currently,
        # we are only printing the first such match.
        if print_it and patterns:
            print_it = False
            words = entity
            for pattern in patterns:
                for (match_ngram,
                     wordnums) in pattern.matches(words,
                                                  anchored_begin=True,
                                                  anchored_end=True):
                    print_it = True
                    ngram_to_print = match_ngram
                    break
                if print_it:
                    break

        # Filter out too long or too short elements
        lenentity = len(entity)
        if lenentity < minlength or lenentity > maxlength:
            print_it = False
            verbose("Filtered out: %d tokens" % lenentity)

        # Filter out sentences with too few/too many MWE candidates
        if info["kind"] == "sentence":
            n = len(entity.mweoccurs)
            if not (min_mweoccurs <= n <= max_mweoccurs):
                print_it = False

        if reverse:
            print_it = not print_it

        if print_it:
            self.chain.handle(ngram_to_print, info)