Exemplo n.º 1
0
def load_lm_filter(source_lang, target_lang, metadata_yaml,
                   source_tokenizer_command, target_tokenizer_command):

    logging.debug("Loading LM filter")

    lmFilter = DualLMFluencyFilter(LMType[metadata_yaml['lm_type']],
                                   source_lang, target_lang,
                                   source_tokenizer_command,
                                   target_tokenizer_command)
    stats = DualLMStats(metadata_yaml['clean_mean_perp'],
                        metadata_yaml['clean_stddev_perp'],
                        metadata_yaml['noisy_mean_perp'],
                        metadata_yaml['noisy_stddev_perp'])

    fullpath_source_lm = os.path.join(metadata_yaml["yamlpath"],
                                      metadata_yaml['source_lm'])
    if os.path.isfile(fullpath_source_lm):
        source_lm = fullpath_source_lm
    else:
        source_lm = metadata_yaml['source_lm']

    fullpath_target_lm = os.path.join(metadata_yaml["yamlpath"],
                                      metadata_yaml['target_lm'])
    if os.path.isfile(fullpath_target_lm):
        target_lm = fullpath_target_lm
    else:
        target_lm = metadata_yaml['target_lm']

    lmFilter.load(source_lm, target_lm, stats)

    return lmFilter
Exemplo n.º 2
0
def train_fluency_filter(args):
    # Prepare corpora:
    # Input corpora for training the classifier split in 2 parts:
    #  - Training data for LM
    #  - Validation set for estimating perplexity of clean text
    # Input noisy corpus used as validation set for estimating perplexity of noisy text

    logging.info("Training LM-based fluency filter")

    if not (args.noisy_examples_file_sl and args.noisy_examples_file_tl
            and args.lm_file_sl and args.lm_file_tl):
        return None

    inputIsTmp = True
    if args.lm_training_file_sl and args.lm_training_file_tl and args.lm_clean_examples_file_sl and args.lm_clean_examples_file_tl:
        inputIsTmp = False
        lm_train_path_sl = args.lm_training_file_sl
        lm_train_path_tl = args.lm_training_file_tl
        lm_dev_clean_sl = args.lm_clean_examples_file_sl
        lm_dev_clean_tl = args.lm_clean_examples_file_tl
        logging.info("SL LM training corpus: {}".format(lm_train_path_sl))
        logging.info("TL LM training corpus: {}".format(lm_train_path_tl))
        logging.info("SL LM dev clean corpus: {}".format(lm_dev_clean_sl))
        logging.info("TL LM dev clean corpus: {}".format(lm_dev_clean_tl))
        logging.info("SL LM dev noisy corpus: {}".format(
            args.noisy_examples_file_sl))
        logging.info("TL LM dev noisy corpus: {}".format(
            args.noisy_examples_file_tl))
    else:
        logging.info(
            "SL & TL LM training corpora have been obtained from tab-separated input file (the same ones used for training the Random Forest classifier), after randomly removing {} sentences"
            .format(args.lm_dev_size))
        logging.info(
            "SL & TL LM dev clean corpora have been randomly selected from input input file (the same used for training the Random Forest classifier): {} sentences"
            .format(args.lm_dev_size))
        logging.info("SL LM dev noisy corpus: {}".format(
            args.noisy_examples_file_sl))
        logging.info("TL LM dev noisy corpus: {}".format(
            args.noisy_examples_file_tl))
        lm_train_path_sl, lm_train_path_tl, lm_dev_clean_sl, lm_dev_clean_tl = shuffle_lm_training_text(
            args.input, args.lm_dev_size)

    try:
        ff = DualLMFluencyFilter(LMType.CHARACTER, args.source_lang,
                                 args.target_lang)
        stats = ff.train(lm_train_path_sl, lm_train_path_tl, lm_dev_clean_sl,
                         lm_dev_clean_tl, args.noisy_examples_file_sl,
                         args.noisy_examples_file_tl, args.lm_file_sl,
                         args.lm_file_tl)
    finally:
        if inputIsTmp:
            os.remove(lm_train_path_sl)
            os.remove(lm_train_path_tl)
            os.remove(lm_dev_clean_sl)
            os.remove(lm_dev_clean_tl)
    return stats
Exemplo n.º 3
0
def initialization():
    global nline
    global logging_level
    
    nline = 0
    logging.info("Processing arguments...")
    # Getting arguments and options with argparse
    # Initialization of the argparse class
    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__)
    # Mandatory parameters
    ## Input file. Try to open it to check if it exists
    parser.add_argument('input', type=argparse.FileType('rt'), default=None, help="Tab-separated files to be classified")      
    parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="Output of the classification")
    parser.add_argument('metadata', type=argparse.FileType('r'), default=None, help="Training metadata (YAML file)")    

    # Options group
    groupO = parser.add_argument_group('Optional')
    groupO.add_argument("-S", "--source_tokeniser_path", type=str, help="Source language (SL) tokeniser executable absolute path")
    groupO.add_argument("-T", "--target_tokeniser_path", type=str, help="Target language (TL) tokeniser executable absolute path")

    groupO.add_argument("--scol", default=3, type=check_positive, help ="Source sentence column (starting in 1)")
    groupO.add_argument("--tcol", default=4, type=check_positive, help ="Target sentence column (starting in 1)")    


    groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program")
    groupO.add_argument('-d', '--discarded_tus', type=argparse.FileType('w'), default=None, help="TSV file with discarded TUs. Discarded TUs by the classifier are written in this file in TSV file.")
    groupO.add_argument('--threshold', type=check_positive_between_zero_and_one, default=0.5, help="Threshold for classifier. If accuracy histogram is present in metadata, the interval for max value will be given as a default instead the current default.")
    groupO.add_argument('--lm_threshold',type=check_positive_between_zero_and_one, default=0.5, help="Threshold for language model fluency scoring. All TUs whose LM fluency score falls below the threshold will are removed (classifier score set to 0), unless the option --keep_lm_result set.")
    groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score and do not discard any TU based on that score.")
     
    groupO.add_argument('--score_only',action='store_true', help="Only output one column which is the bicleaner score", default=False)
     
    groupO.add_argument('--disable_hardrules',action = 'store_true', help = "Disables the bicleaner_hardrules filtering (only bicleaner_classify is applied)")
    groupO.add_argument('--disable_lang_ident', default=False, action='store_true', help="Don't apply hardrules that use language detecting")
    
    # Logging group
    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode')
    groupL.add_argument('--debug', action='store_true', help='Debug logging mode')
    groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file")
    groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit")

    # Validating & parsing
    # Checking if metadata is specified
    args = parser.parse_args()
    logging_setup(args)
    
    logging_level = logging.getLogger().level    

    if logging_level <= logging.WARNING and logging_level != logging.DEBUG:
        #Getting rid of INFO messages when Moses processes start
        logging.getLogger("MosesTokenizer").setLevel(logging.WARNING)
        logging.getLogger("MosesSentenceSplitter").setLevel(logging.WARNING)
        logging.getLogger("MosesPunctuationNormalizer").setLevel(logging.WARNING)
            
    try: 
        yamlpath = os.path.dirname(os.path.abspath(args.metadata.name))

        metadata_yaml = yaml.load(args.metadata)      

        args.source_lang=metadata_yaml["source_lang"]
        args.target_lang=metadata_yaml["target_lang"]
        if "source_tokeniser_path" in metadata_yaml:
            args.source_tokeniser_path=metadata_yaml["source_tokeniser_path"]
        if "target_tokeniser_path" in metadata_yaml:
            args.target_tokeniser_path=metadata_yaml["target_tokeniser_path"]

        try:
            args.clf=joblib.load( os.path.join( yamlpath , metadata_yaml["classifier"]))
        except:            
            args.clf=joblib.load(metadata_yaml["classifier"])
        
#        args.clf.n_jobs = None    
        args.classifier_type=metadata_yaml["classifier_type"]


        try:
            args.dict_sl_tl = ProbabilisticDictionary( os.path.join(yamlpath , metadata_yaml["source_dictionary"]))
        except:
            args.dict_sl_tl = ProbabilisticDictionary(metadata_yaml["source_dictionary"])                
        try:            
            args.dict_tl_sl = ProbabilisticDictionary( os.path.join(yamlpath , metadata_yaml["target_dictionary"]))        
        except:
            args.dict_tl_sl = ProbabilisticDictionary(metadata_yaml["target_dictionary"])        
        
                
        args.normalize_by_length = metadata_yaml["normalize_by_length"]
        args.treat_oovs = metadata_yaml["treat_oovs"]
        args.qmax_limit = metadata_yaml["qmax_limit"]
        args.disable_features_quest = metadata_yaml["disable_features_quest"]
        args.good_examples = metadata_yaml["good_examples"]
        args.wrong_examples = metadata_yaml["wrong_examples"]
        args.good_test_examples = metadata_yaml["good_test_examples"]
        args.wrong_test_examples = metadata_yaml["wrong_test_examples"]
        args.length_ratio = metadata_yaml["length_ratio"]
        args.features_version = 1 if "features_version" not in metadata_yaml else int(metadata_yaml["features_version"])
        
        threshold = np.argmax(metadata_yaml["accuracy_histogram"])*0.1
        logging.info("Accuracy histogram: {}".format(metadata_yaml["accuracy_histogram"]))
        logging.info("Ideal threshold: {:1.1f}".format(threshold))
        metadata_yaml["threshold"] = threshold
        
        #Load LM stuff if model was trained with it 
        if "source_lm" in metadata_yaml and "target_lm" in metadata_yaml:
            lmFilter = DualLMFluencyFilter( LMType[metadata_yaml['lm_type']] ,args.source_lang, args.target_lang)
            stats=DualLMStats( metadata_yaml['clean_mean_perp'],metadata_yaml['clean_stddev_perp'],metadata_yaml['noisy_mean_perp'],metadata_yaml['noisy_stddev_perp'] )
            
            fullpath_source_lm=os.path.join(yamlpath,metadata_yaml['source_lm'])
            if os.path.isfile(fullpath_source_lm):
                source_lm= fullpath_source_lm
            else:
                source_lm= metadata_yaml['source_lm']
            
            fullpath_target_lm=os.path.join(yamlpath,metadata_yaml['target_lm'])
            if os.path.isfile(fullpath_target_lm):
                target_lm=fullpath_target_lm
            else:
                target_lm=metadata_yaml['target_lm']
            lmFilter.load(source_lm, target_lm ,stats)
            args.lm_filter=lmFilter
        else:
            args.lm_filter=None
        
        logging.debug("YAML")
        logging.debug(metadata_yaml)
        parser.set_defaults(**metadata_yaml)   
   
    except:
        print("Error loading metadata")
        traceback.print_exc()
        sys.exit(1)
    
    # Ensure that directory exists; if not, create it
    if not os.path.exists(args.tmp_dir):
        os.makedirs(args.tmp_dir)

    if args.score_only and args.keep_lm_result:
        raise AssertionError("Conflicting arguments: cannot output bicleaner score only AND keep language model result")

    logging.debug("Arguments processed: {}".format(str(args)))
    logging.info("Arguments processed.")
    return args
Exemplo n.º 4
0
def classifier_process(i, jobs_queue, output_queue, args):
    
    if args.source_tokeniser_path:    
        source_tokeniser = ToolWrapper(args.source_tokeniser_path.split(' '))
    else:
        source_tokeniser = MosesTokenizer(args.source_lang)
    if args.target_tokeniser_path:
        target_tokeniser = ToolWrapper(args.target_tokeniser_path.split(' '))
    else:
        target_tokeniser = MosesTokenizer(args.target_lang)
        
    #Load LM for fluency scoring
    lm_filter=None
    if args.source_lm and args.target_lm:

        lm_filter=DualLMFluencyFilter(args.lm_type,args.source_lang, args.target_lang)
        lm_filter.load(args.source_lm, args.target_lm,args.lm_filter_stats)
                

    while True:
        job = jobs_queue.get()
        if job:
            logging.debug("Job {0}".format(job.__repr__()))
            nblock, filein_name = job
            ojob = None
            with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False, dir=args.tmp_dir) as fileout:
                logging.debug("Classification: creating temporary filename {0}".format(fileout.name))
                feats = []
                lm_scores=[]
                
                #Create the following arrays:
                #valid_sentences: boolean, length of input. States whether each sentence passed
                #  hard rules and lm fluency filtering
                #feats: vector of tuples, input features to the classifier, length equals number
                #  of sentences in the input that passed hard rules + lm fluency filtering
                
                valid_sentences=[]
                for i in filein:
                    parts = i.split("\t")
                    sl_sentence=None
                    tl_sentence=None
                    if len(parts) >= max(args.scol, args.tcol):
                        sl_sentence=parts[args.scol-1]
                        tl_sentence=parts[args.tcol-1]
                    else:
                        logging.error("ERROR: scol ({}) or tcol ({}) indexes above column number ({})".format(args.scol, args.tcol, len(parts)))
                        
                    if sl_sentence and tl_sentence and len(sl_sentence.strip()) != 0 and len(tl_sentence.strip()) != 0 and (args.disable_hardrules or  wrong_tu(sl_sentence.strip(),tl_sentence.strip(), args)== False):
                        #if disable_hardrules == 1 --> the second part (and) is always true
                        lm_score=None
                        if lm_filter:
                            lm_score=lm_filter.score(sl_sentence,tl_sentence)
                        if lm_filter and lm_score < args.lm_threshold and not args.keep_lm_result:
                            valid_sentences.append(False)
                        else:
                            features = feature_extract(sl_sentence, tl_sentence, source_tokeniser, target_tokeniser, args)
                            feats.append([float(v) for v in features])
                            lm_scores.append(lm_score)        
                            valid_sentences.append(True)
                    else:
                        valid_sentences.append(False)
                    

                predictions = args.clf.predict_proba(np.array(feats)) if len(feats) > 0 else []
                filein.seek(0)

                piter = iter(predictions)
                if lm_filter:
                    lmiter=iter(lm_scores)
                for i, valid_sentence in zip(filein,valid_sentences):                    
                    if valid_sentence:
                        p = next(piter)
                        if args.score_only:
                            fileout.write("{0:.3f}".format((p[1])))                        
                        else:    
                            fileout.write(i.strip())
                            fileout.write("\t")
                            fileout.write("{0:.3f}".format((p[1])))
                            if lm_filter and args.keep_lm_result:
                                lm_score=next(lmiter)
                                fileout.write("\t")
                                fileout.write("{0:.3f}".format(lm_score))
                        fileout.write("\n")
                    else:
                        if args.score_only:
                            fileout.write("0")
                        else:
                            fileout.write(i.strip("\n"))
                            fileout.write("\t0")
                            if lm_filter and args.keep_lm_result:
                                fileout.write("\t0")
                        fileout.write("\n")

                ojob = (nblock, fileout.name)
                filein.close()
                fileout.close()
             
            if ojob:                    
                output_queue.put(ojob)
                
            os.unlink(filein_name)
        else:
            logging.debug("Exiting worker")
            break