Python DualLMFluencyFilter примеры использования

Язык программирования: Python

Пространство имен/Пакет: lm

Класс/Тип: DualLMFluencyFilter

Примеров на hotexamples.com: 4

Python DualLMFluencyFilter - 4 примера найдено. Это лучшие примеры Python кода для lm.DualLMFluencyFilter, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

DualLMFluencyFilter(4)

load(3)

score(1)

train(1)

Пример #1

Показать файл

Файл: bicleaner_hardrules.py Проект: xpertasks/bicleaner

def load_lm_filter(source_lang, target_lang, metadata_yaml,
                   source_tokenizer_command, target_tokenizer_command):

    logging.debug("Loading LM filter")

    lmFilter = DualLMFluencyFilter(LMType[metadata_yaml['lm_type']],
                                   source_lang, target_lang,
                                   source_tokenizer_command,
                                   target_tokenizer_command)
    stats = DualLMStats(metadata_yaml['clean_mean_perp'],
                        metadata_yaml['clean_stddev_perp'],
                        metadata_yaml['noisy_mean_perp'],
                        metadata_yaml['noisy_stddev_perp'])

    fullpath_source_lm = os.path.join(metadata_yaml["yamlpath"],
                                      metadata_yaml['source_lm'])
    if os.path.isfile(fullpath_source_lm):
        source_lm = fullpath_source_lm
    else:
        source_lm = metadata_yaml['source_lm']

    fullpath_target_lm = os.path.join(metadata_yaml["yamlpath"],
                                      metadata_yaml['target_lm'])
    if os.path.isfile(fullpath_target_lm):
        target_lm = fullpath_target_lm
    else:
        target_lm = metadata_yaml['target_lm']

    lmFilter.load(source_lm, target_lm, stats)

    return lmFilter

Пример #2

Показать файл

Файл: training.py Проект: akbokha/bicleaner

def train_fluency_filter(args):
    # Prepare corpora:
    # Input corpora for training the classifier split in 2 parts:
    #  - Training data for LM
    #  - Validation set for estimating perplexity of clean text
    # Input noisy corpus used as validation set for estimating perplexity of noisy text

    logging.info("Training LM-based fluency filter")

    if not (args.noisy_examples_file_sl and args.noisy_examples_file_tl
            and args.lm_file_sl and args.lm_file_tl):
        return None

    inputIsTmp = True
    if args.lm_training_file_sl and args.lm_training_file_tl and args.lm_clean_examples_file_sl and args.lm_clean_examples_file_tl:
        inputIsTmp = False
        lm_train_path_sl = args.lm_training_file_sl
        lm_train_path_tl = args.lm_training_file_tl
        lm_dev_clean_sl = args.lm_clean_examples_file_sl
        lm_dev_clean_tl = args.lm_clean_examples_file_tl
        logging.info("SL LM training corpus: {}".format(lm_train_path_sl))
        logging.info("TL LM training corpus: {}".format(lm_train_path_tl))
        logging.info("SL LM dev clean corpus: {}".format(lm_dev_clean_sl))
        logging.info("TL LM dev clean corpus: {}".format(lm_dev_clean_tl))
        logging.info("SL LM dev noisy corpus: {}".format(
            args.noisy_examples_file_sl))
        logging.info("TL LM dev noisy corpus: {}".format(
            args.noisy_examples_file_tl))
    else:
        logging.info(
            "SL & TL LM training corpora have been obtained from tab-separated input file (the same ones used for training the Random Forest classifier), after randomly removing {} sentences"
            .format(args.lm_dev_size))
        logging.info(
            "SL & TL LM dev clean corpora have been randomly selected from input input file (the same used for training the Random Forest classifier): {} sentences"
            .format(args.lm_dev_size))
        logging.info("SL LM dev noisy corpus: {}".format(
            args.noisy_examples_file_sl))
        logging.info("TL LM dev noisy corpus: {}".format(
            args.noisy_examples_file_tl))
        lm_train_path_sl, lm_train_path_tl, lm_dev_clean_sl, lm_dev_clean_tl = shuffle_lm_training_text(
            args.input, args.lm_dev_size)

    try:
        ff = DualLMFluencyFilter(LMType.CHARACTER, args.source_lang,
                                 args.target_lang)
        stats = ff.train(lm_train_path_sl, lm_train_path_tl, lm_dev_clean_sl,
                         lm_dev_clean_tl, args.noisy_examples_file_sl,
                         args.noisy_examples_file_tl, args.lm_file_sl,
                         args.lm_file_tl)
    finally:
        if inputIsTmp:
            os.remove(lm_train_path_sl)
            os.remove(lm_train_path_tl)
            os.remove(lm_dev_clean_sl)
            os.remove(lm_dev_clean_tl)
    return stats

Пример #3

Показать файл

def initialization():
    global nline
    global logging_level
    
    nline = 0
    logging.info("Processing arguments...")
    # Getting arguments and options with argparse
    # Initialization of the argparse class
    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__)
    # Mandatory parameters
    ## Input file. Try to open it to check if it exists
    parser.add_argument('input', type=argparse.FileType('rt'), default=None, help="Tab-separated files to be classified")      
    parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="Output of the classification")
    parser.add_argument('metadata', type=argparse.FileType('r'), default=None, help="Training metadata (YAML file)")    

    # Options group
    groupO = parser.add_argument_group('Optional')
    groupO.add_argument("-S", "--source_tokeniser_path", type=str, help="Source language (SL) tokeniser executable absolute path")
    groupO.add_argument("-T", "--target_tokeniser_path", type=str, help="Target language (TL) tokeniser executable absolute path")

    groupO.add_argument("--scol", default=3, type=check_positive, help ="Source sentence column (starting in 1)")
    groupO.add_argument("--tcol", default=4, type=check_positive, help ="Target sentence column (starting in 1)")    


    groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program")
    groupO.add_argument('-d', '--discarded_tus', type=argparse.FileType('w'), default=None, help="TSV file with discarded TUs. Discarded TUs by the classifier are written in this file in TSV file.")
    groupO.add_argument('--threshold', type=check_positive_between_zero_and_one, default=0.5, help="Threshold for classifier. If accuracy histogram is present in metadata, the interval for max value will be given as a default instead the current default.")
    groupO.add_argument('--lm_threshold',type=check_positive_between_zero_and_one, default=0.5, help="Threshold for language model fluency scoring. All TUs whose LM fluency score falls below the threshold will are removed (classifier score set to 0), unless the option --keep_lm_result set.")
    groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score and do not discard any TU based on that score.")
     
    groupO.add_argument('--score_only',action='store_true', help="Only output one column which is the bicleaner score", default=False)
     
    groupO.add_argument('--disable_hardrules',action = 'store_true', help = "Disables the bicleaner_hardrules filtering (only bicleaner_classify is applied)")
    groupO.add_argument('--disable_lang_ident', default=False, action='store_true', help="Don't apply hardrules that use language detecting")
    
    # Logging group
    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode')
    groupL.add_argument('--debug', action='store_true', help='Debug logging mode')
    groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file")
    groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit")

    # Validating & parsing
    # Checking if metadata is specified
    args = parser.parse_args()
    logging_setup(args)
    
    logging_level = logging.getLogger().level    

    if logging_level <= logging.WARNING and logging_level != logging.DEBUG:
        #Getting rid of INFO messages when Moses processes start
        logging.getLogger("MosesTokenizer").setLevel(logging.WARNING)
        logging.getLogger("MosesSentenceSplitter").setLevel(logging.WARNING)
        logging.getLogger("MosesPunctuationNormalizer").setLevel(logging.WARNING)
            
    try: 
        yamlpath = os.path.dirname(os.path.abspath(args.metadata.name))

        metadata_yaml = yaml.load(args.metadata)      

        args.source_lang=metadata_yaml["source_lang"]
        args.target_lang=metadata_yaml["target_lang"]
        if "source_tokeniser_path" in metadata_yaml:
            args.source_tokeniser_path=metadata_yaml["source_tokeniser_path"]
        if "target_tokeniser_path" in metadata_yaml:
            args.target_tokeniser_path=metadata_yaml["target_tokeniser_path"]

        try:
            args.clf=joblib.load( os.path.join( yamlpath , metadata_yaml["classifier"]))
        except:            
            args.clf=joblib.load(metadata_yaml["classifier"])
        
#        args.clf.n_jobs = None    
        args.classifier_type=metadata_yaml["classifier_type"]


        try:
            args.dict_sl_tl = ProbabilisticDictionary( os.path.join(yamlpath , metadata_yaml["source_dictionary"]))
        except:
            args.dict_sl_tl = ProbabilisticDictionary(metadata_yaml["source_dictionary"])                
        try:            
            args.dict_tl_sl = ProbabilisticDictionary( os.path.join(yamlpath , metadata_yaml["target_dictionary"]))        
        except:
            args.dict_tl_sl = ProbabilisticDictionary(metadata_yaml["target_dictionary"])        
        
                
        args.normalize_by_length = metadata_yaml["normalize_by_length"]
        args.treat_oovs = metadata_yaml["treat_oovs"]
        args.qmax_limit = metadata_yaml["qmax_limit"]
        args.disable_features_quest = metadata_yaml["disable_features_quest"]
        args.good_examples = metadata_yaml["good_examples"]
        args.wrong_examples = metadata_yaml["wrong_examples"]
        args.good_test_examples = metadata_yaml["good_test_examples"]
        args.wrong_test_examples = metadata_yaml["wrong_test_examples"]
        args.length_ratio = metadata_yaml["length_ratio"]
        args.features_version = 1 if "features_version" not in metadata_yaml else int(metadata_yaml["features_version"])
        
        threshold = np.argmax(metadata_yaml["accuracy_histogram"])*0.1
        logging.info("Accuracy histogram: {}".format(metadata_yaml["accuracy_histogram"]))
        logging.info("Ideal threshold: {:1.1f}".format(threshold))
        metadata_yaml["threshold"] = threshold
        
        #Load LM stuff if model was trained with it 
        if "source_lm" in metadata_yaml and "target_lm" in metadata_yaml:
            lmFilter = DualLMFluencyFilter( LMType[metadata_yaml['lm_type']] ,args.source_lang, args.target_lang)
            stats=DualLMStats( metadata_yaml['clean_mean_perp'],metadata_yaml['clean_stddev_perp'],metadata_yaml['noisy_mean_perp'],metadata_yaml['noisy_stddev_perp'] )
            
            fullpath_source_lm=os.path.join(yamlpath,metadata_yaml['source_lm'])
            if os.path.isfile(fullpath_source_lm):
                source_lm= fullpath_source_lm
            else:
                source_lm= metadata_yaml['source_lm']
            
            fullpath_target_lm=os.path.join(yamlpath,metadata_yaml['target_lm'])
            if os.path.isfile(fullpath_target_lm):
                target_lm=fullpath_target_lm
            else:
                target_lm=metadata_yaml['target_lm']
            lmFilter.load(source_lm, target_lm ,stats)
            args.lm_filter=lmFilter
        else:
            args.lm_filter=None
        
        logging.debug("YAML")
        logging.debug(metadata_yaml)
        parser.set_defaults(**metadata_yaml)   
   
    except:
        print("Error loading metadata")
        traceback.print_exc()
        sys.exit(1)
    
    # Ensure that directory exists; if not, create it
    if not os.path.exists(args.tmp_dir):
        os.makedirs(args.tmp_dir)

    if args.score_only and args.keep_lm_result:
        raise AssertionError("Conflicting arguments: cannot output bicleaner score only AND keep language model result")

    logging.debug("Arguments processed: {}".format(str(args)))
    logging.info("Arguments processed.")
    return args

Пример #4

Показать файл

def classifier_process(i, jobs_queue, output_queue, args):
    
    if args.source_tokeniser_path:    
        source_tokeniser = ToolWrapper(args.source_tokeniser_path.split(' '))
    else:
        source_tokeniser = MosesTokenizer(args.source_lang)
    if args.target_tokeniser_path:
        target_tokeniser = ToolWrapper(args.target_tokeniser_path.split(' '))
    else:
        target_tokeniser = MosesTokenizer(args.target_lang)
        
    #Load LM for fluency scoring
    lm_filter=None
    if args.source_lm and args.target_lm:

        lm_filter=DualLMFluencyFilter(args.lm_type,args.source_lang, args.target_lang)
        lm_filter.load(args.source_lm, args.target_lm,args.lm_filter_stats)
                

    while True:
        job = jobs_queue.get()
        if job:
            logging.debug("Job {0}".format(job.__repr__()))
            nblock, filein_name = job
            ojob = None
            with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False, dir=args.tmp_dir) as fileout:
                logging.debug("Classification: creating temporary filename {0}".format(fileout.name))
                feats = []
                lm_scores=[]
                
                #Create the following arrays:
                #valid_sentences: boolean, length of input. States whether each sentence passed
                #  hard rules and lm fluency filtering
                #feats: vector of tuples, input features to the classifier, length equals number
                #  of sentences in the input that passed hard rules + lm fluency filtering
                
                valid_sentences=[]
                for i in filein:
                    parts = i.split("\t")
                    sl_sentence=None
                    tl_sentence=None
                    if len(parts) >= max(args.scol, args.tcol):
                        sl_sentence=parts[args.scol-1]
                        tl_sentence=parts[args.tcol-1]
                    else:
                        logging.error("ERROR: scol ({}) or tcol ({}) indexes above column number ({})".format(args.scol, args.tcol, len(parts)))
                        
                    if sl_sentence and tl_sentence and len(sl_sentence.strip()) != 0 and len(tl_sentence.strip()) != 0 and (args.disable_hardrules or  wrong_tu(sl_sentence.strip(),tl_sentence.strip(), args)== False):
                        #if disable_hardrules == 1 --> the second part (and) is always true
                        lm_score=None
                        if lm_filter:
                            lm_score=lm_filter.score(sl_sentence,tl_sentence)
                        if lm_filter and lm_score < args.lm_threshold and not args.keep_lm_result:
                            valid_sentences.append(False)
                        else:
                            features = feature_extract(sl_sentence, tl_sentence, source_tokeniser, target_tokeniser, args)
                            feats.append([float(v) for v in features])
                            lm_scores.append(lm_score)        
                            valid_sentences.append(True)
                    else:
                        valid_sentences.append(False)
                    

                predictions = args.clf.predict_proba(np.array(feats)) if len(feats) > 0 else []
                filein.seek(0)

                piter = iter(predictions)
                if lm_filter:
                    lmiter=iter(lm_scores)
                for i, valid_sentence in zip(filein,valid_sentences):                    
                    if valid_sentence:
                        p = next(piter)
                        if args.score_only:
                            fileout.write("{0:.3f}".format((p[1])))                        
                        else:    
                            fileout.write(i.strip())
                            fileout.write("\t")
                            fileout.write("{0:.3f}".format((p[1])))
                            if lm_filter and args.keep_lm_result:
                                lm_score=next(lmiter)
                                fileout.write("\t")
                                fileout.write("{0:.3f}".format(lm_score))
                        fileout.write("\n")
                    else:
                        if args.score_only:
                            fileout.write("0")
                        else:
                            fileout.write(i.strip("\n"))
                            fileout.write("\t0")
                            if lm_filter and args.keep_lm_result:
                                fileout.write("\t0")
                        fileout.write("\n")

                ojob = (nblock, fileout.name)
                filein.close()
                fileout.close()
             
            if ojob:                    
                output_queue.put(ojob)
                
            os.unlink(filein_name)
        else:
            logging.debug("Exiting worker")
            break