def initialization(): logging.info("Processing arguments...") # Getting arguments and options with argparse # Initialization of the argparse class parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) # Mandatory parameters ## Input file. Try to open it to check if it exists parser.add_argument('input', type=argparse.FileType('r'), default=sys.stdin, help="TSV previously classified to extract bad examples") parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="Output with the bad examples selected in the process") # Options group groupO = parser.add_argument_group('Optional') groupO.add_argument('--tmp_dir', type=check_if_folder, default=gettempdir(), help="Temporary directory where creating the temporary files of this program") groupO.add_argument('-w', '--wrong_examples', type=check_positive_or_zero, default=50000, help="Number of wrong examples") groupO.add_argument('--wrong_test_examples', type=check_positive_or_zero, default=2000, help="Number of wrong test examples") groupO.add_argument('--threshold', type=check_positive_between_zero_and_one, default=None, help="Threshold for classifier.") # Logging group groupL = parser.add_argument_group('Logging') groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') groupL.add_argument('--debug', action='store_true', help='Debug logging mode') groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit") # Validating & parsing args = parser.parse_args() logging_setup(args) logging.debug("Arguments processed: {}".format(str(args))) logging.info("Arguments processed.") return args
def main(config_file: str = ConfigOption, version: bool = VersionOption): """ This is the entry point of your command line application. The values of the CLI params that are passed to this application will show up als parameters to this function. This docstring is where you describe what your command line application does. Try running `python -m {{ cookiecutter.module_name }} --help` to see how this shows up in the command line. """ {% if cookiecutter.config_file != 'none' %}config = util.load_config(config_file) util.logging_setup(config){% endif %} logger.info("Looks like you're all set up. Let's get going!")
def initialization(): global logging_level # Validating & parsing arguments parser, groupO, _ = argument_parser() args = parser.parse_args() # Set up logging logging_setup(args) logging_level = logging.getLogger().level import tensorflow as tf # Set number of processes to be used by TensorFlow tf.config.threading.set_intra_op_parallelism_threads(args.processes) tf.config.threading.set_inter_op_parallelism_threads(args.processes) # Load metadata YAML args = load_metadata(args, parser) return args
def initialization(): parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) parser.add_argument('input', nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="Tab-separated bilingual input file") groupM = parser.add_argument_group("Mandatory") groupM.add_argument('-m', '--metadata', type=argparse.FileType('w'), required=True, help="Training metadata (YAML file)") groupM.add_argument('-c', '--classifier', type=argparse.FileType('wb'), required=True, help="Classifier data file") groupM.add_argument('-s', '--source_lang', required=True, help="Source language code") groupM.add_argument('-t', '--target_lang', required=True, help="Target language code") groupM.add_argument('-d', '--source_dictionary', type=argparse.FileType('r'), required=True, help="LR gzipped probabilistic dictionary") groupM.add_argument('-D', '--target_dictionary', type=argparse.FileType('r'), required=True, help="RL gzipped probabilistic dictionary") groupO = parser.add_argument_group('Options') groupO.add_argument('--normalize_by_length', action='store_true', help="Normalize by length in qmax dict feature") groupO.add_argument('--treat_oovs', action='store_true', help="Special treatment for OOVs in qmax dict feature") groupO.add_argument('--qmax_limit', type=check_positive_or_zero, default=20, help="Number of max target words to be taken into account, sorted by length") groupO.add_argument('--disable_features_quest', action='store_false', help="Disable less important features") groupO.add_argument('-g', '--good_examples', type=check_positive_or_zero, default=50000, help="Number of good examples") groupO.add_argument('-w', '--wrong_examples', type=check_positive_or_zero, default=50000, help="Number of wrong examples") groupO.add_argument('--good_test_examples', type=check_positive_or_zero, default=2000, help="Number of good test examples") groupO.add_argument('--wrong_test_examples', type=check_positive_or_zero, default=2000, help="Number of wrong test examples") groupO.add_argument('--classifier_type', choices=['svm', 'nn', 'nn1', 'adaboost', 'random_forest'], default="svm", help="Classifier type") groupO.add_argument('--dump_features', type=argparse.FileType('w'), default=None, help="Dump training features to file") groupO.add_argument('-b', '--block_size', type=check_positive, default=10000, help="Sentence pairs per block") groupO.add_argument('-p', '--processes', type=check_positive, default=max(1, cpu_count()-1), help="Number of process to use") groupO.add_argument('--wrong_examples_file', type=argparse.FileType('r'), default=None, help="File with wrong examples extracted to replace the synthetic examples from method used by default") groupL = parser.add_argument_group('Logging') groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') groupL.add_argument('--debug', action='store_true', help='Debug logging mode') groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") args = parser.parse_args() # Logging logging_setup(args) return args
def initialization(): logging.info("Processing arguments...") # Getting arguments and options with argparse # Initialization of the argparse class parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) # Mandatory parameters parser.add_argument('input', type=argparse.FileType('r'), default=None, help="File to be anonymized") parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="File with anonymization annotations") parser.add_argument("srclang", type=str, help="Source language (SL) of the input") parser.add_argument("trglang", type=str, help="Target language (TL) of the input") ## Parameters required groupM = parser.add_argument_group('Mandatory') groupM.add_argument("--format", choices=["tmx", "cols"], required=True, type=str, help="Input file format. Values: cols, tmx") # Options group groupO = parser.add_argument_group('Optional') groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program") groupM.add_argument("--core", default=0, type=int, help="GPU id") # Logging group groupL = parser.add_argument_group('Logging') groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') groupL.add_argument('--debug', action='store_true', help='Debug logging mode') groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit") # Validating & parsing args = parser.parse_args() util.logging_setup(args) logging.debug("Arguments processed: {}".format(str(args))) if args.format=="tmx" and args.input.name=="<stdin>": logging.error("Cannot process TMX from standard input.") sys.exit(1) logging.info("Arguments processed.") return args
def initialization(): logging.info("Processing arguments...") # Getting arguments and options with argparse # Initialization of the argparse class parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) # Mandatory parameters ## Input file. Try to open it to check if it exists parser.add_argument('input', type=argparse.FileType('rt'), default=None, help="Tab-separated files to be classified") parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="Output of the classification") ## Parameters required groupM = parser.add_argument_group('Mandatory') groupM.add_argument('-m', '--metadata', type=argparse.FileType('r'), required=True, help="Training metadata (YAML file). Take into account that explicit command line arguments will overwrite the values from metadata file") # Options group groupO = parser.add_argument_group('Optional') groupO.add_argument("-s", "--source_lang", type=str, help="Source language (SL) of the input") groupO.add_argument("-t", "--target_lang", type=str, help="Target language (TL) of the input") groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program") groupO.add_argument('-b', '--block_size', type=int, default=10000, help="Sentence pairs per block") groupO.add_argument('-p', '--processes', type=int, default=max(1, cpu_count()-1), help="Number of processes to use") groupO.add_argument('--normalize_by_length', action='store_true', help="Normalize by length in qmax dict feature") groupO.add_argument('--treat_oovs', action='store_true', help="Special treatment for OOVs in qmax dict feature") groupO.add_argument('--qmax_limit', type=check_positive_or_zero, default=20, help="Number of max target words to be taken into account, sorted by length") groupO.add_argument('--disable_features_quest', action='store_false', help="Disable less important features") groupO.add_argument('-g', '--good_examples', type=check_positive_or_zero, default=50000, help="Number of good examples") groupO.add_argument('-w', '--wrong_examples', type=check_positive_or_zero, default=50000, help="Number of wrong examples") groupO.add_argument('--good_test_examples', type=check_positive_or_zero, default=2000, help="Number of good test examples") groupO.add_argument('--wrong_test_examples', type=check_positive_or_zero, default=2000, help="Number of wrong test examples") groupO.add_argument('-d', '--discarded_tus', type=argparse.FileType('w'), default=None, help="TSV file with discarded TUs. Discarded TUs by the classifier are written in this file in TSV file.") groupO.add_argument('--threshold', type=check_positive_between_zero_and_one, default=0.5, help="Threshold for classifier. If accuracy histogram is present in metadata, the interval for max value will be given as a default instead the current default.") # Logging group groupL = parser.add_argument_group('Logging') groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') groupL.add_argument('--debug', action='store_true', help='Debug logging mode') groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit") # Validating & parsing # Checking if metadata is specified preliminary_args = parser.parse_args() if preliminary_args.metadata != None: # If so, we load values from metadata metadata_yaml = yaml.load(preliminary_args.metadata) threshold = np.argmax(metadata_yaml["accuracy_histogram"])*0.1 logging.info("Accuracy histogram: {}".format(metadata_yaml["accuracy_histogram"])) logging.info("Ideal threshold: {:1.1f}".format(threshold)) metadata_yaml["threshold"] = threshold logging.debug("YAML") logging.debug(metadata_yaml) parser.set_defaults(**metadata_yaml) # Then we build again the parameters to overwrite the metadata values if their options were explicitly specified in command line arguments args = parser.parse_args() logging_setup(args) # Extra-checks for args here # Load dictionaries args.dict_sl_tl = ProbabilisticDictionary(args.source_dictionary) args.dict_tl_sl = ProbabilisticDictionary(args.target_dictionary) # Load classifier args.clf = joblib.load(args.classifier) # Ensure that directory exists; if not, create it if not os.path.exists(args.tmp_dir): os.makedirs(args.tmp_dir) logging.debug("Arguments processed: {}".format(str(args))) logging.info("Arguments processed.") return args
def initialization(): global logging_level logging.info("Processing arguments...") # Getting arguments and options with argparse # Initialization of the argparse class parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) # Mandatory parameters ## Input file. Try to open it to check if it exists parser.add_argument('input', type=argparse.FileType('rt'), default=None, help="Tab-separated files to be classified") parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="Output of the classification") parser.add_argument('metadata', type=argparse.FileType('r'), default=None, help="Training metadata (YAML file)") ## Parameters required #groupM = parser.add_argument_group('Mandatory') # Options group groupO = parser.add_argument_group('Optional') groupO.add_argument("-S", "--source_tokenizer_command", type=str, help="Source language (SL) tokenizer full command") groupO.add_argument("-T", "--target_tokenizer_command", type=str, help="Target language (TL) tokenizer full command") groupO.add_argument("--scol", default=3, type=check_positive, help="Source sentence column (starting in 1)") groupO.add_argument("--tcol", default=4, type=check_positive, help="Target sentence column (starting in 1)") groupO.add_argument( '--tmp_dir', default=gettempdir(), help= "Temporary directory where creating the temporary files of this program" ) groupO.add_argument('-b', '--block_size', type=int, default=200, help="Sentence pairs per block") groupO.add_argument('-p', '--processes', type=int, default=max(1, cpu_count() - 1), help="Number of processes to use") groupO.add_argument( '-d', '--discarded_tus', type=argparse.FileType('w'), default=None, help= "TSV file with discarded TUs. Discarded TUs by the classifier are written in this file in TSV file." ) groupO.add_argument( '--lm_threshold', type=check_positive_between_zero_and_one, default=0.5, help= "Threshold for language model fluency scoring. All TUs whose LM fluency score falls below the threshold will are removed (classifier score set to 0)" ) #groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score and do not discard any TU based on that score.") groupO.add_argument( '--score_only', action='store_true', help="Only output one column which is the bicleaner score", default=False) groupO.add_argument( '--disable_hardrules', action='store_true', help= "Disables the bicleaner_hardrules filtering (only bicleaner_classify is applied)" ) groupO.add_argument('--disable_lm_filter', action='store_true', help="Disables LM filtering") groupO.add_argument('--disable_porn_removal', default=False, action='store_true', help="Don't apply p**n removal") groupO.add_argument('--disable_minimal_length', default=False, action='store_true', help="Don't apply minimal length rule") # Logging group groupL = parser.add_argument_group('Logging') groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') groupL.add_argument('--debug', action='store_true', help='Debug logging mode') groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit") # Validating & parsing # Checking if metadata is specified args = parser.parse_args() logging_setup(args) logging_level = logging.getLogger().level try: metadata_yaml = yaml.safe_load(args.metadata) yamlpath = os.path.dirname(os.path.abspath(args.metadata.name)) metadata_yaml["yamlpath"] = yamlpath args.source_lang = metadata_yaml["source_lang"] args.target_lang = metadata_yaml["target_lang"] if "source_tokenizer_command" in metadata_yaml: args.source_tokenizer_command = metadata_yaml[ "source_tokenizer_command"] if "target_tokenizer_command" in metadata_yaml: args.target_tokenizer_command = metadata_yaml[ "target_tokenizer_command"] try: args.clf = joblib.load( os.path.join(yamlpath, metadata_yaml["classifier"])) except: args.clf = joblib.load(metadata_yaml["classifier"]) # args.clf.n_jobs = None args.classifier_type = metadata_yaml["classifier_type"] try: args.dict_sl_tl = ProbabilisticDictionary( os.path.join(yamlpath, metadata_yaml["source_dictionary"])) except: args.dict_sl_tl = ProbabilisticDictionary( metadata_yaml["source_dictionary"]) try: args.dict_tl_sl = ProbabilisticDictionary( os.path.join(yamlpath, metadata_yaml["target_dictionary"])) except: args.dict_tl_sl = ProbabilisticDictionary( metadata_yaml["target_dictionary"]) try: args.sl_word_freqs = WordZipfFreqDist( os.path.join(yamlpath, metadata_yaml["source_word_freqs"])) except: try: args.sl_word_freqs = WordZipfFreqDist( metadata_yaml["source_word_freqs"]) except: args.sl_word_freqs = None try: args.tl_word_freqs = WordZipfFreqDist( os.path.join(yamlpath, metadata_yaml["target_word_freqs"])) except: try: args.tl_word_freqs = WordZipfFreqDist( metadata_yaml["target_word_freqs"]) except: args.tl_word_freqs = None args.normalize_by_length = metadata_yaml["normalize_by_length"] args.treat_oovs = metadata_yaml["treat_oovs"] args.qmax_limit = metadata_yaml["qmax_limit"] args.disable_features_quest = metadata_yaml["disable_features_quest"] args.length_ratio = metadata_yaml["length_ratio"] args.features_version = 1 if "features_version" not in metadata_yaml else int( metadata_yaml["features_version"]) threshold = np.argmax(metadata_yaml["accuracy_histogram"]) * 0.1 logging.info("Accuracy histogram: {}".format( metadata_yaml["accuracy_histogram"])) logging.info("Ideal threshold: {:1.1f}".format(threshold)) metadata_yaml["threshold"] = threshold #Try loading metadata for LM filtering if not args.disable_lm_filter: if not ("source_lm" in metadata_yaml and "target_lm" in metadata_yaml): args.disable_lm_filter = True logging.warning( "LM filter not present in metadata, disabling.") else: logging.info("LM filtering disabled") if not args.disable_porn_removal: if not ("porn_removal_file" in metadata_yaml and "porn_removal_side" in metadata_yaml): args.disable_porn_removal = True logging.warning( "P**n removal not present in metadata, disabling.") else: try: args.porn_removal = fasttext.load_model( os.path.join(yamlpath, metadata_yaml['porn_removal_file'])) except: args.porn_removal = fasttext.load_model( args.metadata_yaml['porn_removal_file']) else: logging.info("P**n removal disabled") if "disable_lang_ident" in metadata_yaml: args.disable_lang_ident = metadata_yaml["disable_lang_ident"] else: args.disable_lang_ident = False logging.debug("YAML") logging.debug(metadata_yaml) args.metadata_yaml = metadata_yaml parser.set_defaults(**metadata_yaml) except: logging.error("Error loading metadata") traceback.print_exc() sys.exit(1) # Ensure that directory exists; if not, create it if not os.path.exists(args.tmp_dir): os.makedirs(args.tmp_dir) logging.debug("Arguments processed: {}".format(str(args))) logging.info("Arguments processed.") return args
def initialization(): parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) parser.add_argument('input', nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="Tab-separated bilingual input file") groupM = parser.add_argument_group("Mandatory") groupM.add_argument('-m', '--metadata', type=argparse.FileType('w'), required=True, help="Training metadata (YAML file)") groupM.add_argument('-c', '--classifier', type=argparse.FileType('wb'), required=True, help="Classifier data file") groupM.add_argument('-s', '--source_lang', required=True, help="Source language code") groupM.add_argument('-t', '--target_lang', required=True, help="Target language code") groupM.add_argument('-d', '--source_dictionary', type=argparse.FileType('r'), required=True, help="LR gzipped probabilistic dictionary") groupM.add_argument('-D', '--target_dictionary', type=argparse.FileType('r'), required=True, help="RL gzipped probabilistic dictionary") groupO = parser.add_argument_group('Options') groupO.add_argument('-S', '--source_tokeniser_path', help="Source language tokeniser absolute path") groupO.add_argument('-T', '--target_tokeniser_path', help="Target language tokeniser absolute path") groupO.add_argument('--normalize_by_length', action='store_true', help="Normalize by length in qmax dict feature") groupO.add_argument('--treat_oovs', action='store_true', help="Special treatment for OOVs in qmax dict feature") groupO.add_argument( '--qmax_limit', type=check_positive_or_zero, default=20, help= "Number of max target words to be taken into account, sorted by length" ) groupO.add_argument('--disable_features_quest', action='store_false', help="Disable less important features") groupO.add_argument('-g', '--good_examples', type=check_positive_or_zero, default=50000, help="Number of good examples") groupO.add_argument('-w', '--wrong_examples', type=check_positive_or_zero, default=50000, help="Number of wrong examples") groupO.add_argument('--good_test_examples', type=check_positive_or_zero, default=10000, help="Number of good test examples") groupO.add_argument('--wrong_test_examples', type=check_positive_or_zero, default=10000, help="Number of wrong test examples") groupO.add_argument( '--classifier_type', choices=['svm', 'nn', 'nn1', 'adaboost', 'random_forest'], default="random_forest", help="Classifier type") groupO.add_argument('--dump_features', type=argparse.FileType('w'), default=None, help="Dump training features to file") groupO.add_argument( '--wrong_examples_file', type=argparse.FileType('r'), default=None, help= "File with wrong examples extracted to replace the synthetic examples from method used by default" ) groupO.add_argument('--features_version', type=check_positive, default=FEATURES_VERSION, help="Version of the features") # For LM filtering groupO.add_argument( '--noisy_examples_file_sl', type=str, help= "File with noisy text in the SL. These are used to estimate the perplexity of noisy text." ) groupO.add_argument( '--noisy_examples_file_tl', type=str, help= "File with noisy text in the TL. These are used to estimate the perplexity of noisy text." ) groupO.add_argument( '--lm_dev_size', type=check_positive_or_zero, default=2000, help= "Number of sentences to be removed from clean text before training LMs. These are used to estimate the perplexity of clean text." ) groupO.add_argument('--lm_file_sl', type=str, help="SL language model output file.") groupO.add_argument('--lm_file_tl', type=str, help="TL language model output file.") groupO.add_argument( '--lm_training_file_sl', type=str, help= "SL text from which the SL LM is trained. If this parameter is not specified, SL LM is trained from the SL side of the input file, after removing --lm_dev_size sentences." ) groupO.add_argument( '--lm_training_file_tl', type=str, help= "TL text from which the TL LM is trained. If this parameter is not specified, TL LM is trained from the TL side of the input file, after removing --lm_dev_size sentences." ) groupO.add_argument( '--lm_clean_examples_file_sl', type=str, help= "File with clean text in the SL. Used to estimate the perplexity of clean text. This option must be used together with --lm_training_file_sl and both files must not have common sentences. This option replaces --lm_dev_size." ) groupO.add_argument( '--lm_clean_examples_file_tl', type=str, help= "File with clean text in the TL. Used to estimate the perplexity of clean text. This option must be used together with --lm_training_file_tl and both files must not have common sentences. This option replaces --lm_dev_size." ) groupL = parser.add_argument_group('Logging') groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') groupL.add_argument('--debug', action='store_true', help='Debug logging mode') groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") args = parser.parse_args() # Logging logging_setup(args) return args
logging.debug("Return gzipped") #f_dict.close() #afterpruning_dict.close() #afterpruning_dict.seek(0) with open(temp_file_name, 'rb') as ngzd: with gzip.open(f_dict, 'wb') as gzd: shutil.copyfileobj(ngzd, gzd) else: logging.debug("Not gzipped") #f_dict.close() with open(temp_file_name, 'r') as ngzd: with open(f_dict.name, 'wb') as gzd: shutil.copyfile(temp_file_name, f_dict.name) # for i in ngzd: # gzd.write(i) # f_dict.close() if __name__ == '__main__': try: logging_setup() args = initialization() # Parsing parameters logging_setup(args) main(args) # Running main program logging.info("Program finished") except Exception as ex: tb = traceback.format_exc() logging.error(tb) sys.exit(1)
def initialization(): # Getting arguments and options with argparse # Initialization of the argparse class parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__) # Mandatory parameters ## Input file. Try to open it to check if it exists parser.add_argument( 'input', type=argparse.FileType('r'), default=None, help= "Configuration file. Must contain a pair of freq_path dict_path in each line." ) ## Output file. Try to open it to check if it exists or can be created parser.add_argument('output', type=argparse.FileType('wb+'), default=None, help="Merged probabilistic dictionary.") parser.add_argument('--stopwords', type=argparse.FileType('w+'), default="stopwords", help="File with stopwords", required=False) parser.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit") parser.add_argument('-g', '--gzipped', action='store_true', help="Compresses the output file") ## Parameters required #groupM = parser.add_argument_group('mandatory arguments') #groupM.add_argument('-s', '--source_lang', required=True, help="Source language of the input") #groupM.add_argument('-t', '--target_lang', required=True, help="Target language of the input") # Options group groupO = parser.add_argument_group('options') groupO.add_argument('-s', '--stopwords_amount', type=int, default=0, help="Amount of words to mark as stopwords") groupO.add_argument( '-n', '--prune_ratio', type=float, default=10, help= "Ratio to prune the dictionary. Translations whose probability is {} times (default) than the maximum one." .format(10)) groupO.add_argument( '-f', '--cutoff_freq', type=int, default=1, help= "Cutoff frequency for merged dictionary (all those equal or below are removed)" ) groupO.add_argument( '-k', '--keep_tmp', action='store_true', default=False, help="This flag specifies whether removing temporal folder or not") groupO.add_argument( '-m', '--tmp_dir', type=check_if_folder, default=gettempdir(), help= "Temporary directory where creating the temporary files of this program" ) # Logging group groupL = parser.add_argument_group('logging') groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') groupL.add_argument('--debug', action='store_true', help='Debug logging mode') groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") # Validating & parsing args = parser.parse_args() logging_setup(args) # Extra-checks for args here if (args.prune_ratio != 0): args.prune_ratio = math.log(args.prune_ratio) return args
def initialization(): # Getting arguments and options with argparse # Initialization of the argparse class parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__) # Mandatory parameters ##Dictionary file parser.add_argument( 'dictionary', type=argparse.FileType('r'), default=None, help="Dictionary file. Line format: Target Source Prob") ## Output file. Try to open it to check if it exists or can be created parser.add_argument('output', type=argparse.FileType('wb+'), default=None, help="Pruned probabilistic dictionary.") parser.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit") parser.add_argument('-g', '--gzipped', action='store_true', help="Compresses the output file") # Options group groupO = parser.add_argument_group('options') groupO.add_argument( '-n', '--prune_ratio', type=float, default=10, help= "Ratio to prune the dictionary. Translations whose probability is {} times (default) than the maximum one." .format(10)) groupO.add_argument( '-k', '--keep_tmp', action='store_true', default=False, help="This flag specifies whether removing temporal folder or not") groupO.add_argument( '-m', '--tmp_dir', type=check_if_folder, default=gettempdir(), help= "Temporary directory where creating the temporary files of this program" ) # Logging group groupL = parser.add_argument_group('logging') groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') groupL.add_argument('--debug', action='store_true', help='Debug logging mode') groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") # Validating & parsing args = parser.parse_args() logging_setup(args) # Extra-checks for args here if (args.prune_ratio != 0): args.prune_ratio = math.log(args.prune_ratio) return args
def initialization(): logging.info("Processing arguments...") # Getting arguments and options with argparse # Initialization of the argparse class parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) # Mandatory parameters ## Input file. Try to open it to check if it exists parser.add_argument('input', type=argparse.FileType('r'), default=sys.stdin, help="Tab-separated bilingual input file") parser.add_argument( '-o', '--output_dir', required=True, type=str, default=os.getcwd(), help= "Output directory. Cleaned corpus and dictionary will be created here. Folder will be created if not exists" ) parser.add_argument( '--giza', required=True, type=str, help= "GIZA++ folder path, which contains binaries. Expected scripts in the folder: {}, {} and {}" .format(GIZA_MKCLS, GIZA_MGIZA, GIZA_SNT2COOC)) parser.add_argument( '--moses_dir', required=True, type=str, help="Moses scripts folder path, which contains the script {}".format( TRAIN_MODEL_SCRIPT)) ## Parameters required groupM = parser.add_argument_group('Mandatory') groupM.add_argument('-s', '--source_lang', required=True, type=str, help="Source language of the input") groupM.add_argument('-t', '--target_lang', required=True, type=str, help="Target language of the input") # Options group groupO = parser.add_argument_group('Optional') groupO.add_argument( '-m', '--tmp_dir', type=check_if_folder, default=gettempdir(), help= "Temporary directory where creating the temporary files of this program" ) groupO.add_argument('-b', '--block_size', type=int, default=10000, help="Sentence pairs per block") groupO.add_argument('-p', '--processes', type=int, default=max(1, cpu_count() - 1), help="Number of processes to use") groupO.add_argument( '-r', '--giza_ratio', type=float, default=9, help="9-1 Sentence ratio limit of GIZA++ (it shouldn't be modified)") groupO.add_argument( '-n', '--prune_ratio', type=float, default=10, help= "Ratio to prune the dictionary. Translations whose probability is {} times (default) than the maximum one." .format(10)) groupO.add_argument('--min', type=int, default=1, help="Minimum number of tokens allowed for a sentence") groupO.add_argument('--max', type=int, default=50, help="Maximum number of tokens allowed for a sentence") # Logging group groupL = parser.add_argument_group('Logging') groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') groupL.add_argument('--debug', action='store_true', help='Debug logging mode') groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit") # Validating & parsing args = parser.parse_args() logging_setup(args) # Extra-checks for args here args.prune_ratio = math.log(args.prune_ratio) if not args.output_dir.endswith("/"): args.output_dir = args.output_dir + "/" if not args.moses_dir.endswith("/"): args.moses_dir = args.moses_dir + "/" args.output_moses_corpus = args.output_dir + CLEAN_OUTPUT args.moses_train_script = args.moses_dir + TRAIN_MODEL_SCRIPT # Checking if moses scripts exist before running previous processes if not os.path.isdir(os.path.expanduser(args.output_dir)): logging.info("The output folder {} doesn't exist. Creating...".format( args.output_dir)) os.makedirs(os.path.expanduser(args.output_dir)) logging.info("Output folder created at {}".format(args.output_dir)) if not os.path.isfile(args.moses_train_script): raise argparse.ArgumentTypeError( "Moses script {} cannot be found in path {}".format( TRAIN_MODEL_SCRIPT, args.moses_train_script)) if not os.path.isfile(args.giza + GIZA_MGIZA) or not os.path.isfile( args.giza + GIZA_MKCLS) or not os.path.isfile(args.giza + GIZA_SNT2COOC): raise argparse.ArgumentTypeError( "Necessary GIZA++ scripts cannot be found in path {}. Please check if some of the following scripts are missing in the folder: {}, {} and {}" .format(args.giza, GIZA_MGIZA, GIZA_MKCLS, GIZA_SNT2COOC)) # Intermediary files args.output_source = open( "{}{}".format(args.output_dir, CLEAN_OUTPUT + "." + args.source_lang), "w") args.output_target = open( "{}{}".format(args.output_dir, CLEAN_OUTPUT + "." + args.target_lang), "w") # Final dicts names args.dict_sl_tl_final = "{}{}".format( args.output_dir, DICT_FINAL_NAME.format(args.source_lang, args.target_lang)) args.dict_tl_sl_final = "{}{}".format( args.output_dir, DICT_FINAL_NAME.format(args.target_lang, args.source_lang)) logging.debug("Arguments processed: {}".format(str(args))) logging.info("Arguments processed.") return args
def initialization(): global logging_level parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) parser.add_argument('input', nargs='?', type=argparse.FileType('rt', errors="replace"), default=io.TextIOWrapper(sys.stdin.buffer, errors="replace"), help="Tab-separated bilingual tagged file") parser.add_argument('output', nargs='?', type=argparse.FileType('wt'), default=sys.stdout, help="Output of the classification") parser.add_argument( '--annotated_output', default=False, action='store_true', help= "Adds an extra column with each sentence's evaluation (\"keep\" if the sentence is good, otherwise the reason for rejecting" ) #groupM = parser.add_argument_group('Mandatory') #groupM.add_argument("-s", "--source_lang", type=str, required=True, help="Source language (SL) of the input") #groupM.add_argument("-t", "--target_lang", type=str, required=True, help="Target language (TL) of the input") groupO = parser.add_argument_group('Optional') groupO.add_argument( '--tmp_dir', default=gettempdir(), help= "Temporary directory where creating the temporary files of this program" ) groupO.add_argument('-b', '--block_size', type=int, default=10000, help="Sentence pairs per block") groupO.add_argument('-p', '--processes', type=int, default=max(1, cpu_count() - 1), help="Number of processes to use") groupO.add_argument('--disable_lang_ident', default=False, action='store_true', help="Don't apply rules that use language detecting") groupO.add_argument('--disable_minimal_length', default=False, action='store_true', help="Don't apply minimal length rule") groupO.add_argument('--disable_porn_removal', default=False, action='store_true', help="Don't apply p**n removal") groupO.add_argument("-s", "--source_lang", type=str, default=None, help="Source language (SL) of the input") groupO.add_argument("-t", "--target_lang", type=str, default=None, help="Target language (TL) of the input") groupO.add_argument("--scol", default=1, type=check_positive, help="Source sentence column (starting in 1)") groupO.add_argument("--tcol", default=2, type=check_positive, help="Target sentence column (starting in 1)") groupO.add_argument("-S", "--source_tokenizer_command", default=None, type=str, help="Source language (SL) tokenizer full command") groupO.add_argument("-T", "--target_tokenizer_command", default=None, type=str, help="Target language (TL) tokenizer full command") #LM filtering groupO.add_argument('--disable_lm_filter', default=False, action='store_true', help="Don't apply LM filtering") groupO.add_argument('--metadata', type=argparse.FileType('r'), default=None, help="Training metadata (YAML file)") groupO.add_argument('--lm_threshold', type=check_positive_between_zero_and_one, default=0.5, help="Threshold for language model fluency scoring.") #groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score.") # Logging group groupL = parser.add_argument_group('Logging') groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') groupL.add_argument('--debug', action='store_true', help='Debug logging mode') groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") #groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit") args = parser.parse_args() logging_setup(args) logging_level = logging.getLogger().level # Ensure that directory exists; if not, create it if not os.path.exists(args.tmp_dir): os.makedirs(args.tmp_dir) #Try loading metadata for LM filtering and p**n removal if not (args.disable_lm_filter and args.disable_porn_removal) and args.metadata != None: logging.info("Loading metadata info") try: args.metadata_yaml = yaml.safe_load(args.metadata) args.metadata_yaml["yamlpath"] = os.path.dirname( os.path.abspath(args.metadata.name)) if not ("source_lm" in args.metadata_yaml and "target_lm" in args.metadata_yaml): args.disable_lm_filter = True logging.warning("LM file not present in metadata.") if not ("porn_removal_file" in args.metadata_yaml): args.disable_porn_removal = True logging.warning( "P**n removal classifier not present in metadata.") else: try: args.porn_removal = fasttext.load_model( os.path.join(args.metadata_yaml["yamlpath"], args.metadata_yaml['porn_removal_file'])) except: args.porn_removal = fasttext.load_model( args.metadata_yaml['porn_removal_file']) if "source_tokenizer_command" in args.metadata_yaml: args.source_tokenizer_command = args.metadata_yaml[ "source_tokenizer_command"] if "target_tokenizer_command" in args.metadata_yaml: args.target_tokenizer_command = args.metadata_yaml[ "target_tokenizer_command"] parser.set_defaults(**args.metadata_yaml) except: logging.warning("Error loading metadata.") args.disable_lm_filter = True args.disable_porn_removal = True traceback.print_exc() #sys.exit(1) else: if args.metadata == None: logging.warning("Metadata file not provided.") args.disable_lm_filter = True args.disable_porn_removal = True if (args.source_lang == None or args.target_lang == None): if (args.metadata == None): logging.error("No source or target languages provided.") sys.exit(1) else: try: if not "metadata_yaml" in args or args.metadata_yaml == None: args.metadata_yaml = yaml.safe_load(args.metadata) #args.metadata_yaml["yamlpath"] = os.path.dirname(os.path.abspath(args.metadata.name)) args.source_lang = args.metadata_yaml["source_lang"] args.target_lang = args.metadata_yaml["target_lang"] except: traceback.print_exc() logging.error( "Error retrieving source or target languages from metadata." ) sys.exit(1) if args.disable_lm_filter: logging.info("LM filtering disabled.") if args.disable_porn_removal: logging.info("P**n removal disabled.") return args
def initialization(): global nline global logging_level nline = 0 logging.info("Processing arguments...") # Getting arguments and options with argparse # Initialization of the argparse class parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) # Mandatory parameters ## Input file. Try to open it to check if it exists parser.add_argument('input', type=argparse.FileType('rt'), default=None, help="Tab-separated files to be classified") parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="Output of the classification") parser.add_argument('metadata', type=argparse.FileType('r'), default=None, help="Training metadata (YAML file)") # Options group groupO = parser.add_argument_group('Optional') groupO.add_argument("-S", "--source_tokeniser_path", type=str, help="Source language (SL) tokeniser executable absolute path") groupO.add_argument("-T", "--target_tokeniser_path", type=str, help="Target language (TL) tokeniser executable absolute path") groupO.add_argument("--scol", default=3, type=check_positive, help ="Source sentence column (starting in 1)") groupO.add_argument("--tcol", default=4, type=check_positive, help ="Target sentence column (starting in 1)") groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program") groupO.add_argument('-d', '--discarded_tus', type=argparse.FileType('w'), default=None, help="TSV file with discarded TUs. Discarded TUs by the classifier are written in this file in TSV file.") groupO.add_argument('--threshold', type=check_positive_between_zero_and_one, default=0.5, help="Threshold for classifier. If accuracy histogram is present in metadata, the interval for max value will be given as a default instead the current default.") groupO.add_argument('--lm_threshold',type=check_positive_between_zero_and_one, default=0.5, help="Threshold for language model fluency scoring. All TUs whose LM fluency score falls below the threshold will are removed (classifier score set to 0), unless the option --keep_lm_result set.") groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score and do not discard any TU based on that score.") groupO.add_argument('--score_only',action='store_true', help="Only output one column which is the bicleaner score", default=False) groupO.add_argument('--disable_hardrules',action = 'store_true', help = "Disables the bicleaner_hardrules filtering (only bicleaner_classify is applied)") groupO.add_argument('--disable_lang_ident', default=False, action='store_true', help="Don't apply hardrules that use language detecting") # Logging group groupL = parser.add_argument_group('Logging') groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') groupL.add_argument('--debug', action='store_true', help='Debug logging mode') groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit") # Validating & parsing # Checking if metadata is specified args = parser.parse_args() logging_setup(args) logging_level = logging.getLogger().level if logging_level <= logging.WARNING and logging_level != logging.DEBUG: #Getting rid of INFO messages when Moses processes start logging.getLogger("MosesTokenizer").setLevel(logging.WARNING) logging.getLogger("MosesSentenceSplitter").setLevel(logging.WARNING) logging.getLogger("MosesPunctuationNormalizer").setLevel(logging.WARNING) try: yamlpath = os.path.dirname(os.path.abspath(args.metadata.name)) metadata_yaml = yaml.load(args.metadata) args.source_lang=metadata_yaml["source_lang"] args.target_lang=metadata_yaml["target_lang"] if "source_tokeniser_path" in metadata_yaml: args.source_tokeniser_path=metadata_yaml["source_tokeniser_path"] if "target_tokeniser_path" in metadata_yaml: args.target_tokeniser_path=metadata_yaml["target_tokeniser_path"] try: args.clf=joblib.load( os.path.join( yamlpath , metadata_yaml["classifier"])) except: args.clf=joblib.load(metadata_yaml["classifier"]) # args.clf.n_jobs = None args.classifier_type=metadata_yaml["classifier_type"] try: args.dict_sl_tl = ProbabilisticDictionary( os.path.join(yamlpath , metadata_yaml["source_dictionary"])) except: args.dict_sl_tl = ProbabilisticDictionary(metadata_yaml["source_dictionary"]) try: args.dict_tl_sl = ProbabilisticDictionary( os.path.join(yamlpath , metadata_yaml["target_dictionary"])) except: args.dict_tl_sl = ProbabilisticDictionary(metadata_yaml["target_dictionary"]) args.normalize_by_length = metadata_yaml["normalize_by_length"] args.treat_oovs = metadata_yaml["treat_oovs"] args.qmax_limit = metadata_yaml["qmax_limit"] args.disable_features_quest = metadata_yaml["disable_features_quest"] args.good_examples = metadata_yaml["good_examples"] args.wrong_examples = metadata_yaml["wrong_examples"] args.good_test_examples = metadata_yaml["good_test_examples"] args.wrong_test_examples = metadata_yaml["wrong_test_examples"] args.length_ratio = metadata_yaml["length_ratio"] args.features_version = 1 if "features_version" not in metadata_yaml else int(metadata_yaml["features_version"]) threshold = np.argmax(metadata_yaml["accuracy_histogram"])*0.1 logging.info("Accuracy histogram: {}".format(metadata_yaml["accuracy_histogram"])) logging.info("Ideal threshold: {:1.1f}".format(threshold)) metadata_yaml["threshold"] = threshold #Load LM stuff if model was trained with it if "source_lm" in metadata_yaml and "target_lm" in metadata_yaml: lmFilter = DualLMFluencyFilter( LMType[metadata_yaml['lm_type']] ,args.source_lang, args.target_lang) stats=DualLMStats( metadata_yaml['clean_mean_perp'],metadata_yaml['clean_stddev_perp'],metadata_yaml['noisy_mean_perp'],metadata_yaml['noisy_stddev_perp'] ) fullpath_source_lm=os.path.join(yamlpath,metadata_yaml['source_lm']) if os.path.isfile(fullpath_source_lm): source_lm= fullpath_source_lm else: source_lm= metadata_yaml['source_lm'] fullpath_target_lm=os.path.join(yamlpath,metadata_yaml['target_lm']) if os.path.isfile(fullpath_target_lm): target_lm=fullpath_target_lm else: target_lm=metadata_yaml['target_lm'] lmFilter.load(source_lm, target_lm ,stats) args.lm_filter=lmFilter else: args.lm_filter=None logging.debug("YAML") logging.debug(metadata_yaml) parser.set_defaults(**metadata_yaml) except: print("Error loading metadata") traceback.print_exc() sys.exit(1) # Ensure that directory exists; if not, create it if not os.path.exists(args.tmp_dir): os.makedirs(args.tmp_dir) if args.score_only and args.keep_lm_result: raise AssertionError("Conflicting arguments: cannot output bicleaner score only AND keep language model result") logging.debug("Arguments processed: {}".format(str(args))) logging.info("Arguments processed.") return args
tb = traceback.format_exc() print("Unable to extract text from TMX") logging.error(tb) sys.exit(1) else: sentences = args.input source_names_module = binonymizer_core.selectNamesModule(args.srclang) target_names_module = binonymizer_core.selectNamesModule(args.trglang) binonymizer_process(args, sentences, regex_module, source_names_module, target_names_module, address_module) #To do: rebuild tmx files with anotations from binonymizer if args.format=="tmx": #Rebuild TMX with anon logging.warning("********************* Unsupported feature!! ********************") pass logging.info("Program finished") if __name__ == '__main__': try: util.logging_setup() args = initialization() # Parsing parameters main(args) # Running main program except Exception as ex: tb = traceback.format_exc() logging.error(tb) sys.exit(1)
def initialization(): logging.info("Processing arguments...") # Getting arguments and options with argparse # Initialization of the argparse class parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) # Mandatory parameters ## Input file. Try to open it to check if it exists parser.add_argument('input', type=argparse.FileType('rt'), default=None, help="Tab-separated files to be classified") parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="Output of the classification") parser.add_argument('metadata', type=argparse.FileType('r'), default=None, help="Training metadata (YAML file)") ## Parameters required #groupM = parser.add_argument_group('Mandatory') # Options group groupO = parser.add_argument_group('Optional') groupO.add_argument("-S", "--source_tokeniser_path", type=str, help="Source language (SL) tokeniser executable absolute path") groupO.add_argument("-T", "--target_tokeniser_path", type=str, help="Target language (TL) tokeniser executable absolute path") groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program") groupO.add_argument('-b', '--block_size', type=int, default=200, help="Sentence pairs per block") groupO.add_argument('-p', '--processes', type=int, default=max(1, cpu_count()-1), help="Number of processes to use") groupO.add_argument('-d', '--discarded_tus', type=argparse.FileType('w'), default=None, help="TSV file with discarded TUs. Discarded TUs by the classifier are written in this file in TSV file.") groupO.add_argument('--threshold', type=check_positive_between_zero_and_one, default=0.5, help="Threshold for classifier. If accuracy histogram is present in metadata, the interval for max value will be given as a default instead the current default.") groupO.add_argument('--lm_threshold',type=check_positive_between_zero_and_one, default=0.5, help="Threshold for language model fluency scoring. All TUs whose LM fluency score falls below the threshold will are removed (classifier score set to 0), unless the option --keep_lm_result set.") groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score and do not discard any TU based on that score.") # Logging group groupL = parser.add_argument_group('Logging') groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') groupL.add_argument('--debug', action='store_true', help='Debug logging mode') groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit") # Validating & parsing # Checking if metadata is specified args = parser.parse_args() logging_setup(args) try: yamlpath = os.path.dirname(os.path.abspath(args.metadata.name)) metadata_yaml = yaml.load(args.metadata) args.source_lang=metadata_yaml["source_lang"] args.target_lang=metadata_yaml["target_lang"] if "source_tokeniser_path" in metadata_yaml: args.source_tokeniser_path=metadata_yaml["source_tokeniser_path"] if "target_tokeniser_path" in metadata_yaml: args.target_tokeniser_path=metadata_yaml["target_tokeniser_path"] try: args.clf=joblib.load( os.path.join(yamlpath , metadata_yaml["classifier"])) except: args.clf=joblib.load(metadata_yaml["classifier"]) # args.clf.n_jobs = None args.classifier_type=metadata_yaml["classifier_type"] try: args.dict_sl_tl = ProbabilisticDictionary( os.path.join( yamlpath, metadata_yaml["source_dictionary"])) except: args.dict_sl_tl = ProbabilisticDictionary(metadata_yaml["source_dictionary"]) try: args.dict_tl_sl = ProbabilisticDictionary( os.path.join( yamlpath , metadata_yaml["target_dictionary"])) except: args.dict_tl_sl = ProbabilisticDictionary(metadata_yaml["target_dictionary"]) args.normalize_by_length = metadata_yaml["normalize_by_length"] args.treat_oovs = metadata_yaml["treat_oovs"] args.qmax_limit = metadata_yaml["qmax_limit"] args.disable_features_quest = metadata_yaml["disable_features_quest"] args.good_examples = metadata_yaml["good_examples"] args.wrong_examples = metadata_yaml["wrong_examples"] args.good_test_examples = metadata_yaml["good_test_examples"] args.wrong_test_examples = metadata_yaml["wrong_test_examples"] args.length_ratio = metadata_yaml["length_ratio"] args.features_version = 1 if "features_version" not in metadata_yaml else int(metadata_yaml["features_version"]) threshold = np.argmax(metadata_yaml["accuracy_histogram"])*0.1 logging.info("Accuracy histogram: {}".format(metadata_yaml["accuracy_histogram"])) logging.info("Ideal threshold: {:1.1f}".format(threshold)) metadata_yaml["threshold"] = threshold #Load LM stuff if model was trained with it if "source_lm" in metadata_yaml and "target_lm" in metadata_yaml: fullpath_source_lm=os.path.join(yamlpath,metadata_yaml['source_lm']) if os.path.isfile(fullpath_source_lm): args.source_lm= fullpath_source_lm else: args.source_lm= metadata_yaml['source_lm'] fullpath_target_lm=os.path.join(yamlpath,metadata_yaml['target_lm']) if os.path.isfile(fullpath_target_lm): args.target_lm=fullpath_target_lm else: args.target_lm=metadata_yaml['target_lm'] args.lm_type=LMType[metadata_yaml['lm_type']] stats=DualLMStats( metadata_yaml['clean_mean_perp'],metadata_yaml['clean_stddev_perp'],metadata_yaml['noisy_mean_perp'],metadata_yaml['noisy_stddev_perp'] ) args.lm_filter_stats=stats else: args.source_lm=None args.target_lm=None args.lm_type=None args.lm_filter_stats=None logging.debug("YAML") logging.debug(metadata_yaml) parser.set_defaults(**metadata_yaml) except: print("Error loading metadata") traceback.print_exc() sys.exit(1) # Ensure that directory exists; if not, create it if not os.path.exists(args.tmp_dir): os.makedirs(args.tmp_dir) logging.debug("Arguments processed: {}".format(str(args))) logging.info("Arguments processed.") return args
def initialization(): global ilines global olines ilines = 0 olines = 0 logging.info("Processing arguments...") parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) # Mandatory parameters # Input file parser.add_argument('input', type=argparse.FileType('rt'), default=None, help="Tab-separated files to be bifixed") # Output file (corpus) parser.add_argument('output', type=argparse.FileType('w'), default=sys.stdout, help="Fixed corpus") # Source language parser.add_argument("srclang", type=str, help="Source language (SL) of the input") # Target language parser.add_argument("trglang", type=str, help="Target language (TL) of the input") # Mandatory parameters groupM = parser.add_argument_group('Mandatory') # Options group groupO = parser.add_argument_group('Optional') # Format groupO.add_argument("--scol", default=3, type=util.check_positive, help="Source sentence column (starting in 1)") groupO.add_argument("--tcol", default=4, type=util.check_positive, help="Target sentence column (starting in 1)") groupO.add_argument( "--sdeferredcol", type=util.check_positive, help="Source deferred standoff annotation column (starting in 1)") groupO.add_argument( "--tdeferredcol", type=util.check_positive, help="Target deferred standoff annotation column (starting in 1)") # Character fixing groupO.add_argument( '--ignore_characters', default=False, action='store_true', help="Doesn't fix mojibake, orthography, or other character issues") # Empty sides groupO.add_argument( '--ignore_empty', default=False, action='store_true', help="Doesn't remove sentences with empty source or target") # Too long sides groupO.add_argument('--ignore_long', default=False, action='store_true', help="Doesn't ignore too long sentences") # Orthography groupO.add_argument('--ignore_orthography', default=False, action='store_true', help="Doesn't apply orthography fixing") # Deduplication groupO.add_argument('--ignore_duplicates', default=False, action='store_true', help="Doesn't obtain the hashes of parallel sentences") groupO.add_argument( '--aggressive_dedup', default=False, action='store_true', help= "Treats similar sentences as duplicates (marking them with the same hash)" ) # Segmentation groupO.add_argument('--ignore_segmentation', default=False, action='store_true', help="Doesn't change segmentation of long sentences") groupO.add_argument( '--words_before_segmenting', default=15, type=util.check_positive, help= "Max words allowed in one side of a parallel sentence before trying to segmentate it. Set to 0 to applicate segmentation on everything." ) groupO.add_argument('--segmenter', default="nltk", type=str, choices=["nltk", "loomchild"], help="Segmenter module.") groupO.add_argument( '--tmp_dir', default=gettempdir(), help= "Temporary directory where creating the temporary files of this program" ) # Logging group groupL = parser.add_argument_group('Logging') groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') groupL.add_argument('--debug', action='store_true', help='Debug logging mode') groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit") # Validating & parsing args = parser.parse_args() util.logging_setup(args) args.dedup = not args.ignore_duplicates # more friendly usage of the ignore_duplicates flag logging.debug("Arguments processed: {}".format(str(args))) logging.info("Arguments processed.") return args
output_queue.put(None) reduce.join() if args.annotated_output: args.annotated_output.close() # Stats logging.info("Finished") elapsed_time = default_timer() - time_start logging.info("Total: {0} rows".format(nline)) logging.info("Elapsed time {0:.2f} s".format(elapsed_time)) logging.info("Troughput: {0} rows/s".format( int((nline * 1.0) / elapsed_time))) def main(args): logging.info("Executing main program...") perform_hardrules_filtering(args) logging.info("Program finished") if __name__ == '__main__': try: logging_setup() args = initialization() main(args) except Exception as ex: tb = traceback.format_exc() logging.error(tb) sys.exit(1)
def initialization(): global logging_level parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) parser.add_argument('input', nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="Tab-separated bilingual input file") groupM = parser.add_argument_group("Mandatory") groupM.add_argument('-m', '--metadata', type=argparse.FileType('w'), required=True, help="Training metadata (YAML file)") groupM.add_argument('-c', '--classifier', type=argparse.FileType('wb'), required=True, help="Classifier data file") groupM.add_argument('-s', '--source_lang', required=True, help="Source language") groupM.add_argument('-t', '--target_lang', required=True, help="Target language") groupM.add_argument('-d', '--source_dictionary', type=argparse.FileType('r'), required=True, help="LR gzipped probabilistic dictionary") groupM.add_argument('-D', '--target_dictionary', type=argparse.FileType('r'), required=True, help="RL gzipped probabilistic dictionary") groupM.add_argument('-f', '--source_word_freqs', type=argparse.FileType('r'), default=None, required=True, help="L language gzipped list of word frequencies") groupM.add_argument('-F', '--target_word_freqs', type=argparse.FileType('r'), default=None, required=True, help="R language gzipped list of word frequencies") groupO = parser.add_argument_group('Options') groupO.add_argument('-S', '--source_tokenizer_command', help="Source language tokenizer full command") groupO.add_argument('-T', '--target_tokenizer_command', help="Target language tokenizer full command") groupO.add_argument('--normalize_by_length', action='store_true', help="Normalize by length in qmax dict feature") groupO.add_argument('--treat_oovs', action='store_true', help="Special treatment for OOVs in qmax dict feature") groupO.add_argument( '--qmax_limit', type=check_positive_or_zero, default=40, help= "Number of max target words to be taken into account, sorted by length" ) groupO.add_argument('--disable_features_quest', action='store_false', help="Disable less important features") groupO.add_argument('--classifier_type', choices=[ 'mlp', 'extra_trees', 'svm', 'nn', 'nn1', 'adaboost', 'random_forest' ], default="extra_trees", help="Classifier type") groupO.add_argument('--dump_features', type=argparse.FileType('w'), default=None, help="Dump training features to file") groupO.add_argument('-b', '--block_size', type=check_positive, default=10000, help="Sentence pairs per block") groupO.add_argument('-p', '--processes', type=check_positive, default=max(1, cpu_count() - 1), help="Number of process to use") groupO.add_argument( '--wrong_examples_file', type=argparse.FileType('r'), default=None, help= "File with wrong examples extracted to replace the synthetic examples from method used by default" ) groupO.add_argument('--features_version', type=check_positive, default=FEATURES_VERSION, help="Version of the features") groupO.add_argument( '--disable_lang_ident', default=False, action='store_true', help="Don't apply features that use language detecting") groupO.add_argument( '--seed', default=None, type=int, help="Seed for random number generation: by default, no seeed is used") groupO.add_argument( '--relative_paths', action='store_true', help= "Ask training to save model files by relative path if they are in the same directory as metadata. Useful if you are going to train distributable models." ) #For LM filtering groupO.add_argument( '--noisy_examples_file_sl', type=str, help= "File with noisy text in the SL. These are used to estimate the perplexity of noisy text." ) groupO.add_argument( '--noisy_examples_file_tl', type=str, help= "File with noisy text in the TL. These are used to estimate the perplexity of noisy text." ) groupO.add_argument( '--lm_dev_size', type=check_positive_or_zero, default=2000, help= "Number of sentences to be removed from clean text before training LMs. These are used to estimate the perplexity of clean text." ) groupO.add_argument('--lm_file_sl', type=str, help="SL language model output file.") groupO.add_argument('--lm_file_tl', type=str, help="TL language model output file.") groupO.add_argument( '--lm_training_file_sl', type=str, help= "SL text from which the SL LM is trained. If this parameter is not specified, SL LM is trained from the SL side of the input file, after removing --lm_dev_size sentences." ) groupO.add_argument( '--lm_training_file_tl', type=str, help= "TL text from which the TL LM is trained. If this parameter is not specified, TL LM is trained from the TL side of the input file, after removing --lm_dev_size sentences." ) groupO.add_argument( '--lm_clean_examples_file_sl', type=str, help= "File with clean text in the SL. Used to estimate the perplexity of clean text. This option must be used together with --lm_training_file_sl and both files must not have common sentences. This option replaces --lm_dev_size." ) groupO.add_argument( '--lm_clean_examples_file_tl', type=str, help= "File with clean text in the TL. Used to estimate the perplexity of clean text. This option must be used together with --lm_training_file_tl and both files must not have common sentences. This option replaces --lm_dev_size." ) groupO.add_argument( '--porn_removal_train', type=argparse.FileType('r'), help= "File with training dataset for FastText classifier. Each sentence must contain at the beginning the '__label__negative' or '__label__positive' according to FastText convention. It should be lowercased and tokenized." ) groupO.add_argument( '--porn_removal_test', type=argparse.FileType('r'), help= "Test set to compute precision and accuracy of the p**n removal classifier" ) groupO.add_argument('--porn_removal_file', type=str, help="P**n removal classifier output file") groupO.add_argument( '--porn_removal_side', choices=['sl', 'tl'], default="sl", help= "Whether the p**n removal should be applied at the source or at the target language." ) groupL = parser.add_argument_group('Logging') groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') groupL.add_argument('--debug', action='store_true', help='Debug logging mode') groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") args = parser.parse_args() if args.seed is not None: np.random.seed(args.seed) random.seed(args.seed) # Logging logging_setup(args) logging_level = logging.getLogger().level return args