def get_parser() -> argparse.ArgumentParser: """ Creates the cmdline argument parser. """ parser = argparse.ArgumentParser( formatter_class=ArgumentDefaultsHelpFormatterNoNone) # sourced.engine args subparsers = parser.add_subparsers(help="Commands", dest="command") extract_parser = subparsers.add_parser( "extract", help="Extract features from input repositories", formatter_class=ArgumentDefaultsHelpFormatterNoNone) extract_parser.set_defaults(handler=code2vec_extract_features) add_repo2_args(extract_parser) # code2vec specific args extract_parser.add_argument('--max-length', type=int, default=5, help="Max path length.", required=False) extract_parser.add_argument('--max-width', type=int, default=2, help="Max path width.", required=False) extract_parser.add_argument( '-o', '--output', type=str, help="Output path for the Code2VecFeatures model", required=True) return parser
def main(): parser = argparse.ArgumentParser() # sourced.engine args add_repo2_args(parser) # code2vec specific args parser.add_argument('-g', '--max_length', type=int, default=5, help="Max path length.", required=False) parser.add_argument('-w', '--max_width', type=int, default=2, help="Max path width.", required=False) args = parser.parse_args() code2vec(args)
def get_parser() -> argparse.ArgumentParser: """ Create the cmdline argument parser. """ parser = argparse.ArgumentParser( formatter_class=args.ArgumentDefaultsHelpFormatterNoNone) parser.add_argument("--log-level", default="INFO", choices=logging._nameToLevel, help="Logging verbosity.") # Create and construct subparsers subparsers = parser.add_subparsers(help="Commands", dest="command") def add_parser(name, help_message): return subparsers.add_parser( name, help=help_message, formatter_class=args.ArgumentDefaultsHelpFormatterNoNone) # ------------------------------------------------------------------------ preprocessing_parser = subparsers.add_parser( "preprocrepos", help="Convert siva to parquet files with extracted information.") preprocessing_parser.set_defaults(handler=cmd.preprocess_repos) preprocessing_parser.add_argument( "-x", "--mode", choices=Moder.Options.__all__, default="file", help="What to extract from repositories.") args.add_repo2_args(preprocessing_parser) preprocessing_parser.add_argument( "-o", "--output", required=True, help="[OUT] Path to the parquet files with bag batches.") default_fields = ("blob_id", "repository_id", "content", "path", "commit_hash", "uast", "lang") preprocessing_parser.add_argument("-f", "--fields", nargs="+", default=default_fields, help="Fields to save.") # ------------------------------------------------------------------------ repos2bow_parser = add_parser( "repos2bow", "Convert source code to the bag-of-words model.") repos2bow_parser.set_defaults(handler=cmd.repos2bow) args.add_df_args(repos2bow_parser) args.add_repo2_args(repos2bow_parser) args.add_feature_args(repos2bow_parser) args.add_bow_args(repos2bow_parser) args.add_repartitioner_arg(repos2bow_parser) args.add_cached_index_arg(repos2bow_parser) # ------------------------------------------------------------------------ repos2bow_index_parser = add_parser( "repos2bow_index", "Creates the index, quant and docfreq model of the bag-of-words model." ) repos2bow_index_parser.set_defaults(handler=cmd.repos2bow_index) args.add_df_args(repos2bow_index_parser) args.add_repo2_args(repos2bow_index_parser) args.add_feature_args(repos2bow_index_parser) args.add_repartitioner_arg(repos2bow_index_parser) args.add_cached_index_arg(repos2bow_index_parser, create=True) # ------------------------------------------------------------------------ repos2df_parser = add_parser( "repos2df", "Calculate document frequencies of features extracted from source code." ) repos2df_parser.set_defaults(handler=cmd.repos2df) args.add_df_args(repos2df_parser) args.add_repo2_args(repos2df_parser) args.add_feature_args(repos2df_parser) # ------------------------------------------------------------------------ repos2ids_parser = subparsers.add_parser( "repos2ids", help="Convert source code to a bag of identifiers.") repos2ids_parser.set_defaults(handler=cmd.repos2ids) args.add_repo2_args(repos2ids_parser) args.add_split_stem_arg(repos2ids_parser) args.add_repartitioner_arg(repos2ids_parser) repos2ids_parser.add_argument( "-o", "--output", required=True, help="[OUT] output path to the CSV file with identifiers.") repos2ids_parser.add_argument( "--idfreq", action="store_true", help="Adds identifier frequencies to the output CSV file." "num_repos is the number of repositories where the identifier appears in." "num_files is the number of files where the identifier appears in." "num_occ is the total number of occurrences of the identifier.") # ------------------------------------------------------------------------ repos2coocc_parser = add_parser( "repos2coocc", "Convert source code to the sparse co-occurrence matrix of identifiers." ) repos2coocc_parser.set_defaults(handler=cmd.repos2coocc) args.add_df_args(repos2coocc_parser) args.add_repo2_args(repos2coocc_parser) args.add_split_stem_arg(repos2coocc_parser) args.add_repartitioner_arg(repos2coocc_parser) repos2coocc_parser.add_argument( "-o", "--output", required=True, help="[OUT] Path to the Cooccurrences model.") # ------------------------------------------------------------------------ repos2roles_and_ids = add_parser( "repos2roleids", "Converts a UAST to a list of pairs, where pair is a role and " "identifier. Role is merged generic roles where identifier was found.") repos2roles_and_ids.set_defaults(handler=cmd.repos2roles_and_ids) args.add_repo2_args(repos2roles_and_ids) args.add_split_stem_arg(repos2roles_and_ids) repos2roles_and_ids.add_argument( "-o", "--output", required=True, help="[OUT] Path to the directory where spark should store the result. " "Inside the directory you find result is csv format, status file and sumcheck files." ) # ------------------------------------------------------------------------ repos2identifier_distance = add_parser( "repos2id_distance", "Converts a UAST to a list of identifier pairs " "and distance between them.") repos2identifier_distance.set_defaults(handler=cmd.repos2id_distance) args.add_repo2_args(repos2identifier_distance) args.add_split_stem_arg(repos2identifier_distance) repos2identifier_distance.add_argument( "-t", "--type", required=True, choices=extractors.IdentifierDistance.DistanceType.All, help="Distance type.") repos2identifier_distance.add_argument( "--max-distance", default=extractors.IdentifierDistance.DEFAULT_MAX_DISTANCE, type=int, help="Maximum distance to save.") repos2identifier_distance.add_argument("-x", "--mode", choices=("file", "func"), default="file", help="What to extract from " "repositories.") repos2identifier_distance.add_argument( "-o", "--output", required=True, help="[OUT] Path to the directory where spark should store the result. " "Inside the directory you find result is csv format, status file and sumcheck files." ) # ------------------------------------------------------------------------ repos2id_sequence = add_parser( "repos2idseq", "Converts a UAST to sequence of identifiers sorted by order of appearance." ) repos2id_sequence.set_defaults(handler=cmd.repos2id_sequence) args.add_repo2_args(repos2id_sequence) args.add_split_stem_arg(repos2id_sequence) repos2id_sequence.add_argument( "--skip-docname", default=False, action="store_true", help="Do not save document name in CSV file, only identifier sequence." ) repos2id_sequence.add_argument("-x", "--mode", choices=("file", "func"), default="file", help="What to extract from repositories.") repos2id_sequence.add_argument( "-o", "--output", required=True, help="[OUT] Path to the directory where spark should store the result. " "Inside the directory you find result is csv format, status file and sumcheck files." ) # ------------------------------------------------------------------------ preproc_parser = add_parser( "id2vec-preproc", "Convert a sparse co-occurrence matrix to the Swivel shards.") preproc_parser.set_defaults(handler=cmd.id2vec_preprocess) args.add_df_args(preproc_parser) preproc_parser.add_argument("-s", "--shard-size", default=4096, type=int, help="The shard (submatrix) size.") preproc_parser.add_argument( "-i", "--input", help="Concurrence model produced by repos2coocc.") preproc_parser.add_argument("-o", "--output", required=True, help="Output directory.") # ------------------------------------------------------------------------ train_parser = add_parser("id2vec-train", "Train identifier embeddings using Swivel.") train_parser.set_defaults(handler=cmd.run_swivel) mirror_tf_args(train_parser) # ------------------------------------------------------------------------ id2vec_postproc_parser = add_parser( "id2vec-postproc", "Combine row and column embeddings produced by Swivel and write them to an .asdf." ) id2vec_postproc_parser.set_defaults(handler=cmd.id2vec_postprocess) id2vec_postproc_parser.add_argument( "-i", "--swivel-data", required=True, help="Folder with swivel row and column embeddings data. " "You can get it using id2vec_train subcommand.") id2vec_postproc_parser.add_argument( "-o", "--output", required=True, help="Output directory for Id2Vec model.") # ------------------------------------------------------------------------ id2vec_project_parser = add_parser( "id2vec-project", "Present id2vec model in Tensorflow Projector.") id2vec_project_parser.set_defaults(handler=cmd.id2vec_project) args.add_df_args(id2vec_project_parser, required=False) id2vec_project_parser.add_argument("-i", "--input", required=True, help="id2vec model to present.") id2vec_project_parser.add_argument("-o", "--output", required=True, help="Projector output directory.") id2vec_project_parser.add_argument("--no-browser", action="store_true", help="Do not open the browser.") # ------------------------------------------------------------------------ train_id_split_parser = add_parser( "train-id-split", "Train a neural network to split identifiers.") train_id_split_parser.set_defaults(handler=cmd.train_id_split) # common arguments for CNN/RNN models train_id_split_parser.add_argument( "-i", "--input", required=True, help="Path to the input data in CSV format:" "num_files,num_occ,num_repos,token,token_split") train_id_split_parser.add_argument( "-e", "--epochs", type=int, default=10, help="Number of training epochs. The more the better" "but the training time is proportional.") train_id_split_parser.add_argument( "-b", "--batch-size", type=int, default=500, help="Batch size. Higher values better utilize GPUs" "but may harm the convergence.") train_id_split_parser.add_argument("-l", "--length", type=int, default=40, help="RNN sequence length.") train_id_split_parser.add_argument("-o", "--output", required=True, help="Path to store the trained model.") train_id_split_parser.add_argument( "-t", "--test-ratio", type=float, default=0.2, help="Fraction of the dataset to use for evaluation.") train_id_split_parser.add_argument( "-p", "--padding", default="post", choices=("pre", "post"), help="Whether to pad before or after each sequence.") train_id_split_parser.add_argument( "--optimizer", default="Adam", choices=("RMSprop", "Adam"), help="Algorithm to use as an optimizer for the neural net.") train_id_split_parser.add_argument("--lr", default=0.001, type=float, help="Initial learning rate.") train_id_split_parser.add_argument( "--final-lr", default=0.00001, type=float, help="Final learning rate. The decrease from " "the initial learning rate is done linearly.") train_id_split_parser.add_argument( "--samples-before-report", type=int, default=5 * 10**6, help="Number of samples between each validation report" "and training updates.") train_id_split_parser.add_argument( "--val-batch-size", type=int, default=2000, help="Batch size for validation." "It can be increased to speed up the pipeline but" "it proportionally increases the memory consumption.") train_id_split_parser.add_argument("--seed", type=int, default=1989, help="Random seed.") train_id_split_parser.add_argument( "--devices", default="0", help="Device(s) to use. '-1' means CPU.") train_id_split_parser.add_argument( "--csv-identifier", default=3, help="Column name in the CSV file for the raw identifier.") train_id_split_parser.add_argument( "--csv-identifier-split", default=4, help="Column name in the CSV file for the split" "identifier.") train_id_split_parser.add_argument( "--include-csv-header", action="store_true", help="Treat the first line of the input CSV as a regular" "line.") train_id_split_parser.add_argument( "--model", type=str, choices=("RNN", "CNN"), required=True, help="Neural Network model to use to learn the identifier" "splitting task.") train_id_split_parser.add_argument( "-s", "--stack", default=2, type=int, help="Number of layers stacked on each other.") # RNN specific arguments train_id_split_parser.add_argument("--type-cell", default="LSTM", choices=("GRU", "LSTM", "CuDNNLSTM", "CuDNNGRU"), help="Recurrent layer type to use.") train_id_split_parser.add_argument("-n", "--neurons", default=256, type=int, help="Number of neurons on each layer.") # CNN specific arguments train_id_split_parser.add_argument( "-f", "--filters", default="64,32,16,8", help="Number of filters for each kernel size.") train_id_split_parser.add_argument("-k", "--kernel-sizes", default="2,4,8,16", help="Sizes for sliding windows.") train_id_split_parser.add_argument( "--dim-reduction", default=32, type=int, help="Number of 1-d kernels to reduce dimensionality" "after each layer.") # ------------------------------------------------------------------------ bow2vw_parser = add_parser( "bow2vw", "Convert a bag-of-words model to the dataset in Vowpal Wabbit format.") bow2vw_parser.set_defaults(handler=cmd.bow2vw) bow2vw_parser.add_argument("--bow", help="URL or path to a bag-of-words model.") bow2vw_parser.add_argument( "--id2vec", help="URL or path to the identifier embeddings.") bow2vw_parser.add_argument("-o", "--output", required=True, help="Path to the output file.") # ------------------------------------------------------------------------ bigartm_postproc_parser = add_parser( "bigartm2asdf", "Convert a human-readable BigARTM model to Modelforge format.") bigartm_postproc_parser.set_defaults(handler=cmd.bigartm2asdf) bigartm_postproc_parser.add_argument("input") bigartm_postproc_parser.add_argument("output") # ------------------------------------------------------------------------ bigartm_parser = add_parser( "bigartm", "Install bigartm/bigartm to the current working directory.") bigartm_parser.set_defaults(handler=install_bigartm) bigartm_parser.add_argument( "--tmpdir", help="Store intermediate files in this directory instead of /tmp.") bigartm_parser.add_argument("--output", default=os.getcwd(), help="Output directory.") # ------------------------------------------------------------------------ merge_df = add_parser("merge-df", "Merge DocumentFrequencies models to a single one.") merge_df.set_defaults(handler=cmd.merge_df) args.add_min_docfreq(merge_df) args.add_vocabulary_size_arg(merge_df) merge_df.add_argument("-o", "--output", required=True, help="Path to the merged DocumentFrequencies model.") merge_df.add_argument("-i", "--input", required=True, nargs="+", help="DocumentFrequencies models input files." "Use `-i -` to read input files from stdin.") merge_df.add_argument( "--ordered", action="store_true", default=False, help="Save OrderedDocumentFrequencies. " "If not specified DocumentFrequencies model will be saved") # ------------------------------------------------------------------------ merge_coocc = add_parser("merge-coocc", "Merge several Cooccurrences models together.") merge_coocc.set_defaults(handler=cmd.merge_coocc) add_spark_args(merge_coocc) merge_coocc.add_argument("-o", "--output", required=True, help="Path to the merged Cooccurrences model.") merge_coocc.add_argument("-i", "--input", required=True, help="Cooccurrences models input files." "Use `-i -` to read input files from stdin.") merge_coocc.add_argument( "--docfreq", required=True, help="[IN] Specify OrderedDocumentFrequencies model. " "Identifiers that are not present in the model will be ignored.") merge_coocc.add_argument( "--no-spark", action="store_true", default=False, help="Use the local reduction instead of PySpark. " "Can be faster and consume less memory if the data fits into RAM.") # ------------------------------------------------------------------------ merge_bow = add_parser("merge-bow", "Merge BOW models to a single one.") merge_bow.set_defaults(handler=cmd.merge_bow) merge_bow.add_argument("-i", "--input", required=True, nargs="+", help="BOW models input files." "Use `-i -` to read input files from stdin.") merge_bow.add_argument("-o", "--output", required=True, help="Path to the merged BOW model.") merge_bow.add_argument( "-f", "--features", nargs="+", choices=[ex.NAME for ex in extractors.__extractors__.values()], default=None, help= "To keep only specific features, if not specified all will be kept.") # ------------------------------------------------------------------------ id2role_eval = add_parser( "id2role-eval", "Compare the embeddings quality on role prediction problem.") id2role_eval.set_defaults(handler=cmd.id2role_eval) id2role_eval.add_argument("-m", "--models", required=True, nargs="+", help="Id2Vec models to compare." "Use `-i -` to read input files from stdin.") id2role_eval.add_argument( "-d", "--dataset", required=True, help= "Dataset directory. You can collect dataset via repos2roleids command." ) id2role_eval.add_argument("-s", "--seed", default=420, help="Random seed for reproducible results.") return parser
def get_parser() -> argparse.ArgumentParser: """ Creates the cmdline argument parser. """ parser = argparse.ArgumentParser( formatter_class=args.ArgumentDefaultsHelpFormatterNoNone) parser.add_argument("--log-level", default="INFO", choices=logging._nameToLevel, help="Logging verbosity.") # Create and construct subparsers subparsers = parser.add_subparsers(help="Commands", dest="command") def add_parser(name, help_message): return subparsers.add_parser( name, help=help_message, formatter_class=args.ArgumentDefaultsHelpFormatterNoNone) # ------------------------------------------------------------------------ preprocessing_parser = subparsers.add_parser( "preprocrepos", help="Convert siva to parquet files with extracted information.") preprocessing_parser.set_defaults(handler=cmd.preprocess_repos) preprocessing_parser.add_argument( "-x", "--mode", choices=Moder.Options.__all__, default="file", help="What to extract from repositories.") args.add_repo2_args(preprocessing_parser) args.add_dzhigurda_arg(preprocessing_parser) preprocessing_parser.add_argument( "-o", "--output", required=True, help="[OUT] Path to the parquet files with bag batches.") default_fields = ("blob_id", "repository_id", "content", "path", "commit_hash", "uast") preprocessing_parser.add_argument("-f", "--fields", nargs="+", default=default_fields, help="Fields to save.") # ------------------------------------------------------------------------ repos2bow_parser = add_parser( "repos2bow", "Convert source code to the bag-of-words model.") repos2bow_parser.set_defaults(handler=cmd.repos2bow) args.add_df_args(repos2bow_parser) args.add_repo2_args(repos2bow_parser) args.add_feature_args(repos2bow_parser) args.add_bow_args(repos2bow_parser) args.add_repartitioner_arg(repos2bow_parser) # ------------------------------------------------------------------------ repos2df_parser = add_parser( "repos2df", "Calculate document frequencies of features extracted from source code." ) repos2df_parser.set_defaults(handler=cmd.repos2df) args.add_df_args(repos2df_parser) args.add_repo2_args(repos2df_parser) args.add_feature_args(repos2df_parser) # ------------------------------------------------------------------------ repos2ids_parser = subparsers.add_parser( "repos2ids", help="Convert source code to a bag of identifiers.") repos2ids_parser.set_defaults(handler=cmd.repos2ids) args.add_repo2_args(repos2ids_parser) args.add_split_stem_arg(repos2ids_parser) args.add_repartitioner_arg(repos2ids_parser) repos2ids_parser.add_argument( "-o", "--output", required=True, help="[OUT] output path to the CSV file with identifiers.") repos2ids_parser.add_argument( "--idfreq", action="store_true", help="Adds identifier frequencies to the output CSV file." "num_repos is the number of repositories where the identifier appears in." "num_files is the number of files where the identifier appears in." "num_occ is the total number of occurences of the identifier.") # ------------------------------------------------------------------------ repos2coocc_parser = add_parser( "repos2coocc", "Convert source code to the sparse co-occurrence matrix of identifiers." ) repos2coocc_parser.set_defaults(handler=cmd.repos2coocc) args.add_df_args(repos2coocc_parser) args.add_repo2_args(repos2coocc_parser) args.add_split_stem_arg(repos2coocc_parser) args.add_repartitioner_arg(repos2coocc_parser) repos2coocc_parser.add_argument( "-o", "--output", required=True, help="[OUT] Path to the Cooccurrences model.") # ------------------------------------------------------------------------ repos2roles_and_ids = add_parser( "repos2roleids", "Converts a UAST to a list of pairs, where pair is a role and " "identifier. Role is merged generic roles where identifier was found.") repos2roles_and_ids.set_defaults(handler=cmd.repos2roles_and_ids) args.add_repo2_args(repos2roles_and_ids) args.add_split_stem_arg(repos2roles_and_ids) repos2roles_and_ids.add_argument( "-o", "--output", required=True, help="[OUT] Path to the directory where spark should store the result. " "Inside the direcory you find result is csv format, status file and sumcheck files." ) # ------------------------------------------------------------------------ repos2identifier_distance = add_parser( "repos2id_distance", "Converts a UAST to a list of identifier pairs " "and distance between them.") repos2identifier_distance.set_defaults(handler=cmd.repos2id_distance) args.add_repo2_args(repos2identifier_distance) args.add_split_stem_arg(repos2identifier_distance) repos2identifier_distance.add_argument( "-t", "--type", required=True, choices=IdentifierDistance.DistanceType.All, help="Distance type.") repos2identifier_distance.add_argument( "--max-distance", default=IdentifierDistance.DEFAULT_MAX_DISTANCE, type=int, help="Maximum distance to save.") repos2identifier_distance.add_argument( "-o", "--output", required=True, help="[OUT] Path to the directory where spark should store the result. " "Inside the direcory you find result is csv format, status file and sumcheck files." ) # ------------------------------------------------------------------------ repos2id_sequence = add_parser( "repos2idseq", "Converts a UAST to sequence of identifiers sorted by order of appearance." ) repos2id_sequence.set_defaults(handler=cmd.repos2id_sequence) args.add_repo2_args(repos2id_sequence) args.add_split_stem_arg(repos2id_sequence) repos2id_sequence.add_argument( "--skip-docname", default=False, action="store_true", help="Do not save document name in CSV file, only identifier sequence." ) repos2id_sequence.add_argument( "-o", "--output", required=True, help="[OUT] Path to the directory where spark should store the result. " "Inside the direcory you find result is csv format, status file and sumcheck files." ) # ------------------------------------------------------------------------ preproc_parser = add_parser( "id2vec-preproc", "Convert a sparse co-occurrence matrix to the Swivel shards.") preproc_parser.set_defaults(handler=cmd.id2vec_preprocess) args.add_df_args(preproc_parser) preproc_parser.add_argument("-s", "--shard-size", default=4096, type=int, help="The shard (submatrix) size.") preproc_parser.add_argument( "-i", "--input", help="Concurrence model produced by repos2coocc.") preproc_parser.add_argument("-o", "--output", required=True, help="Output directory.") # ------------------------------------------------------------------------ train_parser = add_parser("id2vec-train", "Train identifier embeddings using Swivel.") train_parser.set_defaults(handler=cmd.run_swivel) mirror_tf_args(train_parser) # ------------------------------------------------------------------------ id2vec_postproc_parser = add_parser( "id2vec-postproc", "Combine row and column embeddings produced by Swivel and write them to an .asdf." ) id2vec_postproc_parser.set_defaults(handler=cmd.id2vec_postprocess) id2vec_postproc_parser.add_argument( "-i", "--swivel-data", required=True, help="Folder with swivel row and column embeddings data. " "You can get it using id2vec_train subcommand.") id2vec_postproc_parser.add_argument( "-o", "--output", required=True, help="Output directory for Id2Vec model.") # ------------------------------------------------------------------------ id2vec_project_parser = add_parser( "id2vec-project", "Present id2vec model in Tensorflow Projector.") id2vec_project_parser.set_defaults(handler=cmd.id2vec_project) args.add_df_args(id2vec_project_parser, required=False) id2vec_project_parser.add_argument("-i", "--input", required=True, help="id2vec model to present.") id2vec_project_parser.add_argument("-o", "--output", required=True, help="Projector output directory.") id2vec_project_parser.add_argument("--no-browser", action="store_true", help="Do not open the browser.") # ------------------------------------------------------------------------ bow2vw_parser = add_parser( "bow2vw", "Convert a bag-of-words model to the dataset in Vowpal Wabbit format.") bow2vw_parser.set_defaults(handler=cmd.bow2vw) bow2vw_parser.add_argument("--bow", help="URL or path to a bag-of-words model.") bow2vw_parser.add_argument( "--id2vec", help="URL or path to the identifier embeddings.") bow2vw_parser.add_argument("-o", "--output", required=True, help="Path to the output file.") # ------------------------------------------------------------------------ bigartm_postproc_parser = add_parser( "bigartm2asdf", "Convert a human-readable BigARTM model to Modelforge format.") bigartm_postproc_parser.set_defaults(handler=cmd.bigartm2asdf) bigartm_postproc_parser.add_argument("input") bigartm_postproc_parser.add_argument("output") # ------------------------------------------------------------------------ bigartm_parser = add_parser( "bigartm", "Install bigartm/bigartm to the current working directory.") bigartm_parser.set_defaults(handler=install_bigartm) bigartm_parser.add_argument( "--tmpdir", help="Store intermediate files in this directory instead of /tmp.") bigartm_parser.add_argument("--output", default=os.getcwd(), help="Output directory.") # ------------------------------------------------------------------------ dump_parser = add_parser("dump", "Dump a model to stdout.") dump_parser.set_defaults(handler=cmd.dump_model) dump_parser.add_argument("input", help="Path to the model file, URL or UUID.") dump_parser.add_argument("--gcs", default=None, dest="gcs_bucket", help="GCS bucket to use.") # ------------------------------------------------------------------------ merge_df = add_parser("merge-df", "Merge DocumentFrequencies models to a singe one.") merge_df.set_defaults(handler=cmd.merge_df) args.add_filter_arg(merge_df) args.add_min_docfreq(merge_df) args.add_vocabulary_size_arg(merge_df) merge_df.add_argument("-o", "--output", required=True, help="Path to the merged DocumentFrequencies model.") merge_df.add_argument("-i", "--input", required=True, nargs="+", help="DocumentFrequencies models input files." "Use `-i -` to read input files from stdin.") merge_df.add_argument( "--ordered", action="store_true", default=False, help="Save OrderedDocumentFrequencies. " "If not specified DocumentFrequencies model will be saved") # ------------------------------------------------------------------------ merge_coocc = add_parser("merge-coocc", "Merge several Cooccurrences models together.") merge_coocc.set_defaults(handler=cmd.merge_coocc) add_spark_args(merge_coocc) args.add_filter_arg(merge_coocc) merge_coocc.add_argument("-o", "--output", required=True, help="Path to the merged Cooccurrences model.") merge_coocc.add_argument("-i", "--input", required=True, help="Cooccurrences models input files." "Use `-i -` to read input files from stdin.") merge_coocc.add_argument( "--docfreq", required=True, help="[IN] Specify OrderedDocumentFrequencies model. " "Identifiers that are not present in the model will be ignored.") merge_coocc.add_argument( "--no-spark", action="store_true", default=False, help="Use the local reduction instead of PySpark. " "Can be faster and consume less memory if the data fits into RAM.") return parser
def get_parser() -> argparse.ArgumentParser: """ Create the cmdline argument parser. """ parser = argparse.ArgumentParser( formatter_class=ArgumentDefaultsHelpFormatterNoNone) parser.add_argument("--log-level", default="INFO", choices=logging._nameToLevel, help="Logging verbosity.") def add_feature_weight_arg(my_parser): help_desc = "%s's weight - all features from this extractor will be multiplied by this " \ "factor" for ex in extractors.__extractors__.values(): my_parser.add_argument("--%s-weight" % ex.NAME, default=1, type=float, help=help_desc % ex.__name__) def add_cassandra_args(my_parser): my_parser.add_argument("--cassandra", default="0.0.0.0:9042", help="Cassandra's host:port.") my_parser.add_argument("--keyspace", default="apollo", help="Cassandra's key space.") my_parser.add_argument( "--tables", help= "Table name mapping (JSON): bags, hashes, hashtables, hashtables2." ) def add_wmh_args(my_parser, params_help: str, add_hash_size: bool, required: bool): if add_hash_size: my_parser.add_argument("--size", type=int, default=128, help="Hash size.") my_parser.add_argument("-p", "--params", required=required, help=params_help) my_parser.add_argument("-t", "--threshold", required=required, type=float, help="Jaccard similarity threshold.") my_parser.add_argument( "--false-positive-weight", type=float, default=0.5, help="Used to adjust the relative importance of " "minimizing false positives count when optimizing " "for the Jaccard similarity threshold.") my_parser.add_argument( "--false-negative-weight", type=float, default=0.5, help="Used to adjust the relative importance of " "minimizing false negatives count when optimizing " "for the Jaccard similarity threshold.") def add_template_args(my_parser, default_template): my_parser.add_argument("--batch", type=int, default=100, help="Number of hashes to query at a time.") my_parser.add_argument("--template", default=default_template, help="Jinja2 template to render.") # Create and construct subparsers subparsers = parser.add_subparsers(help="Commands", dest="command") # ------------------------------------------------------------------------ warmup_parser = subparsers.add_parser("warmup", help="Initialize source{d} engine.") warmup_parser.set_defaults(handler=warmup) add_engine_args(warmup_parser, default_packages=[CASSANDRA_PACKAGE]) # ------------------------------------------------------------------------ db_parser = subparsers.add_parser( "resetdb", help="Destructively initialize the database.") db_parser.set_defaults(handler=reset_db) add_cassandra_args(db_parser) db_parser.add_argument( "--hashes-only", action="store_true", help= "Only clear the tables: hashes, hashtables, hashtables2. Do not touch the rest." ) # ------------------------------------------------------------------------ preprocess_parser = subparsers.add_parser( "preprocess", help= "Creates the index, quant and docfreq model of the bag-of-words model." ) preprocess_parser.set_defaults(handler=preprocess) add_df_args(preprocess_parser) add_repo2_args(preprocess_parser) add_feature_args(preprocess_parser) add_repartitioner_arg(preprocess_parser) preprocess_parser.add_argument( "--cached-index-path", default=None, help="[OUT] Path to the docfreq model holding the document's index.") # ------------------------------------------------------------------------ source2bags_parser = subparsers.add_parser( "bags", help="Convert source code to weighted sets.") source2bags_parser.set_defaults(handler=source2bags) add_bow_args(source2bags_parser) add_dzhigurda_arg(source2bags_parser) add_repo2_args(source2bags_parser, default_packages=[CASSANDRA_PACKAGE]) add_feature_args(source2bags_parser) add_cassandra_args(source2bags_parser) add_df_args(source2bags_parser) add_repartitioner_arg(source2bags_parser) source2bags_parser.add_argument( "--cached-index-path", default=None, help="[IN] Path to the docfreq model holding the document's index.") # ------------------------------------------------------------------------ hash_parser = subparsers.add_parser( "hash", help="Run MinHashCUDA on the bag batches.") hash_parser.set_defaults(handler=hash_batches) hash_parser.add_argument("-i", "--input", help="Path to the directory with Parquet files.") hash_parser.add_argument("--seed", type=int, default=int(time()), help="Random generator's seed.") hash_parser.add_argument("--mhc-verbosity", type=int, default=1, help="MinHashCUDA logs verbosity level.") hash_parser.add_argument( "--devices", type=int, default=0, help="Or-red indices of NVIDIA devices to use. 0 means all.") add_wmh_args(hash_parser, "Path to the output file with WMH parameters.", True, True) add_cassandra_args(hash_parser) add_spark_args(hash_parser, default_packages=[CASSANDRA_PACKAGE]) add_feature_weight_arg(hash_parser) add_repartitioner_arg(hash_parser) # ------------------------------------------------------------------------ query_parser = subparsers.add_parser("query", help="Query for similar files.") query_parser.set_defaults(handler=query) mode_group = query_parser.add_mutually_exclusive_group(required=True) mode_group.add_argument("-i", "--id", help="Query for this id (id mode).") mode_group.add_argument("-c", "--file", help="Query for this file (file mode).") query_parser.add_argument( "--docfreq", help="Path to OrderedDocumentFrequencies (file mode).") query_parser.add_argument( "--min-docfreq", default=1, type=int, help="The minimum document frequency of each feature.") query_parser.add_argument("--bblfsh", default="localhost:9432", help="Babelfish server's address.") query_parser.add_argument("--precise", action="store_true", help="Calculate the precise set.") add_wmh_args(query_parser, "Path to the Weighted MinHash parameters.", False, False) add_feature_args(query_parser, required=False) add_template_args(query_parser, "query.md.jinja2") add_cassandra_args(query_parser) # ------------------------------------------------------------------------ cc_parser = subparsers.add_parser( "cc", help= "Load the similar pairs of files and run connected components analysis." ) cc_parser.set_defaults(handler=find_connected_components) add_cassandra_args(cc_parser) cc_parser.add_argument( "-o", "--output", required=True, help="[OUT] Path to connected components ASDF model.") # ------------------------------------------------------------------------ dumpcc_parser = subparsers.add_parser( "dumpcc", help="Output the connected components to stdout.") dumpcc_parser.set_defaults(handler=dumpcc) dumpcc_parser.add_argument("-i", "--input", required=True, help="Path to connected components ASDF model.") # ------------------------------------------------------------------------ community_parser = subparsers.add_parser( "cmd", help= "Run Community Detection analysis on the connected components from \"cc\"." ) community_parser.set_defaults(handler=detect_communities) community_parser.add_argument( "-i", "--input", required=True, help="Path to connected components ASDF model.") community_parser.add_argument( "-o", "--output", required=True, help="[OUT] Path to the communities ASDF model.") community_parser.add_argument( "--edges", choices=("linear", "quadratic", "1", "2"), default="linear", help="The method to generate the graph's edges: bipartite - " "linear and fast, but may not fit some the CD algorithms, " "or all to all within a bucket - quadratic and slow, but " "surely fits all the algorithms.") cmd_choices = [k[10:] for k in dir(Graph) if k.startswith("community_")] community_parser.add_argument( "-a", "--algorithm", choices=cmd_choices, default="walktrap", help="The community detection algorithm to apply.") community_parser.add_argument( "-p", "--params", type=json.loads, default={}, help="Parameters for the algorithm (**kwargs, JSON format).") community_parser.add_argument("--no-spark", action="store_true", help="Do not use Spark.") add_spark_args(community_parser) # ------------------------------------------------------------------------ dumpcmd_parser = subparsers.add_parser( "dumpcmd", help="Output the detected communities to stdout.") dumpcmd_parser.set_defaults(handler=dumpcmd) dumpcmd_parser.add_argument("input", help="Path to the communities ASDF model.") add_template_args(dumpcmd_parser, "report.md.jinja2") add_cassandra_args(dumpcmd_parser) # ------------------------------------------------------------------------ evalcc_parser = subparsers.add_parser( "evalcc", help= "Evaluate the communities: calculate the precise similarity and the " "fitness metric.") evalcc_parser.set_defaults(handler=evaluate_communities) evalcc_parser.add_argument("-t", "--threshold", required=True, type=float, help="Jaccard similarity threshold.") evalcc_parser.add_argument("-i", "--input", required=True, help="Path to the communities model.") add_spark_args(evalcc_parser, default_packages=[CASSANDRA_PACKAGE]) add_cassandra_args(evalcc_parser) # TODO: retable [.....] -> [.] [.] [.] [.] [.] return parser