def main(): log = logging.getLogger(__name__) parser = get_cli_parser() args = parser.parse_args() config_path = args.config generate_config = args.generate_config config_overwrite = args.overwrite is_debug = args.debug label = args.label file_globs = args.file_globs initialize_logging(logging.getLogger(__name__), is_debug and logging.DEBUG or logging.INFO) initialize_logging(logging.getLogger('smqtk'), is_debug and logging.DEBUG or logging.INFO) log.debug("Showing debug messages.") config = get_default_config() config_loaded = False if config_path and os.path.isfile(config_path): with open(config_path) as f: log.info("Loading configuration: %s", config_path) config.update(json.load(f)) config_loaded = True output_config(generate_config, config, log, config_overwrite, 100) if not config_loaded: log.error("No configuration provided") exit(101) classify_files(config, label, file_globs)
def main(): parser = cli_parser() args = parser.parse_args() llevel = logging.DEBUG if args.verbose else logging.INFO bin_utils.initialize_logging(logging.getLogger(), llevel) log = logging.getLogger("main") # Merge loaded config with default config = default_config() if args.config: if osp.isfile(args.config): with open(args.config, 'r') as f: config.update(json.load(f)) elif not osp.isfile(args.config): log.error("Configuration file path not valid.") exit(1) bin_utils.output_config(args.output_config, config, log, True) # Default config options for this util are valid for running, so no "has # config loaded check here. port = int(config['port']) authkey = str(config['authkey']) mgr = ProxyManager(('', port), authkey) mgr.get_server().serve_forever()
def main(): log = logging.getLogger(__name__) parser = get_cli_parser() args = parser.parse_args() config_path = args.config generate_config = args.generate_config config_overwrite = args.overwrite iqr_state_fp = args.iqr_state is_debug = args.debug initialize_logging(logging.getLogger(), is_debug and logging.DEBUG or logging.INFO) log.debug("Showing debug messages.") config = get_default_config() config_loaded = False if config_path and os.path.isfile(config_path): with open(config_path) as f: log.info("Loading configuration: %s", config_path) config.update( json.load(f) ) config_loaded = True output_config(generate_config, config, log, config_overwrite, 100) if not config_loaded: log.error("No configuration provided") exit(101) if not os.path.isfile(iqr_state_fp): log.error("IQR Session info JSON filepath was invalid") exit(102) train_classifier_iqr(config, iqr_state_fp)
def main(): log = logging.getLogger(__name__) parser = get_cli_parser() args = parser.parse_args() config_path = args.config generate_config = args.generate_config config_overwrite = args.overwrite is_debug = args.verbose label = args.label file_globs = args.file_globs initialize_logging(logging.getLogger('__main__'), is_debug and logging.DEBUG or logging.INFO) initialize_logging(logging.getLogger('smqtk'), is_debug and logging.DEBUG or logging.INFO) log.debug("Showing debug messages.") config = get_default_config() config_loaded = False if config_path and os.path.isfile(config_path): with open(config_path) as f: log.info("Loading configuration: %s", config_path) config.update( json.load(f) ) config_loaded = True output_config(generate_config, config, log, config_overwrite, 100) if not config_loaded: log.error("No configuration provided") exit(101) classify_files(config, label, file_globs)
def main(): parser = bin_utils.SMQTKOptParser() parser.add_option("-c", "--config", type=str, help="Path to the configuration file.") parser.add_option("-v", "--verbose", action="store_true", default=False, help="Add debugging log messages.") opts, args = parser.parse_args() bin_utils.initialize_logging(logging.getLogger(), logging.INFO - (10 * opts.verbose)) config_file = opts.config assert config_file is not None, "Not given a configuration file for the server!" assert osp.exists(config_file), "Given config file path does not exist." assert not osp.isdir(config_file), "Given config file is a directory!" config = SafeConfigCommentParser() parsed = config.read(config_file) assert parsed, "Configuration file not parsed!" section = "server" assert config.has_section(section), "No server section found!" assert config.has_option(section, "port"), "No port option in config!" assert config.has_option(section, "authkey"), "No authkey option in config!" port = config.getint(section, "port") authkey = config.get(section, "authkey") mgr = ProxyManager(("", port), authkey) mgr.get_server().serve_forever()
def main(): log = logging.getLogger(__name__) parser = get_cli_parser() args = parser.parse_args() config_path = args.config generate_config = args.generate_config config_overwrite = args.overwrite iqr_state_fp = args.iqr_state is_debug = args.debug initialize_logging(logging.getLogger(), is_debug and logging.DEBUG or logging.INFO) log.debug("Showing debug messages.") config = get_default_config() config_loaded = False if config_path and os.path.isfile(config_path): with open(config_path) as f: log.info("Loading configuration: %s", config_path) config.update(json.load(f)) config_loaded = True output_config(generate_config, config, log, config_overwrite, 100) if not config_loaded: log.error("No configuration provided") exit(101) if not os.path.isfile(iqr_state_fp): log.error("IQR Session info JSON filepath was invalid") exit(102) train_classifier_iqr(config, iqr_state_fp)
def main(): usage = "%prog [options] GLOB [ GLOB [ ... ] ]" description = "Create a file-based ingest from a set of local file paths " \ "or shell-style glob strings." parser = bin_utils.SMQTKOptParser(usage, description=description) parser.add_option('-s', '--set-label', help="Configured ingest to 'ingest' into.") parser.add_option('-l', '--list-ingests', action='store_true', default=False, help="List available ingests we can ingest new data " "into. See the system_config.json file in the etc " "directory for more details.") parser.add_option('-v', '--verbose', action='store_true', default=False, help='Add debug messaged to output logging.') opts, args = parser.parse_args() bin_utils.initialize_logging(logging.getLogger(), logging.INFO - (10*opts.verbose)) log = logging.getLogger("main") if opts.list_ingests: # Find labels for configured data sets that are of the FileSet type file_ds_labels = [ l for l, dsc in smqtk_config.SYSTEM_CONFIG['DataSets'].iteritems() if dsc['type'] == "DataFileSet" ] log.info("") log.info("Available File-based datasets:") for k in sorted(file_ds_labels): log.info("\t%s", k) log.info("") exit(0) if opts.set_label is None: log.info("") log.info("ERROR: Please provide data set configuration label.") log.info("") exit(1) fds = DataSetConfiguration.new_inst(opts.set_label) log.debug("Script arguments:\n%s" % args) def ingest_file(fp): fds.add_data(DataFileElement(fp)) for f in args: f = osp.expanduser(f) if osp.isfile(f): ingest_file(f) else: log.debug("Expanding glob: %s" % f) for g in glob.glob(f): ingest_file(g)
def main(): bin_utils.initialize_logging(logging.getLogger(), logging.DEBUG) log = logging.getLogger(__name__) # For each file in descriptor vector file tree, load from file # [type, uuid, vector] and insert into PSQL element. log.info("Setting up parallel environment") in_queue = multiprocessing.Queue() workers = [] for i in xrange(multiprocessing.cpu_count()): p = multiprocessing.Process( target=proc_transfer, args=(in_queue,) ) workers.append(p) p.start() try: log.info("Loading filename list") with open("descriptor_file_names.5.3mil.pickle") as f: fname_list = cPickle.load(f) log.info("Running through filename list") for n in fname_list: m = fname_re.match(n) assert m type_str = m.group(1) uuid_str = m.group(2) #print type_str, uuid_str #break in_queue.put( (type_str, uuid_str) ) log.info("Sending worker terminal packets") for w in workers: in_queue.put(None) except: log.info("Terminating workers") for w in workers: w.terminate() finally: log.info("Waiting for workers to complete") for w in workers: w.join() log.info("Workers joined")
def main(): parser = cli_parser() args = parser.parse_args() logging_level = logging.INFO if args.verbose: logging_level = logging.DEBUG initialize_logging(logging.getLogger("smqtk"), logging_level) base_dir = args.base_dir interval_seconds = args.interval expiry_seconds = args.expiry interval_scan(interval_seconds, base_dir, expiry_seconds, remove_file_action)
def main(): parser = cli_parser() args = parser.parse_args() bin_utils.initialize_logging(logging.getLogger(), logging.INFO - (10 * args.verbose)) log = logging.getLogger("main") # Merge loaded config with default config_loaded = False config = default_config() if args.config: if osp.isfile(args.config): with open(args.config, 'r') as f: config.update(json.load(f)) config_loaded = True elif not osp.isfile(args.config): log.error("Configuration file path not valid.") exit(1) # output configuration dictionary when asked for. bin_utils.output_config(args.output_config, config, log, True) if not config_loaded: log.error("No configuration provided") exit(1) log.debug("Script arguments:\n%s" % args) def iter_input_elements(): for f in args.input_files: f = osp.expanduser(f) if osp.isfile(f): yield DataFileElement(f) else: log.debug("Expanding glob: %s" % f) for g in glob.glob(f): yield DataFileElement(g) log.info("Adding elements to data set") #: :type: smqtk.representation.DataSet ds = plugin.from_plugin_config(config['data_set'], get_data_set_impls()) ds.add_data(*iter_input_elements())
def main(): from smqtk.utils.bin_utils import initialize_logging, SMQTKOptParser parser = SMQTKOptParser() parser.add_option("-d", "--base-dir", help="Starting directory for scan.") parser.add_option("-i", "--interval", type=int, help="Number of seconds between each scan (integer).") parser.add_option("-e", "--expiry", type=int, help='Number of seconds until a file has "expired" ' "(integer).") parser.add_option("-v", "--verbose", action="store_true", default=False, help="Display more messages (debugging).") opts, args = parser.parse_args() logging_level = logging.INFO if opts.verbose: logging_level = logging.DEBUG initialize_logging(logging.getLogger("smqtk"), logging_level) base_dir = opts.base_dir interval_seconds = opts.interval expiry_seconds = opts.expiry interval_scan(interval_seconds, base_dir, expiry_seconds, remove_file_action)
def main(): usage = "%prog [options] GLOB [ GLOB [ ... ] ]" description = "Add a set of local system files to a data set via " \ "explicit paths or shell-style glob strings." parser = bin_utils.SMQTKOptParser(usage, description=description) parser.add_option('-c', '--config', help="Path to the JSON configuration file") parser.add_option('--output-config', help="Optional path to output a default configuration " "file to. This output file should be modified and " "used for this executable.") parser.add_option('-v', '--verbose', action='store_true', default=False, help='Add debug messaged to output logging.') opts, args = parser.parse_args() bin_utils.initialize_logging(logging.getLogger(), logging.INFO - (10*opts.verbose)) log = logging.getLogger("main") # output configuration dictionary when asked for. bin_utils.output_config(opts.output_config, default_config(), log) with open(opts.config, 'r') as f: config = json.load(f) #: :type: smqtk.representation.DataSet ds = plugin.from_plugin_config(config['data_set'], get_data_set_impls) log.debug("Script arguments:\n%s" % args) def ingest_file(fp): ds.add_data(DataFileElement(fp)) for f in args: f = osp.expanduser(f) if osp.isfile(f): ingest_file(f) else: log.debug("Expanding glob: %s" % f) for g in glob.glob(f): ingest_file(g)
def main(): # Print help and exit if no arguments were passed if len(sys.argv) == 1: get_cli_parser().print_help() sys.exit(1) args = get_cli_parser().parse_args() llevel = logging.INFO if not args.verbose else logging.DEBUG initialize_logging(logging.getLogger('smqtk'), llevel) initialize_logging(logging.getLogger('__main__'), llevel) log = logging.getLogger(__name__) log.debug('Showing debug messages.') if args.file_list is not None and not os.path.exists(args.file_list): log.error('Invalid file list path: %s', args.file_list) exit(103) def check_image(image_path): if not os.path.exists(image_path): log.warn('Invalid image path given (does not exist): %s', image_path) return False, False else: d = DataFileElement(image_path) return is_valid_element(d, check_image=True), d with open(args.file_list) as infile: checked_images = parallel.parallel_map(check_image, map(str.strip, infile), name='check-image-validity', use_multiprocessing=True) for is_valid, dfe in checked_images: if dfe: # in the case of a non-existent file if (is_valid and not args.invert) or \ (not is_valid and args.invert): # We know the callback above is creating DataFileElement # instances. # noinspection PyProtectedMember print('%s,%s' % (dfe._filepath, dfe.uuid()))
def main(): args = cli_parser().parse_args() initialize_logging(logging.getLogger('smqtk'), logging.DEBUG) initialize_logging(logging.getLogger('__main__'), logging.DEBUG) log = logging.getLogger(__name__) hash2uuids_fp = os.path.abspath(args.hash2uuids_fp) bit_len = args.bit_len leaf_size = args.leaf_size rand_seed = args.rand_seed balltree_model_fp = os.path.abspath(args.balltree_model_fp) assert os.path.isfile(hash2uuids_fp), "Bad path: '%s'" % hash2uuids_fp assert os.path.isdir(os.path.dirname(balltree_model_fp)), \ "Bad path: %s" % balltree_model_fp log.debug("hash2uuids_fp : %s", hash2uuids_fp) log.debug("bit_len : %d", bit_len) log.debug("leaf_size : %d", leaf_size) log.debug("rand_seed : %d", rand_seed) log.debug("balltree_model_fp: %s", balltree_model_fp) log.info("Loading hash2uuids table") with open(hash2uuids_fp) as f: hash2uuids = cPickle.load(f) log.info("Computing hash-code vectors") hash_vectors = [] #[int_to_bit_vector_large(h, bit_len) for h in hash2uuids] rs = [0] * 7 for h in hash2uuids: hash_vectors.append( int_to_bit_vector_large(h, bit_len) ) report_progress(log.debug, rs, 1.) log.info("Initializing ball tree") btree = SkLearnBallTreeHashIndex(balltree_model_fp, leaf_size, rand_seed) log.info("Building ball tree") btree.build_index(hash_vectors)
def main(): parser = cli_parser() args = parser.parse_args() debug_smqtk = args.debug_smqtk or args.verbose debug_server = args.debug_server or args.verbose bin_utils.initialize_logging(logging.getLogger("__main__"), logging.INFO - (10 * debug_smqtk)) bin_utils.initialize_logging(logging.getLogger("smqtk"), logging.INFO - (10 * debug_smqtk)) bin_utils.initialize_logging(logging.getLogger("werkzeug"), logging.WARN - (20 * debug_server)) log = logging.getLogger(__name__) web_applications = smqtk.web.get_web_applications() if args.list: log.info("") log.info("Available applications:") log.info("") for l in web_applications: log.info("\t" + l) log.info("") exit(0) application_name = args.application if application_name is None: log.error("No application name given!") exit(1) elif application_name not in web_applications: log.error("Invalid application label '%s'", application_name) exit(1) app_class = web_applications[application_name] bin_utils.utility_main_helper(app_class.get_default_config, args, skip_logging_init=True) host = args.host port = args.port and int(args.port) use_reloader = args.reload use_threading = args.threaded use_basic_auth = args.use_basic_auth # noinspection PyUnresolvedReferences app = app_class.from_config(config) if use_basic_auth: app.config["BASIC_AUTH_FORCE"] = True BasicAuth(app) app.config['DEBUG'] = debug_server app.run(host=host, port=port, debug=debug_server, use_reloader=use_reloader, threaded=use_threading)
def add_descriptors_smallcodes(): log = logging.getLogger(__name__) log.info("Loading descriptor UUIDs") with open(UUIDS_FILEPATH) as f: descriptor_uuids = cPickle.load(f) log.info("Loading ITQ components") r = np.load("/data/shared/memex/ht_image_cnn/itq_model/16-bit/rotation.npy") mv = np.load("/data/shared/memex/ht_image_cnn/itq_model/16-bit/mean_vec.npy") log.info("Making small-codes") sc_d_pairs = async_compute_smallcodes( r, mv, make_elements_from_uuids(descriptor_uuids) ) log.info("Loading ITQ model") itq_index = load_algo() log.info("Adding small codes") itq_index._code_index.add_many_descriptors(sc_d_pairs) return descriptor_uuids, itq_index if __name__ == "__main__": initialize_logging(logging.getLogger(), logging.DEBUG) filenames, itq_index = add_descriptors_smallcodes()
CAFFE_LABELS = "labels.txt" # CSV file detailing [cluster_id, ad_id, image_sha1] relationships. EVAL_CLUSTERS_ADS_IMAGES_CSV = "eval.CP1_clusters_ads_images.csv" # json-lines file of clusters missing from the above file. Should be at least # composed of: {"cluster_id": <str>, ... } EVAL_MISSING_CLUSTERS = "eval.cluster_scores.missing_clusters.jl" OUTPUT_DESCR_PROB_INDEX = "cp1_img_prob_descriptors.pickle" OUTPUT_MAX_JL = "cp1_scores_max.jl" OUTPUT_AVG_JL = "cp1_scores_avg.jl" ############################################################################### # Compute classification scores initialize_logging(logging.getLogger('smqtk'), logging.DEBUG) eval_data_set = DataMemorySet(EVAL_DATASET) img_prob_descr_index = MemoryDescriptorIndex(OUTPUT_DESCR_PROB_INDEX) img_prob_gen = CaffeDescriptorGenerator(CAFFE_DEPLOY, CAFFE_MODEL, CAFFE_IMG_MEAN, 'prob', batch_size=1000, use_gpu=True, load_truncated_images=True) img_c_mem_factory = ClassificationElementFactory(MemoryClassificationElement, {}) img_prob_classifier = IndexLabelClassifier(CAFFE_LABELS)
if __name__ == "__main__": p = cli_parser() args = p.parse_args() debug = args.debug config_fp = args.config out_config_fp = args.gen_config completed_files_fp = args.completed_files filelist_fp = args.file_list batch_size = args.batch_size # Initialize logging llevel = debug and logging.DEBUG or logging.INFO if not logging.getLogger("smqtk").handlers: initialize_logging(logging.getLogger("smqtk"), llevel) if not logging.getLogger("__main__").handlers: initialize_logging(logging.getLogger("__main__"), llevel) l = logging.getLogger(__name__) # Merge loaded config with default config_loaded = False c = default_config() if config_fp: if os.path.isfile(config_fp): with open(config_fp) as f: c.update(json.loads(jsmin(f.read()))) config_loaded = True else: l.error("Config file path not valid")
def main(): usage = "%prog [OPTIONS] INPUT_FILE" description = """\ Compute a descriptor vector for a given data file, outputting the generated feature vector to standard out, or to an output file if one was specified (in numpy format). """ parser = bin_utils.SMQTKOptParser(usage, description=description) group_labels = optparse.OptionGroup(parser, "Configuration") group_labels.add_option('-c', '--config', default=None, help='Path to the JSON configuration file.') group_labels.add_option('--output-config', default=None, help='Optional path to output default JSON ' 'configuration to.') parser.add_option_group(group_labels) group_optional = optparse.OptionGroup(parser, "Optional Parameters") group_optional.add_option('--overwrite', action='store_true', default=False, help="Force descriptor computation even if an " "existing descriptor vector was discovered " "based on the given content descriptor type " "and data combination.") group_optional.add_option('-o', '--output-filepath', help='Optional path to a file to output feature ' 'vector to. Otherwise the feature vector is ' 'printed to standard out. Output is saved ' 'in numpy binary format (.npy suffix ' 'recommended).') group_optional.add_option('-v', '--verbose', action='store_true', default=False, help='Print additional debugging messages. All ' 'logging goes to standard error.') parser.add_option_group(group_optional) opts, args = parser.parse_args() output_filepath = opts.output_filepath overwrite = opts.overwrite verbose = opts.verbose llevel = logging.DEBUG if verbose else logging.INFO bin_utils.initialize_logging(logging.getLogger(), llevel) log = logging.getLogger("main") bin_utils.output_config(opts.output_config, default_config(), log) if not opts.config: log.error("No configuration provided") exit(1) elif not os.path.isfile(opts.config): log.error("Configuration file path not valid.") exit(1) if len(args) == 0: log.error("Failed to provide an input file path") exit(1) if len(args) > 1: log.warning("More than one filepath provided as an argument. Only " "computing for the first one.") with open(opts.config, 'r') as f: config = json.load(f) input_filepath = args[0] data_element = DataFileElement(input_filepath) factory = DescriptorElementFactory.from_config(config['descriptor_factory']) #: :type: smqtk.descriptor_generator.DescriptorGenerator cd = plugin.from_plugin_config(config['content_descriptor'], get_descriptor_generator_impls) descr_elem = cd.compute_descriptor(data_element, factory, overwrite) vec = descr_elem.vector() if vec is None: log.error("Failed to generate a descriptor vector for the input data!") if output_filepath: numpy.save(output_filepath, vec) else: # Construct string, because numpy s = [] # noinspection PyTypeChecker for f in vec: s.append('%15f' % f) print ' '.join(s)
def main(): parser = bin_utils.SMQTKOptParser() parser.add_option('-c', '--config', default=None, help='Path to an smqtk configuration extension file ' '(a python file).') parser.add_option('-a', '--application', default=None, help="Name of the web application to run. Required.") parser.add_option('-r', '--reload', action='store_true', default=False, help='Turn on server reloading.') parser.add_option('-t', '--threaded', action='store_true', default=False, help="Turn on web searcher threading.") parser.add_option('--debug-server', action='store_true', default=False, help='Turn on server debugging messages') parser.add_option('--debug-backend', action='store_true', default=False, help='Turn on smqtk backend debugging messages') parser.add_option('--host', default=None, help="Run host address specification override. This will " "override all other configuration method " "specifications.") parser.add_option('--port', default=None, help="Run port specification override. This will " "override all other configuration method " "specifications.") parser.add_option("--use-basic-auth", action="store_true", default=False, help="Use global basic authentication as configured.") parser.add_option('-l', '--list', default=False, action="store_true", help="List currently available applications for running.") opts, args = parser.parse_args() bin_utils.initialize_logging(logging.getLogger("smqtk"), logging.INFO - (10*opts.debug_backend)) bin_utils.initialize_logging(logging.getLogger("werkzeug"), logging.WARN - (20*opts.debug_server)) log = logging.getLogger("smqtk.main") if opts.list: from smqtk.web import APPLICATIONS log.info("") log.info("Available applications:") log.info("") for e in APPLICATIONS: log.info("\t%s" % e.__name__) log.info("") exit(0) host = opts.host port = opts.port and int(opts.port) debug_server = opts.debug_server use_reloader = opts.reload use_threading = opts.threaded application_name = opts.application use_basic_auth = opts.use_basic_auth if application_name is None: raise ValueError("No application name given!") import smqtk.web # noinspection PyPep8Naming App = getattr(smqtk.web, application_name, None) if App is None: raise ValueError("No available application by the name of '%s'" % application_name) app = App(opts.config) if use_basic_auth: app.config["BASIC_AUTH_FORCE"] = True BasicAuth(app) app.config['DEBUG'] = debug_server app.run(host=host, port=port, debug=debug_server, use_reloader=use_reloader, threaded=use_threading)
import json import logging from smqtk import algorithms from smqtk import representation from smqtk.utils import bin_utils, jsmin, plugin __author__ = '*****@*****.**' # # Setup logging # if not logging.getLogger().handlers: bin_utils.initialize_logging(logging.getLogger(), logging.INFO) # # Input parameters # # The following dictionaries are JSON configurations that are used to # configure the various data structures and algorithms needed for the IQR demo # application. Values here can be changed to suit your specific data and # algorithm needs. # # See algorithm implementation doc-strings for more information on configuration # parameters (see implementation class ``__init__`` method). # search_app_config_filepath = "/Users/purg/dev/smqtk/source/python/smqtk/web/" \ "search_app/config.IqrSearchApp.json"
from smqtk.utils.file_utils import safe_create_dir from smqtk.utils.parallel import parallel_map ################################################################################ # PARAMETERS # Confirmed there are no conflicting truth labels on a CDR and URL basis ad_image_csv = "ad-images.source.url_ad_label.csv" ad_phone_csv = "ad-images.source.ad_phone.csv" image_output_dir = "ad-images" ################################################################################ initialize_logging(logging.getLogger('__main__'), logging.INFO) initialize_logging(logging.getLogger('smqtk'), logging.INFO) log = logging.getLogger(__name__) if '.jfif' in mimetypes.types_map: del mimetypes.types_map['.jfif'] if '.jpe' in mimetypes.types_map: del mimetypes.types_map['.jpe'] def dl_ad_image(url, output_dir): """ Returns (None, None, None) if failed, otherwise (url, filepath, sha1) """ log = logging.getLogger(__name__)
import csv import json import logging from matplotlib import pyplot as plt import numpy from sklearn.metrics import auc, confusion_matrix, precision_recall_curve, roc_curve from smqtk.algorithms import get_classifier_impls from smqtk.representation import ClassificationElementFactory from smqtk.representation.classification_element.memory import MemoryClassificationElement from smqtk.representation.descriptor_index.memory import MemoryDescriptorIndex from smqtk.utils.bin_utils import initialize_logging from smqtk.utils.plugin import from_plugin_config initialize_logging(logging.getLogger(), logging.INFO) log = logging.getLogger(__name__) ############################################################################### # Parameters # PHONE_SHA1_JSON = "eval.map.phone2shas.json" DESCRIPTOR_INDEX_FILE_CACHE = "eval.images.descriptors.alexnet_fc7.index" CLASSIFIER_TRAINING_CONFIG_JSON = 'ad-images.final.cmv.train.json' PHONE2SCORE_OUTPUT_FILEPATH = "eval.results.full_model.phone2score.csv" # Optional for ROC generation, using PHONE2SCORE_OUTPUT_FILEPATH as input, and # outputting plots PHONE2TRUTH = 'eval.source.phone2truth.json'
def main(): usage = "%prog [OPTIONS] INPUT_FILE" description = "Compute a feature vector for a given data file, outputting " \ "the generated feature vector to standard out, or to an " \ "output file if one was specified.\n" \ "\n" \ "An ingest " \ "configuration must be specified for the purpose of " \ "identifying which model files to use (assuming a given " \ "descriptor has/uses model files). The ingest configuration " \ "also informs where to put temporary working files. " parser = bin_utils.SMQTKOptParser(usage, description=description) parser.add_option('-c', '--content-descriptor', help='The descriptor type to use. This must be a type ' 'available in system configuration') parser.add_option('-o', '--output-filepath', help='Optional path to a file to output feature vector ' 'to. Otherwise the feature vector is printed to ' 'standard out. Output is saved in numpy binary ' 'format (.npy suffix recommended).') parser.add_option('-l', '--list', action='store_true', default=False, help='List available descriptor types.') parser.add_option('-v', '--verbose', action='store_true', default=False, help='Print additional debugging messages. All logging ' 'goes to standard error.') opts, args = parser.parse_args() output_filepath = opts.output_filepath descriptor_type = opts.content_descriptor verbose = opts.verbose llevel = logging.DEBUG if verbose else logging.INFO bin_utils.initialize_logging(logging.getLogger(), llevel) log = logging.getLogger("main") if opts.list: log.info("") log.info("Available ContentDescriptor types:") log.info("") for dl in ContentDescriptorConfiguration.available_labels(): log.info("\t%s", dl) log.info("") exit(0) if len(args) == 0: log.error("Failed to provide an input file path") exit(1) if len(args) > 1: log.warning("More than one filepath provided as an argument. Only " "computing for the first one.") input_filepath = args[0] data_element = DataFileElement(input_filepath) fd = ContentDescriptorConfiguration.new_inst(descriptor_type) feat = fd.compute_descriptor(data_element) if output_filepath: numpy.save(output_filepath, feat) else: # Construct string, because numpy s = [] for f in feat: s.append('%15f' % f) print ' '.join(s)
def main(): parser = cli_parser() args = parser.parse_args() output_filepath = args.output_filepath overwrite = args.overwrite verbose = args.verbose llevel = logging.DEBUG if verbose else logging.INFO bin_utils.initialize_logging(logging.getLogger(), llevel) log = logging.getLogger("main") # Merge loaded config with default config_loaded = False config = default_config() if args.config: if os.path.isfile(args.config): with open(args.config, 'r') as f: config.update(json.load(f)) config_loaded = True elif not os.path.isfile(args.config): log.error("Configuration file path not valid.") exit(1) bin_utils.output_config(args.output_config, config, log, True) # Configuration must have been loaded at this point since we can't normally # trust the default. if not config_loaded: log.error("No configuration provided") exit(1) if not args.input_file: log.error("Failed to provide an input file path") exit(1) elif not os.path.isfile(args.input_file): log.error("Given path does not point to a file.") exit(1) input_filepath = args.input_file data_element = DataFileElement(input_filepath) factory = DescriptorElementFactory.from_config(config['descriptor_factory']) #: :type: smqtk.algorithms.descriptor_generator.DescriptorGenerator cd = plugin.from_plugin_config(config['content_descriptor'], get_descriptor_generator_impls()) descr_elem = cd.compute_descriptor(data_element, factory, overwrite) vec = descr_elem.vector() if vec is None: log.error("Failed to generate a descriptor vector for the input data!") if output_filepath: numpy.save(output_filepath, vec) else: # Construct string, because numpy s = [] # noinspection PyTypeChecker for f in vec: s.append('%15f' % f) print ' '.join(s)
def main(): parser = cli_parser() args = parser.parse_args() debug_smqtk = args.debug_smqtk or args.verbose debug_server = args.debug_server or args.verbose bin_utils.initialize_logging(logging.getLogger("__main__"), logging.INFO - (10 * debug_smqtk)) bin_utils.initialize_logging(logging.getLogger("smqtk"), logging.INFO - (10*debug_smqtk)) bin_utils.initialize_logging(logging.getLogger("werkzeug"), logging.WARN - (20*debug_server)) log = logging.getLogger(__name__) web_applications = smqtk.web.get_web_applications() if args.list: log.info("") log.info("Available applications:") log.info("") for l, cls in six.iteritems(web_applications): log.info("\t" + l) if debug_smqtk: log.info('\t' + ('^'*len(l)) + '\n' + cls.__doc__ + '\n' + ('*' * 80) + '\n') log.info("") exit(0) application_name = args.application if application_name is None: log.error("No application name given!") exit(1) elif application_name not in web_applications: log.error("Invalid application label '%s'", application_name) exit(1) #: :type: smqtk.web.SmqtkWebApp app_class = web_applications[application_name] config = bin_utils.utility_main_helper(app_class.get_default_config, args, skip_logging_init=True) host = args.host port = args.port and int(args.port) use_reloader = args.reload use_threading = args.threaded use_basic_auth = args.use_basic_auth # noinspection PyUnresolvedReferences #: :type: smqtk.web.SmqtkWebApp app = app_class.from_config(config) if use_basic_auth: app.config["BASIC_AUTH_FORCE"] = True BasicAuth(app) app.config['DEBUG'] = debug_server log.info("Starting application") app.run(host=host, port=port, debug=debug_server, use_reloader=use_reloader, threaded=use_threading)
def main(): import optparse description = \ "Generate the model for the given indexer type, using features " \ "from the given feature descriptor type. We use configured valued in " \ "the smqtk_config module and from the system configuration JSON file " \ "(etc/system_config.json) unless otherwise specified by options to " \ "this script. Specific ingest used is determined by the ingest type " \ "provided (-t/--type)." parser = bin_utils.SMQTKOptParser(description=description) group_required = optparse.OptionGroup(parser, "Required Options") group_optional = optparse.OptionGroup(parser, "Optional") group_required.add_option('-d', '--data-set', help="Data set to use for model generation.") group_required.add_option('-c', '--content-descriptor', help="Feature descriptor type for model and " "feature generation.") group_required.add_option('-i', '--indexer', help="(Optional) Indexer type for model " "generation.") group_optional.add_option('--sys-json', help="Custom system configuration JSON file to " "use. Otherwise we use the one specified in " "the smqtk_config module.") group_optional.add_option('-l', '--list', action='store_true', default=False, help="List available ingest configurations. If " "a valid ingest configuration has been " "specified, we list available " "FeatureDetector and Indexer configurations " "available.") group_optional.add_option('-t', '--threads', type=int, default=None, help='Number of threads/processes to use for ' 'processing. By default we use all ' 'available cores/threads.') group_optional.add_option('-v', '--verbose', action='store_true', default=False, help='Add debug messaged to output logging.') parser.add_option_group(group_required) parser.add_option_group(group_optional) opts, args = parser.parse_args() bin_utils.initialize_logging(logging.getLogger(), logging.INFO - (10*opts.verbose)) log = logging.getLogger("main") dset_label = opts.data_set cd_label = opts.content_descriptor idxr_label = opts.indexer parallel = opts.threads # Prep custom JSON configuration if one was given if opts.sys_json: with open(opts.sys_json) as json_file: json_config = json.loads(jsmin(json_file.read())) ConfigurationInterface.BASE_CONFIG = json_config['Ingests'] if opts.list: log.info("") log.info("Available Data Sets:") log.info("") for l in DataSetConfiguration.available_labels(): log.info("\t%s" % l) log.info("") log.info("Available ContentDescriptor types:") log.info("") for l in ContentDescriptorConfiguration.available_labels(): log.info("\t%s" % l) log.info("") log.info("Available Indexer types:") log.info("") for l in IndexerConfiguration.available_labels(): log.info("\t%s", l) log.info("") exit(0) # Check given labels fail = False if dset_label and dset_label not in DataSetConfiguration.available_labels(): log.error("Given label '%s' is NOT associated to an existing " "data set configuration!", dset_label) fail = True if cd_label and cd_label not in ContentDescriptorConfiguration.available_labels(): log.error("Given label '%s' is NOT associated to an existing " "content descriptor configuration!", cd_label) fail = True if idxr_label and idxr_label not in IndexerConfiguration.available_labels(): log.error("Given label '%s' is NOT associated to an existing " "indexer configuration!", idxr_label) fail = True if fail: exit(1) del fail log.info("Loading data-set instance...") #: :type: DataIngest or VideoIngest dset = DataSetConfiguration.new_inst(dset_label) log.info("Loading descriptor instance...") #: :type: smqtk.content_description.ContentDescriptor descriptor = ContentDescriptorConfiguration.new_inst(cd_label) # Generate any model files needed by the chosen descriptor descriptor.PARALLEL = parallel descriptor.generate_model(dset) # Don't do indexer model generation if a type was not provided if idxr_label: log.info("Loading indexer instance...") #: :type: smqtk.indexing.Indexer indexer = IndexerConfiguration.new_inst(idxr_label) # It is not guaranteed that the feature computation method is doing # anything in parallel, but if it is, request that it perform serially # in order to allow multiple high-level feature computation jobs, else # we could be overrun with threads. descriptor.PARALLEL = 1 # Using NonDaemonicPool because content_description that might to # parallel processing might use multiprocessing.Pool instances, too. # Pools don't usually allow daemonic processes, so this custom top-level # pool allows worker processes to spawn pools themselves. fmap = descriptor.compute_descriptor_async( dset, parallel=parallel, pool_type=NonDaemonicPool ) indexer.generate_model(fmap, parallel=parallel)
def main(): parser = bin_utils.SMQTKOptParser() setup_cli(parser) opts, args = parser.parse_args() debug_smqtk = opts.debug_smqtk debug_server = opts.debug_server bin_utils.initialize_logging(logging.getLogger("smqtk"), logging.INFO - (10*debug_smqtk)) bin_utils.initialize_logging(logging.getLogger("werkzeug"), logging.WARN - (20*debug_server)) log = logging.getLogger("smqtk.main") web_applications = smqtk.web.get_web_applications() if opts.list: log.info("") log.info("Available applications:") log.info("") for l in web_applications: log.info("\t" + l) log.info("") exit(0) application_name = opts.application if application_name is None: log.error("No application name given!") exit(1) elif application_name not in web_applications: log.error("Invalid application label '%s'", application_name) exit(1) app_class = web_applications[application_name] # Output config and exit if requested bin_utils.output_config(opts.output_config, app_class.get_default_config(), log, opts.overwrite) if not opts.config: log.error("No configuration provided") exit(1) elif not os.path.isfile(opts.config): log.error("Configuration file path not valid.") exit(1) with open(opts.config, 'r') as f: config = json.loads(jsmin(f.read())) host = opts.host port = opts.port and int(opts.port) use_reloader = opts.reload use_threading = opts.threaded use_basic_auth = opts.use_basic_auth # noinspection PyUnresolvedReferences app = app_class.from_config(config) if use_basic_auth: app.config["BASIC_AUTH_FORCE"] = True BasicAuth(app) app.config['DEBUG'] = debug_server app.run(host=host, port=port, debug=debug_server, use_reloader=use_reloader, threaded=use_threading)
import json import logging from smqtk import algorithms from smqtk import representation from smqtk.utils import bin_utils, jsmin, plugin __author__ = '*****@*****.**' # # Setup logging # if not logging.getLogger().handlers: bin_utils.initialize_logging(logging.getLogger(), logging.DEBUG) log = logging.getLogger("smqtk.scripts.iqr_app_model_generation") # # Input parameters # # The following dictionaries are JSON configurations that are used to # configure the various data structures and algorithms needed for the IQR demo # application. Values here can be changed to suit your specific data and # algorithm needs. # # See algorithm implementation doc-strings for more information on configuration # parameters (see implementation class ``__init__`` method). # search_app_config_filepath = "/Users/purg/dev/smqtk/source/python/smqtk/web/" \
#for line in chunk_file: # fpath = line.rstrip() # log.debug("Async processing filepath: %s", fpath) # pool.apply_async(process_file, args=(fpath,)) file_paths = [line.rstrip() for line in chunk_file] pool.map(process_file, file_paths) pool.close() pool.join() del pool mark_stage(stage_label) else: log.info("'%s' already complete", stage_label) stage_label = osp.basename(chunk_file_path)+'-cleanup' if not check_stage(stage_label): log.info("Cleaning work tree for chunk '%s'", chunk_file_path) if osp.isdir(CLEAN_WORK_DIR): shutil.rmtree(CLEAN_WORK_DIR) mark_stage(stage_label) else: log.info("'%s' already complete", stage_label) if __name__ == '__main__': initialize_logging(logging.getLogger(), logging.INFO) run()
def main(): usage = "%prog [OPTIONS] INPUT_FILE" description = """\ Compute a descriptor vector for a given data file, outputting the generated feature vector to standard out, or to an output file if one was specified (in numpy format). """ parser = bin_utils.SMQTKOptParser(usage, description=description) group_labels = optparse.OptionGroup(parser, "Configuration Labels") group_labels.add_option('-c', '--content-descriptor', help='The descriptor type to use. This must be a ' 'type available in the system configuration') group_labels.add_option('-f', '--factory-type', help='The DescriptorElementFactory configuration ' 'to use when computing the descriptor. This ' 'must be a type available in the system ' 'configuration.') parser.add_option_group(group_labels) group_optional = optparse.OptionGroup(parser, "Optional Parameters") group_optional.add_option('-l', '--list', action='store_true', default=False, help='List available descriptor types.') group_optional.add_option('--overwrite', action='store_true', default=False, help="Force descriptor computation even if an " "existing descriptor vector was discovered " "based on the given content descriptor type " "and data combination.") group_optional.add_option('-o', '--output-filepath', help='Optional path to a file to output feature ' 'vector to. Otherwise the feature vector is ' 'printed to standard out. Output is saved ' 'in numpy binary format (.npy suffix ' 'recommended).') group_optional.add_option('-v', '--verbose', action='store_true', default=False, help='Print additional debugging messages. All ' 'logging goes to standard error.') parser.add_option_group(group_optional) opts, args = parser.parse_args() output_filepath = opts.output_filepath descriptor_label = opts.content_descriptor factory_label = opts.factory_type overwrite = opts.overwrite verbose = opts.verbose llevel = logging.DEBUG if verbose else logging.INFO bin_utils.initialize_logging(logging.getLogger(), llevel) log = logging.getLogger("main") if opts.list: log.info("") log.info("Available ContentDescriptor types:") log.info("") for dl in ContentDescriptorConfiguration.available_labels(): log.info("\t%s", dl) log.info("") log.info("Available DescriptorElementFactory types:") log.info("") for df in DescriptorFactoryConfiguration.available_labels(): log.info("\t%s", df) log.info("") exit(0) if len(args) == 0: log.error("Failed to provide an input file path") exit(1) if len(args) > 1: log.warning("More than one filepath provided as an argument. Only " "computing for the first one.") input_filepath = args[0] data_element = DataFileElement(input_filepath) cd = ContentDescriptorConfiguration.new_inst(descriptor_label) factory = DescriptorFactoryConfiguration.new_inst(factory_label) descr_elem = cd.compute_descriptor(data_element, factory, overwrite) vec = descr_elem.vector() if vec is None: log.error("Failed to generate a descriptor vector for the input data!") if output_filepath: numpy.save(output_filepath, vec) else: # Construct string, because numpy s = [] # noinspection PyTypeChecker for f in vec: s.append('%15f' % f) print ' '.join(s)
def main(): parser = cli_parser() args = parser.parse_args() # # Setup logging # if not logging.getLogger().handlers: if args.verbose: bin_utils.initialize_logging(logging.getLogger(), logging.DEBUG) else: bin_utils.initialize_logging(logging.getLogger(), logging.INFO) log = logging.getLogger("smqtk.scripts.iqr_app_model_generation") search_app_config = json.loads(jsmin.jsmin(open(args.config).read())) # # Input parameters # # The following dictionaries are JSON configurations that are used to # configure the various data structures and algorithms needed for the IQR demo # application. Values here can be changed to suit your specific data and # algorithm needs. # # See algorithm implementation doc-strings for more information on configuration # parameters (see implementation class ``__init__`` method). # # base actions on a specific IQR tab configuration (choose index here) if args.tab < 0 or args.tab > (len(search_app_config["iqr_tabs"]) - 1): log.error("Invalid tab number provided.") exit(1) search_app_iqr_config = search_app_config["iqr_tabs"][args.tab] # Configure DataSet implementation and parameters data_set_config = search_app_iqr_config['data_set'] # Configure DescriptorGenerator algorithm implementation, parameters and # persistant model component locations (if implementation has any). descriptor_generator_config = search_app_iqr_config['descr_generator'] # Configure NearestNeighborIndex algorithm implementation, parameters and # persistant model component locations (if implementation has any). nn_index_config = search_app_iqr_config['nn_index'] # Configure RelevancyIndex algorithm implementation, parameters and # persistant model component locations (if implementation has any). # # The LibSvmHikRelevancyIndex implementation doesn't actually build a persistant # model (or doesn't have to that is), but we're leaving this block here in # anticipation of other potential implementations in the future. # rel_index_config = search_app_iqr_config['rel_index_config'] # Configure DescriptorElementFactory instance, which defines what implementation # of DescriptorElement to use for storing generated descriptor vectors below. descriptor_elem_factory_config = search_app_iqr_config[ 'descriptor_factory'] # # Initialize data/algorithms # # Constructing appropriate data structures and algorithms, needed for the IQR # demo application, in preparation for model training. # descriptor_elem_factory = \ representation.DescriptorElementFactory \ .from_config(descriptor_elem_factory_config) #: :type: representation.DataSet data_set = \ plugin.from_plugin_config(data_set_config, representation.get_data_set_impls()) #: :type: algorithms.DescriptorGenerator descriptor_generator = \ plugin.from_plugin_config(descriptor_generator_config, algorithms.get_descriptor_generator_impls()) #: :type: algorithms.NearestNeighborsIndex nn_index = \ plugin.from_plugin_config(nn_index_config, algorithms.get_nn_index_impls()) #: :type: algorithms.RelevancyIndex rel_index = \ plugin.from_plugin_config(rel_index_config, algorithms.get_relevancy_index_impls()) # # Build models # # Perform the actual building of the models. # # Add data files to DataSet DataFileElement = representation.get_data_element_impls( )["DataFileElement"] for fp in args.input_files: fp = osp.expanduser(fp) if osp.isfile(fp): data_set.add_data(DataFileElement(fp)) else: log.debug("Expanding glob: %s" % fp) for g in glob.iglob(fp): data_set.add_data(DataFileElement(g)) # Generate a mode if the generator defines a known generation method. if hasattr(descriptor_generator, "generate_model"): descriptor_generator.generate_model(data_set) # Add other if-else cases for other known implementation-specific generation # methods stubs # Generate descriptors of data for building NN index. data2descriptor = descriptor_generator.compute_descriptor_async( data_set, descriptor_elem_factory) try: nn_index.build_index(six.itervalues(data2descriptor)) except RuntimeError: # Already built model, so skipping this step pass rel_index.build_index(six.itervalues(data2descriptor))
def main(): parser = cli_parser() args = parser.parse_args() output_filepath = args.output_filepath overwrite = args.overwrite verbose = args.verbose llevel = logging.DEBUG if verbose else logging.INFO bin_utils.initialize_logging(logging.getLogger(), llevel) log = logging.getLogger("main") # Merge loaded config with default config_loaded = False config = default_config() if args.config: if os.path.isfile(args.config): with open(args.config, 'r') as f: config.update(json.load(f)) config_loaded = True elif not os.path.isfile(args.config): log.error("Configuration file path not valid.") exit(1) bin_utils.output_config(args.output_config, config, log, True) # Configuration must have been loaded at this point since we can't normally # trust the default. if not config_loaded: log.error("No configuration provided") exit(1) if not args.input_file: log.error("Failed to provide an input file path") exit(1) elif not os.path.isfile(args.input_file): log.error("Given path does not point to a file.") exit(1) input_filepath = args.input_file data_element = DataFileElement(input_filepath) factory = DescriptorElementFactory.from_config( config['descriptor_factory']) #: :type: smqtk.algorithms.descriptor_generator.DescriptorGenerator cd = plugin.from_plugin_config(config['content_descriptor'], get_descriptor_generator_impls) descr_elem = cd.compute_descriptor(data_element, factory, overwrite) vec = descr_elem.vector() if vec is None: log.error("Failed to generate a descriptor vector for the input data!") if output_filepath: numpy.save(output_filepath, vec) else: # Construct string, because numpy s = [] # noinspection PyTypeChecker for f in vec: s.append('%15f' % f) print ' '.join(s)
def main(): parser = cli_parser() args = parser.parse_args() debug_smqtk = args.debug_smqtk debug_server = args.debug_server bin_utils.initialize_logging(logging.getLogger("smqtk"), logging.INFO - (10 * debug_smqtk)) bin_utils.initialize_logging(logging.getLogger("werkzeug"), logging.WARN - (20 * debug_server)) log = logging.getLogger("smqtk.main") web_applications = smqtk.web.get_web_applications() if args.list: log.info("") log.info("Available applications:") log.info("") for l in web_applications: log.info("\t" + l) log.info("") exit(0) application_name = args.application if application_name is None: log.error("No application name given!") exit(1) elif application_name not in web_applications: log.error("Invalid application label '%s'", application_name) exit(1) app_class = web_applications[application_name] # Merge loaded config with default config_loaded = False config = app_class.get_default_config() if args.config: if os.path.isfile(args.config): with open(args.config, 'r') as f: config.update(json.load(f)) config_loaded = True elif not os.path.isfile(args.config): log.error("Configuration file path not valid.") exit(1) # Output config and exit if requested bin_utils.output_config(args.output_config, config, log, args.overwrite) # Configuration must have been loaded at this point since we can't normally # trust the default. if not config_loaded: log.error("No configuration provided") exit(1) host = args.host port = args.port and int(args.port) use_reloader = args.reload use_threading = args.threaded use_basic_auth = args.use_basic_auth # noinspection PyUnresolvedReferences app = app_class.from_config(config) if use_basic_auth: app.config["BASIC_AUTH_FORCE"] = True BasicAuth(app) app.config['DEBUG'] = debug_server app.run(host=host, port=port, debug=debug_server, use_reloader=use_reloader, threaded=use_threading)
if __name__ == "__main__": p = cli_parser() args = p.parse_args() debug = args.debug config_fp = args.config out_config_fp = args.gen_config completed_files_fp = args.completed_files filelist_fp = args.file_list batch_size = args.batch_size # Initialize logging llevel = debug and logging.DEBUG or logging.INFO if not logging.getLogger('smqtk').handlers: initialize_logging(logging.getLogger('smqtk'), llevel) if not logging.getLogger('__main__').handlers: initialize_logging(logging.getLogger('__main__'), llevel) l = logging.getLogger(__name__) # Merge loaded config with default config_loaded = False c = default_config() if config_fp: if os.path.isfile(config_fp): with open(config_fp) as f: c.update(json.loads(jsmin(f.read()))) config_loaded = True else: l.error("Config file path not valid")
def main(): parser = cli_parser() args = parser.parse_args() # # Setup logging # if not logging.getLogger().handlers: if args.verbose: bin_utils.initialize_logging(logging.getLogger(), logging.DEBUG) else: bin_utils.initialize_logging(logging.getLogger(), logging.INFO) log = logging.getLogger("smqtk.scripts.iqr_app_model_generation") search_app_config = json.loads(jsmin.jsmin(open(args.config).read())) # # Input parameters # # The following dictionaries are JSON configurations that are used to # configure the various data structures and algorithms needed for the IQR demo # application. Values here can be changed to suit your specific data and # algorithm needs. # # See algorithm implementation doc-strings for more information on configuration # parameters (see implementation class ``__init__`` method). # # base actions on a specific IQR tab configuration (choose index here) if args.tab < 0 or args.tab > (len(search_app_config["iqr_tabs"]) - 1): log.error("Invalid tab number provided.") exit(1) search_app_iqr_config = search_app_config["iqr_tabs"][args.tab] # Configure DataSet implementation and parameters data_set_config = search_app_iqr_config['data_set'] # Configure DescriptorGenerator algorithm implementation, parameters and # persistant model component locations (if implementation has any). descriptor_generator_config = search_app_iqr_config['descr_generator'] # Configure NearestNeighborIndex algorithm implementation, parameters and # persistant model component locations (if implementation has any). nn_index_config = search_app_iqr_config['nn_index'] # Configure RelevancyIndex algorithm implementation, parameters and # persistant model component locations (if implementation has any). # # The LibSvmHikRelevancyIndex implementation doesn't actually build a persistant # model (or doesn't have to that is), but we're leaving this block here in # anticipation of other potential implementations in the future. # rel_index_config = search_app_iqr_config['rel_index_config'] # Configure DescriptorElementFactory instance, which defines what implementation # of DescriptorElement to use for storing generated descriptor vectors below. descriptor_elem_factory_config = search_app_iqr_config['descriptor_factory'] # # Initialize data/algorithms # # Constructing appropriate data structures and algorithms, needed for the IQR # demo application, in preparation for model training. # descriptor_elem_factory = \ representation.DescriptorElementFactory \ .from_config(descriptor_elem_factory_config) #: :type: representation.DataSet data_set = \ plugin.from_plugin_config(data_set_config, representation.get_data_set_impls) #: :type: algorithms.DescriptorGenerator descriptor_generator = \ plugin.from_plugin_config(descriptor_generator_config, algorithms.get_descriptor_generator_impls) #: :type: algorithms.NearestNeighborsIndex nn_index = \ plugin.from_plugin_config(nn_index_config, algorithms.get_nn_index_impls) #: :type: algorithms.RelevancyIndex rel_index = \ plugin.from_plugin_config(rel_index_config, algorithms.get_relevancy_index_impls) # # Build models # # Perform the actual building of the models. # # Add data files to DataSet DataFileElement = representation.get_data_element_impls()["DataFileElement"] for fp in args.input_files: fp = osp.expanduser(fp) if osp.isfile(fp): data_set.add_data(DataFileElement(fp)) else: log.debug("Expanding glob: %s" % fp) for g in glob.iglob(fp): data_set.add_data(DataFileElement(g)) # Generate a mode if the generator defines a known generation method. if hasattr(descriptor_generator, "generate_model"): descriptor_generator.generate_model(data_set) # Add other if-else cases for other known implementation-specific generation # methods stubs # Generate descriptors of data for building NN index. data2descriptor = descriptor_generator.compute_descriptor_async( data_set, descriptor_elem_factory ) try: nn_index.build_index(data2descriptor.itervalues()) except RuntimeError: # Already built model, so skipping this step pass rel_index.build_index(data2descriptor.itervalues())
def main(): args = cli_parser().parse_args() ui_config_filepath, iqr_config_filepath = args.config llevel = logging.DEBUG if args.verbose else logging.INFO tab = args.tab input_files_globs = args.input_files # Not using `bin_utils.utility_main_helper`` due to deviating from single- # config-with-default usage. bin_utils.initialize_logging(logging.getLogger('smqtk'), llevel) bin_utils.initialize_logging(logging.getLogger('__main__'), llevel) log = logging.getLogger(__name__) log.info("Loading UI config: '{}'".format(ui_config_filepath)) ui_config, ui_config_loaded = bin_utils.load_config(ui_config_filepath) log.info("Loading IQR config: '{}'".format(iqr_config_filepath)) iqr_config, iqr_config_loaded = bin_utils.load_config(iqr_config_filepath) if not (ui_config_loaded and iqr_config_loaded): raise RuntimeError("One or both configuration files failed to load.") # Ensure the given "tab" exists in UI configuration. if tab is None: log.error("No configuration tab provided to drive model generation.") exit(1) if tab not in ui_config["iqr_tabs"]: log.error("Invalid tab provided: '{}'. Available tags: {}" .format(tab, list(ui_config["iqr_tabs"]))) exit(1) # # Gather Configurations # log.info("Extracting plugin configurations") ui_tab_config = ui_config["iqr_tabs"][tab] iqr_plugins_config = iqr_config['iqr_service']['plugins'] # Configure DataSet implementation and parameters data_set_config = ui_tab_config['data_set'] # Configure DescriptorElementFactory instance, which defines what # implementation of DescriptorElement to use for storing generated # descriptor vectors below. descriptor_elem_factory_config = iqr_plugins_config['descriptor_factory'] # Configure DescriptorGenerator algorithm implementation, parameters and # persistent model component locations (if implementation has any). descriptor_generator_config = iqr_plugins_config['descriptor_generator'] # Configure NearestNeighborIndex algorithm implementation, parameters and # persistent model component locations (if implementation has any). nn_index_config = iqr_plugins_config['neighbor_index'] # # Initialize data/algorithms # # Constructing appropriate data structures and algorithms, needed for the # IQR demo application, in preparation for model training. # log.info("Instantiating plugins") #: :type: representation.DataSet data_set = \ plugin.from_plugin_config(data_set_config, representation.get_data_set_impls()) descriptor_elem_factory = \ representation.DescriptorElementFactory \ .from_config(descriptor_elem_factory_config) #: :type: algorithms.DescriptorGenerator descriptor_generator = \ plugin.from_plugin_config(descriptor_generator_config, algorithms.get_descriptor_generator_impls()) #: :type: algorithms.NearestNeighborsIndex nn_index = \ plugin.from_plugin_config(nn_index_config, algorithms.get_nn_index_impls()) # # Build models # log.info("Adding files to dataset '{}'".format(data_set)) for g in input_files_globs: g = osp.expanduser(g) if osp.isfile(g): data_set.add_data(DataFileElement(g, readonly=True)) else: log.debug("Expanding glob: %s" % g) for fp in glob.iglob(g): data_set.add_data(DataFileElement(fp, readonly=True)) # Generate a model if the generator defines a known generation method. try: log.debug("descriptor generator as model to generate?") descriptor_generator.generate_model(data_set) except AttributeError as ex: log.debug("descriptor generator as model to generate - Nope: {}" .format(str(ex))) # Generate descriptors of data for building NN index. log.info("Computing descriptors for data set with {}" .format(descriptor_generator)) data2descriptor = descriptor_generator.compute_descriptor_async( data_set, descriptor_elem_factory ) # Possible additional support steps before building NNIndex try: # Fit the LSH index functor log.debug("Has LSH Functor to fit?") nn_index.lsh_functor.fit(six.itervalues(data2descriptor)) except AttributeError as ex: log.debug("Has LSH Functor to fit - Nope: {}".format(str(ex))) log.info("Building nearest neighbors index {}".format(nn_index)) nn_index.build_index(six.itervalues(data2descriptor))
def main(): parser = cli_parser() args = parser.parse_args() debug_smqtk = args.debug_smqtk debug_server = args.debug_server bin_utils.initialize_logging(logging.getLogger("smqtk"), logging.INFO - (10*debug_smqtk)) bin_utils.initialize_logging(logging.getLogger("werkzeug"), logging.WARN - (20*debug_server)) log = logging.getLogger("smqtk.main") web_applications = smqtk.web.get_web_applications() if args.list: log.info("") log.info("Available applications:") log.info("") for l in web_applications: log.info("\t" + l) log.info("") exit(0) application_name = args.application if application_name is None: log.error("No application name given!") exit(1) elif application_name not in web_applications: log.error("Invalid application label '%s'", application_name) exit(1) app_class = web_applications[application_name] # Merge loaded config with default config_loaded = False config = app_class.get_default_config() if args.config: if os.path.isfile(args.config): with open(args.config, 'r') as f: config.update(json.load(f)) config_loaded = True elif not os.path.isfile(args.config): log.error("Configuration file path not valid.") exit(1) # Output config and exit if requested bin_utils.output_config(args.output_config, config, log, args.overwrite) # Configuration must have been loaded at this point since we can't normally # trust the default. if not config_loaded: log.error("No configuration provided") exit(1) host = args.host port = args.port and int(args.port) use_reloader = args.reload use_threading = args.threaded use_basic_auth = args.use_basic_auth # noinspection PyUnresolvedReferences app = app_class.from_config(config) if use_basic_auth: app.config["BASIC_AUTH_FORCE"] = True BasicAuth(app) app.config['DEBUG'] = debug_server app.run(host=host, port=port, debug=debug_server, use_reloader=use_reloader, threaded=use_threading)