def train_model(model: SLModel, trainset: NpDataset, valset: NpDataset, epochs=5, batch_size=32): # Create the generators logging.info("Training model for {} epochs and {} batch size".format( epochs, batch_size)) logging.info("Flowing the train and validation sets") traingen = trainset.flow( batch_size=batch_size, shuffle=True, seed=utils.get_random_seed()) valgen = valset.flow(batch_size=batch_size, shuffle=False) # Create the callbacks logging.info("Creating the callbacks") callbacks = [ ModelCheckpoint( utils.get_model_path(RUN_ID), "val_loss", verbose=1, save_best_only=True), Plotter( "loss", scale='log', plot_during_train=True, save_to_file=utils.get_plot_path(RUN_ID), block_on_end=False), Plotter( "accuracy", scale='linear', plot_during_train=True, save_to_file=utils.get_plot_path(RUN_ID + "_acc"), block_on_end=False) ] # Create the optiizer logging.info("Creating the optimizer") params = [param for param in model.parameters() if param.requires_grad] # optimizer = optim.SGD( # params # lr=0.01, # momentum=0.9, # nesterov=True) optimizer = optim.Adam(params) logging.info("Optimizer: %r" % optimizer) # Train the model logs = model.fit_generator( traingen, traingen.steps_per_epoch, epochs=epochs, optimizer=optimizer, validation_generator=valgen, validation_steps=valgen.steps_per_epoch, metrics=["accuracy"], callbacks=callbacks, verbose=1) return logs
def _classify_thread_body(train_ratio_list): ret_list = [] for train_ratio in train_ratio_list: time_start = time.time() logger.info('\t train_ratio = {}, evaling ...'.format(train_ratio)) X_train, X_test, Y_train, Y_test = train_test_split( features_matrix, labels_matrix, test_size=1.0 - train_ratio, random_state=utils.get_random_seed(), shuffle=True) # find out how many labels should be predicted top_k_list = [ np.sum(Y_test[i]) for i in range(np.size(Y_test, axis=0)) ] clf = TopKRanker(LogisticRegression()) clf.fit(X_train, Y_train) preds = clf.predict(X_test, top_k_list) # averages = ["micro", "macro", "samples", "weighted"] # results[average] = f1_score(mlb.fit_transform(y_test), mlb.fit_transform(preds), average=average) # macro = f1_score(Y_test, preds, average="macro") # micro = f1_score(Y_test, preds, average="micro") macro, micro = eval_utils.f1_scores_multilabel(Y_test, preds) logger.info('\t train_ratio = {}, eval completed in {}s'.format( train_ratio, time.time() - time_start)) ret_list.append((train_ratio, macro, micro)) return ret_list
def _classify_thread_body(train_ratio_list): global features_dict, true_edges_list_by_repeat, neg_edges_list_by_repeat ret_list = [] for repeat, op, train_ratio in train_ratio_list: time_start = time.time() logger.info('\t repeat={}, train_ratio={}, op={}, evaling ...'.format( repeat, train_ratio, op)) edges_train, edges_test, labels_train, labels_test = train_test_split( true_edges_list_by_repeat[repeat] + neg_edges_list_by_repeat[repeat], [1] * len(true_edges_list_by_repeat[repeat]) + [0] * len(neg_edges_list_by_repeat[repeat]), test_size=1.0 - train_ratio, random_state=utils.get_random_seed(), shuffle=True) train1 = np.array([features_dict[e[0]] for e in edges_train], dtype=np.float32) train2 = np.array([features_dict[e[1]] for e in edges_train], dtype=np.float32) test1 = np.array([features_dict[e[0]] for e in edges_test], dtype=np.float32) test2 = np.array([features_dict[e[1]] for e in edges_test], dtype=np.float32) if op == 'average': X_train = (train1 + train2) / 2 X_test = (test1 + test2) / 2 elif op == 'hadamard': X_train = np.multiply(train1, train2) X_test = np.multiply(test1, test2) elif op == 'l1': X_train = np.absolute(train1 - train2) X_test = np.absolute(test1 - test2) elif op == 'l2': X_train = np.square(train1 - train2) X_test = np.square(test1 - test2) elif op == 'concat': X_train = np.concatenate((train1, train2), axis=1) X_test = np.concatenate((test1, test2), axis=1) else: logger.error("error: invalid feature operator: {}".format(op)) clf = LogisticRegression() clf.fit(X_train, np.asarray(labels_train)) preds = clf.predict(X_test) # preds = clf.predict_proba(X_test)[:,1] # better choice! auc = roc_auc_score(np.asarray(labels_test), preds) logger.info( '\t repeat={}, train_ratio={}, op={}, eval completed in {}s.'. format(repeat, train_ratio, op, time.time() - time_start)) ret_list.append((train_ratio, op, auc)) return ret_list
def _cluster_thread_body(repeated_times): nmi_list = [] X = features_matrix y = labels_matrix for _ in range(repeated_times): X, y = shuffle(X, y, random_state=utils.get_random_seed()) # clr clr = KMeans(n_clusters=LABEL_SIZE) clr.fit(X) # clustering y_pred = clr.labels_ # get clustering labels nmi_list.append(evalute_NMI(y, y_pred)) return nmi_list
def train_model(model: Model, train_dataset: ImageDataset, val_dataset: ImageDataset, augmenters=(), epochs=100, batch_size=32, epoch_size=10000, plot=False, load_model=False, **kwargs): logging.info("Training model with run id %s" % model.run_id) logging.info("Using: \n\tbatch_size: {batch_size} \ \n\tepochs: {epochs} \ \n\tplot: {plot} \ \n\tload_model: {load_model} \ \n\tepoch_size: {epoch_size}".format(**locals())) if load_model: logging.info("Reloading model from weights") model.load_weights(utils.get_model_path(model.run_id), by_name=True) if model.fine_tune: old_run_id = model.run_id[:-len("-fine-tune")] logging.info( "Fine tuning model with weights from {}".format(old_run_id)) model.load_weights(utils.get_model_path(old_run_id), by_name=True) steps = epoch_size // batch_size val_steps = epoch_size // 10 // batch_size traingen = train_dataset.flow(batch_size=batch_size, steps_per_epoch=steps, shuffle=True, replace=True, seed=utils.get_random_seed()) valgen = val_dataset.flow(batch_size=batch_size, steps_per_epoch=val_steps, shuffle=True, replace=True, seed=utils.get_random_seed()) # Add the augmenters to the training generator for augmenter in augmenters: traingen = augmenter(traingen) # Create the callbacks callbacks = [ ModelCheckpoint(utils.get_model_path(model.run_id), monitor="val_loss", save_best_only=False, save_weights_only=True, mode="min", verbose=1), ModelCheckpoint(utils.get_model_path(model.run_id + "_f1"), monitor="val_f1_loss", save_best_only=True, save_weights_only=True, mode="min", verbose=1), Plotter(monitor="loss", scale="linear", plot_during_train=plot, save_to_file=utils.get_plot_path(model.run_id), block_on_end=False) ] # Train the model history = model.fit_generator( traingen, steps_per_epoch=5 if args.debug else traingen.steps_per_epoch, epochs=epochs, verbose=1, callbacks=callbacks, validation_data=valgen, validation_steps=5 if args.debug else valgen.steps_per_epoch) # Log the output logs = history.history epochs = range(len(logs["val_loss"])) checkpoint = min(epochs, key=lambda i: logs["val_loss"][i]) best_val_loss, best_val_f1 = logs["val_loss"][checkpoint], logs[ "val_f1_loss"][checkpoint] logging.info("LOSS CHECKPOINTED -- Loss: {} -- F1: {}".format( best_val_loss, best_val_f1)) checkpoint = min(epochs, key=lambda i: logs["val_f1_loss"][i]) best_val_loss, best_val_f1 = logs["val_loss"][checkpoint], logs[ "val_f1_loss"][checkpoint] logging.info("ACC CHECKPOINTED -- Loss: {} -- F1: {}".format( best_val_loss, best_val_f1))
from data import HomeCreditData logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('--data', default='../input/', help="Path to the data") parser.add_argument( '--train', action='store_true', help="Runs the script in train mode") parser.add_argument( '--test', action='store_true', help="Runs the script in test mode") MODEL = models.SNNModel RUN_ID = "snn" SEED = 42 utils.set_random_seed(SEED) SPLIT_SEED = utils.get_random_seed() # TODO: Add ROC AUC stateful metric to pyjet so we don't need the validate # function and can plot the roc-auc over time # TODO: add more logging to these functions s we can see what's going on def train_model(model: SLModel, trainset: NpDataset, valset: NpDataset, epochs=5, batch_size=32): # Create the generators logging.info("Training model for {} epochs and {} batch size".format( epochs, batch_size)) logging.info("Flowing the train and validation sets")
def sample_by_nodes(self, sampled_num, rule="random", keep_consistent_nodes=False): """ sample some nodes to construct a sub-network. :param sampled_num: :param keep_consistent_nodes: whether making the sampled nodes consistent by re-sorting the nodesID. :param rule: sampling rule. random: randomly sample all sampled_nodes; extend: randomly sample one root-node and then extend to sampled_nodes. :return: a sub-network with sampled_nodes and corresponding edges. """ logger.info( 'Net sampling: sample nodes to construct a sub-network ...') logger.info( "\t\t sampled_nodes = {}, sample_rule = {}, keep_consistent_nodes = {}" .format(sampled_num, rule, keep_consistent_nodes)) logger.info("\t\t origin_node_size = {}".format(self.get_nodes_size())) assert sampled_num <= self.get_nodes_size(), "error, {} > {}".format( sampled_num, self.get_nodes_size()) time_start = time.time() origin_nodes_list = list(self.nodes) if rule == "random": sampled_nodes_set = set( shuffle(origin_nodes_list, random_state=utils.get_random_seed())[0:sampled_num]) # random.shuffle(origin_nodes_list) # sampled_nodes_set = set(origin_nodes_list[0:sampled_num]) elif rule == "extend": sampled_nodes_set = set() extend_nodes_list = [] origin_nodes_set = set(origin_nodes_list) while len(sampled_nodes_set) < sampled_num: if len(extend_nodes_list) == 0: origin_nodes_set = origin_nodes_set - sampled_nodes_set root = random.choice( shuffle(list(origin_nodes_set), random_state=utils.get_random_seed())) sampled_nodes_set.add(root) extend_nodes_list.append(root) if len(sampled_nodes_set) >= sampled_num: break root = extend_nodes_list.pop(0) for v in self._nodes_adjlist[root]: if v not in sampled_nodes_set: sampled_nodes_set.add(v) extend_nodes_list.append(v) if len(sampled_nodes_set) >= sampled_num: break else: logger.error( "Unknown sampling rule: '%s'. Valid rules: 'random', 'extend'." % rule) sampled_net = Graph(isdirected=self._isdirected, isweighted=self._isweighted, self_looped=self._self_looped) for node in sampled_nodes_set: sampled_net.add_single_node(node) for v in self._nodes_adjlist[node]: if v in sampled_nodes_set: sampled_net.add_single_edge(node, v) if keep_consistent_nodes: sampled_net.make_consistent() logger.info("\t\t sampled_net edges_size = {}".format( sampled_net.get_edges_size())) logger.info( 'Net sampling: sample nodes completed in {}s'.format(time.time() - time_start)) return sampled_net
def split_by_edges(self, train_ratio=0, keep_static_nodes=True, keep_consistent_nodes=False): """ split the network to two parts: one has train_ratio edges, one has 1-train_ratio edges. :param train_ratio: :param keep_static_nodes: whether the splited two parts keep the same node set as the original network. :param keep_consistent_nodes: whether making the splited nodes consistent by re-sorting the nodesID. :return: train_netwrok: with train_ratio edges, eval_netwrok: with 1-train_ratio edges. """ logger.info( 'Net split: spliting edges to train_network and eval_network ...') logger.info( "\t\t train_ratio = {}, keep_static_nodes = {}, keep_consistent_nodes = {}" .format(train_ratio, keep_static_nodes, keep_consistent_nodes)) logger.info("\t\t origin_edges_size = {}".format( self.get_edges_size())) time_start = time.time() edges_list = self.edges if not self._isdirected: edges_set = set() for source, target in edges_list: if (source, target) not in edges_set and (target, source) not in edges_set: edges_set.add((source, target)) edges_list = list(edges_set) train_edges_list, test_edges_list = train_test_split( edges_list, test_size=1.0 - train_ratio, random_state=utils.get_random_seed(), shuffle=True) # perm = np.arange(len(edges_list)) # random.shuffle(perm) # edges_list_t = [edges_list[i] for i in perm] # edges_list = edges_list_t # # split for train: # train_edges_size = int(np.ceil(len(edges_list)*train_ratio)) # assert train_edges_size <= len(edges_list), "error, {} > {}".format(train_edges_size, len(edges_list)) # # train network: train_net = Graph(isdirected=self._isdirected, isweighted=self._isweighted, self_looped=self._self_looped) # for source, target in edges_list[0:train_edges_size]: for source, target in train_edges_list: train_net.add_single_edge(source, target) if keep_static_nodes: for v in self.nodes: train_net.add_single_node(v) elif keep_consistent_nodes: train_net.make_consistent() logger.info("\t\t train_edges_size = {}".format( train_net.get_edges_size())) # eval network: eval_net = Graph(isdirected=self._isdirected, isweighted=self._isweighted, self_looped=self._self_looped) # for source, target in edges_list[train_edges_size:]: for source, target in test_edges_list: eval_net.add_single_edge(source, target) if keep_static_nodes: for v in self.nodes: eval_net.add_single_node(v) elif keep_consistent_nodes: eval_net.make_consistent() logger.info("\t\t eval_edges_size = {}".format( eval_net.get_edges_size())) logger.info( 'Net split: split edges completed in {}s'.format(time.time() - time_start)) return train_net, eval_net
def validate_arguments(args): # Initialize the logging if args.verbose: logging.basicConfig(level=logging.DEBUG, format="%(levelname)s: %(message)s") else: logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") errors = [] # Partitioning if not args.assignments: if args.nparts == None: errors.append("--nparts is required when not using --assignments") if not args.tpwgts: errors.append("--tpwgts is required when not using --assignments") if args.random_assignments: if args.nparts != None and args.nparts <= 0: errors.append("The --nparts value must be strictly positive") else: if args.nparts != None and args.nparts <= 1: errors.append("The --nparts value must be greater than 1") if args.ubvec != None and args.nparts == None: errors.append( "The --ubvec option is only available with the --nparts option" ) if args.ubvec != None and args.ubvec <= 1.0: errors.append("The --ubvec value must be greater than 1.0") if args.tpwgts and not args.nparts: errors.append( "The --tpwgts option is only available with the --nparts option" ) if args.tpwgts and args.nparts and len(args.tpwgts) != args.nparts: errors.append( "The --tpwgts option requires a list of {} values (one value per partition)" .format(args.nparts)) if args.tpwgts and not math.isclose( sum(args.tpwgts), 1.0, rel_tol=1e-5): errors.append( "The sum of --tpwgts values must be 1.0 (currently {})".format( sum(args.tpwgts))) # Clustering if args.scheme == 'communities': if args.clustering and args.clustering == 'graphviz' and args.cluster_seed: errors.append( "The --cluster-seed option is not available with the graphviz clustering method" ) if args.clustering and args.clustering != 'oslom2' and args.infomap_calls: errors.append( "The --infomap-calls option is only available with the oslom2 clustering method" ) if args.cut_edge_length: errors.append( "The --cut-edge-length option is only available with the cut-edges scheme" ) if args.cut_edge_node_size: errors.append( "The --cut-edge-node-size option is only available with the cut-edges scheme" ) # Cut edges if args.scheme == 'cut-edges': if args.cut_edge_length and (args.cut_edge_length < 0 or args.cut_edge_length > 100): errors.append( "The --cut-edge-length value must be between 0 and 100") if args.clustering: errors.append( "The --clustering option is only available with the communities scheme" ) if args.cluster_seed: errors.append( "The --cluster-seed option is only available with the communities scheme" ) if args.infomap_calls: errors.append( "The --infomap-calls option is only available with the communities scheme" ) # Layout if args.layout != 'linlog' and args.force: errors.append( "The --force option is only available with the linlog layout") if not args.video and args.fps: errors.append( "The --fps option is only available with the --video option") if not args.video and args.padding_time: errors.append( "The --padding-time option is only available with the --video option" ) # Image style if args.node_size and args.node_size_mode != 'fixed': errors.append( "The --node-size option is only available with --node-size-mode fixed" ) if args.min_node_size and args.node_size_mode == 'fixed': errors.append( "The --min-node-size option is only available with --node-size-mode centrality or highlight-new" ) if args.max_node_size and args.node_size_mode == 'fixed': errors.append( "The --max-node-size option is only available with --node-size-mode centrality or highlight-new" ) # Print errors and exit if any error found if errors: for error in errors: logging.error(error) sys.exit(1) # Set default values if args.layout == 'springbox': if not args.attraction: args.attraction = 0.012 if not args.repulsion: args.repulsion = 0.024 elif args.layout == 'linlog': if not args.attraction: args.attraction = 0.0 if not args.repulsion: args.repulsion = -1.2 if not args.fps: args.fps = 8 if not args.padding_time: args.padding_time = 2.0 if not args.node_size: args.node_size = 20 if not args.min_node_size: args.min_node_size = 20 if not args.min_node_size: args.max_node_size = 60 if args.scheme == 'communities': if not args.clustering: args.clustering = 'oslom2' if not args.cluster_seed: args.cluster_seed = utils.get_random_seed() if not args.infomap_calls: args.infomap_calls = 0 if args.scheme == 'cut-edges': if not args.cut_edge_length: args.cut_edge_length = 50 if not args.cut_edge_node_size: args.cut_edge_node_size = 5 if not args.cut_edge_length: args.cut_edge_length = 0 # to avoid passing None to Graphstream if not args.ubvec: args.ubvec = 1.0
def parse_arguments(): parent_parser = argparse.ArgumentParser( description= '''Create animation of network partition assignments. First processes network file and assignments into DGS file format, then uses GraphStream to animate each frame, finally frames are stitched together.''' ) parent_parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity") # Required arguments required_group = parent_parser.add_argument_group('required arguments') required_group.add_argument('-g', '--graph', required=True, help='input graph file') required_group.add_argument('-f', '--format', choices=['metis', 'edgelist', 'gml'], required=True, help='format of the input graph file') required_group.add_argument('-o', '--output_dir', required=True, help='output directory') # Input/output files io_group = parent_parser.add_argument_group('input/outputs options') order_group = io_group.add_mutually_exclusive_group() order_group.add_argument('-n', '--order', help='node order list') order_group.add_argument('--order-seed', type=int, default=utils.get_random_seed(), metavar='S', help='seed for ordering nodes') io_group.add_argument('--filter', help='filter node list (<= 0 to exclude node)') io_group.add_argument( '--node-weight', default='weight', metavar='W', help= 'attribute used to determine the weight of each node (default=\'weight\')' ) io_group.add_argument( '--edge-weight', default='weight', metavar='W', help= 'attribute used to determine the weight of each edge (default=\'weight\')' ) # Partitioning partitioning_group = parent_parser.add_argument_group( 'partitioning options') partitioning_type_group = partitioning_group.add_mutually_exclusive_group() partitioning_type_group.add_argument('-a', '--assignments', help='partition assignments list') partitioning_type_group.add_argument("--random-assignments", action="store_true", help="generate random assignments") partitioning_group.add_argument( '--partition-seed', type=int, default=utils.get_random_seed(), metavar='S', help='seed for random assignments partitioning') partitioning_group.add_argument( '--nparts', type=int, metavar='P', help='number of partitions to generate with METIS') partitioning_group.add_argument( '--ubvec', type=float, metavar='U', help= 'allowed load imbalance among partitions in METIS (default=1.001). The load imbalance must be greater than 1.0, 1.2 indicates a desired maximum load imbalance of 20 percents.' ) partitioning_group.add_argument( '--tpwgts', nargs='+', type=float, metavar='T', help= 'desired weight for each partition in METIS. The sum of tpwgts[] must be 1.0' ) partitioning_group.add_argument( '--show-partitions', nargs='+', type=int, help= 'partitions to be displayed (based on nparts or partition values in assignments list)' ) # Layout layout_group = parent_parser.add_argument_group('layout options') layout_group.add_argument('--layout', '-l', choices=['springbox', 'linlog'], default='springbox', help='graph layout') layout_group.add_argument('--layout-seed', type=int, default=utils.get_random_seed(), metavar='S', help='seed for graph layout') layout_group.add_argument( '--force', type=float, metavar='F', help='force for linlog graph layout (default=3.0)') layout_group.add_argument( '--attraction', type=float, metavar='A', help= 'attraction factor for graph layout (default=0.06 for springbox, default=0.0 for linlog)' ) layout_group.add_argument( '--repulsion', type=float, metavar='R', help= 'repulsion factor for graph layout (default=0.024 for springbox, default=-1.2 for linlog)' ) # Coloring coloring_group = parent_parser.add_argument_group('coloring options') color_mode_group = coloring_group.add_mutually_exclusive_group() color_mode_group.add_argument( '--color-scheme', choices=['pastel', 'primary-colors'], default='pastel', help='color scheme used by gvmap (default=pastel)') color_mode_group.add_argument('--node-color', metavar='C', help='single color to use for all nodes') coloring_group.add_argument('--color-seed', type=int, default=utils.get_random_seed(), metavar='S', help='seed for coloring with gvmap') coloring_group.add_argument( '--shadow-color', metavar='C', help= 'color of the shadow to use for highlighted nodes. Use with --node-size-mode highlight-new' ) # Image style styling_group = parent_parser.add_argument_group('image options') styling_group.add_argument( '--node-size-mode', choices=['fixed', 'centrality', 'highlight-new'], default='fixed', help='node size mode') styling_group.add_argument( '--node-size', type=int, metavar='S', help= 'node size in pixels (default=20). Use with --node-size-mode fixed.') styling_group.add_argument( '--min-node-size', type=int, metavar='S', help= 'minimum node size in pixels (default=20). Use with --node-size-mode centrality or highlight-new.' ) styling_group.add_argument( '--max-node-size', type=int, metavar='S', help= 'maximum node size in pixels (default=60). Use with --node-size-mode centrality or highlight-new.' ) styling_group.add_argument('--edge-size', type=int, default=1, metavar='S', help='edge size in pixels (default=1)') styling_group.add_argument('--label-size', type=int, default=10, metavar='S', help='label size in points (default=10)') styling_group.add_argument( '--label-type', choices=['id', 'order'], default='id', metavar='T', help='type of node labels (node id or node order)') styling_group.add_argument('--border-size', type=int, default=1, metavar='S', help='border size between tiles (default=1)') styling_group.add_argument('--width', type=int, default=1280, metavar='W', help='image width (default=1280)') styling_group.add_argument('--height', type=int, default=720, metavar='H', help='image height (default=720)') # Video video_group = parent_parser.add_argument_group('video options') video_group.add_argument('--video', help='output video file with tiled frames') video_group.add_argument('--fps', type=int, help='frames per second (default=8)') video_group.add_argument( '--padding-time', type=float, help= 'padding time in seconds to add extra frames at the end of the video (default=2.0)' ) # Pdf pdf_group = parent_parser.add_argument_group('pdf options') pdf_group.add_argument( '--pdf', type=int, default=20, metavar='P', help='Percentage of frames to convert to pdf (default=20)') # Scheme scheme_group = parent_parser.add_argument_group('scheme option') scheme_group.add_argument( '-s', '--scheme', choices=['communities', 'cut-edges'], default='communities', help= 'scheme to highlight either communities or cut edges (default=communities)' ) # Clustering clustering_group = parent_parser.add_argument_group( 'communities options (only for scheme=communities)') clustering_group.add_argument('--clustering', '-c', choices=['oslom2', 'infomap', 'graphviz'], help='clustering method (default=oslom2)') clustering_group.add_argument('--cluster-seed', type=int, metavar='S', help='seed for clustering') clustering_group.add_argument( '--infomap-calls', type=int, metavar='C', help= 'number of times infomap is called within oslom2. Good values are between 1 and 10 (default=0)' ) # Cut edges cut_edges_group = parent_parser.add_argument_group( 'cut-edges options (only for scheme=cut-edges)') cut_edges_group.add_argument( '--cut-edge-length', type=int, metavar='L', help='length of cut edges as percentage of original length (default=50)' ) cut_edges_group.add_argument( '--cut-edge-node-size', metavar='S', help='size of the nodes attached to cut edges (default=10)') return parent_parser.parse_args()