def save_image(fig, name, in_tmp, step): if in_tmp: # step should be an int if in_tmp is True path = config.tmp_loc+'step/step_'+str(step)+'/' func.create_directory(path) plt.savefig(path + name) else: plt.savefig(config.parent_dir + 'data/images/' + name) with open(config.parent_dir + 'data/saved/' + name + '.pkl', 'wb') as f: pickle.dump(fig, f)
def __init__(self): #Assign the application directory self.application_directory = data.application_directory self.resources_directory = data.resources_directory # Create the settings file path functions.create_directory(data.settings_directory) self.settings_filename_with_path = functions.unixify_path_join( data.settings_directory, self.settings_filename ) #Check if the settings file exists if self.check_settings_file() == None: #Create the settings file self.create_settings_file(self.empty_settings_list) #Load the settings from the settings file self.load_settings()
def createCacheData(outputFile="cachedData.npz"): if outputFile.find('.') == -1: outputFile += ".npz" outputFile = os.path.join(DATA_FOLDER, outputFile) #Prevent overwriting assert(not os.path.isfile(outputFile)) create_directory(DATA_FOLDER) noteStateSeq = musicFolderToNoteStateSeq(TRAIN_MUSIC_FOLDER) wordIdxToNoteState, wordIdxToCount = loadVocabularyData(noteStateSeq) print("Creating data cache") print("-------------------") np.savez(outputFile, noteStateSeq=noteStateSeq, wordIdxToNoteState=wordIdxToNoteState, wordIdxToCount=wordIdxToCount) print("Data cache created: {}".format(outputFile))
def main(run_again, with_dnn, with_prompt, override, with_collect=True): # with_collect = True # whether or not to run collect after run is finished if with_prompt: print("Are you sure you have checked the following variables?") print(" - with_collect (run.py)") print(" - with_dnn (run.py") print(" - already_trained (run.py") print(" - has_past (pack_data.py)") check = input('(y/n) ') if not check in ['y', 'Y']: raise Exception("please check your variables!") print() # start runtime start_prog = timeit.default_timer() # ensure current working directory is in src folder if os.getcwd()[-3:] != 'src': # assuming we are somewhere inside the git directory path = s.Popen('git rev-parse --show-toplevel', shell=True, stdout=s.PIPE).communicate()[0].decode("utf-8")[:-1] print('changing working directory from', os.getcwd(), 'to', path) os.chdir(path + '/src') print('Run Args:', sys.argv[:]) # if user wants to pass in arguments if len(sys.argv) > 1 and sys.argv[1] == 'master': init.use_fund = sys.argv[2] == 'True' if len(sys.argv) == 5: init.switch = int(sys.argv[3]) init.has_past = sys.argv[4] == 'True' else: if len(sys.argv) >= 5: # config 1 init.time_periods = int(sys.argv[1]) init.ideas_per_time = int(sys.argv[2]) init.N = int(sys.argv[3]) init.time_periods_alive = int(sys.argv[4]) if len(sys.argv) >= 8: # config 2 init.prop_sds = float(sys.argv[5]) init.prop_means = float(sys.argv[6]) init.prop_start = float(sys.argv[7]) if len(sys.argv) == 15: # server config init.true_means_lam = float(sys.argv[5]) init.prop_sds = float(sys.argv[6]) init.prop_means = float(sys.argv[7]) init.prop_start = float(sys.argv[8]) init.switch = float(sys.argv[9]) # sys.argv[10] is empty for now init.all_scientists = sys.argv[11] == 'True' init.use_equal = sys.argv[12] == 'True' init.use_idea_shift = sys.argv[13] == 'True' init.show_step = sys.argv[14] == 'True' if not override: if run_again: init.switch = 2 # need bayesian stats to train neural net the first time else: init.switch = 4 # prefer neural net over bayesian stats if already trained # check if we are using batch runs if os.path.isdir('tmp_batch'): init.tmp_loc = 'tmp_batch/tmp_' + '_'.join( [str(v) for v in sys.argv[1:]]) + '/' # so that config file loads after init.py is set import config, collect import model as m import functions as func func.create_directory(config.parent_dir + 'data/') func.create_directory(config.tmp_loc) with open(config.tmp_loc + 'start_prog.txt', 'w') as f: f.write('%d' % time.time()) config.start = timeit.default_timer() # default parameters for model as a dictionary all_params = { "seed": config.seed, "use_multiprocessing": config.use_multiprocessing, "use_fund": config.use_fund, "optimization": config.switch, "time_periods": config.time_periods, "ideas_per_time": config.ideas_per_time, "N": config.N, "use_store_model": config.use_store_model, "time_periods_alive": config.time_periods_alive, "true_means_lam": config.true_means_lam, "true_sds_lam": config.true_sds_lam, "start_effort_lam": config.start_effort_lam, "k_lam": config.k_lam, "use_multithreading": config.use_multithreading, "use_equal": config.use_equal, "use_idea_shift": config.use_idea_shift } # printing parameters into console screen func.f_print("\nVariables:\n", all_params) # write parameters to text file f = open('../data/parameters.txt', 'w') f.write(str(all_params)) f.close() # initialize model object model = m.ScientistModel(config.seed) func.stop_run("time to create model object... now entering main function") func.gc_collect() for i in range(config.time_periods + 2): model.step() func.stop_run("step: " + str(i)) func.f_print("\nTOTAL TIME TO FINISH RUNNING SIMULATION:", timeit.default_timer() - start_prog, "seconds") if with_collect: s.call('python3 collect.py', shell=True) path = s.Popen('git rev-parse --show-toplevel', shell=True, stdout=s.PIPE).communicate()[0].decode("utf-8")[:-1] # s.call('open ../data/pages/page_agent_vars.html', shell=True) # s.call("/usr/bin/open -a '/Applications/Google Chrome.app' 'file://"+path+"/data/images/scatterplot_resid.png'", shell=True) # open image with Chrome # s.call("/usr/bin/open -a '/Applications/Google Chrome.app' 'file://"+path+"/data/images/1-var_bar_graph_prop_idea_phase.png'", shell=True) # open image with Chrome # collect.init() if not override: if with_dnn: s.call('python3 ../ai/neural_net.py', shell=True) if run_again: s.call('python3 run.py False False False', shell=True)
def main(): #define supported models allowed_models = [ 'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152' ] #Set up argument parser for console input parser = argparse.ArgumentParser(description='Train NN') parser.add_argument('data_dir', help='directory containing sub-folders with data') parser.add_argument('--save_dir', help='directory for saving checkpoint', default='checkpoints') parser.add_argument('--arch', help='pre-trained model architecture', default='resnet18', choices=allowed_models) parser.add_argument('--learning_rate', help='learning rate during learning', type=float, default=0.01) parser.add_argument('--dropout', help='dropout during learning', type=float, default=0.05) parser.add_argument('--hidden_units', help='List of number of nodes in hidden layers', nargs='+', type=int, default=[256, 128]) parser.add_argument('--epochs', help='Number of epochs for training', default=3, type=int) parser.add_argument('--gpu', help='Enable GPU', action='store_true') args = parser.parse_args() # Describe directories relative to working directory data_dir = args.data_dir train_dir = data_dir + '/train' valid_dir = data_dir + '/valid' test_dir = data_dir + '/test' save_dir = args.save_dir # Set variables for console input arguments model_arch = args.arch model_hidden_units = args.hidden_units learning_rate = args.learning_rate drop = args.dropout #Testing area print('Data directory: ' + data_dir) print('hidden units: ' + str(args.hidden_units)) print('Save directory: ' + save_dir) print('Architecture: ' + args.arch) #create save directory if not existing fu.create_directory(save_dir) # Loading Pre-Trained model dependent on console input arch model = models.__getattribute__(model_arch)(pretrained=True) # Freeze parameters so we don't backprop through them for param in model.parameters(): param.requires_grad = False # Create the network, define the criterion and optimizer model.fc = fu.Network(model.fc.in_features, 102, model_hidden_units, drop) criterion = nn.NLLLoss() optimizer = optim.Adam(model.fc.parameters(), lr=learning_rate) device = torch.device( 'cuda' if torch.cuda.is_available() and args.gpu == True else 'cpu') print('Device is: ', device) epochs = args.epochs print_every = 50 running_loss = 0 steps = 0 train_loader, test_loader, valid_loader, train_data, test_data, valid_data = load_transform.load_transform( data_dir, train_dir, valid_dir, test_dir) fu.train(device, model, epochs, criterion, optimizer, print_every, train_loader, test_loader, valid_loader) fu.save_checkpoint(model, model_arch, epochs, criterion, optimizer, train_data, save_dir) return model, test_loader, criterion
def run_bigscape_hmmscan(input_dir, output_folder, pfam_dir, bigscape_path, biopython_path, parallel=False): sys.path.append(bigscape_path) sys.path.append(biopython_path) import bigscape as bs import functions as f class bgc_data: def __init__(self, accession_id, description, product, records, max_width, organism, taxonomy, biosynthetic_genes, contig_edge): # These two properties come from the genbank file: self.accession_id = accession_id self.description = description # AntiSMASH predicted class of compound: self.product = product # number of records in the genbank file (think of multi-locus BGCs): self.records = records # length of largest record (it will be used for ArrowerSVG): self.max_width = int(max_width) # organism self.organism = organism # taxonomy as a string (of comma-separated values) self.taxonomy = taxonomy # Internal set of tags corresponding to genes that AntiSMASH marked # as "Kind: Biosynthetic". It is formed as # clusterName + "_ORF" + cds_number + ":gid:" + gene_id + ":pid:" + protein_id + ":loc:" + gene_start + ":" + gene_end + ":strand:" + {+,-} self.biosynthetic_genes = biosynthetic_genes # AntiSMASH 4+ marks BGCs that sit on the edge of a contig self.contig_edge = contig_edge f.create_directory(output_folder, "Output", False) bgc_fasta_folder = os.path.join(output_folder, "fasta") f.create_directory(bgc_fasta_folder, "BGC fastas", False) bs.bgc_data = bgc_data bs.mode = 'global' bgc_info = {} # Stores, per BGC: predicted type, gbk Description, # number of records, width of longest record, # GenBank's accession, Biosynthetic Genes' ids min_bgc_size = 0 # Provide the minimum size of a BGC to be included in the analysis. Default is 0 base pairs exclude_gbk_str = '' # If this string occurs in the gbk filename, this file will not be used for the analysis # genbankDict: {cluster_name:[genbank_path_to_1st_instance,[sample_1,sample_2,...]]} genbankDict = bs.get_gbk_files(input_dir, output_folder, bgc_fasta_folder, min_bgc_size, exclude_gbk_str, bgc_info) # clusters and sampleDict contain the necessary structure for all-vs-all and sample analysis clusters = genbankDict.keys() clusterNames = tuple(sorted(clusters)) sampleDict = {} # {sampleName:set(bgc1,bgc2,...)} gbk_files = [] # raw list of gbk file locations for (cluster, (path, clusterSample)) in genbankDict.items(): gbk_files.append(path) for sample in clusterSample: clustersInSample = sampleDict.get(sample, set()) clustersInSample.add(cluster) sampleDict[sample] = clustersInSample baseNames = set(clusters) allFastaFiles = set(glob(os.path.join(bgc_fasta_folder, "*.fasta"))) fastaFiles = set() for name in baseNames: fastaFiles.add(os.path.join(bgc_fasta_folder, name + ".fasta")) fastaBases = allFastaFiles.intersection(fastaFiles) task_set = fastaFiles verbose = False domtable_folder = os.path.join(output_folder, "domtable") f.create_directory(domtable_folder, "Domtable", False) if parallel: cores = cpu_count() pool = Pool(cores, maxtasksperchild=1) for fasta_file in task_set: pool.apply_async(bs.runHmmScan, args=(fasta_file, pfam_dir, domtable_folder, verbose)) pool.close() pool.join() else: i = 1 for fasta_file in task_set: print 'Processing %d/%d' % (i, len(task_set)) bs.runHmmScan(fasta_file, pfam_dir, domtable_folder, verbose) i += 1 print("Processing domtable files") pfs_folder = os.path.join(output_folder, "pfs") pfd_folder = os.path.join(output_folder, "pfd") f.create_directory(pfs_folder, "pfs", False) f.create_directory(pfd_folder, "pfd", False) allDomtableFiles = set(glob(os.path.join(domtable_folder, "*.domtable"))) domtableFiles = set() for name in baseNames: domtableFiles.add(os.path.join(domtable_folder, name + ".domtable")) domtableBases = allDomtableFiles.intersection(domtableFiles) alreadyDone = set() bs.gbk_files = gbk_files bs.genbankDict = genbankDict bs.clusters = clusters bs.baseNames = baseNames bs.sampleDict = sampleDict # Specify at which overlap percentage domains are considered to overlap. # Domain with the best score is kept (default=0.1). domain_overlap_cutoff = 0.1 for domtableFile in domtableFiles - alreadyDone: try: bs.parseHmmScan(domtableFile, pfd_folder, pfs_folder, domain_overlap_cutoff) except IndexError: continue except ValueError: continue return baseNames