def train(self, appliances, num_states_dict={}, **load_kwargs): """Train using 1d FHMM. Places the learnt model in `model` attribute The current version performs training ONLY on the first chunk. Online HMMs are welcome if someone can contribute :) Assumes all pre-processing has been done. """ learnt_model = OrderedDict() num_meters = len(appliances) if num_meters > 12: max_num_clusters = 2 else: max_num_clusters = 3 for i, app in enumerate(appliances): power_data = app.power_data.fillna(value=0, inplace=False) X = power_data.values.reshape((-1, 1)) assert X.ndim == 2 self.X = X if num_states_dict.get(app.name) is not None: # User has specified the number of states for this appliance num_total_states = num_states_dict.get(app.name) else: # Find the optimum number of states print "Identifying number of hidden states for appliance {}".format( app.name) states = cluster(X, max_num_clusters) num_total_states = len(states) print "Number of hidden states for appliance {}: {}".format( app.name, num_total_states) print( "Training model for appliance {} with {} hidden states".format( app.name, num_total_states)) learnt_model[app.name] = GaussianHMM(num_total_states, "full") # Fit learnt_model[app.name].fit(X) # Combining to make a AFHMM self.meters = [] new_learnt_models = OrderedDict() for app in learnt_model: startprob, means, covars, transmat = sort_learnt_parameters( learnt_model[app].startprob_, learnt_model[app].means_, learnt_model[app].covars_, learnt_model[app].transmat_) new_learnt_models[app] = GaussianHMM(startprob.size, "full") new_learnt_models[app].means_ = means new_learnt_models[app].covars_ = covars new_learnt_models[app].startprob_ = startprob new_learnt_models[app].transmat_ = transmat # UGLY! But works. self.meters.append(app) learnt_model_combined = create_combined_hmm(new_learnt_models) self.individual = new_learnt_models self.model = learnt_model_combined
def predict_one(filename, company, dt1, dt2,num_of_states, days_future, tr_prob): # Generate samples starting in the most likely actual current state model = joblib.load(filename) rp = getrealprice_series(company, dt2,days_future) days = rp.size quotes = quotes_historical_yahoo_ochl(company, dt1, dt2) dates = np.array([q[0] for q in quotes], dtype=int) close_v = np.array([q[2] for q in quotes]) # Take diff of close value and shift by 1 diff = np.diff(close_v) dates = dates[1:] close_v = close_v[1:] X = np.column_stack([diff]) # Predict the most likely current internal hidden state hidden_probs = model.predict_proba(X) lstate_prob = hidden_probs[-1] # If more than one state, make sure we start at the most likely current state if (num_of_states>1): startprob = np.zeros(num_of_states) startprob[lstate_prob.argmax()] = 1.0 else: startprob = [ 1.] # Prepare the model for sampling model_2_sample = GaussianHMM(n_components=num_of_states, covariance_type="full") model_2_sample.startprob_ = startprob model_2_sample.transmat_ = model.transmat_ model_2_sample.means_ = model.means_ model_2_sample.covars_ = model.covars_ #Make sure to randomize the samples random.seed() rseed = random.randrange(0,max_int_value) X, Z = model_2_sample.sample(days, random_state=rseed) # Make predictions predictions = np.zeros(days) #added two in case there was a weekend at the end final_price = rp[0] #start at day 0 of the real prices predictions[0] = final_price #day 0 prediction same as current real price for i in range(1, days): final_price += X[i][0] predictions[i] = final_price return predictions
def predictions_rand(filename, company, dt1, dt2, num_of_states, test_num, days_future): # Generate samples starting in a random state model = joblib.load(filename) quotes = quotes_historical_yahoo_ochl(company, dt1, dt2) dates = np.array([q[0] for q in quotes], dtype=int) close_v = np.array([q[2] for q in quotes]) volume = np.array([q[5] for q in quotes])[1:] # Take diff of close value. Note that this makes # len(diff) = len(close_t) - 1 therefore, other quantities also need to be shifted by 1 diff = np.diff(close_v) dates = dates[1:] close_v = close_v[1:] X = np.column_stack([diff]) # Predict the most likely current internal hidden state hidden_probs = model.predict_proba(X) lstate_prob = hidden_probs[-1] total2active = 364 / 251 # Ratio of days the market is open to all days days = days_future // total2active # 251 open market days in a year predictions = [] # Might be useful to store the predictions for future use print(days) startprob = np.zeros(num_of_states) for start_st_prob in range(num_of_states): startprob[start_st_prob] = 1.0 / num_of_states model_2_sample = GaussianHMM(n_components=num_of_states, covariance_type="full") model_2_sample.startprob_ = startprob model_2_sample.transmat_ = model.transmat_ model_2_sample.means_ = model.means_ model_2_sample.covars_ = model.covars_ random.seed() rseed = random.randrange(0, sys.maxint) X, Z = model_2_sample.sample(days, random_state=rseed) avg_prediction = 0 for test in range(test_num): final_price = close_v[-1] for i in range(days): if ((final_price + X[i]) > 0): final_price += X[i] predictions.append(final_price[0]) rseed = random.randrange(0, sys.maxint) X, Z = model_2_sample.sample(days, random_state=rseed) return predictions
def create_combined_hmm(model): list_pi = [model[appliance].startprob_ for appliance in model] list_A = [model[appliance].transmat_ for appliance in model] list_means = [model[appliance].means_.flatten().tolist() for appliance in model] pi_combined = compute_pi_fhmm(list_pi) A_combined = compute_A_fhmm(list_A) [mean_combined, cov_combined] = compute_means_fhmm(list_means) combined_model = GaussianHMM(n_components=len(pi_combined), covariance_type='full') combined_model.startprob_ = pi_combined combined_model.transmat_ = A_combined combined_model.covars_ = cov_combined combined_model.means_ = mean_combined return combined_model
def getFinalState(self,globalstatenumber,data,pa,transport,means,convars): model = GaussianHMM(n_components=globalstatenumber,n_iter=1000,covariance_type='diag',params='stcm', init_params='',random_state=1) model.startprob_=pa model.transmat_=transport model.means_=means model.covars_=convars #----------- # for i, con in enumerate(model.covars_): # if (not np.allclose(con, con.T) or np.any(linalg.eigvalsh(con) <= 0)): # print 'is:',i # else: # print 'not is:',i # print 'before_model.covars_:',model.covars_ model.fit(data) hidden_states = model.predict(data) # print 'after_model.covars_:',model.covars_ return hidden_states
def calculate_hmm_g(training_set, test_set, taxonomy, cursor, connection, settings): da_id_taxonomy = find_da_id(taxonomy, cursor) states, start_probability, transition_probability = start_transition_probability_extraction(training_set, taxonomy) n_states = len(states) feature_list = extract_features_training_set_gaus(training_set, taxonomy, settings) n_features = len(feature_list[states[0]][0]) mean = calculate_means(states, feature_list, n_features) covariance = calculate_covariance(states, feature_list, n_features) # covariance = diag_cov(states, feature_list, n_features, mean) model = GaussianHMM(n_components=n_states, covariance_type='full') model.startprob_ = start_probability model.transmat_ = transition_probability model.means_ = mean model.covars_ = covariance test_seq, con_pathes = extract_features_test_set_gaus(test_set, taxonomy, settings) da_predictions(test_seq, model, con_pathes, states, da_id_taxonomy, taxonomy, cursor, connection)
def create_left_right_hmm(num_states, random_state): ''' create left to right hmm like in the lecture (state1 -> state2 -> state3...) Always start from state1 ''' transmat = np.zeros((num_states, num_states)) for i in range(num_states): if i == num_states - 1: transmat[i, i] = 1.0 else: transmat[i, i] = 0.5 transmat[i, i + 1] = 0.5 startprob = np.zeros(num_states) startprob[0] = 1.0 model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000, random_state=random_state, verbose=False, params='mct', init_params='cm') model.startprob_ = startprob model.transmat_ = transmat return model
covars_prior=0.01, covars_weight=1, init_params='mc', means_prior=0, means_weight=0, min_covar=0.001, n_components=3, n_iter=1000, params='mc', random_state=None, startprob_prior=1.0, tol=0.01, transmat_prior=1.0, verbose=False) model.startprob_ = numpy.array([1., 0, 0]) model.startprob_prior = model.startprob_ model.transmat_ = numpy.array([[0.9, 0.1, 0], [0, 0.9, 0.1], [0, 0, 1]]) model.transmat_prior = model.transmat_ model.fit(obs) pi = model.startprob_ A = model.transmat_ w = numpy.ones((n, m), dtype=numpy.double) hmm_means = numpy.ones((n, m, d), dtype=numpy.double) hmm_means[0][0] = model.means_[0] hmm_means[1][0] = model.means_[1] hmm_means[2][0] = model.means_[2] hmm_covars = numpy.array(
# variables. model_gaussian = GaussianHMM(n_components=3, covariance_type='full') # Transition probability as specified above transition_matrix = np.array([[0.2, 0.6, 0.2], [0.4, 0.3, 0.3], [0.05, 0.05, 0.9]]) # Setting the transition probability model_gaussian.transmat_ = transition_matrix # Initial state probability initial_state_prob = np.array([0.1, 0.4, 0.5]) # Setting initial state probability model_gaussian.startprob_ = initial_state_prob # As we want to have a 2-D gaussian distribution the mean has to # be in the shape of (n_components, 2) mean = np.array([[0.0, 0.0], [0.0, 10.0], [10.0, 0.0]]) # Setting the mean model_gaussian.means_ = mean # As emission probability is a 2-D gaussian distribution, thus # covariance matrix for each state would be a 2-D matrix, thus # overall the covariance matrix for all the states would be in the # form of (n_components, 2, 2) covariance = 0.5 * np.tile(np.identity(2), (3, 1, 1))
# Here n_components correspond to number of states in the hidden # variables. model_gaussian = GaussianHMM(n_components=3, covariance_type='full') # Transition probability as specified above transition_matrix = np.array([[0.2, 0.6, 0.2], [0.4, 0.3, 0.3], [0.05, 0.05, 0.9]]) # Setting the transition probability model_gaussian.transmat_ = transition_matrix # Initial state probability initial_state_prob = np.array([0.1, 0.4, 0.5]) # Setting initial state probability model_gaussian.startprob_ = initial_state_prob # As we want to have a 2-D gaussian distribution the mean has to # be in the shape of (n_components, 2) mean = np.array([[0.0, 0.0], [0.0, 10.0], [10.0, 0.0]]) # Setting the mean model_gaussian.means_ = mean # As emission probability is a 2-D gaussian distribution, thus # covariance matrix for each state would be a 2-D matrix, thus # overall the covariance matrix for all the states would be in the # form of (n_components, 2, 2) covariance = 0.5 * np.tile(np.identity(2), (3, 1, 1)) model_gaussian.covars_ = covariance
def fit_and_predict(self, dataset): predicted_stock_data = np.empty([0, dataset.shape[1]]) for idx in range(self.num_calib): train_dataset = dataset[idx:idx + self.time_step:] test_data = dataset[idx + self.time_step, :] if idx == 0: # n_components=4, covariance_type="diag", n_iter=100 model = GaussianHMM(n_components=self.states, covariance_type='full', verbose=True, n_iter=100, init_params='stmc') else: # Retune the model by using the HMM paramters from the previous iterations as the prior model = GaussianHMM(n_components=self.states, covariance_type='full', verbose=True, n_iter=100, init_params='') model.transmat_ = transmat_retune_prior model.startprob_ = startprob_retune_prior model.means_ = means_retune_prior model.covars_ = covars_retune_prior model.fit(train_dataset) print(model.transmat_) transmat_retune_prior = model.transmat_ startprob_retune_prior = model.startprob_ means_retune_prior = model.means_ covars_retune_prior = model.covars_ if model.monitor_.iter == 100: print('Increase number of iterations') sys.exit(1) iters = 1 past_likelihood = [] K = self.time_step curr_likelihood = model.score(train_dataset[0:K, :]) num_examples = train_dataset.shape[0] iters = num_examples while iters > 0: past_likelihood = np.append( past_likelihood, model.score(train_dataset[0:iters, :])) iters = iters - 1 likelihood_diff_idx = np.argmin( np.absolute(past_likelihood - curr_likelihood)) predicted_change = train_dataset[ likelihood_diff_idx, :] - train_dataset[likelihood_diff_idx + 1, :] predicted_stock_data = np.vstack( (predicted_stock_data, dataset[idx + self.time_step - 1, :] + predicted_change)) mape = calc_mape(predicted_stock_data, np.flipud(dataset[range(100), :])) print('MAPE is ', mape) print(predicted_stock_data)
def main(): """ Main function that performs footprint analysis. Keyword arguments: None Return: None """ ################################################################################################### # Processing Input Arguments ################################################################################################### # Initializing ErrorHandler error_handler = ErrorHandler() # Parameters current_version = "0.0.1" usage_message = ("\n--------------------------------------------------\n" "The 'hint' program predicts TFBSs given open chromatin data.\n" "In order to use this tools, please type: \n\n" "%prog [options] <experiment_matrix>\n\n" "The <experiment matrix> should contain:\n" "- One region file representing the regions in which the HMM\n" " will be applied. It should contain 'regions' in the type field\n" "- One DNase aligned reads file (bam) file with 'DNASE' in the name field.\n" "- One to Three histone modification aligned reads file (bam).\n\n" "For more information, please refer to:\n" "http://www.regulatory-genomics.org/dnasefootprints/\n" "--------------------------------------------------") version_message = "HINT - Regulatory Analysis Toolbox (RGT). Version: "+str(current_version) # Initializing Option Parser parser = PassThroughOptionParser(usage = usage_message, version = version_message) # Optional Input Options parser.add_option("--hmm-file", dest = "hmm_file", type = "string", metavar="FILE_1_1[[,...,FILE_N_1];...;FILE_1_M[,...,FILE_N_M]]", default = None, help = ("List of HMM files separated by comma. If one file only, then this HMM will be " "applied for all histone signals, otherwise, the list must have the same number " "of histone files given. The order of the list should be the order of the " "histones in the input_matrix file. If the argument is not given, then a default HMM " "will be used. In case multiple input groups are used, then " "other lists can be passed using semicolon. The number of group of lists should " "equals the number of input groups.")) parser.add_option("--bias-table", dest = "bias_table", type = "string", metavar="FILE1_F,FILE1_R[;...;FILEM_F,FILEM_R]", default = None, help = ("List of files (for each input group; separated by semicolon) with all " "possible k-mers (for any k) and their bias estimates. Each input group" "should have two files: one for the forward and one for the negative strand." "Each line should contain a kmer and the bias estimate separated by tab. " "Leave an empty set for histone-only groups. Eg. FILE1;;FILE3.")) # Parameters Options parser.add_option("--organism", dest = "organism", type = "string", metavar="STRING", default = "hg19", help = ("Organism considered on the analysis. Check our full documentation for all available " "options. All default files such as genomes will be based on the chosen organism " "and the data.config file. This option is used only if a bigbed output is asked.")) parser.add_option("--estimate-bias-correction", dest = "estimate_bias_correction", action = "store_true", default = False, help = ("Applies DNase-seq cleavage bias correction with k-mer bias estimated " "from the given DNase-seq data (SLOW HINT-BC).")) parser.add_option("--default-bias-correction", dest = "default_bias_correction", action = "store_true", default = False, help = ("Applies DNase-seq cleavage bias correction with default " "k-mer bias estimates (FAST HINT-BC).")) parser.add_option("--dnase-norm-per", dest = "dnase_norm_per", type = "float", metavar="INT", default = 98, help = SUPPRESS_HELP) parser.add_option("--dnase-slope-per", dest = "dnase_slope_per", type = "float", metavar="INT", default = 98, help = SUPPRESS_HELP) parser.add_option("--dnase-frag-ext", dest = "dnase_frag_ext", type = "int", metavar="INT", default = 1, help = SUPPRESS_HELP) parser.add_option("--ext-both-directions", dest = "ext_both_directions", action = "store_true", default = False, help = SUPPRESS_HELP) parser.add_option("--histone-norm-per", dest = "histone_norm_per", type = "float", metavar="INT", default = 98, help = SUPPRESS_HELP) parser.add_option("--histone-slope-per", dest = "histone_slope_per", type = "float", metavar="INT", default = 98, help = SUPPRESS_HELP) # Output Options parser.add_option("--output-location", dest = "output_location", type = "string", metavar="PATH", default = getcwd(), help = ("Path where the output files will be written.")) parser.add_option("--print-bb", dest = "print_bb", action = "store_true", default = False, help = ("If used, the output will be a bigbed (.bb) file.")) parser.add_option("--print-wig", dest = "print_wig", type = "string", metavar="PATH", default = None, help = SUPPRESS_HELP) # Processing Options options, arguments = parser.parse_args() if(not arguments or len(arguments) > 1): error_handler.throw_error("FP_WRONG_ARGUMENT") # Fixed Parameters ################ region_total_ext = 10000 fp_limit_size = 50 fp_limit_size_histone = 2000 fp_limit_size_ext = 10 fp_limit_size_ext_histone = 200 fp_ext = 5 fp_ext_histone = 50 tc_ext = 50 tc_ext_histone = 500 ### dnase_initial_clip = 1000 dnase_sg_window_size = 9 dnase_norm_per = options.dnase_norm_per dnase_slope_per = options.dnase_slope_per dnase_frag_ext = options.dnase_frag_ext dnase_ext_both_directions = options.ext_both_directions ### histone_initial_clip = 1000 histone_sg_window_size = 201 histone_norm_per = options.histone_norm_per histone_slope_per = options.histone_slope_per histone_frag_ext = 200 ################################### # Output wig signal if(options.print_wig): system("touch "+options.print_wig+"signal.wig | echo -n "" > "+options.print_wig+"signal.wig") system("touch "+options.print_wig+"norm.wig | echo -n "" > "+options.print_wig+"norm.wig") system("touch "+options.print_wig+"slope.wig | echo -n "" > "+options.print_wig+"slope.wig") # Global class initialization genome_data = GenomeData(options.organism) hmm_data = HmmData() ################################################################################################### # Reading Input Matrix ################################################################################################### # Reading input argument input_matrix = arguments[0] # Create experimental matrix try: exp_matrix = ExperimentalMatrix() exp_matrix.read(input_matrix) except Exception: error_handler.throw_error("FP_WRONG_EXPMAT") ################################################################################################### # Reading Input ################################################################################################### # Group class class Group: def __init__(self): self.name = None self.original_regions = None self.regions = None self.dnase_file = None self.histone_file_list = [] self.dnase_only = True self.histone_only = True self.hmm = [] self.flag_multiple_hmms = False self.bias_table = None # Initialization name_list = exp_matrix.names type_list = exp_matrix.types file_dict = exp_matrix.files fields_dict = exp_matrix.fieldsDict objects_dict = exp_matrix.objectsDict # Populating fields dict data for e in ["HS", "DNASE", "HISTONE"]: try: fields_dict["data"][e] except Exception: fields_dict["data"][e] = [] # Fetching files per group group_list = [] for g in fields_dict["group"].keys(): group = Group() group.name = g for i in range(0,len(fields_dict["group"][g])): if(name_list[i] in fields_dict["data"]["HS"]): group.original_regions = objects_dict[name_list[i]] group.regions = deepcopy(group.original_regions) group.regions.extend(int(region_total_ext/2),int(region_total_ext/2)) # Extending group.regions.merge() # Sort & Merge elif(name_list[i] in fields_dict["data"]["DNASE"]): group.dnase_file = GenomicSignal(file_dict[name_list[i]]) group.dnase_file.load_sg_coefs(dnase_sg_window_size) elif(name_list[i] in fields_dict["data"]["HISTONE"]): group.histone_file_list.append(GenomicSignal(file_dict[name_list[i]])) group.histone_file_list[-1].load_sg_coefs(histone_sg_window_size) else: pass # TODO Error (Category of data outside "HS, DNASE, HISTONE") if(group.dnase_file): group.histone_only = False if(group.histone_file_list): group.dnase_only = False if(group.histone_only and group.dnase_only): pass # TODO ERROR (There is no DNase or histone data) if(not group.original_regions): pass # TODO ERROR (There is no HS regions) group_list.append(group) ################################################################################################### # Fetching Bias Table ################################################################################################### bias_correction = False if(options.bias_table): bias_table_group_list = options.bias_table.split(";") if(len(bias_table_group_list) != len(group_list)): pass # TODO ERROR for g in range(0,len(group_list)): group = group_list[g] bias_table_list = bias_table_group_list[g].split(",") if(group.histone_only): continue group.bias_table = BiasTable(table_file_F=bias_table_list[0], table_file_R=bias_table_list[1]) bias_correction = True elif(options.estimate_bias_correction): for group in group_list: if(group.histone_only): continue group.bias_table = BiasTable(regions=group.original_regions,dnase_file_name=group.dnase_file.file_name, genome_file_name=genome_data.get_genome()) bias_correction = True elif(options.default_bias_correction): for group in group_list: if(group.histone_only): continue group.bias_table = BiasTable(table_file_F=hmm_data.get_default_bias_table_F(), table_file_R=hmm_data.get_default_bias_table_R()) bias_correction = True ################################################################################################### # Creating HMMs ################################################################################################### # Fetching HMM input flag_multiple_hmms = False if(options.hmm_file): # Argument is passed hmm_group_list = options.hmm_file.split(";") if(len(hmm_group_list) != len(group_list)): pass # TODO ERROR for g in range(0,len(group_list)): group = group_list[g] # Fetching list of HMM files group.hmm = hmm_group_list[g].split(",") # Verifying HMM application mode (one HMM or multiple HMM files) if(len(group.hmm) == 1): group.flag_multiple_hmms = False group.hmm = group.hmm[0] elif(len(group.hmm) == len(histone_file_name_list)): flag_multiple_hmms = True else: error_handler.throw_error("FP_NB_HMMS") else: # Argument was not passed for group in group_list: group.flag_multiple_hmms = False if(group.dnase_only): if(bias_correction): group.hmm = hmm_data.get_default_hmm_dnase_bc() else: group.hmm = hmm_data.get_default_hmm_dnase() elif(group.histone_only): group.hmm = hmm_data.get_default_hmm_histone() else: if(bias_correction): group.hmm = hmm_data.get_default_hmm_dnase_histone_bc() else: group.hmm = hmm_data.get_default_hmm_dnase_histone() # Creating scikit HMM list for group in group_list: if(group.flag_multiple_hmms): hmm_list = [] for hmm_file_name in group.hmm: try: hmm_scaffold = HMM() hmm_scaffold.load_hmm(hmm_file_name) if(int(hmm_ver.split(".")[0]) <= 0 and int(hmm_ver.split(".")[1]) <= 1): scikit_hmm = GaussianHMM(n_components=hmm_scaffold.states, covariance_type="full", transmat=array(hmm_scaffold.A), startprob=array(hmm_scaffold.pi)) scikit_hmm.means_ = array(hmm_scaffold.means) scikit_hmm.covars_ = array(hmm_scaffold.covs) else: scikit_hmm = GaussianHMM(n_components=hmm_scaffold.states, covariance_type="full") scikit_hmm.startprob_ = array(hmm_scaffold.pi) scikit_hmm.transmat_ = array(hmm_scaffold.A) scikit_hmm.means_ = array(hmm_scaffold.means) scikit_hmm.covars_ = array(hmm_scaffold.covs) except Exception: error_handler.throw_error("FP_HMM_FILES") hmm_list.append(scikit_hmm) group.hmm = hmm_list else: scikit_hmm = None try: hmm_scaffold = HMM() hmm_scaffold.load_hmm(group.hmm) if(int(hmm_ver.split(".")[0]) <= 0 and int(hmm_ver.split(".")[1]) <= 1): scikit_hmm = GaussianHMM(n_components=hmm_scaffold.states, covariance_type="full", transmat=array(hmm_scaffold.A), startprob=array(hmm_scaffold.pi)) scikit_hmm.means_ = array(hmm_scaffold.means) scikit_hmm.covars_ = array(hmm_scaffold.covs) else: scikit_hmm = GaussianHMM(n_components=hmm_scaffold.states, covariance_type="full") scikit_hmm.startprob_ = array(hmm_scaffold.pi) scikit_hmm.transmat_ = array(hmm_scaffold.A) scikit_hmm.means_ = array(hmm_scaffold.means) scikit_hmm.covars_ = array(hmm_scaffold.covs) except Exception: error_handler.throw_error("FP_HMM_FILES") group.hmm = scikit_hmm ################################################################################################### # Main Pipeline ################################################################################################### # Iterating over groups for group in group_list: # Initializing result set footprints = GenomicRegionSet(group.name) # Iterating over regions for r in group.regions.sequences: ################################################################################################### # DNASE ONLY ################################################################################################### if(group.dnase_only): # Fetching DNase signal try: dnase_norm, dnase_slope = group.dnase_file.get_signal(r.chrom, r.initial, r.final, dnase_frag_ext, dnase_initial_clip, dnase_norm_per, dnase_slope_per, group.bias_table, genome_data.get_genome(), dnase_ext_both_directions, options.print_wig) except Exception: raise error_handler.throw_warning("FP_DNASE_PROC", add_msg="for region ("+",".join([r.chrom, str(r.initial), str(r.final)])+"). This iteration will be skipped.") continue # Formatting sequence try: input_sequence = array([dnase_norm,dnase_slope]).T except Exception: raise error_handler.throw_warning("FP_SEQ_FORMAT",add_msg="for region ("+",".join([r.chrom, str(r.initial), str(r.final)])+"). This iteration will be skipped.") continue # Applying HMM if(isinstance(group.hmm,list)): continue # TODO Error try: posterior_list = group.hmm.predict(input_sequence) except Exception: raise error_handler.throw_warning("FP_HMM_APPLIC",add_msg="in region ("+",".join([r.chrom, str(r.initial), str(r.final)])+"). This iteration will be skipped.") continue # Formatting results start_pos = 0 flag_start = False fp_state_nb = 4 for k in range(r.initial, r.final): curr_index = k - r.initial if(flag_start): if(posterior_list[curr_index] != fp_state_nb): if(k-start_pos < fp_limit_size): fp = GenomicRegion(r.chrom, start_pos, k) footprints.add(fp) flag_start = False else: if(posterior_list[curr_index] == fp_state_nb): flag_start = True start_pos = k if(flag_start): fp = GenomicRegion(r.chrom, start_pos, r.final) footprints.add(fp) ################################################################################################### # HISTONES ################################################################################################### else: # Fetching DNase signal if(not group.histone_only): try: dnase_norm, dnase_slope = group.dnase_file.get_signal(r.chrom, r.initial, r.final, dnase_frag_ext, dnase_initial_clip, dnase_norm_per, dnase_slope_per, group.bias_table, genome_data.get_genome(), dnase_ext_both_directions, options.print_wig) except Exception: raise error_handler.throw_warning("FP_DNASE_PROC", add_msg="for region ("+",".join([r.chrom, str(r.initial), str(r.final)])+"). This iteration will be skipped.") continue # Iterating over histone modifications for i in range(0,len(group.histone_file_list)): # Fetching histone signal try: histone_file = group.histone_file_list[i] histone_norm, histone_slope = histone_file.get_signal(r.chrom, r.initial, r.final, histone_frag_ext, histone_initial_clip, histone_norm_per, histone_slope_per, options.print_wig) except Exception: raise error_handler.throw_warning("FP_HISTONE_PROC",add_msg="for region ("+",".join([r.chrom, str(r.initial), str(r.final)])+") and histone modification "+histone_file.file_name+". This iteration will be skipped for this histone.") continue # Formatting sequence try: if(group.histone_only): input_sequence = array([histone_norm,histone_slope]).T else: input_sequence = array([dnase_norm,dnase_slope,histone_norm,histone_slope]).T except Exception: raise error_handler.throw_warning("FP_SEQ_FORMAT",add_msg="for region ("+",".join([r.chrom, str(r.initial), str(r.final)])+") and histone modification "+histone_file.file_name+". This iteration will be skipped.") continue # Applying HMM if(flag_multiple_hmms): current_hmm = group.hmm[i] else: current_hmm = group.hmm try: posterior_list = current_hmm.predict(input_sequence) except Exception: raise error_handler.throw_warning("FP_HMM_APPLIC",add_msg="in region ("+",".join([r.chrom, str(r.initial), str(r.final)])+") and histone modification "+histone_file.file_name+". This iteration will be skipped.") continue # Histone-only limit size if(group.histone_only): fp_limit_size = fp_limit_size_histone fp_state_nb = 4 else: fp_state_nb = 7 # Formatting results start_pos = 0 flag_start = False for k in range(r.initial, r.final): curr_index = k - r.initial if(flag_start): if(posterior_list[curr_index] != fp_state_nb): if(k-start_pos < fp_limit_size): fp = GenomicRegion(r.chrom, start_pos, k) footprints.add(fp) flag_start = False else: if(posterior_list[curr_index] == fp_state_nb): flag_start = True start_pos = k if(flag_start): fp = GenomicRegion(r.chrom, start_pos, r.final) footprints.add(fp) ################################################################################################### # Post-processing ################################################################################################### # Parameters if(group.histone_only): fp_limit = fp_limit_size_ext_histone fp_ext = fp_ext_histone tc_ext = tc_ext_histone tcsignal = group.histone_file_list[0] tcfragext = 1 tcinitialclip = histone_initial_clip tcextboth = False else: fp_limit = fp_limit_size_ext fp_ext = fp_ext tc_ext = tc_ext tcsignal = group.dnase_file tcfragext = 1 tcinitialclip = dnase_initial_clip tcextboth = dnase_ext_both_directions # Sorting and Merging footprints.merge() # Overlapping results with original regions footprints = footprints.intersect(group.original_regions,mode=OverlapType.ORIGINAL) # Extending footprints for f in footprints.sequences: if(f.final - f.initial < fp_limit): f.initial = max(0,f.initial-fp_ext) f.final = f.final+fp_ext # Fetching chromosome sizes chrom_sizes_file_name = genome_data.get_chromosome_sizes() chrom_sizes_file = open(chrom_sizes_file_name,"r") chrom_sizes_dict = dict() for chrom_sizes_entry_line in chrom_sizes_file: chrom_sizes_entry_vec = chrom_sizes_entry_line.strip().split("\t") chrom_sizes_dict[chrom_sizes_entry_vec[0]] = int(chrom_sizes_entry_vec[1]) chrom_sizes_file.close() # Evaluating TC for f in footprints.sequences: mid = (f.initial+f.final)/2 p1 = max(mid - tc_ext,0) p2 = min(mid + tc_ext,chrom_sizes_dict[f.chrom]) try: tag_count = tcsignal.get_tag_count(f.chrom, p1, p2, tcfragext, tcinitialclip, tcextboth) except Exception: tag_count = 0 f.data = str(int(tag_count)) ################################################################################################### # Writing output ################################################################################################### # Creating output file output_file_name = options.output_location+group.name+".bed" footprints.write_bed(output_file_name) # Verifying condition to write bb if(options.print_bb): # Fetching file with chromosome sizes chrom_sizes_file = genome_data.get_chromosome_sizes() # Converting to big bed output_bb_name = options.output_location+options.footprint_name+".bb" system(" ".join(["bedToBigBed",output_file_name,chrom_sizes_file,output_bb_name]))
def MyGaussianHMM(): from hmmlearn.hmm import GaussianHMM df = pd.read_csv( "/home/ray/Documents/suibe/2017/建模/Modeling_Preparation/dataset/SZIndex.csv", header=-1) df.head() X = np.array(df.iloc[:, 0:5]) # 一、未知模型情况下,解决问题3 model = GaussianHMM(n_components=6, covariance_type="diag", n_iter=1000) # 方差矩阵为对角阵 """ 参数解释: covariance_type: "spherical" :主对角元素均为1,其余元素为0,独立同分布 (数据不足时,难以进行参数估计) "diag" :主对角元素不为0,其余为0 (一般情况,折中) "full" :所有元素均不为0 (数据足够进行参数估计时) """ model.fit(X) print "隐含状态为: ", model.predict(X) # 列出每一天的隐含状态 print "特征数目 %s" % model.n_features print "隐状态数目 %s" % model.n_components print "起始概率 :", model.startprob_ print "隐状态转移矩阵", model.transmat_ ## 每个隐含层对应的特征概率空间假设为正态分布,则可以得到一个model.n_components行model.n_features列的均值矩阵 print "混淆矩阵:均值部分", model.means_ print "混淆矩阵:方差部分", model.covars_ ## 绘图 hidden_states = model.predict(X) tradeDate = df.iloc[:, 5].values closeIndex = df.iloc[:, 6].values plt.figure(figsize=(15, 8)) for i in range(model.n_components): idx = (hidden_states == i) plt.plot_date(pd.to_datetime(tradeDate[idx]), closeIndex[idx], '.', label='%dth hidden state' % i, lw=1) plt.legend() plt.grid(1) plt.show() # 二、已知模型情况下,解决问题1,2 ## 沿用上述模型 ### 问题1 print "某天出现该观测的概率为: %s" % np.exp(model.score(X[0])) ### 问题2 log_prob, state = model.decode(X[:10], algorithm="viterbi") print "只根据前十天,推断出最有可能的隐含状态序列为:", state ## 自己输入模型参数 ### 一个2特征,4隐状态情况 startprob = np.array([0.6, 0.3, 0.1, 0.0]) # The transition matrix, note that there are no transitions possible # between component 1 and 3 transmat = np.array([[0.7, 0.2, 0.0, 0.1], [0.3, 0.5, 0.2, 0.0], [0.0, 0.3, 0.5, 0.2], [0.2, 0.0, 0.2, 0.6]]) # The means of each component means = np.array([[0.0, 0.0], [0.0, 11.0], [9.0, 10.0], [11.0, -1.0]]) # The covariance of each component covars = .5 * np.tile(np.identity(2), (4, 1, 1)) model2 = GaussianHMM(n_components=4, covariance_type="full", n_iter=1000) model2.startprob_ = startprob model2.transmat_ = transmat model2.means_ = means model2.covars_ = covars
def predictions_mls(filename, company, dt1, dt2,num_of_states,test_num, days_future, tr_prob): # Generate samples starting in the most likely actual current state model = joblib.load(filename) rp = getrealprice_series(company, dt2,days_future) days = rp.size quotes = quotes_historical_yahoo_ochl(company, dt1, dt2) dates = np.array([q[0] for q in quotes], dtype=int) close_v = np.array([q[2] for q in quotes]) # Take diff of close value and shift by 1 diff = np.diff(close_v) dates = dates[1:] close_v = close_v[1:] X = np.column_stack([diff]) # Predict the most likely current internal hidden state hidden_probs = model.predict_proba(X) lstate_prob = hidden_probs[-1] # If more than one state, make sure we start at the most likely current state if (num_of_states>1): startprob = np.zeros(num_of_states) startprob[lstate_prob.argmax()] = 1.0 else: startprob = [ 1.] # Prepare the model for sampling model_2_sample = GaussianHMM(n_components=num_of_states, covariance_type="full") model_2_sample.startprob_ = startprob model_2_sample.transmat_ = model.transmat_ model_2_sample.means_ = model.means_ model_2_sample.covars_ = model.covars_ #Make sure to randomize the samples random.seed() rseed = random.randrange(0,max_int_value) X, Z = model_2_sample.sample(days, random_state=rseed) # Make predictions avg_prediction = 0 allpredictions = np.zeros((test_num, days)) #added two in case there was a weekend at the end for test in range(test_num): final_price = rp[0] #start at day 0 of the real prices allpredictions[test][0] = final_price #day 0 prediction same as current real price for i in range(1, days): final_price += X[i][0] allpredictions[test][i] = final_price rseed = random.randrange(0,max_int_value) X, Z = model_2_sample.sample(days, random_state=rseed) predictions = allpredictions.mean(axis=0) predictions_var = allpredictions.var(axis=0) predictions_median = np.median(allpredictions, axis=0) errors = predictions - rp tr_prob_vector = np.full((predictions.size),tr_prob) data = [predictions,rp, errors, tr_prob_vector, predictions_var,predictions_median] err_final = errors[-1] print ("Start Price: ",rp[0],"Avg. Prediction: ",str(num_of_states),"states:" , predictions[-1]," Real Price:", rp[-1]) print (" Error end of predictions:", err_final,"Delta Start-End:", rp[0]-rp[-1],"\n") #print ("Real prices:", rp) #print ("Predicted prices", predictions) fname = "Predictions_"+str(company)+"_States_"+str(num_of_states)+"_stats.csv" fname = os.path.join('./sims_final', fname) np.savetxt(fname, data, delimiter=",") return
def predictions_mls(filename, company, refcompany, dt1, dt2, num_of_states, test_num): # Generate samples starting in the most likely actual current state days_future = 365 model = joblib.load(filename) quotes = quotes_historical_yahoo_ochl(company, dt1, dt2) dates = np.array([q[0] for q in quotes], dtype=int) close_v = np.array([q[2] for q in quotes]) volume = np.array([q[5] for q in quotes])[1:] # Take diff of close value. Note that this makes # len(diff) = len(close_t) - 1 therefore, other quantities also need to be shifted by 1 diff = np.diff(close_v) dates = dates[1:] close_v = close_v[1:] # Unpack quotes Company2 quotes2 = quotes_historical_yahoo_ochl(refcompany, dt1, dt2) close_v2 = np.array([q[2] for q in quotes2]) diff2 = np.diff(close_v2) close_v2 = close_v2[1:] #print (diff2.shape) delta = diff2.shape[0] - diff.shape[0] delta = abs(delta) diff0 = np.pad(diff, (delta, 0), mode='constant', constant_values=0) close_v = np.pad(close_v, (delta, 0), mode='constant', constant_values=0) #print (diff.shape) #print (diff0.shape) X = np.column_stack([diff0, diff2]) # Predict the most likely current internal hidden state hidden_probs = model.predict_proba(X) lstate_prob = hidden_probs[-1] days = int(days_future // total2active) # 251 open market days in a year print(days, strftime("%Y-%m-%d %H:%M:%S", gmtime())) #debugging purposes if (num_of_states > 1): startprob = np.zeros(num_of_states) startprob[lstate_prob.argmax()] = 1.0 else: startprob = [1.] model_2_sample = GaussianHMM(n_components=num_of_states, covariance_type="full") model_2_sample.startprob_ = startprob model_2_sample.transmat_ = model.transmat_ model_2_sample.means_ = model.means_ model_2_sample.covars_ = model.covars_ random.seed() rseed = random.randrange(0, max_int_value) X, Z = model_2_sample.sample(days, random_state=rseed) avg_prediction = 0 allpredictions = np.zeros((test_num, yr)) for test in range(test_num): final_price = close_v[-1] j = 0 for i in range(days): if ((final_price + X[i][0]) > 0): final_price += X[i][0] if (j > 1 and i % 5 == 0): allpredictions[test][j] = final_price allpredictions[test][j + 1] = final_price allpredictions[test][j + 2] = final_price j = j + 3 else: allpredictions[test][j] = final_price j = j + 1 while (j < allpredictions.shape[1]): allpredictions[test][j] = final_price j = j + 1 rseed = random.randrange(0, max_int_value) X, Z = model_2_sample.sample(days, random_state=rseed) predictions_year = allpredictions.mean(axis=0) print("Avg. Prediction: ", predictions_year[-1]) fname = "Year_of_predictions_" + str(company) + "_States_" + str( num_of_states) + "_adv.csv" fname = os.path.join('./sims3', fname) np.savetxt(fname, predictions_year, delimiter=",") return allpredictions[:, days_future - 2], allpredictions[:, (days_future - 2) / 4], allpredictions[:, (days_future - 2) / 36]
startprob = OrderedDict() transmat = OrderedDict() means = OrderedDict() covars = OrderedDict() model = OrderedDict() for appliance in model_appliance: startprob[appliance] = np.array(model_appliance[appliance]['startprob']) transmat[appliance] = np.array(model_appliance[appliance]['transmat']) means[appliance] = np.array(model_appliance[appliance]['means']) covars[appliance] = np.array(model_appliance[appliance]['covars']) for appliance in model_appliance: model[appliance] = GaussianHMM(n_components=state_appliances[appliance], covariance_type="full") model[appliance].startprob_ = startprob[appliance] model[appliance].transmat_ = transmat[appliance] model[appliance].means_ = means[appliance] model[appliance].covars_ = covars[appliance] new_model = OrderedDict() for appliance in model: startprob_new, means_new, covars_new, transmat_new = sort_learnt_parameters( startprob[appliance], means[appliance], covars[appliance], transmat[appliance]) new_model[appliance] = GaussianHMM(n_components=startprob_new.size, covariance_type="full") new_model[appliance].startprob_ = startprob_new new_model[appliance].transmat_ = transmat_new new_model[appliance].means_ = means_new
def chromas_from_midi(midi): chromas = [] for i in range(0, len(midi), 16): chromas.append(chroma_from_slice(midi[i:i + 16])) return chromas #%% #%% start_probs, transition_matrix = get_hmm_parameters() markov_model = GaussianHMM(n_components=24, covariance_type="full", init_params="stmc") markov_model.startprob_ = start_probs markov_model.transmat_ = transition_matrix markov_model.n_features = 12 markov_model.means_ = Chroma_Templates markov_model.covars_ = covariance_matrix path = markov_model.predict(emissions, [len(emissions)]) chords = [index_to_chord(chord) for chord in path] probable_chords = most_likely_from_midi(notes) #%%
best_transmat = model.transmat_ best_means_ = model.means_ best_covars = model.covars_ max_prop = -999 for i in range(5): model.fit(A) temp_prop = model.score(A) if(temp_prop>max_prop): max_prop=temp_prop best_startprob_ = model.startprob_ best_transmat = model.transmat_ best_means_ = model.means_ best_covars = model.covars_ model.startprob_ = best_startprob_ model.transmat_ = best_transmat model.means_ = best_means_ model.covars_ = best_covars #已知模型参数,根据观测序列,解码隐藏状态序列 hidden_states = model.predict(A) print hidden_states #我们把每个预测的状态用不同颜色标注在指数曲线上看一下结果。从图中可以比较明显的看出绿色的隐藏状态代表指数大幅上涨,浅蓝色和黄色的隐藏状态代表指数下跌。 plt.figure(figsize=(25, 18)) for i in range(model.n_components): #n_components应该是隐藏状态的数组 pos = (hidden_states==i) plt.plot_date(Date[pos],close[pos],'o',label='hidden state %d'%i,lw=2) plt.legend(loc="left")
def train_models(model, x, y): """ Train a Gaussian HMM for each class in the input matrix. For this, a default of 7 states is used. Hence the size of startprob and transmat. As this is a Left-Right HMM, the start probability is 100% for state 1, and 0 for all others. Args: model: HMM class object x: input image feature vector y: labels for images Return: Trained Model """ classes = len(np.unique(y)) if classes < 1: raise ValueError("Need at least 1 class to train HMMs with.") startprob = np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) transmat = np.array([[0.8, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.8, 0.2, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.8, 0.2, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.8, 0.2, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.8, 0.2, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.8, 0.2], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1]]) for image_class in range(classes): # Partition the training data into arrays for each class # Then train a Hidden Markov Model for each covar = "diag" if model.diag else "full" hmm = GaussianHMM(model.states, covar, n_iter=20, tol=0.00001, random_state=np.random.RandomState()) hmm.startprob_ = startprob hmm.transmat_ = transmat # Loop iterator corresponds to label of data hmm.label = image_class features = functions.get_labelled_train_data(x, y, image_class) # Fit the HMM for that label old_block_num = features.shape[1] features_size = features.shape[2] length = features.shape[0] # Need to flatten out the first 2 dimensions of feature vector # and pass an array of length into the HMM # old_block_num = number of observation blocks for a single image. lengths = [] for i in range(length): lengths.append(old_block_num) feat = features.reshape((length * old_block_num, features_size)) hmm.fit(feat, lengths=lengths) # Train the model for 1 class. model.hmms.append(hmm) return model
[3797.02038489, 1235931.58385831], [13182.17439996, 87137.57994132], [1759475.42864922, 36552747.45908708], [2724.71340548, 296602.83220848], [63837.66522882, 2867629.16600791], [20513.28086561, 19980338.31462503], [28962.97633114, 520482.13848515], [4315.55389006, 3128607.93648248], [1790.20488976, 123237.84834907]]) # Build an HMM instance and set parameters test_model = GaussianHMM(n_components=10, covariance_type="diag") # Instead of fitting it from the data, we directly set the estimated # parameters, the means and covariance of the components test_model.startprob_ = startprob test_model.transmat_ = transmat test_model.means_ = means test_model.covars_ = covars test_hidden_states = test_model.predict(X) print(test_hidden_states) test_result = test_hidden_states for i in range(len(test_close_v_real)): if test_result[i] == 0: test_result[i] = -1 elif test_result[i] == 1: test_result[i] = -1 elif test_result[i] == 2: test_result[i] = -1
for analysis in b.analysis.lowlevel.mfcc: if analysis is not None: try: obs = numpy.array(analysis) obs = obs.T obs = obs[1:] obs = obs.T obs = scale(obs) model = GaussianHMM(algorithm='viterbi', covariance_type='diag', covars_prior=0.01, covars_weight=1, init_params='mc', means_prior=0, means_weight=0, min_covar=0.001, n_components=3, n_iter=1000, params='mc', random_state=None, startprob_prior=1.0, tol=0.01, transmat_prior=1.0, verbose=False) model.startprob_ = numpy.array([1., 0, 0]) model.startprob_prior = model.startprob_ model.transmat_ = numpy.array([[0.9, 0.1, 0], [0, 0.9, 0.1], [0, 0, 1]]) model.transmat_prior = model.transmat_ model.fit(obs) pi = model.startprob_ A = model.transmat_ w = numpy.ones((n, m), dtype=numpy.double) hmm_means = numpy.ones((n, m, d), dtype=numpy.double) hmm_means[0][0] = model.means_[0] hmm_means[1][0] = model.means_[1] hmm_means[2][0] = model.means_[2] hmm_covars = numpy.array([[ numpy.matrix(numpy.eye(d,d)) for j in xrange(m)] for i in xrange(n)]) hmm_covars[0][0] = model.covars_[0]
startprob = np.zeros(3) startprob[0] = 1. transmat = [[0.99, 0.01, 0.], [0., 0.99, 0.01], [0., 0., 1.]] transmat = np.array(transmat) NUM_ATTEMPTS = 10 all_histories = list() improvement_histories = list() for attempt in range(NUM_ATTEMPTS): model = GaussianHMM(n_components=3, covariance_type='diag', params='tmc', init_params='mc', verbose=True) model.startprob_ = startprob model.transmat_ = transmat model.fit(all_videos) all_histories.append(model.custom_history) improvement_history = model.monitor_.custom_history improvement_histories.append(np.array(improvement_history)) all_histories = np.array(all_histories) with open('/storage/jalverio/hmmlearn/training_utils/all_histories.npy', 'wb') as f: pickle.dump(all_histories, f) with open('/storage/jalverio/hmmlearn/training_utils/improvement_history.npy', 'wb') as f: pickle.dump(improvement_histories, f) print('saved.')
import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=RuntimeWarning) from hmmlearn.hmm import GaussianHMM import numpy as np #samples: X = np.array([[-1.03573482, -1.03573482], [6.62721065, 11.62721065], [3.19196949, 8.19196949], [0.38798214, 0.38798214], [2.56845104, 7.56845104], [5.03699793, 10.03699793], [5.87873937, 10.87873937], [4.27000819, -1.72999181], [4.02692237, -1.97307763], [5.7222677, 10.7222677]]) # Trainning a new model over samples: model = GaussianHMM(n_components=3, covariance_type="diag").fit(X) # Create a new copy of the trained model: new_model = GaussianHMM(n_components=3, covariance_type="diag") new_model.startprob_ = model.startprob_ new_model.transmat_ = model.transmat_ new_model.means_ = model.means_ m = model._covars_ n = model.covars_ p = model.get_params() new_model.covars_ = model._covars_ # Predict from X: X_N = new_model.predict(X) print(X_N)