def load(self, file_path): with open(file_path, 'rb') as f: objects = pickle.load(f) try: self.transformer = objects['transformer'] except KeyError: # for backwards compatibility self.transformer = objects['lda'] self.hmm = HiddenMarkovModel.from_json(objects['hmm'])
def init(base_dir): print base_dir cluster_directories = \ glob.glob(base_dir + '/*') initial_clusterings = {} clusterings = {} clusterings_models = {} for cluster_dir in cluster_directories: try: clustering_id = cluster_dir.split('/')[-1:][0] # read initial clusters initial_clusters = {} filepath = '/'.join(cluster_dir.split('/') + ['init_assignments.txt']) lines = (open(filepath, 'r').read().splitlines()) l = 0 while l < len(lines): cluster_name = lines[l] cluster_members = lines[l + 1].split('\t') initial_clusters[cluster_name] = cluster_members l += 4 initial_clusterings[clustering_id] = initial_clusters # read final clusters clusters = {} filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt']) lines = (open(filepath, 'r').read().splitlines()) l = 0 while l < len(lines): cluster_name = lines[l] cluster_members = lines[l + 1].split('\t') clusters[cluster_name] = cluster_members l += 4 clusterings[clustering_id] = clusters # load models models = {} model_files = glob.glob(cluster_dir + '/*') for model_file in model_files: try: model_id = model_file.split('/')[-1:][0] json = open(model_file).read() models[model_id] = HiddenMarkovModel.from_json(json) print 'model loaded from: ', model_file except: pass clusterings_models[clustering_id] = models except: pass return initial_clusterings, clusterings
def lambda_handler(event, context): # TODO implement content_object = s3.get_object(Bucket='bhargav-ml-trained-models', Key='pos_model.txt') file_content = content_object['Body'].read().decode() json_content = json.loads(file_content) model = HiddenMarkovModel.from_json(json_content) sentence = event['body'].split(' ') output = simplify_decoding(sentence, model) return { 'statusCode': 200, 'headers': {'Content-Type': 'text/plain', 'Access-Control-Allow-Origin': '*'}, 'body': output }
def load(cls, path): """ Loads saved model Arguments: path: path to saved .pkl file Returns: Loaded HMM """ with open(path, "rb") as f: data = pickle.load(f) hmm = data['model'] del data['model'] model = cls(**data) model.model = HiddenMarkovModel.from_json(hmm) model.fitted = True return model
def get_vntr_matcher_hmm(self, read_length): """Try to load trained HMM for this VNTR If there was no trained HMM, it will build one and store it for later usage """ logging.info('Using read length %s' % read_length) copies = self.get_copies_for_hmm(read_length) base_name = str( self.reference_vntr.id) + '_' + str(read_length) + '.json' stored_hmm_file = settings.TRAINED_HMMS_DIR + base_name if settings.USE_TRAINED_HMMS and os.path.isfile(stored_hmm_file): model = Model() model = model.from_json(stored_hmm_file) return model flanking_region_size = read_length vntr_matcher = self.build_vntr_matcher_hmm(copies, flanking_region_size) json_str = vntr_matcher.to_json() with open(stored_hmm_file, 'w') as outfile: outfile.write(json_str) return vntr_matcher
subseqs.append(seq[cut[0]:cut[1]]) return subseqs def predict_path(model, seq): logp, path = model.viterbi(seq) return [p[1].name for p in path] with open('coding_model_base_poly.json') as base_model_file: coding_model_json = base_model_file.read() with open('utr_model_base.json') as promoter_model_file: promoter_utr_model_json = promoter_model_file.read() coding_model = HiddenMarkovModel.from_json(coding_model_json) promoter_utr_model = HiddenMarkovModel.from_json(promoter_utr_model_json) def predict_all_old(seq, string): path_names = predict_path(coding_model, seq) print([(string[i + 1], name, i - len(path_names) + 1) for i, name in enumerate(path_names) if i + 1 < len(string)]) starts = find_gene_cut_index(path_names, ['start zone7']) ends = find_gene_cut_index( path_names, ['stop zone taa9', 'stop zone tag9', 'stop zone tga9']) ext_subseq = find_intercoding_region(starts, ends, seq)
from nltk.stem import WordNetLemmatizer, SnowballStemmer snowball_stemmer = SnowballStemmer("english") porter_stemmer = PorterStemmer() # Import LDA corpus data lda_data = pd.read_csv('abcnews-date-text.csv') lda_data_text = lda_data[:300000][['headline_text']] # We only need the Headlines text column from the data lda_data_text['index'] = lda_data_text.index documents = lda_data_text # Import HMM Tagger corpus data and model hmm_data = Dataset("tags-universal.txt", "brown-universal.txt", train_test_split=0.8) hmm_model = HiddenMarkovModel.from_json( r'C:\Users\Marco\Desktop\Gits\eg-texttools\hmm-models\model.json') # Output dictionary. To export data to visualizer. output_dict = {} def lemmatize_stemming(text): return snowball_stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) # Tokenize and lemmatize def preprocess(text): result = [] for token in gensim.utils.simple_preprocess(text): if token not in gensim.parsing.preprocessing.STOPWORDS and len( token) > 3:
def filter_repeat(genes): names = [] filtered = [] for gene in genes: if gene['name'] not in names: filtered.append(gene) names.append(gene['name']) return filtered if __name__ == '__main__': with open('coding_model_base.json') as base_model_file: model_json = base_model_file.read() hmmodel = HiddenMarkovModel.from_json(model_json) genes = extract(folder_path='/run/media/jose/BE96A68C96A6452D/Asi/Data/', lookfor='CDS', before=500, after=30) valid_st = ['start zone8', 'start zone9', 'start zone10', 'start zone11', 'start zone12', 'start zone13', 'start zone14', 'start one15', 'start zone16', 'stop zone0', 'stop zone1', 'stop zone2', 'stop zone3', 'stop zone4', 'stop zone5', 'coding', 'acceptor016', 'acceptor017', 'acceptor018', 'acceptor116', 'acceptor117', 'acceptor118', 'acceptor119', 'acceptor120', 'acceptor216', 'acceptor217', 'acceptor218', 'acceptor219', 'donor00', 'donor01', 'donor02', 'donor10', 'donor11', 'donor12', 'donor13', 'donor20', 'donor21', 'donor22', 'donor23', 'donor24', ] unique_genes = filter_repeat(genes) predicted = map(test(hmmodel, valid_st), unique_genes)
def load_model(self, path, **kwargs): with open(path, 'r') as f: json_model = json.load(f) self.model = HiddenMarkovModel.from_json(json_model)
def __init__(self, n_trials=3, leave_one_out=1): """Variable initialization""" self.patient = rospy.get_param("gait_phase_det/patient") self.verbose = rospy.get_param("gait_phase_det/verbose") self.n_trials = n_trials self.n_features = 2 # Raw data and 1st-derivative self.leave_one_out = leave_one_out self.rec_data = 0.0 # Number of recorded IMU data self.proc_data = 0.0 # Number of extracted features self.win_size = 3 self.raw_win = [None] * self.win_size # self.fder_win = [0] * self.win_size self.ff = [[] for x in range(self.n_trials)] # Training and test dataset self.labels = [[] for x in range(self.n_trials)] # Reference labels from local data self.first_eval = True self.model_loaded = False algorithm = rospy.get_param("gait_phase_det/algorithm") rospy.loginfo('Decoding algorithm: {}'.format(algorithm)) if algorithm not in DECODER_ALGORITHMS: raise ValueError("Unknown decoder {!r}".format(algorithm)) self.decode = { "fov": self._run_fov, "bvsw": self._run_bvsw }[algorithm] self.imu_callback = { "fov": self._fov_callback, "bvsw": self._bvsw_callback }[algorithm] """HMM variables""" ''' State list: s1: Heel Strike (HS) s2: Flat Foot (FF) s3: Heel Off (HO) s4: Swing Phase (SP)''' self.model_name = "Gait" self.has_model = False self.must_train = False self.states = ['s1', 's2', 's3', 's4'] self.n_states = len(self.states) self.state2phase = {"s1": "hs", "s2": "ff", "s3": "ho", "s4": "sp"} self.train_data = [] self.mgds = {} self.dis_means = [[] for x in range(self.n_states)] self.dis_covars = [[] for x in range(self.n_states)] self.start_prob = [1.0/self.n_states]*self.n_states self.trans_mat = np.array([(0.9, 0.1, 0, 0), (0, 0.9, 0.1, 0), (0, 0, 0.9, 0.1), (0.1, 0, 0, 0.9)]) # Left-right model # self.trans_mat = np.array([0.8, 0.1, 0, 0.1], [0.1, 0.8, 0.1, 0], [0, 0.1, 0.8, 0.1], [0.1, 0, 0.1, 0.8]) # Left-right-left model self.log_startprob = [] self.log_transmat = np.empty((self.n_states, self.n_states)) self.max_win_len = 11 # ms (120 ms: mean IC duration for healthy subjects walking at comfortable speed) self.viterbi_path = np.empty((self.max_win_len+1, self.n_states)) self.backtrack = [[None for x in range(self.n_states)] for y in range(self.max_win_len+1)] self.global_path = [] self.work_buffer = np.empty(self.n_states) self.boundary = 1 self.buff_len = 0 self.states_pos = {} for i in range(len(self.states)): self.states_pos[self.states[i]] = i self.last_state = -1 self.curr_state = -1 self.conv_point = 0 self.conv_found = False self.smp_freq = 100.0 # Hz self.fp_thresh = 1/self.smp_freq*4 # Threshold corresponds to 8 samples self.time_passed = 0.0 self.obs = [[None for x in range(self.n_features)] for y in range(self.max_win_len)] self.model = HMM(name=self.model_name) """ROS init""" rospy.init_node('real_time_HMM', anonymous=True) rospack = rospkg.RosPack() self.packpath = rospack.get_path('hmm_gait_phase_classifier') self.init_subs() self.init_pubs() """HMM-training (if no model exists)""" try: '''HMM-model loading''' with open(self.packpath+'/log/HMM_models/'+self.patient+'.txt') as infile: json_model = json.load(infile) self.model = HMM.from_json(json_model) rospy.logwarn(self.patient + "'s HMM model was loaded.") self.has_model = True except IOError: if os.path.isfile(self.packpath + "/log/mat_files/" + self.patient + "_proc_data1.mat"): """Training with data collected with FSR-based reference system""" self.data_ext = 'mat' self.must_train = True elif os.path.isfile(self.packpath + "/log/IMU_data/" + self.patient + "_labels.csv"): """Training with data collected with offline threshold-based gait phase detection method""" self.data_ext = 'csv' self.must_train = True else: rospy.logerr("Please collect data for training ({})!".format(self.patient)) if self.must_train: rospy.logwarn("HMM model not trained yet for {}!".format(self.patient)) rospy.logwarn("Training HMM with local data...") self.load_data() self.init_hmm() self.train_hmm() self.has_model = True if self.has_model: try: '''MGDs loading if model exists''' for st in self.states: with open(self.packpath+'/log/HMM_models/'+self.patient+'_'+self.state2phase[st]+'.txt') as infile: yaml_dis = yaml.safe_load(infile) dis = MGD.from_yaml(yaml_dis) self.mgds[st] = dis rospy.logwarn(self.patient +"'s " + self.state2phase[st] + " MGC was loaded.") '''Loading means and covariance matrix''' self.dis_means[self.states_pos[st]] = self.mgds[st].parameters[0] self.dis_covars[self.states_pos[st]] = self.mgds[st].parameters[1] except yaml.YAMLError as exc: rospy.logwarn("Not able to load distributions: " + exc) """Transition and initial (log) probabilities matrices upon training""" trans_mat = self.model.dense_transition_matrix()[:self.n_states,:self.n_states] if self.verbose: print '**TRANSITION MATRIX (post-training)**\n'+ str(trans_mat) for i in range(self.n_states): self.log_startprob.append(ln(self.start_prob[i])) for j in range(self.n_states): self.log_transmat[i,j] = ln(trans_mat[i][j]) self.model_loaded = True
def run(cluster_directory_root, depth, plottype): # load data gc, mt, track = load_data(None, 0) data = pd.concat([gc.data, mt.data]) labels = data.index.values pos_labels = labels + '+' neg_labels = labels + '-' pos_data = pd.DataFrame(data=data.as_matrix(), index=pos_labels, columns=data.columns.values) neg_data = pd.DataFrame(data=data.as_matrix(), index=neg_labels, columns=data.columns.values) data = pd.concat([data, pos_data, neg_data]) generic_dir = cluster_directory_root.split('/') + (['*'] * depth) generic_dir = ('/').join(generic_dir) cluster_directories = \ glob.glob(generic_dir) clusterings = {} clusterings_models = {} for cluster_dir in cluster_directories: try: clustering_id = cluster_dir.split('/')[-1:][0] # read final clusters clusters = {} filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt']) lines = (open(filepath, 'r').read().splitlines()) l = 0 while l < len(lines): cluster_name = lines[l] cluster_members = lines[l + 1].split('\t') clusters[cluster_name] = cluster_members l += 4 clusterings[clustering_id] = clusters # load models models = {} model_files = glob.glob(cluster_dir + '/*') for model_file in model_files: try: model_id = model_file.split('/')[-1:][0] json = open(model_file).read() models[model_id] = HiddenMarkovModel.from_json(json) print 'model loaded from: ', model_file except: pass clusterings_models[clustering_id] = models except: pass background = set() for clustering in clusterings.itervalues(): for cid, members in clustering.iteritems(): background.update(set(members)) background = list(background) # data = data.loc[background, :] # generate ranomd clusterings of the same size k as our models random_clusterings = {} for clustering_id, clustering in clusterings.iteritems(): source = np.array(background) random_assignments = np.random.choice(len(clustering), source.size) random_clusters = {} for i, cluster_id in enumerate(clustering.iterkeys()): random_clusters[cluster_id] = \ source[np.where(random_assignments == i)[0]].tolist() random_clusterings[clustering_id] = random_clusters # run dunn and davies_bouldin for clusterings and random permutations rand_dunn = report_dunn(random_clusterings, clusterings_models, data) savename = cluster_directory_root + 'dunn_index_random' dump(rand_dunn, open(savename, 'w')) rand_davies = report_davies_bouldin(random_clusterings, clusterings_models, data) savename = cluster_directory_root + 'davies_bouldin_index_random' dump(rand_davies, open(savename, 'w')) if plottype == 'none': pass elif plottype == 'kn_grid': rand_dunn_df = pd.DataFrame() rand_davies_df = pd.DataFrame() for clustering_id, clustering in clusterings.iteritems(): cid = clustering_id.replace('k', '_'). \ replace('n', '_').split('_') m = cid[0] k = int(cid[1]) n = int(cid[2]) rand_dunn_df.loc[k, n] = rand_dunn[clustering_id] rand_davies_df.loc[k, n] = rand_davies[clustering_id] rand_davies_df = rand_davies_df.fillna(0) rand_dunn_df = rand_dunn_df.fillna(0) rand_dunn_df = rand_dunn_df.sort_index().sort_index(1) rand_davies_df = rand_davies_df.sort_index().sort_index(1) odir = cluster_directory_root title = 'RANDOM_' + str(m) + ': Dunn Index' HeatMap(rand_dunn_df.as_matrix(), rand_dunn_df.index.values, rand_dunn_df.columns.values, title=title, odir=odir) odir = cluster_directory_root title = 'RANDOM_' + str(m) + ': Davies-Bouldin Index' HeatMap(rand_davies_df.as_matrix(), rand_davies_df.index.values, rand_davies_df.columns.values, title=title, odir=odir) elif plottype == 'row': rand_dunn_df = pd.Series() rand_davies_df = pd.Series() for clustering_id, clustering in clusterings.iteritems(): rand_dunn_df.loc[clustering_id] = rand_dunn[clustering_id] rand_davies_df.loc[clustering_id] = rand_davies[clustering_id] rand_davies_df = rand_davies_df.fillna(0) rand_dunn_df = rand_dunn_df.fillna(0) rand_dunn_df = rand_dunn_df.sort_index() rand_davies_df = rand_davies_df.sort_index() odir = cluster_directory_root title = 'RANDOM' + ': Dunn Index' HeatMap(rand_dunn_df.as_matrix().reshape(-1, 1), rand_dunn_df.index.values, [' '], title=title, odir=odir, cmin=0, cmax=.5) odir = cluster_directory_root title = 'RANDOM' + ': Davies-Bouldin Index' HeatMap(rand_davies_df.as_matrix().reshape(-1, 1), rand_davies_df.index.values, [' '], title=title, odir=odir, cmin=5, cmax=10) return clusterings, clusterings_models
from pomegranate import HiddenMarkovModel from converter_to import converter_to import numpy with open('coding_model_base.json') as base_model_file: model_json = base_model_file.read() promoter_utr_model = HiddenMarkovModel.from_json(model_json) string = 'ggccctggtgtgtgatgttccccgccctgtgtccaagtgttctcattgttcagttcccacccatgagtgagaacatgctcgcaccgccgcttctaaatgttttaaaaacaaagacaccaatgcccttcattggggaaatgaaagacttttaagtaaaacgattttgagtgaaataatatttgttgttttaaaaagttaatattaaccactctccatcatatattgaaattaacttaagatgtgaaagttaaattagaaaccttgtaaaggaaaaataggaaatagtttcatgaacttgacacaggaaaatatttcttagactagatactgtagcactcaccacaataagaaatcaagcgaattgcacttcatttttaaaaagcttctccttattatgttgttgtttaacaacttaaacgctatctctagaccaggaataattatttgctatataatacagcaaaaaatatgtatgtataaatggactcattcaaaatatataaagaactcctattacaaagaaattgacaaacagcccagtatatcaatgaatataaaaatttgagaagatattttccataagaagatatctaaatgaacattaggcatgagaaaaccaaattttaggatatcactacacacctggtgtagtttaaaagactgaaaatattaagtgtgtgggaatgtagagcaactggaaatggcctacatctttcatagaaatgtaaaacaatacaaatactttgcaaaactctgtccaacattttctacccattcaccaagcaactccatccctagctatagatacccaggaaaataagtatgtatcttcacagaaataattgtatgagaatattcatagttacttatgcacagtagttaccaagtaaacctgtctcccatcagaaaaatggatatcaaattgtgtgataatcatacaatcaataggatattacttggccaaaacaaaatgaaacaagggaaaaacacaatcaaacaaattagtggcatatatacccacctgagtaaagagaagtcggccgggtgcggtggctcatgcctgtaatcccagcactttgggaggccgaggcggcagatcacgaggtcaggagatcgagaccatcctggctaacacagtgaaaccccgtctctactaaaaatacaaaaaaaaaaagaaaaagaaaaattagccgggcgtggtggcgggcgcctgtattcccaactacttgggagtctgaggcaggagaatagcgtgaacctgggaggcagagcttgcagtgagcctagatcgcgtcactgcactccagcctgggcgacagagtgagactctgtctcaaacaaaaaaaaaaaaaaaaaagtcaaaacaagagaacatactaaatgattctatttttttatttatgatttcatgactaccattaagaaaatataacctgttgggaaactgtttctgccttgatgatgttgtacagacaagagataaacagtgaggaatatgcttagatgtattgggaaagacacgggtctgtggcattgtcacaagggtacacgaatactgagagtgaatgctgaaggaatgatccccattggtggtgaccctcaggtgagactagggtgcctgtgtttcagcaaagcctgggcaattggaatgcagggctcctaagattccatgacacccccaccttctaattctgttttgcaactgcagacggttacctggcacgctggccacaatctacctcactcttatcagagtctgcgctactgacagtgctttcagctctgagttgaggcacctcgaaccttgtttttgtggtgaaggatcctaaagtgctgtggggagtgatcacatttttgacaacagtaagttaagaatttcagttacttacatccctcagtcctgattaaacctatttgatttcaccagtttttaacccatcatatgtttgggtttcttctccccagtccctgactccacctcttctgccacaaacgtcagcatggtggtatcagccggccctttgtccagcgagaaggcagagatgaacattctagaaatcaatgagaaattgcgcccccagttggcagagaagaaacagcagttcagaaacctcaaagagaaatgttttctaactcaactggccggcttcctggccaaccgacagaagaaatacagtaagatctataggctcaccgtcatgaaagtgatgaatgatgtcctgtcttctctctgagacactaaatgctctctccatcaaaaataatttcatccttcctgtacttctaggaaaacagaaatgggtattttaacattttgttaaagttggaagacagaggtaccaaagtatttagcaactttccatgtttgcaatcaggtgggggtgggactagagttaaactgccatttattgatttctgacacaggcacagaatgacctgttttctccaagaggctcaatcatgttttcaagaatcctctctgtaccatataagatcctgcagacaaataacatctagtctgttgttctaaatgtctgagactagtgaacttttattcagttcaagtttctgtggaggcccaacaggcaaagctctgttctagtgactctgagggaaacttggtgatagtagccagtacctgctctgaggggcttcaagaggagtctactcctaatagaacctgtgctgtctataagtgacagcatcaagagcagggagtaggggccgtgcatggtggctcactcctgtaatcccagcactttgggaggctgaggcgggcagatcatgaggtcaggagtttgagaccagcctgggcaacatggagaaaccccatctccactaaaaatacaaaaagtagatgggcgtggtggcaggtgactgtaatcacccctgcctaggaggctgaggcaggagaatcctttgaacccaggaggctgaggttgcagtgagccaagattttgccattgcactccagcctgggcgacagggcaagactgttaaaaaaaaaaataataataatgataaataaaaataagaataaaaagcagagagtagcttggtgagagtgaagtcctgcttcctggggcacagagtcttgttgctaaagaggaagaaagatcgcatccgagaatgtgtggagatagcagtgcagtgtacagagcagggactgtgggcctgtctactgggctccatccaagttgcttgtcttgtctgtccctcagtttcctcacctgttcagagggtactacaataatacctacctctgtaaattgctgcagtgaattacatgagctatttcttgtcaatctcctagaacatttattggcacacagtaaacactatctattagttgttcattctgctgtttctaaattaacacaaactttattagcatttgggcatatttccttcatggccttatggtgttatgtgtcactctttatgcttcagatatgattcttaaaatcataactgaagatatgatttaaaaatcaaagattttaaaaatctttcacatacttgtccttgaaattcccagtaaaagggaaaccatcagtcccatagtcctaggggccttcccgactgtacgagaaatcactacttcatgccccagtgcagtgttttagaggagaggctgcaaggcttgggaaagtggccccgcattcagagtcagacctcagggactgtgaattctgactccacttcgttgtggttgaatcatcttgtcaacttccttgatgtgcccttgaggttctctttcttcatctctaaattttggaggatcagatgccagaaagtcaggagactgaagagtaaagatgtggaaatccctgtctagaccctggtactggggagagttttgtccttgggatggacctggctcctgtcctgtaggcaatgaccacagcagcatgtccagccttccactgaggcaggcgtgtctgtcttttctcagaatatgaagagtgcaaagatctcataaaatttatgctgaggaatgagcgacagttcaaggaggagaagcttgcagagcagctcaagcaagctgaggagctcaggtgaggggaccccatgggggcaggcaggggtcaggtgtgtaaatctctgaagtacagcagctcggtggggagatgtaagagctaagctgggccaggggaagggcaggaattgccatggcaggctcgctacacacaaatatttatcaaacagagaagaaggataataaaaatgtatgggttgcagttgtttctcagagccttgttttctctttttcaaacaagtaattgttgatgtgaaatttacataacacaaaattaaccaaaggagtgtgaaccacacagcagcattcagtatactcaaaatggtgtgccatcaccaccccacttacccttagtgagaatcacctcctgactgactgcggcttctcattctttcactcaatcaatgttgccttctcgaccctgtcattcttttcttctttcgtcttttcaattcgccccatctgcacctggcctcatttctgtacatggctttgtatctagtggccgcaagatgcactatgtgtattttcacatggaaatgtccatggccagagtgaggaactgaaaggatgtctttttgaaacggaattaggaagacacctacttttgtttacagaagagaaagatgaatggaacatcatcgaggatcttgcaggagccctctctgatacagaggaagcctgtaaaccattttctattctttctcttggccacagacattcctttaaacatgtgctgaccttctgcttcgaggtctccttgaggacattgtctcagaaatctctgttgcaatatttgagcggatcactcaaccctttccactcttaaattttctctaccgtctcaccttaggcaatataaagtcctggttcacgctcaggaacgagagctgacccagttaagggagaagttacgggaagggagagatgcctcccgctcattgaatgagcatctccaggccctcctcactccggatgagccggacaagtcccaggggcaggacctccaagaacagctggctgaggggtgtagactgacacagcaccttgtccaaaagctcagcccaggtaaggtggccataggccctgatgacccaaaatcccaggcttatgagagactccagacctccatactttcacaatgacagttgtatcaatggtgtttttttccactaagcttatgtggccatgacatgaccaggacttcttgggtaagagcggagatgggaaacccatggggttggaggtcacagtattgcaagtgtccctcctcccttgatggaaggtggtctttggagtatgaggcagcatctgtctagttttaaaggacaggaaggaggctgcgatgggagcaggcttgttagagtgaaaagagctctggactaagaatgaagattcccaggctgtcttttcggcaatgttcttagtaactgtcagagagtgaatgacttgtccttcctgaatttctctctctccgtggcagacaaattgtctcttgcaagggtctgaagcattcaaatgtgggaacacttacaactgctttccaaaatgagatgaaggccctcgccgtgtgatgttggagaaggcactttatgtgggggcgttttgtggtaggaagtgcttcagactggagcactccccatggatagaatgtccctgaataacacagcagaagccacttggaggcttgaaatcttctgatgcatagaggactgtgggacaagtttgtctgcttctaagagaaagaattaggtttgaaatgcaaactgtgacgggacaccaagcctgtgcctgggaatcagatctggcaggatgggggacacagctgccaatgtccagagagaggctgcacaagcctccagtgatatgggaagcaaaaggtcttttcaatatttggccacatcttgatggtggccctccagatcagaaatgcattgcctgatggatcaggaaaccatgccagggcattctgttaaagataaaacatgagagttttcagttgaacggtgacccatgcctagatgttcatgtctctgttgcacattgggctgactgtgcttgcagactgtgaagtgggaaatatctgaacgaacacttctgtatttacagaaaatgacaacgatgacgatgaagatgttcaagttgaggtggctgagaaagtgcagaaatcgtctgcccccaggtaacactgaatactcaggaacaattaatggatggtaacatatgaggaatatctaggaggcacaccctctctggcatctatgatgggccaaaaacccgcattcgcttggccacagtatgtgaaatataacccagcttagacacagggtgcggcagctgtcatgtttctctgtgtgtgccgagtgtcatgtctgcaccgtacagggatagctgagtcttcatcctcctcagctcctatctgtccagtgcaatgaacagcagctgctctcttcctctctggttcccatggcagccatgctctgttgcagagagaacaggattgcatgttccctcttaatgggaacgtccattttgctttctgggaccactctcttaatgccgcctgtcaaaaccagctaggactccctggggtccaatccctctgtgtttaatcttctgtcatctctgtcccacctggctcatcagggagatgcagaaggctgaagaaaaggaagtccctgaggactcactggaggaatgtgccatcacttgttcaaatagccatggcccttatgactccaaccagccacataagaaaaccaaaatcacatttgaggaagacaaagtcgactcaactctcattggctcatcctctcatgttgaatgggaggatgctgtacacattattccaggtagcctctgttttccttgtgtctcgtacctctctctaggctgaggaagataaactctgaagacaggctctatcaacacaaattcatttgaataaaaaactgtgatgggtttctaaacagatatcagggagtttttttgtccttctcagctaatgtcatgcctttgtctgccagtccccagtatcaagttactcaaccccaggcaagtgtgacaatctcatagtcacctgagtgcaggaggtgcacaggccatatctgtcaggcctcctagcttcgattcagtatttcttgtcatctgtgattaagtcatctgtccctgaacaatgcccatggagtttctatgcctgtttaaggaagctggcagccttgcctttgtatttggaaatatcgttccccaggcttcactgctctcagctttcatctggatctcctttaagtcagcttgcttagctgcacagtcaccctgaaatcaggacggaaacttttcttctttactttgctgatatatttccataaagcaaggctggaccctggttctccaccctgtcaatgcaatggctgatccaatgtttctttgtagcatcgtggattttttttttttttttttttttgcgatggagtcttgctctgtcacccaggctggagtgcagtggcaccatcttggcttggtgcaacctctccctcccagattcaagtgattctcctgcctcagcctcctgagttgctgggaccacaggtgcacaacatcacatctggctaatttttgtatttttagtagagacagggtttccccatattggccagggtagtcctgaactcatgacctcaaatgattcacctgtcttggcctcccaaatcacagattctttttaaagcaagagttgttcaaatttatctatcagtcgtgtttcatgtatagatgcctctaaacatttaatgtccatgttacctggtgatataagtccgtattgcagcaacactcttagaaaattgtttgaccaatttttggagatttttttggggaaaaaattttgtttaactttgactcaggcagggaatatggcattatggtctacacgtagagggagattttggcctgtgggtctggaaagcagggtcatctaattctcaccaaagttaatctaggacaccctagaatattcctgtcagaatccttattcttgcactgagaatagttatgtccttgtgctatgactggacagtgatttgttcctatgtgaagtatgaattgcttaatgtgacctgcttctctgaatttatttacagaaaatgaaagtgatgatgaggaagaggaagaaaaagggccagtgtctcccaggtaatgttgtggaattgttggctgttaattcagtagtgacatctggagattgtagatttagggaaaatgaggaagtgatgaatagaactatttcttccattcacccagctacaaattgtgctgatttacaatgttgtatgttatttgtggcacttgtattggttttaatttcatagtcctctcaagataggaacttgccatcagatgagccaggtgaactagccaaacagggttttcttgttgatcttttcaaaaaaccagccctggattcattgattttttgaagggttttttgtgtctctatctcctttagttctgctctgatcttagttacttcttgtcttctgctagcttttgaatttgtttgctttgcttctctcgttattttaattgtgatgttaggatgtcaattttagatcttttctgctttctcttgtgggcatttagtgctataattttccctctacacattgctttaaatgtgtctcagagattctggtatgttgtgtctttgttctcattggtttcaaagaacatctttatttctgccttcattttgttattttcccagtagtcattcaggagcaggttgttgagtttccatgtagttgtgcggttttgagtgagtttcttaatcctgggttctaatttgatggcactgtggtctgacagtttgttgtgatttccattcttttacatttgctgacgagtgctttacctccaactatgtggtcaattttggaataagtgtgatgtggtgctgagaagaatgtatattctgttgatttggggtggagagttctgtagatgtcttttaggtctgcttggtggagagctgagttcaagtcctggatatccttgttaagcttctgtctcattgatctgtctaatattgacagtggggtgttaaagtctcccattatgattgtgtggagtctaaatctctttgtaggtctctcagacttgctttatgaatctgggtgctcctgtatagggtgcatatatatttaggatagttaactcttgttgaattgatccctttaccattatgtagtggccttctttgtctcttttgatctttgttggtttaaagtctgttttatcagagactaggattgcaacccctgcctttttttgttttccatttgcttggtagatcttcctccatccctttattttgagcctatgtgtgtctctgcatgtgagatgggtttcctgagtacagcacactgatgggtcttgactctttatccaatttgccattctgtgttttttaactggggcatttagcccatttacatttaaggttaatatcgttatgtgtgaatttgatcctgtcattatgatattagctggttatttcgcccgttagttgatgcagtttcttcctagcgtcaatggtctttacagtttggcatgtttttgtagtggctggtaccggttgttcctttccatgtttagtgcttcctttaggagctcttgtaaggcaggcctggtggtgacaaaatctctcagcatttgcttctctgtaaaggatttatttctccttcacttatgaagctttgtttggctggatatgaaattctgggttgaaaattcttttctttaagaaggttgaagatgctggagaggatgtggagaaataggaacacttttacactgttggtgggactgtaaactagttcaatgattgtggaaggcagtgtggcaattcctcagggatctagaactagaaatactatttgacccagccatcccattactgggtgtgtacccaaatgattataaatcatgctgctgtaaagacacatgcacacatatgtttattgtggcactattcacaatagcaaagacttggaaccaagccaaatatccagcaatgatagactggattaagaaaatgtggcacatatacaccatggaatactatgcagctataaaaaatgatgagttcatgtcctttgtaggggcatggatgaagctggaaaccatcattctcagcaaactattgcaaggacaaaaaaccaaataccgcatgttcttactcacaggtgggaattgaacaatgagaacacatggacacagaaaggggaacatcacacactggggcctgttgtagggtggggggagggaggaggggtagcattaggagatatacctaatgttaaatgatgagttaatgggtgaagcacaccaatgtggacatgtatacatatgtaactaacctgcacgttgtgcacatgtaccctaagacttaaagtattaaaatatatatatctgtatatatatatatacatacacacaaaaaataataaaggaaaactatacatatggaaaaaaaaagaatgttgaatattgctcccactctcttctggcttgtagggtttgtgccaagagatctgctgctagtctgatgggcttccctttgtgggtaatccgacctttctctctggctgcccttagcattttttccttcatttcaaccttggtgaatctgacaattaagtgttttggggttgctcttctcgaggagtatctttatggtgttctctgtgtttcctgaatttgaatgttggccttccttgctaggttggggaagtcctcctggataatatcctgaagaatgtttcccagcttggttccattctccccgtcactttcagtacaccaatcaaacgtagatttggtctttccacatagtcccatatttattggaggcttgttcatttctttttactcttttttctctaaacttctcttctcgcttcatttcattaatttgatcttgaatcactgataccatttcttgcacttgatcgaattggctactgaagcttgtgcatgcaccacgtagttctcgtgccatggttttcagctccatcaggtcatttaaggtcttctctacactgttcattctggttagccattcgtctaatcttttttcctttagctcagagaagtttgttattaccgactttctgaagcctacttctgtcagctcatcaaagtcattctccatcctgctttgttccattgctggcgaggagctgcgatcctttggaggagaagggatgtcaggtttttggaattttcagcttttgtgctctggtttctccccacctttgtggttttatctacccttggtctttgatgatggtgacctacagatggggttttggggtggatgtcttttttgttgatgttgatgctattcctttctgtgtgttagttttccttctaacagtcaggtccctcagcttcaggtctgttggagtttgctggaagtccactccagaccctcaaacagggatttctttgtgttgcctattctctcccatgtgtttaaatccagggagaggtgtatacatgctttcttcctatttgttggtagtatgttggctagtatttttgcaagaaaagaaattgaaaaagtaaatatattatatcaaaatattgggaaaatggggcccttaatacacaagatctgtgtctgcactgcgtcaagaactctcttcacttgaatgctgcatgtaaaattcaacccaatttatgcaaagtagttgaagccctgtgtcagttctctgtgctgcaagtcatgatggtagtttacagggagagtctgggtgccctgagttggctcatctgtggcaaatgtactgagcacatgctgcccatttttgctgtgtccccagagcagtcaccctccaccctgtatttagaaggatagttttatttctcttgaaggaaaaatgcctttggtttctgtgaccactccattctgtctcccatcagatcatctgggaggttttgttgtctaatgtctgttggttaaatcttctatcatccctgtcctgcctggctcatcaggaatctgcaggagtctgaagaggaggaagtcccccaggagtcctgggatgaaggttattcgactctctcaattcctcctgaaatgttggcctcgtaccagtcttacagcagcacatttcactcattagaggaacagcaagtctgcatggctgttgacataggcagtgagtactccattgtgaaggtgataaagctccagttcatggcccaggtagaccccataatctttgggccttgtgccccttgttgggctgagatttgccatcaccgtgggctgaacctatatatcaatgtagatttcaatcactctggagtcgagtctgaagcacaggcatggggtgggtcagtgagctttgctctcttcctagtctcaggccatgcccgtgccaacctggactgactgtcacgacattgaactcaaggcaggtgtggcaaactcacaccaaactatgcagcacatgcccaggagctgtctgtcagctcagctcatctgaattaaatgtctcttgccagctacaaaattccttatgagttttgttcccaaagcatgtctgtgtggttctttacctgccgaaggccagtgtcacccttgtctacctctcagtgaaagatgtgacccaggtttcactgaatttattcccattttctgtgtcttctaagttcgcttgttttagctcatctgtccatcatgttcctggtacgttttctagataaacagctgacttttcacccacaaaagccataatagctgatgcttctgtgtagaaccaagtttcattttgactcaagagctggtacattgcaccccttcatcaaatctctgtgtccacaatctcataaactatcaaattctgggtatttaatgagagaaagcttaatattgaagtatctctcctatgaggtgttagaactatttgcctacaatttattggggaaaaaattgctcatttgtgtacataaacctaggacagagcacatagggaagataacattccaaaacaggggaattttgcccaaggctcatgaaagaacccaagccagttttctcaagacttgacctcaggcctactggaatatttctctcaaagtctcctgttctcacactgacaagactgatgtccctgtgttaggattggacagaggaatgtttctgtgtgcaaggaagaactgcttaatgtaagagggcccatctgaatttatttgcaggacatcggtgggatcaagtgaaaaaggaggaccaagaggcaacaggtcccaggtgagtctgagaaattgtggagagttaatttgatgttgacacctggagatgccaagtccagggaaaacagtacatgctgaaaataatgattttgtcttgtcagacaagtctgaattatgcctactacattgctttttggttctcattagagtaaatgtttaggtttccatttcttcctacacttatcatttactaacctagtgaaggttgaccatacctcaaaagctgtattctcatggtaactgcagggaaacttgagcacattttatgcaaaattattgaggacatgcttttcatgatcactgttcactgtgtgtcctgagagcacaaatacagagtgtcctttgattccctcatcagtgtgtcacctgaccaattcactgagctcgctctgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtctttctctttcatccttttctacctggccctagtctatcccaacataaaggcaataatttgttacctcattaatggatctgtcctttttcttttcaaactcttccttatgttagccatgaaatctagctgggactgtgtggtttctgattccccctggcttattctttactttttcccacttttccaggctcagcagggagctgctggatgagaaagagcctgaagtcttgcaggactcactggatagatgttattcaactccttcaggttgtcttgaactgactgactcatgccagccctacagaagtgccttttacgtattggagcaacagcgtgttggcttggctattgacatggatggtgagtacctttctatgaaggtgataaggatccactgagtcttctggttagggtcatattcctactgcaagtggcccttactgagctgagagatgtcattgccacagggaggacctataggcacatgtaggttgaatgaaactctagttccacttggaagcccagacaagggatgggtcagtgagcaaggctctcttcctagtctcaggccatgcctgtggcgccctaatcctactctcatgacgttggacctgggcagatgtgacaaattcacacaactctgattttgtctcaattttgtagatcttgtagatttcatccttcactctaatttcagcgtctaaaatccttgctaccatgaacaatctgagtatttgatgagacagggctgaatagtgcagtttttctcctagcaaccatttgggggcatttgctttaaatcgattggaaaaatatggcataaccatttgcacaaacttgggacaaatgatattgggataacgatctaccagaatagggaattttacccacagtttctgggacaaaaaccaaggaatctctatcgtgatcagccttcaggcctcttgaagaatatctctcacagtgtcctattctcatgctgaggagcctgaagtccctgtgtgaggattagacagtggattgttatgtgtgtaggagaaccagcttaatatgtctgtccatgtctgaacttattgcagaaattgaaaagtaccaagaagtggaagaagaccaagacccatcatgccccaggtaactttgagcaattatggatgcttaattgtgtgttgacacctggagatgccaggtccagggaaaacaagagtgtgttcaatttcatgttttcaacgaaggttgaattactcctcctgacattgctgttggttttcattgcagtagatgtttaggtttccatttcttcctccccttatcatttactaacttactataggttgaccatacctcaaaggctgtatggcaactgcatggaatcttgagcaagtttatggaaaattattgagcccactcttttcatgaccactgttcgctgtgtgtcccgagcgcactaactcagagtgtcctttgaccccttcatcagtgtgtcacccggccaattcgctgagctcactttctcctctctctctctctccctctccctgtctttctctttcattcttttctacctggccctggtctatcccaacataaaggcaataattcattacctcattaatggatccgtcctttttctttttaaacagttccttatgttagccatgaaatctagctggggctgtgtggtttctgattctccctggcttattctttactttttcctacttttccaggctcagcagggagctgctggatgagaaagagcctgaagtcttgcaggactcactggatagatgttattcgactccttcagattatcttgaactgcctgacttaggccagccctacagcagtgctgtttactcattggaggaacagtaccttggcttggctcttgacgtggacagtgagtaccttactgtgaaggtgataagcctccacctggtcttccagataggggtgatattcctgttccaagtggcccttactgacccgagagatgtcattgccgcaggcaggacctatgggcgcatataggttgtaatgaaactgttgtctcagttggaagcctagacatgaaatgggtcagtgagcaaggctctattcctagtctccagccatgcctgtggcaagctgagcccgctctcagcacattggacccaggcagatgtaaaaaattcacagaactatgatttggactcaagggtttgtagatttcctccttcattctaatttcagtgtctaaaattcttgcatccatgaacgagctgggcatttgatgagacagggctgaatactgcagttttcctcctagaaatcatctggggcattttctttgaactgatgggaacaataaggcataactgtttgcacaaacttgggataaatgattttgggataacgatctaccagaatggggatatttcacccttggttctgagatgcaaaccaaagaatatcatgaccagctttcaggcctcctgaagtatctctctcacattgtcctgttctcatgctgagaagcctgagatccctgtgtggggattagacagtggactgttatgggtgtaggtgaattggcttattttgtctgtccctgtctgaatgtattgcaggaattaaaaaggaccaagaagaggaagaagaccaaggcccaccatgccccaggtaactgagcaattgtgaacagctacttctgtgttgacatctggagactcctggttcagggaaaacagagcgggctgacattatcgattacatcttttccagcaagcctgaattattcctactaacattgctgttggttttcattgcagtagatatttaggtttccatttcttcctccccttatcatttactaacctactgtaggtggaccagacttcaaaaactgtattctcatggcgactgcatggaaacttgagcacattttatggaaaattattgagcacagtcttttcatgatcactgtatgctgtgtgtcctgagggcactaactcagagtgtcctgttactccctcatcagtgtgtcacctggacaattcactgagctcattctctctctctctctctctgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtctatctgtctttctctttcattcttttccatttggccctgttctgtcccaacatgaaggcaataatttgttacctcattaatggatctatccttttacttttttaaccacttccttatgctacccatgaaacctagttggggctctgttgtgtctgatttcccctggcttattctttactttttcctccttttccaggctcagcagggagctgctggaggtagtagagcctgaagtcttgcaggactcactggatagatattattcaactccttccagttgtcttgaacagcctgactcctgccagccctatggaagttccttttatgcattggaggaaaaacatgttggcttttctcttgacgtgggaggtgagtacctttctatgaaggtgataaggatccactgagtcttccatataaagatcatattcctgctccaattggccattactgagctgagagatgtcattgccgcagtgaggacctataggcacatgtaggttgaatgaaactctagttctacctggaagcccagacaagggatgggtcagtgagcaagactctcttcctagtctcaggccatgcctgtggcactctgattctactctcatgacattggacctgggcagatgtgacaaattcagagaactatgattttgactcaagggtttgtagatttcctttttcactctaatttcagtgtctaaagtcctcacaaccatgaacaatctgagtatttgatgagacagggctaaatattgcagtttttctcctagaaatcatttgagggtatttgctttaaattgattggaaaaatatggcataactgtttgcacaaacttgggacaaatgttattgggataacgatctactagaatagggacactttacccacagtttttgggagaaaaactgaggaatttatatcatgaccagccttcagacctcctgaaatatatctctcatggtgtcgtattcttatgctgaggagcctgaggtccctgtgtgaggattagacagtggattgttatatgtgtaggggaatcagcttaatgtgtctgtccatgtctgaatttattgcagaaattgaaaagaaggggaaggggaagaaaagaaggggaagaagatcaaagaagaaaagaagaaggggaagaaaagaaggggaagatgacaacccaccatgccccaggtaactttcagcaattgtggatgcttaattctgtgttaacacctggaggcaacagattcagggaaaccagagtgtgtttgatgtcatgttttcaacgaaggctgaattactcctactgtcattgctgttggttttcattgcagtagatgtttaggtttccatttcttcctccccttatcatttactaacgtaccataggttgaccatacttcaaaagctgtactctgatggccactgcatcaaattttgagcatattttatgggaaactattgagctcactctttttgtgatcacagtttgctgtgtgtcatgagggcactaactcagagtgtccttttactcccttaccagtatgtcacctgggcaattcactagctcactttctctctgtctctgtctctgtctctctctctctgtctttctctttcattgttttctacctggccctgttctatcccaacataaaggcaataaattttttttttttacctcattaatgaatctatcctttttcttttctaaccacttccttatattacttctgaaatctagtggggctctgtggtgtctgattttccctggctgcttctttagttttgtctccttttccaggctcaacggcgtgctgatggaagtggaagagcctgaagtcttacaggactcactggatagatgttattcgactcagtcaatgtactttgaactacctgactcattccagcactacagaagtgtgttttactcatttgaggaagagcatatcagcttcgccctttacgtggacaataggttttttactttgacggtgacaagtctccacctggtgttccagatgggagtcatattcccacaataagcagctcttactaagccgagagatgtcatt' string = string.lower().replace('\n', '') print(len(string)) lists = list(string) two = converter_to(lists, 2) print(two) seq = numpy.array(two, numpy.unicode_) logp, path = promoter_utr_model.viterbi(seq) path_names = [p[1].name for p in path] count = 0 print([(string[i + 1], name, i - len(path_names) + 1) for i, name in enumerate(path_names) if i + 1 < len(string)])
def prep(cluster_directory_root, depth, genefile): # load data gc, mt, track = load_data(None, 0) genes = load(open(genefile, 'r')) gc.data = gc.data.loc[genes, :] data = pd.concat([gc.data, mt.data]) labels = data.index.values original_labels = labels pos_labels = labels + '+' neg_labels = labels + '-' pos_data = pd.DataFrame(data=data.as_matrix(), index=pos_labels, columns=data.columns.values) neg_data = pd.DataFrame(data=(data.as_matrix() * -1), index=neg_labels, columns=data.columns.values) data = pd.concat([data, pos_data, neg_data]) print data.index.values generic_dir = cluster_directory_root.split('/') + (['*'] * depth) generic_dir = ('/').join(generic_dir) cluster_directories = \ glob.glob(generic_dir) clusterings = {} clusterings_models = {} for cluster_dir in cluster_directories: try: clustering_id = cluster_dir.split('/')[-1:][0] # read final clusters clusters = {} filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt']) lines = (open(filepath, 'r').read().splitlines()) l = 0 while l < len(lines): cluster_name = lines[l] cluster_members = lines[l + 1].split('\t') if cluster_members == ['']: cluster_members = [] clusters[cluster_name] = cluster_members l += 4 clusterings[clustering_id] = clusters # load models models = {} model_files = glob.glob(cluster_dir + '/*') for model_file in model_files: try: model_id = model_file.split('/')[-1:][0] json = open(model_file).read() models[model_id] = HiddenMarkovModel.from_json(json) print 'model loaded from: ', model_file except: pass clusterings_models[clustering_id] = models except: pass """ background = set() for clustering in clusterings.itervalues(): for cid, members in clustering.iteritems(): background.update(set(members)) """ background = list(original_labels) # data = data.loc[background, :] # generate ranomd clusterings of the same size k as our models random_clusterings = {} np.random.seed(int(time.time())) for clustering_id, clustering in clusterings.iteritems(): source = np.array(background) random_assignments = np.random.choice(len(clustering), source.size) random_clusters = {} for i, cluster_id in enumerate(clustering.iterkeys()): random_clusters[cluster_id] = \ source[np.where(random_assignments == i)[0]].tolist() random_clusterings[clustering_id] = random_clusters # generate random signed clustering random_signed_clusterings = {} pn = np.array(['+', '-']) for clustering_id, clustering in clusterings.iteritems(): source = np.array(background) random_assignments = np.random.choice(len(clustering), source.size) random_clusters = {} for i, cluster_id in enumerate(clustering.iterkeys()): members = source[np.where(random_assignments == i)[0]].tolist() signed_members = [] for member in members: sign = np.random.choice(pn, 1)[0] signed_members.append(member + sign) random_clusters[cluster_id] = signed_members random_signed_clusterings[clustering_id] = random_clusters return clusterings, random_clusterings, random_signed_clusterings,\ clusterings_models, data, original_labels
with open('partial_model_start_model.json') as start_model_file: start_model_json = start_model_file.read() with open('partial_model_coding_to_stop_model0.json' ) as coding_to_stop_model_file0: coding_to_stop_model_json0 = coding_to_stop_model_file0.read() with open('partial_model_coding_to_stop_model1.json' ) as coding_to_stop_model_file1: coding_to_stop_model_json1 = coding_to_stop_model_file1.read() with open('partial_model_coding_to_stop_model2.json' ) as coding_to_stop_model_file2: coding_to_stop_model_json2 = coding_to_stop_model_file2.read() start_model = HiddenMarkovModel.from_json(start_model_json) coding_to_donor_model0 = HiddenMarkovModel.from_json( coding_to_donor_model_json0) coding_to_donor_model1 = HiddenMarkovModel.from_json( coding_to_donor_model_json1) coding_to_donor_model2 = HiddenMarkovModel.from_json( coding_to_donor_model_json2) intron_acceptor_model = HiddenMarkovModel.from_json(intron_acceptor_model_json) coding_to_stop_model0 = HiddenMarkovModel.from_json(coding_to_stop_model_json0) coding_to_stop_model1 = HiddenMarkovModel.from_json(coding_to_stop_model_json1) coding_to_stop_model2 = HiddenMarkovModel.from_json(coding_to_stop_model_json2)
def load(fileLocation): with open(fileLocation) as modelFile: model = HiddenMarkovModel.from_json(modelFile.read()) return model raise RuntimeError("can't load the model")
def gen_cluster_plots(cluster_directory_root, depth): # load data gc, mt, track = load_data(None, 0) data = pd.concat([gc.data, mt.data]) labels = data.index.values pos_labels = labels + '+' neg_labels = labels + '-' pos_data = pd.DataFrame(data=data.as_matrix(), index=pos_labels, columns=data.columns.values) neg_data = pd.DataFrame(data=data.as_matrix(), index=neg_labels, columns=data.columns.values) data = pd.concat([data, pos_data, neg_data]) generic_dir = cluster_directory_root.split('/') + (['*'] * depth) generic_dir = ('/').join(generic_dir) cluster_directories = \ glob.glob(generic_dir) clusterings = {} clusterings_models = {} for cluster_dir in cluster_directories: try: clustering_id = cluster_dir.split('/')[-1:][0] # read final clusters clusters = {} filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt']) lines = (open(filepath, 'r').read().splitlines()) l = 0 while l < len(lines): cluster_name = lines[l] cluster_members = lines[l + 1].split('\t') clusters[cluster_name] = cluster_members l += 4 clusterings[clustering_id] = clusters # load models models = {} model_files = glob.glob(cluster_dir + '/*') for model_file in model_files: try: model_id = model_file.split('/')[-1:][0] json = open(model_file).read() models[model_id] = HiddenMarkovModel.from_json(json) print 'model loaded from: ', model_file except: pass clusterings_models[clustering_id] = models except: pass background = set() for clustering in clusterings.itervalues(): for cid, members in clustering.iteritems(): background.update(set(members)) background = list(background) # data = data.loc[background, :] # generate ranomd clusterings of the same size k as our models for clustering_id, clustering in clusterings.iteritems(): for model_id, members in clustering.iteritems(): sequences = data.loc[members, :] pltdir = '/'.join(cluster_directory_root.split('/') + ['plots']) # make line plots directory if not os.path.isdir(pltdir + '/line'): print "Creating directory...", pltdir os.mkdir(pltdir + '/line') savename = pltdir + '/line/' + model_id + '_lineplot' plt_title = model_id + ' Line Plot' ax = sequences.T.plot(legend=False, rot=2) ax.set_title(plt_title) ax.set_xlabel('Timepoint') ax.set_ylabel('Normalized Expression') print 'Saving: ', savename fig = ax.get_figure() fig.savefig(savename) fig.clear() # make autocorr plots directory if not os.path.isdir(pltdir + '/autocorr'): print "Creating directory...", pltdir os.mkdir(pltdir + '/autocorr') savename = pltdir + '/autocorr/' + model_id + '_autocorr' plt_title = model_id + ' Autocorr Plot' for seq in sequences.index: ax = autocorrelation_plot(sequences.loc[seq]) ax.set_title(plt_title) print 'Saving: ', savename fig = ax.get_figure() fig.savefig(savename) fig.clear() # make lag plots directory if not os.path.isdir(pltdir + '/lag'): print "Creating directory...", pltdir os.mkdir(pltdir + '/lag') from pylab import * NUM_COLORS = len(members) cm = get_cmap('gist_rainbow') colors = [] for i in range(NUM_COLORS): colors.append(cm(1.*i/NUM_COLORS)) savename = pltdir + '/lag/' + model_id + '_lagplot' plt_title = model_id + ' Lag Plot' for i, seq in enumerate(sequences.index): ax = lag_plot(sequences.loc[seq], c=colors[i]) ax.set_title(plt_title) print 'Saving: ', savename fig = ax.get_figure() fig.savefig(savename) fig.clear() """
def _model(self): return HiddenMarkovModel.from_json(self.json_model)