예제 #1
0
def synthetic_sequences_experiment(exp_params, path_dict, sample_retries=100, reuse_sequences=None):
    
    t_exp = Timer()
    start_time = t_exp.tic("Starting a 'synthetic sequences' experiment.")
    
    # Get parameters
    t = Timer()
    t.tic("Parsing parameters ...")
    exp_params = _parse_syntheticgt_dirichlet_parameters(exp_params, path_dict)
    exp_params = _parse_standard_and_dense(exp_params, path_dict, exp_params['n_emissions'])
    _exp_params = _save_experiment_parameters(exp_params, path_dict)
    t.toc("Parameters parsed. Using parameters: %s" % str(_exp_params))
    
    train_X, test_X, gt_AB = None, None, None
    if reuse_sequences is None or type(reuse_sequences) != tuple or len(reuse_sequences) != 3:
        train_X, test_X, gt_AB = _sample_sequences_from_gt_hmm(exp_params, path_dict, sample_retries=sample_retries)
    else:
        train_X, test_X, gt_AB = reuse_sequences
        timestamp_msg("Reusing sequences")
    
    if 'fair_standard_params' in exp_params: 
        _standard_vs_dense(train_X, test_X, (exp_params['standard_params'], exp_params['fair_standard_params']),
                           exp_params['dense_params'], gt_AB)
    else:
        _standard_vs_dense(train_X, test_X, exp_params['standard_params'], exp_params['dense_params'], gt_AB)
    
    fin_time, diff = t_exp.toc("Finished a 'synthetic sequences' experiment.")
예제 #2
0
def _load_protein():

    global protein_seqs, protein_meta
    if protein_seqs is None or protein_meta is None or (len(protein_seqs) !=
                                                        len(protein_meta)):

        protein_seqs, protein_meta = [], []

        # Parsing data
        filename = os.path.join(DATA_DIR, 'pdb_seqres.txt.gz')
        with gzip.open(filename) as f:
            for i, line in enumerate(f):
                #if i >= 500:
                #    break

                #print(line)
                if line.startswith('>'):
                    protein_meta.append(line.strip())
                else:
                    seq_str = line.strip()
                    protein_seqs.append([c for c in seq_str])

    else:
        timestamp_msg('Using already loaded sequences ...')

    return protein_seqs, protein_meta
예제 #3
0
def _save_data(path_dict, train_X, test_X=None, gt_AB=None):
    if 'data_dir' in path_dict:
        data_dir = str(path_dict['data_dir'])
        check_dir(data_dir)
        np.save(data_dir + '/train_X', train_X)
        if test_X is not None:
            np.save(data_dir + '/test_X', test_X)
        if gt_AB is not None:
            np.save(data_dir + '/gt_A', gt_AB[0])
            np.save(data_dir + '/gt_B', gt_AB[1])
        timestamp_msg("Saved data in %s" % data_dir)
예제 #4
0
def _load_penntreebank():

    global penntree_tagged_words, penntree_tagged_sents
    if penntree_tagged_words is None or penntree_tagged_sents is None:
        nltk.download('treebank')
        from nltk.corpus import treebank

        # Organized in sentences and words
        penntree_tagged_sents = treebank.tagged_sents()
        penntree_tagged_words = [
            word for sent in penntree_tagged_sents for word in sent
        ]
    else:
        timestamp_msg('Using already loaded sequences ...')
    return penntree_tagged_sents, penntree_tagged_words
예제 #5
0
def dataset_sequences_experiment(exp_params, path_dict, reuse_sequences=None):
    
    t_exp = Timer()
    exp_params = dict(exp_params)
    ident = dict_get(exp_params, 'dataset_ident', default='', cast=str)
    start_time = t_exp.tic("Starting a 'dataset sequences' experiment. (%s)" % str(ident))
    
    # Get parameters
    t = Timer()
    t.tic("Parsing parameters ...")
    
    train_perc = dict_get(exp_params, 'train_perc', default=1., cast=float)
    gt_dir = dict_get(path_dict, 'gt_dir', default=None)
    check_dir(gt_dir)
    ds_params = dict_get(exp_params, 'dataset_params', default=dict(), cast=dict)
    if reuse_sequences is None or type(reuse_sequences) != tuple or len(reuse_sequences) != 2:
        gt_sequences, _, _ = get_dataset_sequences(ident, ds_params, gt_dir)
        train_X, test_X = train_test_split(gt_sequences, train_perc)
    else:
        train_X, test_X = reuse_sequences
        timestamp_msg("Reusing sequences ...")
    
    # Check gt_sequences
    _, _, n_train_emissions = check_sequences(train_X)
    n_test_emissions = None
    if test_X is not None and len(test_X) > 0:
        _, _, n_test_emissions = check_sequences(test_X)
    _save_data(path_dict, train_X, test_X)
    if n_test_emissions is not None and n_train_emissions != n_test_emissions:
        raise Exception("Number of emissions in train and test sequence differs")
    exp_params['n_emissions'] = n_train_emissions
    
    exp_params = _parse_base_parameters(exp_params, path_dict)
    exp_params = _parse_standard_and_dense(exp_params, path_dict, exp_params['n_emissions']) 
    _exp_params = _save_experiment_parameters(exp_params, path_dict)
    t.toc("Parameters parsed. Using parameters: %s" % str(_exp_params))
    
    if 'fair_standard_params' in exp_params: 
        _standard_vs_dense(train_X, test_X, (exp_params['standard_params'], exp_params['fair_standard_params']),
                           exp_params['dense_params'])
    else:
        _standard_vs_dense(train_X, test_X, exp_params['standard_params'], exp_params['dense_params'])
    
    fin_time, diff = t_exp.toc("Finished a 'dataset sequences' experiment.")
예제 #6
0
def _save_experiment_parameters(exp_params, path_dict):
    
    if 'experiment_directory' in path_dict:
        exp_dir = str(path_dict['experiment_directory'])
        check_dir(exp_dir)
    
        _exp_params = copy.deepcopy(exp_params)

        gt_params = dict_get(_exp_params, 'gt_params', default=None, cast=dict)
        if gt_params is not None:
            _exp_params['gt_params'] = gt_params
            init_params = dict_get(gt_params, 'init_params', default=None)
            if callable(init_params):
                _exp_params['gt_params']['init_params'] = str(init_params.__name__)
            gt_logmon = dict_get(gt_params, 'logging_monitor', default=None)
            if gt_logmon is not None and isinstance(gt_logmon, HMMLoggingMonitor):
                _exp_params['gt_params']['logging_monitor'] = dict(gt_logmon.log_config)

        standard_params = dict_get(_exp_params, 'standard_params', default=None, cast=dict)
        standard_logmon = dict_get(standard_params, 'logging_monitor', default=None)
        if standard_logmon is not None and isinstance(standard_logmon, HMMLoggingMonitor):
            _exp_params['standard_params']['logging_monitor'] = dict(standard_logmon.log_config)

        dense_params = dict_get(_exp_params, 'dense_params', default=None, cast=dict)
        dense_logmon = dict_get(standard_params, 'logging_monitor', default=None)
        if dense_logmon is not None and isinstance(dense_logmon, HMMLoggingMonitor):
            _exp_params['dense_params']['logging_monitor'] = dict(dense_logmon.log_config)

        fair_standard_params = dict_get(_exp_params, 'fair_standard_params', default=None, cast=dict)
        fair_standard_logmon = dict_get(fair_standard_params, 'logging_monitor', default=None)
        if fair_standard_logmon is not None and isinstance(fair_standard_logmon, HMMLoggingMonitor):
            _exp_params['fair_standard_params']['logging_monitor'] = dict(fair_standard_logmon.log_config)
            
        np.save(exp_dir + '/exp_params', _exp_params)
        timestamp_msg("Saved experiment parameters in %s" % exp_dir)
        return _exp_params