Exemplo n.º 1
0
def main_script(model_initializer=None,
                description=None,
                epilog=None,
                prog_name=None,
                logger=None):
    parser = argparse_setup(model_initializer, description, epilog)
    args = parser.parse_args()

    if args.no_logging:
        configure_logging(logger, verbosity=args.verbosity)
    else:
        configure_logging(logger,
                          verbosity=args.verbosity,
                          filename=prog_name + '.log')

    logger.debug('-' * 80)
    logger.debug('Program: %s' % prog_name)
    logger.debug('Called with parameters:\n %s' %
                 serialize_dict(args.__dict__))

    start_time = time()
    try:
        main(model_initializer, args)
    except Exception:
        import datetime
        curr_time = datetime.datetime.now().strftime("%A, %d. %B %Y %I:%M%p")
        logger.exception("Program run failed on %s" % curr_time)
    finally:
        end_time = time()
        logger.info('Elapsed time: %.1f sec', end_time - start_time)
Exemplo n.º 2
0
def main_script(prog_name=None, logger=None):

    parser = argparse_setup()
    args = parser.parse_args()

    if args.no_logging:
        configure_logging(logger, verbosity=args.verbosity)
    else:
        configure_logging(logger,
                          verbosity=args.verbosity,
                          filename=args.logging_dir +
                          'logs_gc%.2f_len%d_num%d' %
                          (args.gc_content, args.length, args.num) + '.log')

    logger.debug('-' * 80)
    logger.debug('Program: %s' % prog_name)
    logger.debug('\n')
    logger.debug('Called with parameters:\n\n %s \n\n' %
                 serialize_dict(args.__dict__))

    start_time = time.asctime(time.localtime(time.time()))
    logger.info('Initializing program execution %s \n\n' % (start_time))
    try:
        main(args, logger)
    except Exception:
        import datetime
        curr_time = datetime.datetime.now().strftime("%A, %d. %B %Y %I:%M%p")
        logger.exception("Program run failed on %s" % curr_time)
        exit(1)
    finally:
        end_time = time.asctime(time.localtime(time.time()))
        logger.info('Executing program execution %s' % (end_time))
        logger.info('-' * 80)
Exemplo n.º 3
0
def load_PUBCHEM_data(assay_id, max_size=20):
    configure_logging(logger, verbosity=2)
    logger.debug('_' * 80)
    logger.debug('Dataset %s info:' % assay_id)
    desc = get_assay_description(assay_id)
    logging.debug('\n%s' % desc)
    # extract pos and neg graphs
    all_pos_graphs, all_neg_graphs = get_pos_graphs(assay_id), get_neg_graphs(assay_id)
    # remove too large and too small graphs and outliers
    initial_max_size = 2000
    initial_max_size = max(initial_max_size, max_size)
    args = dict(initial_max_size=initial_max_size, fraction_to_remove=.1, n_neighbors_for_outliers=9, remove_similar=False, max_size=max_size)
    logging.debug('\nPositive graphs')
    pos_graphs = pre_process(all_pos_graphs, **args)
    logging.debug('\nNegative graphs')
    neg_graphs = pre_process(all_neg_graphs, **args)
    logger.debug('-' * 80)
    configure_logging(logger, verbosity=1)
    return pos_graphs, neg_graphs
Exemplo n.º 4
0
def do():
    #special case: bursi
    pos_dataset_fname = 'RF00005.fa'
    neg_dataset_fname = None  # none will permute the first dataset
    dataset = '%s_vs_%s' % (pos_dataset_fname, neg_dataset_fname)
    #logging

    if True:
        logger_fname = '%s_predictive_performance_of_samples.log' % dataset
    else:
        logger_fname = None
    configure_logging(logger, verbosity=1, filename=logger_fname)

    #main
    start = time()
    print('Working with dataset: %s' % dataset)

    logger.info('Working with dataset: %s' % dataset)

    percentages = [.08, .2, .4, .6, .8, .95]
    percentages = [.07, 0.1, 0.15, 0.2]

    # set size to 900 in production
    original_repetitions, original_sample_repetitions, sample_repetitions = evaluate(
        pos_dataset_fname,
        neg_dataset_fname,
        size=100,
        percentages=percentages,
        n_repetitions=3,
        train_test_split=0.7)
    #save and display results
    result_fname = '%s_predictive_performance_of_samples.data' % dataset
    save_results(result_fname, percentages, original_repetitions,
                 original_sample_repetitions, sample_repetitions)

    percentages_l, original_repetitions_l, original_sample_repetitions_l, sample_repetitions_l = load_results(
        'asd.data')
    plot(dataset, percentages_l, original_sample_repetitions_l,
         original_repetitions_l, sample_repetitions_l)

    print('Time elapsed: %s' % (datetime.timedelta(seconds=(time() - start))))
Exemplo n.º 5
0
def main_script(model_initializer=None, description=None, epilog=None, prog_name=None, logger=None):
    parser = argparse_setup(model_initializer, description, epilog)
    args = parser.parse_args()

    if args.no_logging:
        configure_logging(logger, verbosity=args.verbosity)
    else:
        configure_logging(logger, verbosity=args.verbosity, filename=prog_name + '.log')

    logger.debug('-' * 80)
    logger.debug('Program: %s' % prog_name)
    logger.debug('Called with parameters:\n %s' % serialize_dict(args.__dict__))

    start_time = time()
    try:
        main(model_initializer, args)
    except Exception:
        import datetime
        curr_time = datetime.datetime.now().strftime("%A, %d. %B %Y %I:%M%p")
        logger.exception("Program run failed on %s" % curr_time)
    finally:
        end_time = time()
        logger.info('Elapsed time: %.1f sec', end_time - start_time)
Exemplo n.º 6
0
def test_pareto():
    configure_logging(logging.getLogger(), verbosity=2)
    graphs = rg.make_graphs_static(
        100,  # how many to generate
        5,  # graph size
        4,  # node-labelcount
        2,  # edgelabelcount
        labeldistribution='uniform',
        allow_cycles=False)

    im = InstanceMaker(n_landmarks=5, n_neighbors=50).fit(graphs, ntargets=2)

    optimizer = pareto.LocalLandmarksDistanceOptimizer(n_iter=7,
                                                       context_size=1,
                                                       multiproc=True)
    landmark_graphs, desired_distances, ranked_graphs, target_graph = im.get()
    NONE = optimizer.optimize(
        landmark_graphs,
        desired_distances,
        ranked_graphs,
        #start_graph_list=[landmark_graphs[0]])
        start_graph_list=landmark_graphs)
    return None
def do():
    #special case: bursi
    pos_dataset_fname = 'RF00005.fa'
    neg_dataset_fname = None # none will permute the first dataset
    dataset='%s_vs_%s' % (pos_dataset_fname, neg_dataset_fname)
    #logging

    if True:
        logger_fname = '%s_predictive_performance_of_samples.log'%dataset
    else:
        logger_fname = None
    configure_logging(logger,verbosity=1, filename=logger_fname)

    #main 
    start=time()
    print( 'Working with dataset: %s' % dataset )

    logger.info( 'Working with dataset: %s' % dataset )

    percentages=[.08,.2,.4,.6,.8,.95]
    percentages=[.07,0.1,0.15,0.2]

    # set size to 900 in production
    original_repetitions,    original_sample_repetitions,    sample_repetitions = evaluate(pos_dataset_fname,
                                  neg_dataset_fname,
                                  size=100,
                                  percentages=percentages,
                                  n_repetitions=3,
                                  train_test_split=0.7)
    #save and display results
    result_fname='%s_predictive_performance_of_samples.data'%dataset
    save_results(result_fname,percentages, original_repetitions,original_sample_repetitions,sample_repetitions)    

    percentages_l, original_repetitions_l,original_sample_repetitions_l,sample_repetitions_l = load_results('asd.data')
    plot(dataset, percentages_l, original_sample_repetitions_l, original_repetitions_l, sample_repetitions_l)

    print('Time elapsed: %s'%(datetime.timedelta(seconds=(time() - start))))
Exemplo n.º 8
0
    args=vars(parser.parse_args())
    import os.path
    if not os.path.isfile(args['start_graphs']):
        parser.print_usage()
        print 'at least provide a path to input'
        exit()

    print "*raw args"
    print "*" * 80
    print args

    # verbosity
    from eden.util import configure_logging
    import logging
    configure_logging(logging.getLogger(),verbosity=args['verbose'])
    args.pop('verbose')

    # graphs
    from eden.io.gspan import gspan_to_eden
    from itertools import islice
    args['graph_iter'] = islice(gspan_to_eden(args.pop('start_graphs')),args.pop('num_graphs'))


    #output
    OUTFILE=args.pop('out')
    MODEL=args.pop('model')

    # CREATE SAMPLER
    from graphlearn01.graphlearn import Sampler
    s=Sampler()
Exemplo n.º 9
0
    args = vars(parser.parse_args())
    import os.path
    if not os.path.isfile(args['start_graphs']):
        parser.print_usage()
        print 'at least provide a path to input'
        exit()

    print "*raw args"
    print "*" * 80
    print args

    # verbosity
    from eden.util import configure_logging
    import logging
    configure_logging(logging.getLogger(), verbosity=args['verbose'])
    args.pop('verbose')

    # graphs
    from eden.io.gspan import gspan_to_eden
    from itertools import islice
    args['graph_iter'] = islice(gspan_to_eden(args.pop('start_graphs')),
                                args.pop('num_graphs'))

    #output
    OUTFILE = args.pop('out')
    MODEL = args.pop('model')

    # CREATE SAMPLER
    from graphlearn01.graphlearn import Sampler
    s = Sampler()
Exemplo n.º 10
0
    import os.path
    if not os.path.isfile(args['input']):
        parser.print_usage()
        print 'at least provide a path to input'
        exit()

    print "*raw args"
    print "*"*80
    print args


    # verbosity
    from eden.util import configure_logging
    import logging
    configure_logging(logging.getLogger(),verbosity=args.pop('verbose'))


    # handle Vectorizer:
    from eden.graph import Vectorizer
    args['vectorizer'] = Vectorizer(args.pop('vectorizer_complexity'))


    # estimator, if the user is providing a negative graph set, we use
    # the twoclass esti OO
    import graphlearn01.estimate as estimate
    if args['negative_input']==None:
        args['estimator']=estimate.OneClassEstimator(nu=.5, cv=2, n_jobs=-1)
    else:
        args['estimator']=estimate.TwoClassEstimator( cv=2, n_jobs=-1)
        
Exemplo n.º 11
0
from ego.decomposition.paired_neighborhoods import decompose_paired_neighborhoods, decompose_neighborhood
from ego.vectorize import hash_graph
from ego.vectorize import set_feature_size, vectorize
from ego.encode import make_encoder

from utils_oracle_with_target import oracle_setup as oracle_setup_target
from utils_oracle_from_dataset import oracle_setup as oracle_setup_dataset
from eden_chem.io.rdkitutils import nx_to_inchi
from eden_chem.io.rdkitutils import nx_to_smi
from datetime import datetime



logger = logging.getLogger()
configure_logging(logger, verbosity=1)

download_active = curry(download)(active=True)
download_inactive = curry(download)(active=False)


def get_pos_graphs(assay_id): return pipe(assay_id, download_active, sdf_to_nx, list)


def get_neg_graphs(assay_id): return pipe(assay_id, download_inactive, sdf_to_nx, list)

colormap = 'tab20c'

#assay_ids = ['624466','492992','463230','651741','743219','588350','492952','624249','463213','2631','651610']

def rank_and_persist_molecules(graphs, scores, name='', plot=True):
Exemplo n.º 12
0
    default="",
    help="Prefix to use for output filenames")
parser.add_argument(
    "--chromosome_limits",
    help="Path to file containing chromosome limites as required by bedtools. Use this parameter disables automatic lookup via the genome id.")
parser.add_argument(
    "--negative_site_candidate_regions_fn",
    help="Path to regions considered for placement of negatives in bed format")
parser.add_argument(
    "-v", "--verbosity",
    action="count",
    help="Increase output verbosity")
args = parser.parse_args()

logger = logging.getLogger()
configure_logging(logger, verbosity=args.verbosity)

# fixed global variables
npeek = 2

# check chromsizes retreival
if (args.chromosome_limits is None):
    # check if genome_id can be found,
    chromsizes = get_chromsizes_from_ucsc(args.genome_id)
    logging.debug("Number of chromosomes: {}.".format(len(chromsizes)))
    # otherwise request manual definition of chromosome limits
    if (len(chromsizes) == 0):
        logging.error("Error: retrieving chromosome sizes from UCSC failed. Please specify manually using parameter --chromosome_limits")
        exit(1)

# output file arguments
Exemplo n.º 13
0
                        value = int(round(value))
                    # parameter_setting[key] = value
                    break
                value = np.random.normal(mu, 2 * sigma)
                n_iter += 1
                success, value = check_validity(key, value, noise)
            parameter_setting[key] = value
    return parameter_setting


# In[ ]:

# %%time

logger = logging.getLogger()
configure_logging(logger, verbosity=1)

filename = "Result_at_" + str(noise_level) + ".txt"

best_config = {'min_score': 6,  # atleast motif_length/2
               'min_freq': 0.1,  # can not be more than (1- noise level)
               'min_cluster_size': 3,  # atleast 3
               'p_value': 0.1,  # atleast 0.1
               'similarity_th': 0.8,  # 0.8
               'regex_th': 0.3,  # max 0.3
               'freq_th': 0.05,  # 0.05
               'std_th': 0.2}  # 0.2


results_dic = {}
Exemplo n.º 14
0
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=1)


'''
GET RNA DATA
'''
from eden.converter.fasta import fasta_to_sequence
import itertools
from eden.util import random_bipartition_iter
import random
import numpy
 
 
def rfam_uri(family_id):
    return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)
def rfam_uri(family_id):
    return '%s.fa'%(family_id)
 
    
    
RFAM="RF01725"
#cutoff 162 (44.0)
#cutoff 1725 (38.0)
#cutoff rest (29)
sizes=[50,100,200,400]
repeats=3


Exemplo n.º 15
0
parser.add_argument(
    "--chromosome_limits",
    help=
    "Path to file containing chromosome limites as required by bedtools. Use this parameter disables automatic lookup via the genome id."
)
parser.add_argument(
    "--negative_site_candidate_regions_fn",
    help="Path to regions considered for placement of negatives in bed format")
parser.add_argument("-v",
                    "--verbosity",
                    action="count",
                    help="Increase output verbosity")
args = parser.parse_args()

logger = logging.getLogger()
configure_logging(logger, verbosity=args.verbosity)

# fixed global variables
npeek = 2

# check chromsizes retreival
if (args.chromosome_limits is None):
    # check if genome_id can be found,
    chromsizes = get_chromsizes_from_ucsc(args.genome_id)
    logging.debug("Number of chromosomes: {}.".format(len(chromsizes)))
    # otherwise request manual definition of chromosome limits
    if (len(chromsizes) == 0):
        logging.error(
            "Error: retrieving chromosome sizes from UCSC failed. Please specify manually using parameter --chromosome_limits"
        )
        exit(1)
Exemplo n.º 16
0
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(), verbosity=1)
'''
GET RNA DATA
'''
from eden.converter.fasta import fasta_to_sequence
import itertools
from eden.util import random_bipartition_iter
import random
import numpy


def rfam_uri(family_id):
    return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0' % (
        family_id, family_id)


def rfam_uri(family_id):
    return '%s.fa' % (family_id)


RFAM = "RF01725"
#cutoff 162 (44.0)
#cutoff 1725 (38.0)
#cutoff rest (29)
sizes = [50, 100, 200, 400]
repeats = 3


def get_sequences(size=9999, rand=False):
Exemplo n.º 17
0
def main(args):
    """Main."""
    # read variables
    # if no -i is given then read from stdin
    seq = args['-i']
    seq = (sys.stdin.readline().strip() if args['-i'] == 'stdin' else seq)
    k = int(args['-k'])
    complexity = int(args['--complexity'][0])
    nbits = int(args['--nbits'][0])
    window_size = int(args['--window_size'][0])
    window_size = min(len(seq), window_size)
    max_bp_span = int(args['--max_bp_span'][0])
    max_bp_span = min(len(seq), max_bp_span)
    avg_bp_prob_cutoff = float(args['--avg_bp_prob_cutoff'][0])
    hard_threshold = float(args['--hard_threshold'][0])
    max_num_edges = int(args['--max_num_edges'][0])
    no_lonely_bps = args['--no_lonely_bps']
    no_nesting = args['--no_nesting']
    draw = args['--draw']
    jpg = args['--jpg']
    svg = args['--svg']
    png = args['--png']
    pdf = args['--pdf']

    if no_nesting is True:
        nesting = False
    else:
        nesting = True
    # setup logger
    if args['--verbose']:
        verbosity = 2
    else:
        verbosity = 1
    configure_logging(logger, verbosity=verbosity, filename='log')
    logger.debug(serialize_dict(args))

    # setup folding algorithm
    rase = StructuralStabilityEstimator(seq,
                                        alphabet='ACGU',
                                        k=k,
                                        complexity=complexity,
                                        nbits=nbits,
                                        window_size=window_size,
                                        max_bp_span=max_bp_span,
                                        avg_bp_prob_cutoff=avg_bp_prob_cutoff,
                                        hard_threshold=hard_threshold,
                                        max_num_edges=max_num_edges,
                                        no_lonely_bps=no_lonely_bps,
                                        nesting=nesting)
    # print: nt pos, original nt, most de-stabilizing nt, dotbracket, score
    for line in rase.transform(seq):
        print(line)

    # if drawing is required use the folding algorithm to compute the graph
    if draw:
        suffix = 'pdf'
        if jpg:
            suffix = 'jpg'
        if svg:
            suffix = 'svg'
        if png:
            suffix = 'png'
        if pdf:
            suffix = 'pdf'
        structure_fname = 'structure.' + suffix
        score_fname = 'score.' + suffix
        all_plots_fname = 'structures.' + suffix
        rase.draw(file_name=structure_fname)
        rase.plot(file_name=score_fname)
        rase.draw_all(file_name=all_plots_fname)