def setUpClass(cls): log.info("Loading alignments from {c}".format(c=cls.BAM_PATH)) movie_names, unrolled, datum, columns = from_alignment_file( alignment_info_from_bam(cls.BAM_PATH)[cls.MOVIE]) cls.datum = datum cls.unrolled = unrolled cls.movie_names = movie_names cls.columns = columns
def analyze_movies(movies, alignment_file_names, stats_models): all_results = [] log.info("collecting data from {n} BAM files...".format( n=len(alignment_file_names))) for file_name in alignment_file_names: log.info("reading {f}.pbi".format(f=file_name)) results = alignment_info_from_bam(file_name) for movie, aln_info in results.iteritems(): log.info("Analyzing Movie {n} in {f}".format(n=movie, f=file_name)) args = from_alignment_file(aln_info) _process_movie_data(movie, file_name, stats_models, *args) log.info("Completed analyzing {n} movies.".format(n=len(movies)))
def analyze_movie(movie, alignment_file, stats_models): """ The regions should only correspond to a single Movie :type movie: Movie :type stats_models: list """ started_at = time.time() log.info("Analyzing Movie {n}".format(n=movie)) movie_names, unrolled, data_, columns = from_alignment_file( movie, alignment_file) if len(data_) == 0: msg = "Movie '{n}' produced no alignments.".format(n=movie) log.warn(msg) return crunched = CrunchedAlignments(movie_names, unrolled, data_, columns) log.debug("Movie names from crunched {m}.".format(m=movie_names)) reads = crunched.reads() # subreads recarray # ["Length", "Accuracy", "isFirst", "modStart", "isFullSubread", "isMaxSubread"] subreads = crunched.subreads() log.info("Movie") log.info(movie) log.info(('Number of reads', len(reads))) log.info(('Number of subreads', len(subreads))) for model in stats_models: if model.filter_func(movie): for aggregator in model.aggregators: if aggregator.DATA_TYPE == READ_TYPE: aggregator.apply(reads) if aggregator.DATA_TYPE == SUBREAD_TYPE: aggregator.apply(subreads) else: log.warn( "model {m}. Skipping movie {r}".format(m=repr(model), r=movie)) pass run_time = time.time() - started_at _d = dict(n=movie, s=run_time) log.info("Completed analyzing Movie {n} with in {s:.2f} sec.".format(**_d))
def analyze_movie(movie, alignment_file, stats_models): """ The regions should only correspond to a single Movie :type movie: Movie :type stats_models: list """ started_at = time.time() movie_names, unrolled, data_, columns = from_alignment_file( movie, alignment_file) _process_movie_data(movie, alignment_file, stats_models, movie_names, unrolled, data_, columns) run_time = time.time() - started_at _d = dict(n=movie, s=run_time) log.info("Completed analyzing Movie {n} with in {s:.2f} sec.".format(**_d))
def analyze_movies(movies, alignment_file_names, stats_models, nproc=1): #pool = None #if nproc >= 1: # # XXX I use nproc-1 here because the callback in the main process # # actually takes up a lot of time # log.info("Starting pool of {n} processes".format(n=max(1, nproc-1))) # pool = multiprocessing.Pool(processes=nproc) for movie in movies: for file_name in alignment_file_names: log.info("Analyzing Movie {n}".format(n=movie)) results = from_alignment_file(movie, file_name) _process_movie_data(movie, file_name, stats_models, *results) # FIXME need to re-think this #def __analyze_movie(args): # return from_alignment_file(*args) #__callback = functools.partial(_process_movie_data, movie, # file_name, stats_models) #pool.apply_async(from_alignment_file, (movie, file_name), # callback=__callback) #pool.close() #pool.join() log.info("Completed analyzing {n} movies.".format(n=len(movies)))