def generate_region_dfs(parameterObj, entityCollection): logging.info( "[#] Splitting bed file into chunks for downstream processing (this might take a while) ..." ) df = read_csv(parameterObj.bed_file, sep="\t", usecols=[0, 1, 2, 4], names=['sequence_id', 'start', 'end', 'samples'], skiprows=1, header=None, dtype={ 'sequence_id': 'category', 'start': np.int, 'end': np.int, 'samples': 'category' }) df = df[df['sequence_id'].isin( entityCollection.sequence_idx_by_sequence_id)].sort_values( ['sequence_id', 'start'], ascending=[ True, True ]) # filter rows based on sequence_ids, sort by sequence_id, start df['length'] = df['end'] - df['start'] # compute length column parameterObj.bed_length_total = int(df['length'].sum()) genome_bases_percentage = format_percentage( parameterObj.bed_length_total / entityCollection.count('bases')) logging.info( "[+] Found %s BED intervals adding up to %s (%s of genome) ..." % (format_count(len(df)), format_bases( parameterObj.bed_length_total), genome_bases_percentage)) df = df[ df['length'] >= parameterObj. min_interval_length] # filter intervals shorter than MIN_INTERVAL_LEN #df['samples_ids'] = df['samples'].apply(entityCollection.sample_string_to_sample_ids) # samples to frozenset df['pair_idxs'] = df['samples'].apply( entityCollection.sample_string_to_pair_idxs ) # samples to frozenset pairs logging.debug(df) df['pair_count'] = df['pair_idxs'].apply(entityCollection.count_pair_idxs) df = df[ df['pair_count'] >= parameterObj.min_samples** 2] # filter intervals with less than min_samples in both populations df = df.dropna() # Drop intervals that don't affect pairs df['distance'] = np.where( (df['sequence_id'] == df['sequence_id'].shift(-1)), df['start'].shift(-1) - df['end'], parameterObj.max_interval_distance + 1) # compute distance to next interval # region_dfs region_ids = (df["distance"].shift() > float( parameterObj.max_interval_distance) ).cumsum() # generate indices for splitting region_dfs = [] for idx, region_df in df.groupby(region_ids): if region_df.length.sum( ) > parameterObj.block_length: # remove regions below block_length region_df = region_df.drop( columns=['distance', 'samples', 'pair_count'], axis=1) # remove distance/sample_idsx columns region_dfs.append(region_df) return region_dfs
def task_load_blockObjs(parameterObj, entityCollection): print("[#] Loading blocks ...") entityCollection.load_blocks(parameterObj) block_count = format_count(entityCollection.count('blocks')) blocked_bases = entityCollection.count( 'blocks') * parameterObj.block_length total_bases_percentage = format_percentage(blocked_bases / entityCollection.count('bases')) print("[+] Read %s blocks covering %s (%s of genome) (%.2fMB)" % (block_count, format_bases(blocked_bases), total_bases_percentage, memory_usage_psutil()))
def task_generate_entityCollection(parameterObj): start = timer() print("[#] Building entities based on samples and sequences...") entityCollection = EntityCollection() entityCollection.parse_sample_file(parameterObj) print("[+] Read %s samples from %s populations and generated %s pairs in %.3fs." % (\ entityCollection.count('samples'), \ entityCollection.count('populations'), \ entityCollection.count('pairs'), \ timer() - start)) entityCollection.parse_genome_file(parameterObj) print("[+] Read %s sequences with total length of %s b in %.3fs" % (\ entityCollection.count('sequences'), \ format_count(entityCollection.count('bases')), \ timer() - start)) return entityCollection
def task_make_blocks(parameterObj, region_dfs, entityCollection): logging.info("[#] Generating blocks ...") make_blocks(parameterObj, region_dfs, entityCollection) block_count = format_count(entityCollection.count('blocks')) blocked_bases = entityCollection.count( 'blocks') * parameterObj.block_length bed_bases_percentage = format_percentage(blocked_bases / parameterObj.bed_length_total) total_bases_percentage = format_percentage(blocked_bases / entityCollection.count('bases')) logging.info("[+] Made %s blocks covering %s (%s of BED intervals, %s of genome) (%.2fMB)" % (\ block_count, format_bases(blocked_bases), bed_bases_percentage, total_bases_percentage, memory_usage_psutil() ))
def get_mutuple_counters(self, entityCollection): if self.mode == 'variants': infile = self.variants_file elif self.mode == 'windows': infile = self.windows_file else: pass mutype_hdf5_store = pd.HDFStore(infile) mutype_df = pd.read_hdf(mutype_hdf5_store, key='mutypes') mutype_hdf5_store.close() shape = tuple(self.max_by_mutype[mutype] + 2 for mutype in MUTYPES) mutuple_count_matrix = np.zeros(shape, np.float64) #print(self.ancestor_population_id, (entityCollection.populationObjs[0].id, entityCollection.populationObjs[1].id)) #print("before") #print(mutype_df) if self.ancestor_population_id == entityCollection.populationObjs[ 0].id: # mutuples do not have to be flipped print("[+] Ancestor is %s ..." % self.ancestor_population_id) elif self.ancestor_population_id == entityCollection.populationObjs[ 1].id: mutype_df.rename(columns={ 'hetA': 'hetB', 'hetB': 'hetA' }, inplace=True) print("[+] Ancestor is %s (hetA and hetB will be flipped)... " % self.ancestor_population_id) #print("before") #print(mutype_df) # this has to be changed if order of mutypes changes FGV_count = 0 kmax_binned_count = 0 total_count = mutype_df['count'].sum() for count, hetA, fixed, hetB, hetAB in tqdm(mutype_df[['count'] + MUTYPES].values, total=len(mutype_df.index), desc="[%] ", ncols=100): #print(hetA, fixed, hetB, hetAB) mutuple = (hetA, fixed, hetB, hetAB) if mutuple[1] > 0 and mutuple[3] > 0: FGV_count += count if any([count > self.kmax for count in mutuple]): kmax_binned_count += count mutuple_vector = tuple([ count if not count > self.max_by_mutype[mutype] else self.max_by_mutype[mutype] + 1 for count, mutype in zip(mutuple, MUTYPES) ]) mutuple_count_matrix[mutuple_vector] += count #print(count, hetA, fixed, hetB, hetAB, mutuple_vector, mutuple_count_matrix) print("[=] Total mutuple count = %s" % (format_count(total_count))) print("[=] Counts excluded due to four-gamete-violations = %s (%s)" % (format_count(FGV_count), format_percentage(FGV_count / total_count))) print("[=] Counts binned due to kmax = %s (%s)" % (format_count(kmax_binned_count), format_percentage(kmax_binned_count / total_count))) return mutuple_count_matrix