Пример #1
0
def generate_region_dfs(parameterObj, entityCollection):
    logging.info(
        "[#] Splitting bed file into chunks for downstream processing (this might take a while) ..."
    )
    df = read_csv(parameterObj.bed_file,
                  sep="\t",
                  usecols=[0, 1, 2, 4],
                  names=['sequence_id', 'start', 'end', 'samples'],
                  skiprows=1,
                  header=None,
                  dtype={
                      'sequence_id': 'category',
                      'start': np.int,
                      'end': np.int,
                      'samples': 'category'
                  })
    df = df[df['sequence_id'].isin(
        entityCollection.sequence_idx_by_sequence_id)].sort_values(
            ['sequence_id', 'start'], ascending=[
                True, True
            ])  # filter rows based on sequence_ids, sort by sequence_id, start
    df['length'] = df['end'] - df['start']  # compute length column
    parameterObj.bed_length_total = int(df['length'].sum())
    genome_bases_percentage = format_percentage(
        parameterObj.bed_length_total / entityCollection.count('bases'))
    logging.info(
        "[+] Found %s BED intervals adding up to %s (%s of genome) ..." %
        (format_count(len(df)), format_bases(
            parameterObj.bed_length_total), genome_bases_percentage))
    df = df[
        df['length'] >= parameterObj.
        min_interval_length]  # filter intervals shorter than MIN_INTERVAL_LEN
    #df['samples_ids'] = df['samples'].apply(entityCollection.sample_string_to_sample_ids) # samples to frozenset
    df['pair_idxs'] = df['samples'].apply(
        entityCollection.sample_string_to_pair_idxs
    )  # samples to frozenset pairs
    logging.debug(df)
    df['pair_count'] = df['pair_idxs'].apply(entityCollection.count_pair_idxs)
    df = df[
        df['pair_count'] >= parameterObj.min_samples**
        2]  # filter intervals with less than min_samples in both populations
    df = df.dropna()  # Drop intervals that don't affect pairs
    df['distance'] = np.where(
        (df['sequence_id'] == df['sequence_id'].shift(-1)),
        df['start'].shift(-1) - df['end'], parameterObj.max_interval_distance +
        1)  # compute distance to next interval
    # region_dfs
    region_ids = (df["distance"].shift() > float(
        parameterObj.max_interval_distance)
                  ).cumsum()  # generate indices for splitting
    region_dfs = []
    for idx, region_df in df.groupby(region_ids):
        if region_df.length.sum(
        ) > parameterObj.block_length:  # remove regions below block_length
            region_df = region_df.drop(
                columns=['distance', 'samples', 'pair_count'],
                axis=1)  # remove distance/sample_idsx columns
            region_dfs.append(region_df)
    return region_dfs
Пример #2
0
def task_load_blockObjs(parameterObj, entityCollection):
    print("[#] Loading blocks ...")
    entityCollection.load_blocks(parameterObj)
    block_count = format_count(entityCollection.count('blocks'))
    blocked_bases = entityCollection.count(
        'blocks') * parameterObj.block_length
    total_bases_percentage = format_percentage(blocked_bases /
                                               entityCollection.count('bases'))
    print("[+] Read %s blocks covering %s (%s of genome) (%.2fMB)" %
          (block_count, format_bases(blocked_bases), total_bases_percentage,
           memory_usage_psutil()))
Пример #3
0
def task_generate_entityCollection(parameterObj):
    start = timer()
    print("[#] Building entities based on samples and sequences...")
    entityCollection = EntityCollection()
    entityCollection.parse_sample_file(parameterObj)
    print("[+] Read %s samples from %s populations and generated %s pairs in %.3fs." % (\
        entityCollection.count('samples'), \
        entityCollection.count('populations'), \
        entityCollection.count('pairs'), \
        timer() - start))
    entityCollection.parse_genome_file(parameterObj)
    print("[+] Read %s sequences with total length of %s b in %.3fs" % (\
        entityCollection.count('sequences'), \
        format_count(entityCollection.count('bases')), \
        timer() - start))
    return entityCollection
Пример #4
0
def task_make_blocks(parameterObj, region_dfs, entityCollection):
    logging.info("[#] Generating blocks ...")
    make_blocks(parameterObj, region_dfs, entityCollection)
    block_count = format_count(entityCollection.count('blocks'))
    blocked_bases = entityCollection.count(
        'blocks') * parameterObj.block_length
    bed_bases_percentage = format_percentage(blocked_bases /
                                             parameterObj.bed_length_total)
    total_bases_percentage = format_percentage(blocked_bases /
                                               entityCollection.count('bases'))
    logging.info("[+] Made %s blocks covering %s (%s of BED intervals, %s of genome) (%.2fMB)" % (\
        block_count,
        format_bases(blocked_bases),
        bed_bases_percentage,
        total_bases_percentage,
        memory_usage_psutil()
        ))
Пример #5
0
    def get_mutuple_counters(self, entityCollection):
        if self.mode == 'variants':
            infile = self.variants_file
        elif self.mode == 'windows':
            infile = self.windows_file
        else:
            pass
        mutype_hdf5_store = pd.HDFStore(infile)
        mutype_df = pd.read_hdf(mutype_hdf5_store, key='mutypes')
        mutype_hdf5_store.close()
        shape = tuple(self.max_by_mutype[mutype] + 2 for mutype in MUTYPES)
        mutuple_count_matrix = np.zeros(shape, np.float64)
        #print(self.ancestor_population_id, (entityCollection.populationObjs[0].id, entityCollection.populationObjs[1].id))
        #print("before")
        #print(mutype_df)
        if self.ancestor_population_id == entityCollection.populationObjs[
                0].id:
            # mutuples do not have to be flipped
            print("[+] Ancestor is %s ..." % self.ancestor_population_id)
        elif self.ancestor_population_id == entityCollection.populationObjs[
                1].id:
            mutype_df.rename(columns={
                'hetA': 'hetB',
                'hetB': 'hetA'
            },
                             inplace=True)
            print("[+] Ancestor is %s (hetA and hetB will be flipped)... " %
                  self.ancestor_population_id)
        #print("before")
        #print(mutype_df)
        # this has to be changed if order of mutypes changes
        FGV_count = 0
        kmax_binned_count = 0
        total_count = mutype_df['count'].sum()
        for count, hetA, fixed, hetB, hetAB in tqdm(mutype_df[['count'] +
                                                              MUTYPES].values,
                                                    total=len(mutype_df.index),
                                                    desc="[%] ",
                                                    ncols=100):
            #print(hetA, fixed, hetB, hetAB)
            mutuple = (hetA, fixed, hetB, hetAB)
            if mutuple[1] > 0 and mutuple[3] > 0:
                FGV_count += count
            if any([count > self.kmax for count in mutuple]):
                kmax_binned_count += count
            mutuple_vector = tuple([
                count if not count > self.max_by_mutype[mutype] else
                self.max_by_mutype[mutype] + 1
                for count, mutype in zip(mutuple, MUTYPES)
            ])

            mutuple_count_matrix[mutuple_vector] += count
            #print(count, hetA, fixed, hetB, hetAB, mutuple_vector, mutuple_count_matrix)
        print("[=] Total mutuple count = %s" % (format_count(total_count)))
        print("[=] Counts excluded due to four-gamete-violations = %s (%s)" %
              (format_count(FGV_count),
               format_percentage(FGV_count / total_count)))
        print("[=] Counts binned due to kmax = %s (%s)" %
              (format_count(kmax_binned_count),
               format_percentage(kmax_binned_count / total_count)))
        return mutuple_count_matrix