Exemplo n.º 1
0
 def compute_deflation_ratios(self, sumstats_prefix, sumstats_prefix_chr, pve, N, mean_l2, M_base):
             
     #read diagGRM files
     df_diagGRM = pcgc_utils.load_dfs(sumstats_prefix, sumstats_prefix_chr, 'diagGRM.gz', 'diagGRM', 'diagGRM', join_axis=0, index_col=['fid', 'iid'], use_tqdm=False)
     df_diagGRM = df_diagGRM.groupby(['fid', 'iid']).sum()                
             
     #compute trace ratio
     deflate_columns   = df_diagGRM.columns.str.startswith('diag_G_deflate')
     nodeflate_columns = df_diagGRM.columns.str.startswith('diag_G_nodeflate')
     sum_diag_deflate = df_diagGRM.loc[:,deflate_columns].sum(axis=0).values
     sum_diag_nodeflate = df_diagGRM.loc[:,nodeflate_columns].sum(axis=0).values
     trace_ratios = sum_diag_nodeflate / sum_diag_deflate
     
     #compute deflation ratios        
     if (len(pve) == 0):
         if np.isclose(trace_ratios[0], 1):
             deflation_ratio = 1.0
         else:
             raise ValueError('trace deflation reported, but no pve values found!')
     else:
         var_diag_deflate = df_diagGRM.loc[:,deflate_columns].iloc[:,0].var(ddof=0) / M_base**2
         var_diag_nodeflate = df_diagGRM.loc[:,nodeflate_columns].iloc[:,0].var(ddof=0) / M_base**2
         trace_G = sum_diag_nodeflate[0] / M_base
         deflation_ratio = self.compute_PCS_deflation_ratio(pve, N, M_base, mean_l2,
                                                        trace_G,
                                                        var_diag_nodeflate,
                                                        var_diag_deflate)
     return trace_ratios, deflation_ratio
Exemplo n.º 2
0
    def read_Gty_files(self, args, sumstats_prefix, sumstats_prefix_chr, category_names, N, mean_Q):
    
        #read otherstats files
        df_otherstats_list = pcgc_utils.load_dfs(sumstats_prefix, sumstats_prefix_chr, 'otherstats', 'otherstats', 'otherstats', join_axis=None, use_tqdm=False, allow_duplicates=True)

        #sum M2 (sum of squares of annotations) of each annotation
        M_annot_sumstats2 = np.zeros(len(category_names))
        for df_otherstats_chr in df_otherstats_list:            
            for c_i, c in enumerate(category_names):
                if len(category_names)==1:
                    df_M_annot_sumstats2 = df_otherstats_chr.loc[df_otherstats_chr['Property'].str.startswith('M2_'), 'Value']
                else:
                    df_M_annot_sumstats2 = df_otherstats_chr.query('Property == "M2_%s"'%(c))['Value']
                if df_M_annot_sumstats2.shape[0] == 0:
                    raise ValueError('M2_%s not found in otherstats file'%(c))
                if df_M_annot_sumstats2.shape[0] > 1:
                    raise ValueError('Multiple M2_%s values found in otherstats file'%(c))
                M_annot_sumstats2[c_i] += df_M_annot_sumstats2.values[0]

                
        #load PCGC Gty files and aggregate them (across chromosomes)
        if args.no_Gty:                    
            df_Gty = self.create_synthetic_Gty(N, mean_Q, category_names)
        else:
            df_Gty = pcgc_utils.load_dfs(sumstats_prefix, sumstats_prefix_chr, 'Gty.gz', 'Gty', 'Gty', join_axis=0, index_col=['fid', 'iid'], use_tqdm=False, allow_duplicates=True)
            df_Gty = np.sqrt((df_Gty**2).groupby(['fid', 'iid']).sum())
            
            #synchronize df_Gty columns to match category_names
            if len(category_names) > 1:
                columns_intersect = df_Gty.columns.intersection(category_names)
                if len(columns_intersect) < len(category_names):
                    raise ValueError('Gty files and prodr2 files must have the same annotations')
                if len(columns_intersect) < df_Gty.shape[1]:
                    logging.warning('Gty file has ununsed annotations')
                    df_Gty.loc[:, category_names]
                if not np.all(df_Gty.columns == category_names):
                    df_Gty = df_Gty.loc[:, category_names]
                
            #normalize Gty by the total number of SNPs in the genome
            for anno_i in range(df_Gty.shape[1]):
                df_Gty.iloc[:,anno_i] /= np.sqrt(M_annot_sumstats2[anno_i])                
              
        M_base = M_annot_sumstats2[0]
        return df_Gty, M_base
Exemplo n.º 3
0
    def read_sumstats(self, args, sumstats_prefix, sumstats_prefix_chr):

        #load summary statistics
        if args.he:        
            try:
                df_sumstats = pcgc_utils.load_dfs(sumstats_prefix, sumstats_prefix_chr, '', 'sumstats', 'sumstats', index_col='SNP', allow_duplicates=True)
            except IOError:            
                df_sumstats = pcgc_utils.load_dfs(sumstats_prefix, sumstats_prefix_chr, 'sumstats.gz', 'sumstats', 'sumstats', index_col='SNP', allow_duplicates=True)
        else:
            df_sumstats = pcgc_utils.load_dfs(sumstats_prefix, sumstats_prefix_chr, 'sumstats.gz', 'sumstats', 'sumstats', index_col='SNP', allow_duplicates=True)
        
        #transform Z column if it wasn't created especially for PCGC
        if args.he:
            if 'pcgc_sumstat' not in df_sumstats.columns:
                if 'Z' not in df_sumstats.columns:
                    raise ValueError('cannot find a Z column in summary statistics file')
                else:
                    df_sumstats['pcgc_sumstat'] = df_sumstats['Z'] * np.sqrt(df_sumstats['N'])
        
        #if HE regrssion is used, create default naive values for PCGC-relevant fields
        if args.he:
            var_t = 0
            mean_Q = 1
            pve = np.array([])
            N = df_sumstats['N'].mean()
        #load PCGC other stats files
        else:            
            df_otherstats_list = pcgc_utils.load_dfs(sumstats_prefix, sumstats_prefix_chr, 'otherstats', 'otherstats', 'otherstats', join_axis=None, use_tqdm=False, allow_duplicates=True)
            df_otherstats = df_otherstats_list[0]
            var_t = df_otherstats.query('Property == "var_t"')['Value'].values[0]
            pve = df_otherstats.loc[df_otherstats['Property'].str.startswith('pve'), 'Value'].values
            mean_Q = df_otherstats.query('Property == "mean_Q"')['Value'].values[0]
            N = df_otherstats.query('Property == "N"')['Value'].values[0]
        
        #filter out SNPs with very large summary statistics
        if args.chisq_max is None: chisq_max = max(0.001*df_sumstats.N.max(), 80)            
        else: chisq_max = args.chisq_max
        df_sumstats['Z2'] = df_sumstats['pcgc_sumstat']**2 / df_sumstats['N'] / mean_Q
        is_large_z = (df_sumstats['Z2'] > chisq_max)
        if is_large_z.any():
            logging.warning('Removing %d summary statistics with Z^2 > %0.1f'%(is_large_z.sum(), chisq_max))
            df_sumstats = df_sumstats.loc[~is_large_z]
            
        return df_sumstats, var_t, pve, mean_Q, N
Exemplo n.º 4
0
    def load_annotations(self, anno, anno_chr, sync_prefix):

        #extract SNP names
        if self.genetic_format == 'plink':
            index_snpnames = self.bfile['df_bim'].index
        elif self.genetic_format == 'bgen':
            raise NotImplementedError()
        else:
            raise ValueError('illegal option')

        #load annotations (or create baseline annotation only)
        if (anno is None) and (anno_chr is None):
            df_annotations = pd.DataFrame(np.ones(len(index_snpnames)),
                                          index=index_snpnames,
                                          columns=['base'])
        else:
            df_annotations = pcgc_utils.load_dfs(anno,
                                                 anno_chr,
                                                 'annot.gz',
                                                 'annot',
                                                 'annot',
                                                 index_col='SNP')
            df_annotations.drop(columns=['CHR', 'CM', 'BP'], inplace=True)
            if not np.allclose(df_annotations.iloc[:, 0], 1):
                raise ValueError(
                    'The first annotation must be the base annotation (all SNPs must have value 1.0)'
                )

            #apply min_annot correction to ensure no negative values
            category_names = df_annotations.columns
            if sync_prefix is None:
                raise ValueError(
                    '--annot and --annot-chr must be used together with --sync'
                )
            df_sync = pd.read_table(sync_prefix + 'sync', index_col='Category')
            if df_sync.shape[0] != len(category_names) or not np.all(
                    df_sync.index == category_names):
                raise ValueError(
                    'Annotations in sync file do not match those in annotations/prodr2 files'
                )
            min_annot = df_sync['min_annot'].values
            df_annotations -= min_annot

        #remove annotations for unused SNPs
        is_same = (df_annotations.shape[0] == len(index_snpnames)) and \
                  (df_annotations.index == index_snpnames).all()
        if not is_same:
            has_anno = (index_snpnames.isin(df_annotations.index)).all()
            if not has_anno:
                raise ValueError('not all SNPs have annotations')
            df_annotations = df_annotations.loc[index_snpnames]

        #save the df in a class member
        self.df_annotations_noneg = df_annotations
Exemplo n.º 5
0
 def load_all_snp_indices(self, args):
     index_list = []
 
     if args.annot is not None or args.annot_chr is not None:
         df_annot_index = pcgc_utils.load_dfs(args.annot, args.annot_chr, 'annot.gz', 'annot', 'annot', index_col='SNP', usecols=['SNP'], allow_duplicates=True)
         index_list.append(df_annot_index.index)
     if args.fit_intercept:
         df_l2_index = pcgc_utils.load_dfs(args.annot, args.annot_chr, 'l2.ldscore.gz', 'l2.ldscore', 'annot', index_col='SNP', usecols=['SNP'], allow_duplicates=True)
         index_list.append(df_l2_index.index)
     if args.w_ld is not None or args.w_ld_chr is not None:
         df_w_ld_index = pcgc_utils.load_dfs(args.w_ld, args.w_ld_chr, 'l2.ldscore.gz', 'l2.ldscore', 'w-ld', index_col='SNP', usecols=['SNP'], allow_duplicates=True)
         index_list.append(df_w_ld_index.index)
         
     if len(index_list) == 0:
         index_intersect = None
     elif len(index_list) == 1:
         index_intersect = index_list[0]
     else:
         index_intersect = reduce(lambda i1,i2: i1.intersection(i2), index_list)
         
     if index_intersect is not None:
         assert not index_intersect.duplicated().any()
     return index_intersect
Exemplo n.º 6
0
    def load_prodr2(self, args):
        df_prodr2 = pcgc_utils.load_dfs(args.prodr2, args.prodr2_chr, 'prodr2', 'prodr2', 'prodr2', use_tqdm=False)
        df_prodr2.index.name = 'Category'        
        df_prodr2 = df_prodr2.groupby(by=['Category']).sum()
        assert df_prodr2.shape[0] == len(df_prodr2.columns.intersection(df_prodr2.index))
        df_prodr2 = df_prodr2.loc[df_prodr2.columns, df_prodr2.columns]
        assert (df_prodr2.columns == df_prodr2.index).all()

        if args.annot is None and args.annot_chr is None:
            if df_prodr2.shape[1] > 1:
                logging.warning('Using only the first annotation in prodr2 file!!!')
                df_prodr2 = df_prodr2.iloc[:1,:1]
        
        return df_prodr2
Exemplo n.º 7
0
def compute_r2_prod(args):

    # read bim/snp
    array_snps = ps.PlinkBIMFile(args.bfile + '.bim')
    snpnames = array_snps.df['SNP']

    #read annotations file
    if args.annot is None and args.annot_chr is None:
        df_annotations = pd.DataFrame(np.ones(snpnames.shape[0]),
                                      index=snpnames,
                                      columns=['base'])
    else:
        df_annotations = pcgc_utils.load_dfs(args.annot,
                                             args.annot_chr,
                                             'annot.gz',
                                             'annot',
                                             'annot',
                                             join_axis=0,
                                             index_col='SNP')
        df_annotations.drop(columns=['CHR', 'CM', 'BP'], inplace=True)

        #apply min_annot correction to ensure no negative values
        category_names = df_annotations.columns
        if args.sync is None:
            raise ValueError('--annot must be used together with --sync')
        df_sync = pd.read_table(args.sync + 'sync', index_col='Category')
        if df_sync.shape[0] != len(category_names) or not np.all(
                df_sync.index == category_names):
            raise ValueError(
                'Annotations in sync file do not match those in annotations/prodr2 files'
            )
        min_annot = df_sync['min_annot'].values
        df_annotations -= min_annot

    #mark which SNPs to keep
    is_good_snp = np.ones(len(snpnames), dtype=np.bool)
    if args.exclude is not None:
        df_exclude = pd.read_table(args.exclude, squeeze=True, header=None)
        is_good_snp = is_good_snp & (~snpnames.isin(df_exclude))
        logging.info('Excluding %d SNPs' %
                     (np.sum(~snpnames.isin(df_exclude))))
    if args.extract is not None:
        df_extract = pd.read_table(args.extract, squeeze=True, header=None)
        is_good_snp = is_good_snp & (snpnames.isin(df_extract))
        logging.info('Extracting %d SNPs' %
                     (np.sum(snpnames.isin(df_extract))))

    if args.ld_all:
        keep_snps = None
        is_r2_snp = is_good_snp
    else:
        keep_snps = np.where(is_good_snp)[0]
        is_r2_snp = np.ones(len(keep_snps), dtype=np.bool)
        snpnames = snpnames.iloc[keep_snps]

    #keep only annotations of SNPs in plink file
    if not snpnames.isin(df_annotations.index).all():
        raise ValueError('not all SNPs have annotations')
    df_annotations = df_annotations.loc[snpnames]

    logging.info('Computing r^2 products for %d SNPs' % (len(snpnames)))

    #find #individuals in bfile
    df_fam = pd.read_table(args.bfile + '.fam', header=None)
    n = df_fam.shape[0]

    #read plink file
    keep_indivs = None
    mafMin = None
    reload(ldscore_r2)
    logging.info('Loading SNP file...')
    geno_array = ldscore_r2.PlinkBEDFile(args.bfile + '.bed',
                                         n,
                                         array_snps,
                                         is_r2_snp,
                                         keep_snps=keep_snps,
                                         keep_indivs=keep_indivs,
                                         mafMin=mafMin)

    #compute r2_prod_table
    logging.info('Computing r2 prod...')
    coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]
    block_left = ldscore_r2.getBlockLefts(coords, args.ld_wind_cm)
    if block_left[len(block_left) - 1] == 0:
        error_msg = 'Only a single block selected - this is probably a mistake'
        raise ValueError(error_msg)
    t0 = time.time()
    geno_array._currentSNP = 0
    r2prod_table = geno_array.ldScoreVarBlocks(block_left,
                                               args.chunk_size,
                                               annot=df_annotations.values)

    df_r2prod_table = pd.DataFrame(r2prod_table,
                                   index=df_annotations.columns,
                                   columns=df_annotations.columns)
    return df_r2prod_table
Exemplo n.º 8
0
    def __init__(self, args):

        #read phenotypes file
        df_pheno = self.read_pheno_file(args)

        #read covariates
        if args.covar is not None:
            df_covar = pd.read_table(args.covar, delim_whitespace=True)
            df_covar = self.add_fid_iid_index(df_covar)

            #merge individuals across phenotypes and covariates
            index_intersect = df_pheno.index.intersection(df_covar.index)
            if len(index_intersect) < len(df_pheno.index):
                if len(index_intersect) == 0:
                    raise ValueError(
                        'no individuals have both both phenotype and covariates data'
                    )
                df_pheno = df_pheno.loc[index_intersect]
                df_covar = df_covar.loc[index_intersect]
                logging.info(
                    '%d individuals have both phenotypes and covariates data' %
                    (df_covar.shape[0]))

        #read mafs file if it exists
        if args.frqfile is not None or args.frqfile_chr is not None:
            df_maf = pcgc_utils.load_dfs(args.frqfile,
                                         args.frqfile_chr,
                                         'frq',
                                         'frq',
                                         'frqfile',
                                         index_col='SNP')
        else:
            df_maf = None
            logging.warning(
                'MAF file not provided! We will use the in-sample MAF estimates (this is highly unrecommended!)'
            )

        #read bfile or bgen file
        if args.bfile is not None:
            assert args.bgen is None, '--bfile and --bgen cannot both be specified'
            self.bfile, df_pheno, df_maf, self.num_snps, self.sample_size = self.read_plink(
                args, df_pheno, df_maf)
            self.genetic_format = 'plink'
        elif args.bgen is not None:
            assert args.bfile is None, '--bfile and --bgen cannot both be specified'
            raise NotImplementedError('bgen functionality not yet implemented')
            self.genetic_format = 'bgen'
        else:
            raise ValueError('either --bfile or --bgen must be specified')

        #save MAFs
        if df_maf is not None:
            maf_col = self.find_df_column(df_maf,
                                          ['MAF', 'FRQ', 'FREQ', 'A1Freq'],
                                          'MAF')
            self.mafs = df_maf[maf_col]
        else:
            self.mafs = None

        #Extract relevant covariates and compute the Cholesky factorization of the hat matrix
        if args.covar is None:
            C = None
            self.C_regress = None
            self.L_CTC = None
            covars_regress_cols = []
        else:
            #reorder individuals to make sure that the covariates match the other files
            is_same = (df_covar.shape[0]
                       == df_pheno.shape[0]) and (df_covar.index
                                                  == df_pheno.index).all()
            if not is_same:
                df_covar = df_covar.loc[df_pheno.index]
            C = df_covar.iloc[:, 2:]

            #extract relevant covariates
            if args.covars_regress is None:
                self.C_regress = None
                self.L_CTC = None
                covars_regress_cols = []
            else:
                assert args.covar is not None
                covars_regress_cols = args.covars_regress.split(',')
                for c in covars_regress_cols:
                    if c not in df_covar.columns:
                        raise ValueError('%s is not in the covariates file' %
                                         (c))
                self.C_regress = df_covar[covars_regress_cols].values

                #compute the Cholesky factorization of the hat matrix
                self.L_CTC = la.cho_factor(self.C_regress.T.dot(
                    self.C_regress))

        #load pve file if it exists
        if args.pve is None:
            if args.covars_regress is not None:
                raise ValueError('--covars_regress must be used with --pve')
            self.pve = []
        else:
            df_pve = pd.read_table(args.pve, header=None)
            if df_pve.shape[1] > 1:
                raise ValueError('pve file must include only a single column')
            if df_pve.shape[0] < len(covars_regress_cols):
                raise ValueError(
                    'There are fewer pve values than covariates to regress')
            df_pve.columns = ['pve']
            if df_pve.shape[0] > len(covars_regress_cols):
                logging.warning(
                    'There are more pve values than covariates to regress. Using only top %d pve values'
                    % (len(covars_regress_cols)))
                df_pve = df_pve.sort_values('pve', ascending=False).head(
                    len(covars_regress_cols))
            self.pve = df_pve.iloc[:, 0].values

        #transform phenotypes to 0/1
        y = df_pheno.iloc[:, -1]
        num_pheno = len(np.unique(y))
        if num_pheno == 1:
            raise ValueError('only one phenotype value found!')
        elif num_pheno > 2:
            raise ValueError(
                'phenotype file must include only cases and controls')
        y = (y > y.mean()).astype(np.int).values

        #compute PCGC statistics
        P = y.mean()
        u0, u1, ty, self.var_t = self.compute_pcgc_stats(args.prev, P, y, C)
        y_norm = (y - P) / np.sqrt(P * (1 - P))
        self.z_coeff = y_norm * (u0 + u1)
        self.mean_Q = np.mean((u0 + u1)**2)
        self.prev = args.prev

        #load annotations
        self.load_annotations(args.annot, args.annot_chr, args.sync)
Exemplo n.º 9
0
def pcgc_sync2(args):
    
    #define the list of annotation file names
    if args.annot is None and args.annot_chr is None:
        raise ValueError('you must use either --annot or --annot-chr')
    if args.annot is not None and args.annot_chr is not None:
        raise ValueError('you cannot use both --annot and --annot-chr')
    if args.annot is not None:
        annot_fname_list = [args.annot+'annot.gz']
    else:
        annot_fname_list = [args.annot_chr+'%d.annot.gz'%(chr_num) for chr_num in range(1,23)]
    
    #round 1: Collect min_annot fields
    logging.info('Computing annotation minimum values')
    min_annot_list = []
    for annot_fname in tqdm(annot_fname_list, disable=len(annot_fname_list)==1):
        min_annot_list.append(collect_annotations_chr_info(annot_fname).values[:,0])
        
    #compute min_annot across all annotations
    min_annot = np.min(np.array(min_annot_list), axis=0)
    min_annot[min_annot>0] = 0
    
    #collect MAFs
    df_frq = pcgc_utils.load_dfs(args.frqfile, args.frqfile_chr, 'frq', 'frq', 'frqfile', join_axis=0, index_col='SNP', usecols=['SNP', 'MAF'])
    df_frq = df_frq['MAF']
    
    #round 2: collect all fields
    overlap_matrix = 0
    overlap_matrix_common = 0
    df_sync_list = []
    logging.info('Collecting annotation details')
    for annot_fname in tqdm(annot_fname_list, disable=len(annot_fname_list)==1):
        df_sync, df_overlap, df_overlap_common = collect_annotations_chr_info(annot_fname, df_frq, min_annot)
        df_sync_list.append(df_sync)
        overlap_matrix += df_overlap.values
        overlap_matrix_common += df_overlap_common.values
        
    #create df_overlap and df_overlap_common
    df_overlap = pd.DataFrame(overlap_matrix, index=df_overlap.index, columns=df_overlap.columns)
    df_overlap_common = pd.DataFrame(overlap_matrix_common, index=df_overlap_common.index, columns=df_overlap_common.columns)
        
    #Group df_sync results
    func_dict = {}
    func_dict['min_annot'] = np.min
    func_dict['is_continuous'] = np.any
    func_dict['M'] = np.sum
    func_dict['M2'] = np.sum
    func_dict['M_noneg'] = np.sum
    func_dict['M2_noneg'] = np.sum
    func_dict['M_5_50'] = np.sum
    func_dict['M2_5_50'] = np.sum
    func_dict['M_5_50_noneg'] = np.sum
    func_dict['M2_5_50_noneg'] = np.sum    
    assert np.all([c in func_dict for c in df_sync.columns])    
    df_sync_concat = pd.concat(df_sync_list, axis=0)
    df_sync = df_sync_concat.groupby(df_sync_concat.index).agg(func_dict)
    df_sync = df_sync.loc[df_sync_list[0].index]

    #add df index names
    df_sync.index.name = 'Category'
    df_overlap.index.name = 'Category'
    df_overlap_common.index.name = 'Category'
    
    return df_sync, df_overlap, df_overlap_common
Exemplo n.º 10
0
 def load_annotations_data(self, args, df_prodr2, index_intersect):
 
     if args.annot is None and args.annot_chr is None:
         #create relevant data for a single annotation
         
         if args.M is None:
             raise ValueError('--M must be used when not using --annot or --annot-chr')
         if args.not_M_5_50 is not None:
             raise ValueError('--not-M-5-50 cannot be used without using --annot or --annot-chr')
         if args.fit_intercept:
             raise ValueError('--fit-intercept cannot be used without using --annot or --annot-chr')
         M_annot = np.ones(1) * args.M
         df_annotations_sumstats_noneg = pd.DataFrame(np.ones(len(index_intersect)), index=index_intersect, columns=['base'])
         df_sync = pd.DataFrame(index=['base'])
         df_sync['min_annot'] = 0
         df_sync['M2_5_50'] = args.M
         df_sync['is_continuous'] = False
         df_overlap = pd.DataFrame(index=['base'])
         df_overlap['base'] = args.M
         df_l2 = None
         df_w_ld = None
         
     else:
     
         #load M_annot
         if args.not_M_5_50: M_suffix = 'l2.M'
         else: M_suffix = 'l2.M_5_50'
         df_M_annot = pcgc_utils.load_dfs(args.annot, args.annot_chr, M_suffix, 'M', 'annot', header=None,use_tqdm=False)
         M_annot = df_M_annot.sum(axis=0).values
         if M_annot.shape[0] != df_prodr2.shape[1]:
             raise ValueError('.M files have a different number of columns than .prodr2 files')            
     
         #read df_sync and overlap matrix
         if args.sync is None:
             raise ValueError('--sync not provided')
         df_sync = pd.read_table(args.sync+'sync', index_col='Category')
         overlap_suffix = 'overlap' if args.not_M_5_50 else 'overlap_5_50'
         df_overlap = pd.read_table(args.sync+overlap_suffix, sep='\s+', index_col='Category')
         if df_sync.shape[0] != df_prodr2.shape[1] or not np.all(df_sync.index == df_prodr2.columns):
             raise ValueError('sync and prodr2 files must have the same annotations')
         if df_overlap.shape[0] != df_prodr2.shape[1] or not np.all(df_overlap.index == df_prodr2.columns):
             raise ValueError('overlap_matrix and prodr2 files must have the same annotations')
         
         #read SNP data files
         df_annotations_sumstats_noneg = pcgc_utils.load_dfs(args.annot, args.annot_chr, 'annot.gz', 'annot', 'annot', index_col='SNP', index_intersect=index_intersect, use_tqdm=True)
         df_annotations_sumstats_noneg.drop(columns=['CHR', 'CM', 'BP'], inplace=True)
         if df_annotations_sumstats_noneg.shape[1] != df_prodr2.shape[1] or not np.all(df_annotations_sumstats_noneg.columns == df_prodr2.columns):
             raise ValueError('annotation and prodr2 files must have the same annotations')
         df_annotations_sumstats_noneg -= df_sync['min_annot'].values
         df_list = [df_annotations_sumstats_noneg]
         if args.fit_intercept:
             df_l2 = pcgc_utils.load_dfs(args.annot, args.annot_chr, 'l2.ldscore.gz', 'l2.ldscore', 'annot', index_col='SNP', usecols=['SNP', 'baseL2'], index_intersect=index_intersect)
             df_w_ld = pcgc_utils.load_dfs(args.w_ld, args.w_ld_chr, 'l2.ldscore.gz', 'l2.ldscore', 'w-ld', index_col='SNP', usecols=['SNP', 'L2'], index_intersect=index_intersect)
             df_l2 = df_l2['baseL2']
             df_w_ld = df_w_ld['L2']
             df_list += [df_l2, df_w_ld]
         else:
             df_l2, df_w_ld = None, None
             
         #make sure that all the dfs are perfectly aligned
         for df in df_list:
             assert not df.index.duplicated().any()
             index_intersect_df = index_intersect.intersection(df.index)
             if len(index_intersect_df) < len(index_intersect):
                 raise ValueError('not all SNPs found in the annotation or LD score files - this shouldn''t happen')
             is_same = (df.index == index_intersect).all()
             if not is_same:
                 df = df.loc[index_intersect]
                 
     #restrict annotations if requested
     if args.keep_anno is not None or args.remove_anno is not None:
         #Find the set of annotations to select
         category_names = df_prodr2.columns
         remove_anno = set([] if (args.remove_anno is None) else args.remove_anno.split(','))
         keep_anno = set(category_names if (args.keep_anno is None) else args.keep_anno.split(','))
         if len(keep_anno.intersection(set(category_names))) < len(keep_anno):
             raise ValueError('-keep-anno includes non-existing annotations')
         if len(remove_anno.intersection(set(category_names))) < len(remove_anno):
             raise ValueError('-remove-anno includes non-existing annotations')
         
         #keep only the selected annotations
         anno_arr = [c for c in category_names if ((c in keep_anno) and (c not in remove_anno))]
         if len(anno_arr) < len(category_names):
             selected_anno = np.isin(category_names, anno_arr)
             M_annot = M_annot[selected_anno]
             df_prodr2 = df_prodr2.loc[anno_arr, anno_arr]
             df_annotations_sumstats_noneg = df_annotations_sumstats_noneg[anno_arr]
             df_sync = df_sync.loc[anno_arr]
             df_overlap = df_overlap.loc[anno_arr, anno_arr]
             logging.info('%d annotations remained after applying --keep-anno and --remove anno'%(df_prodr2.shape[1]))
                         
     return M_annot, df_prodr2, df_annotations_sumstats_noneg, df_sync, df_overlap, df_l2, df_w_ld