def _read_chr_split_files(chr_arg, not_chr_arg, log, noun, parsefunc, **kwargs): '''Read files split across 22 chromosomes (annot, ref_ld, w_ld).''' try: if not_chr_arg: log.log('Reading {N} from {F} ...'.format(F=not_chr_arg, N=noun)) out = parsefunc(_splitp(not_chr_arg), **kwargs) elif chr_arg: f = ps.sub_chr(chr_arg, '[1-22]') log.log('Reading {N} from {F} ...'.format(F=f, N=noun)) out = parsefunc(_splitp(chr_arg), _N_CHR, **kwargs) except ValueError as e: log.log('Error parsing {N}.'.format(N=noun)) raise e return out
def _read_header(args): '''Read header from .expscore files''' if args.exp_chr: f = ps.sub_chr(args.exp_chr, 1) + '.expscore' comp = ps.which_compression(f) indices, cnames, cgroups = ps.filter_columns(f + comp[0], comp[1], fsuffix='expscore', args=args) cnames = [x for x in cnames if x not in ['SNP', 'CHR', 'BP', 'CM']] else: f = args.exp + '.expscore' comp = ps.which_compression(f) indices, cnames, cgroups = ps.filter_columns(f + comp[0], comp[1], fsuffix='expscore', args=args) cnames = [x for x in cnames if x not in ['SNP', 'CHR', 'BP', 'CM']] return indices, cnames, cgroups
def _read_multiple_chr_split_files(chr_arg_array, not_chr_arg_array, log, noun, parsefunc, **kwargs): '''Read files split across 22 chromosomes (annot, ref_ld, w_ld).''' try: if not_chr_arg_array: fh_list = [] for not_chr_arg in not_chr_arg_array: log.log('Reading {N} from {F} ...'.format(F=not_chr_arg, N=noun)) fh_list += _splitp(not_chr_arg) out = parsefunc(fh_lit, **kwargs) elif chr_arg_array: fh_list = [] for chr_arg in chr_arg_array: f = ps.sub_chr(chr_arg, '[1-22]') log.log('Reading {N} from {F} ...'.format(F=f, N=noun)) fh_list += _splitp(chr_arg) out = parsefunc(fh_list, _N_CHR, **kwargs) except ValueError as e: log.log('Error parsing {N}.'.format(N=noun)) raise e return out
def _read_nonneg_constraints(args, log, snp_names, ref_ld_cnames): assert args.ref_ld_chr or args.ref_ld is not None, 'non-negative constraints must be used with --ref_ld_chr or --ref_ld' noun = 'annot matrix' try: #read annotation files if args.ref_ld_chr is not None: f = ps.sub_chr(args.ref_ld_chr, '[1-22]') log.log('Reading {N} from {F} ...'.format(F=f, N=noun)) fh_list = _splitp(args.ref_ld_chr) else: f = args.ref_ld log.log('Reading {N} from {F} ...'.format(F=f, N=noun)) fh_list = _splitp(args.ref_ld) df_annotations_list = [] columns_list = [] for fh in fh_list: df_annotations_list_fh = [] if args.ref_ld_chr is not None: flist_small = [ps.sub_chr(fh, chr) + '.annot_small.gz' for chr in xrange(1, _N_CHR + 1)] flist_notsmall = [ps.sub_chr(fh, chr) + '.annot.gz' for chr in xrange(1, _N_CHR + 1)] else: flist_small = [fh + '.annot_small.gz'] flist_notsmall = [fh + '.annot.gz'] for f_i in xrange(len(flist_small)): fname = flist_small[f_i] if (not os.path.exists(fname)): fname = flist_notsmall[f_i] df_annotations_chr = pd.read_csv(fname, delim_whitespace=True) # if not args.constrain_all_snps: # df_annotations_chr = df_annotations_chr.loc[df_annotations_chr['SNP'].isin(snp_names)] df_annotations_chr.set_index(df_annotations_chr.columns[:4].tolist(), drop=True, inplace=True) if (args.anno is not None): annotations = args.anno.split(',') #for a in annotations: # assert a in df_annotations_chr.columns df_annotations_chr = df_annotations_chr.loc[:, [c for c in df_annotations_chr.columns if (c in annotations)]] df_annotations_list_fh.append(df_annotations_chr) df_fh = pd.concat(df_annotations_list_fh, axis=0) columns_list += df_fh.columns.tolist() df_annotations_list.append(df_fh) assert len(np.unique(columns_list)) == len(columns_list), 'duplicate columns exist in different annotation files' df_annotations = reduce(lambda left,right: pd.merge(left,right,left_index=True, right_index=True), df_annotations_list) del df_annotations_list df_annotations.drop_duplicates(inplace=True) log.log('%d constraints remained after removing duplicate constraints'%(df_annotations.shape[0])) # if not args.constrain_all_snps: # assert (df_annotations.index.get_level_values(2) == snp_names).all() #SNP Names #define tau constraints annotation_names = [(c[:c.index('L2')] if ('L2' in c) else c) for c in ref_ld_cnames] assert (df_annotations.columns == annotation_names).all() #add intercept column if required if (args.intercept_h2 is None and not args.no_intercept): df_annotations['intercept'] = np.zeros(df_annotations.shape[0]) return df_annotations.astype(np.float) except ValueError as e: log.log('Error parsing {N}.'.format(N=noun)) raise e