Exemplo n.º 1
0
def compare_samename_files(path1, path2, show=False):
    fl1, fl2 = gt.get_flist(path1), gt.get_flist(path2)
    for file in fl1:
        shn = gt.get_shn(file)
        other = os.path.join(path2, shn)
        if os.path.isfile(other):
            compare_files(file, other, show=show)
Exemplo n.º 2
0
Arquivo: qc.py Projeto: wrbutton/foo
def check_data(path='dflt'):
    """ a better final map checker """
    if path == 'dflt':
        path = gt.dflt_outpath(fldr_name='finaldata')

    flist = gt.get_flist(path, ext='.gct')
    maplist = gt.get_flist(path, ext='.txt')
    maplist.extend((gt.get_flist(path, ext='.xlsx')))

    for f in flist:
        shn = gt.get_shn(f).split('.')[0]
        try:
            mapfile = [x for x in maplist if shn in x][0]
        except:
            print(f'error with map file {shn}')

        g = gct.Gct(f)
        g.get_headers()
        g.get_wells()
        datwells = g.wells

        mymap = gct.extractmap(mapfile)

        mapwells = gt.hsub(mymap, {'type':['vehicle', 'poscon', 'test']})['well'].values

        res = set(datwells) - set(mapwells)

        if len(res) == 0:
            print(f'{shn} ok, {380-len(datwells)} failed wells')
        else:
            print(f'eror with map/data {shn}, {len(datwells)}/{len(mapwells)}')
Exemplo n.º 3
0
def compare_samename_files(path1, path2, show=False):
    """ from two folders, compare to see whether all identically named maps are identical """
    fl1, fl2 = gt.get_flist(path1), gt.get_flist(path2)
    for file in fl1:
        shn = gt.get_shn(file)
        other = os.path.join(path2, shn)
        if os.path.isfile(other):
            compare_files(file, other, show=show)
Exemplo n.º 4
0
def grab_ranks(path, feat, hilo=1, t=40):
    """ survey folder for the wells in whic given gene istop ranked wells of given feat by sorted z score). generates
        overall list ofdefault rank output is in descending order, highest zscore = 1
        hilo: 1 = high, upregulated genes (default rank order)
                    0 = low, downnregulated genes """
    outpath = os.path.join(path, '_rank_summary.txt')
    flist = gt.get_flist(path, 'ranks.gct')
    # set dummy starting point for low rank
    lowest = 500
    # create blank template dataframe
    summary = pd.DataFrame()
    for f in flist:
        d, h = gct.extractgct(f)
        # flip rank order as needed
        if hilo > 1:
            d = 978 - d
        # get column ids for ranks below threshold
        wells = d.columns[d.ix[feat] < t]
        # extract portion of dataframe
        ranks = d.ix[feat, wells]
        ranks = pd.DataFrame(ranks)
        # assign plate column to each well id entry, re-order cols
        ranks['plate'] = gt.get_shn(f).split('-')[0]
        # concat portion to overall dataframe
        summary = pd.concat([summary, ranks])
        # check and store the lowest rank
        newlow = min(d.ix[feat])
        if newlow < lowest:
            lowest = newlow
    # re-shuffle the column order
    summary['well'] = summary.index.values
    summary = summary[['plate', 'well', feat]]
    print('\n', feat, int(lowest))
    summary.to_csv(outpath, sep='\t', index=None)
Exemplo n.º 5
0
def extractgct(pathlist, split=True):
    """ automatically extract and concat dataframe and header files
    CARE MUST BE TAKEN THE FILES ARE OF THE SAME HEADER/MAP TYPE!
    the break argument will parse a single string of run-on gct paths and
    separate into a list of separate paths"""

    pathlist = gt.splitpaths(pathlist, ext='.gct')

    if not isinstance(pathlist, list):
        pathlist = [pathlist]

    if len(pathlist) == 1:
        if os.path.isdir(pathlist[0]):
            print(f'directory, getting all gcts')
            pathlist = gt.get_flist(pathlist[0], ext='.gct')
        else:
            d, h = builddframegct(pathlist[0])
            return d, h

    dlist, hlist = [], []
    for path in pathlist:
        print(path)
        d, h = builddframegct(path)
        dlist.append(d)
        hlist.append(h)
    d = pd.concat(dlist, axis=1)
    h = pd.concat(hlist, axis=0)
    print('samples (d/h): ', len(d.columns), len(h.index))
    d.name = dlist[0].name + '+'

    return d, h
Exemplo n.º 6
0
Arquivo: qc.py Projeto: wrbutton/foo
def check_final(path='dflt'):
    """ check numbers of row/columns, number of fails and decimal places of final data files """
    if path == 'dflt':
        path = gt.dflt_outpath(fldr_name='finaldata')

    f_list = gt.get_flist(path, ext='.gct')

    for file in f_list:
        g = gct.Gct(file)
        g.get_headers()
        try:
            txt = g.file.split('.')[0] + '.txt'
        except:
            try:
                txt = g.file.split('.')[0] + '.xlsx'
            except:
                pass

        try:
            print(sub_check_failed(g, txt))
            fails, fail_result = sub_check_failed(g, txt)
            result = sub_check_lines(g) and sub_check_columns(g) and fail_result
            dplaces = sub_check_decimal(g)
        except FileNotFoundError:
            result = False
            fails = 'no map!!'

        print('{} - {} - {} failed wells - {} dplaces'.format(g.shortname, result, fails, dplaces))
Exemplo n.º 7
0
def openmap(path, ext='all'):
    """ bulk map opening, flexible by type, but watch out for mismatched dimensions of different maps """
    if isinstance(ext, str) and ext == 'all':
        #exts = ['.gct', '.txt', '.xlsx', '.xls']
        exts = ['.gct', '.txt', '.xlsx']
    pathlist = []
    if os.path.isdir(path):
        for extsn in exts:
            pathlist.extend(gt.get_flist(path, ext=extsn))
    else:
        pathlist = path
        for extsn in exts:
            pathlist = gt.splitpaths(pathlist, ext=extsn)
        if isinstance(pathlist, str):
            print('only one map')
            combined = extractmap(path)
            return combined
        else:
            combined = []
            plates = [os.path.basename(x) for x in pathlist]
            print(plates)
            for file in pathlist:
                combined.append(extractmap(file))
            combined = pd.concat(combined, axis=0, sort=False)
            if 'plate' not in combined.columns:
                combined['plate'] = combined.index
                combined.plate = combined.plate.apply(
                    lambda x: x.split(':')[0])
            return combined
Exemplo n.º 8
0
def summary_v0():

    try:
        path = sys.argv[1]
    except:
        sys.exit('arg1 is code directory to summarize')

    opath = '/Users/WRB/Desktop/codesummary.txt'

    results = []

    flist = gt.get_flist(path, '.py')

    for file in flist:
        fn = file.split('/')[-1]
        if fn[0] == '_':
            continue
        with open(file, 'rU') as df:
            results.append(fn)
            a = 'de'
            b = 'f '
            c = a + b
            for line in df:
                if c in line and 'main' not in line:
                    line = line.replace(c, '\t').rstrip().rstrip(':')
                    results.append(line)
    with open(opath, 'w') as of:
        for entry in results:
            print(entry, file=of)
Exemplo n.º 9
0
def gather_wells(path, wellfile):
    # reads from a text file ('MKD031:A03' one or many per line) to assemble a dataframe
    # from a collection of wells from a range of plates. dir points to gct directory
    fl = gt.get_flist(path, '.gct')
    data, h = pd.DataFrame(), pd.DataFrame()
    with open(wellfile, 'r') as f:
        for line in f:
            plates = list(
                set([p.split(':')[0] for p in line.strip().split(',')]))
            wells = line.strip().split(',')
            # if only one plate, grab all wells from it
            if len(plates) <= 1:
                file = [s for s in fl if plates[0] in s]
                if len(file) > 1:
                    print('more than one plate match', file)
                d, sh = dfsubset(file[0], wells)
                d = d.sort_index()
                data = pd.concat([data, d], axis=1)
                h = pd.concat([h, sh], axis=0)
            # if multiple plates, just grab each well one at a time
            elif len(plates) > 1:
                for w in wells:
                    p = w.split(':')[0]
                    file = [s for s in fl if p in s]
                    if len(file) > 1:
                        print('more than one plate match', file)
                    d, sh = dfsubset(file[0], w)
                    d = d.sort_index()
                    data = pd.concat([data, d], axis=1)
                    h = pd.concat([h, sh], axis=0)
    return data, h
Exemplo n.º 10
0
def convert_to_txt(path):
    fl = gt.get_flist(path, '.xlsx')
    opath = '/Users/WRB/Desktop/newmaps/'
    for f in fl:
        m = pd.read_excel(f)
        shn = gt.get_shn(f).strip('.xlsx')
        outpath = os.path.join(opath, shn + '.txt')
        m.to_csv(outpath, sep='\t', index=False)
Exemplo n.º 11
0
Arquivo: sigs.py Projeto: wrbutton/foo
def run_granularity(path):
    flist = gt.get_flist(path, '.gct')
    c = cll.Counter()
    for f in flist:
        d, h = gct.extractgct(f)
        survey_granularity(d, c)
    c = pd.Series(c, name='count')
    c.sort_values(ascending=False, inplace=True)
    c = c[c > 1]
    c.to_excel(os.path.join(path, 'counter.xlsx'))
Exemplo n.º 12
0
def list_fails(path):
    fl = gt.get_flist(path)
    for f in fl:
        if f.endswith('.txt'):
            m = pd.read_table(f, index_col=False)
        elif f.endswith('.xlsx'):
            m = pd.read_excel(f)
        fw = m[m['type'] == 'failed']['well'].values
        shn = gt.get_shn(f)
        print(shn, fw)
Exemplo n.º 13
0
def rank_files(path):
    """ reads a zscore gct file (probably consensus) and saves file of ranks
            of each gene within each sample - from highest to lowest values"""
    flist = gt.get_flist(path, '.gct')
    for f in flist:
        g = gct.Gct(f)
        print(f)
        d = g.build_dframe()
        ds = d.rank(ascending=False)
        outpath = os.path.join(
            os.path.split(path)[0], g.shortname + '-ranks.gct')
        gct.save_headergct(ds, outpath, g.fh)
Exemplo n.º 14
0
Arquivo: gcsv.py Projeto: wrbutton/foo
def summarize_csvs(path):
    """ provide path containing csv files to generate output summarizing levels 1 and 10
    for the plate as well as the posamp and ref """
    if path is None:
        path = gt.dflt_outpath(fldr_name='csv')
    results = cll.defaultdict(dict)
    f_list = gt.get_flist(path, '.csv')
    for file in f_list:
        try:
            c = Gcsv(file)
            d = c.build_dframe()
            results[c.shortname]['plate-L10'] = d['Analyte 10'].mean(axis=0)
            results[c.shortname]['Pos-L10'] = d.ix['B1']['Analyte 10']
            results[c.shortname]['Ref-L10'] = d.ix[['A2', 'B2'
                                                    ]]['Analyte 10'].mean()
            results[c.shortname]['plate-L1'] = d['Analyte 1'].mean(axis=0)
        except:
            print('error with ' + file)
    res = pd.DataFrame(results)
    res = res.T
    outpath = os.path.join(path, 'csv_summary.txt')
    res.to_csv(outpath, sep='\t', float_format='%.0f')
Exemplo n.º 15
0
Arquivo: gcsv.py Projeto: wrbutton/foo
def bulk_open_as_gct(path, drop_inv=False):
    # no current support for repeat wells files
    flist = gt.get_flist(path, '.csv')
    pdict = cll.defaultdict(list)
    for f in flist:
        shn = gt.get_shn(f)
        pdict[shn].append(f)
    for k, v in pdict.items():
        bsets = [x.split('_')[1] for x in v]
        if 'DP52' in bsets and 'DP53' in bsets and len(bsets) < 3:
            print(k, ' ok')
            df1 = open_as_gct(v[0], log=True)
            df2 = open_as_gct(v[1], log=True)
            df = pd.concat([df1, df2], axis=0)
            df.sort_index(inplace=True)
            if drop_inv is True:
                df = df[~df.index.str.contains('INV')]
            else:
                print('watch out, invariant genes included')
            outpath = os.path.join(path, k + '_pkexp.gct')
            gct.save_simplegct(df, outpath)
        else:
            print('error with: ', k)
Exemplo n.º 16
0
Arquivo: qc.py Projeto: wrbutton/foo
def dl_data(sc='q', src='dflt', dest='dflt', search=None, excl=None, ext=None, p=False):
    """ download data from s3. 'sc' is shortcut, can be 'q' for qc, 'g' for gct, 'z' for zscore,
    'e' for enrichment, 'f' for final """
    if dest == 'dflt':
        dest = gt.dflt_outpath(fldr_name='newQC')
    elif dest == 'foo':
        dest = gt.dflt_outpath(fldr_name='foo')
    else:
        if '/' in dest or '\\' in dest:
            try:
                os.mkdir(dest)
            except:
                pass
        else:
             dest = gt.dflt_outpath(fldr_name=dest)

    tempdest = gt.dflt_outpath(fldr_name='temp_copy_transfer')

    s3c = boto3.client('s3')
    pref = '_review/'
    if src == 'dflt':
        items = s3c.list_objects(Bucket='genometry', Prefix=pref, Delimiter='/')
        folds = sorted([list(x.values())[0].replace(pref, '').strip('/') for x in items['CommonPrefixes']])
        if len(folds) == 0:
            print('hm, zero files in list')
        if len(folds) == 1:
            fold = folds[0]
        if len(folds) > 1:
            fold = folds[-1]
        print('downloading from ', fold)
        src = 's3://genometry/' + pref + fold
        # grab PCA ppt and latest coordinates txt
        s3c.download_file('genometry', 'PCA_analysis/PCA2.pptx', os.path.join(dest, 'PCA.pptx'))
        coords = s3c.list_objects(Bucket='genometry', Prefix='PCA_analysis/')
        coord = sorted([x['Key'] for x in coords['Contents']])[-1]
        s3c.download_file('genometry', coord, os.path.join(tempdest, 'PCA_coords.txt'))
    else:
        src = 's3://genometry/' + src

    search_args = []

    # parse shortcut
    if 'q' in sc:
        search_args.append(('*_qc/*', ''))
    if 'g' in sc:
        search_args.append((['*_fullqnorm_*', '*_QNORM_sorted*', '*_ref_n*', '*_Qctrl_n*'], ''))
        ext = '.gct'
    if 'z' in sc:
        search_args.append(('*_ZSVCQNORM_*', ''))
        ext = '.gct'
    if 'e' in sc:
        search_args.append(('*_escore/*', '*.gct'))
    if 'f' in sc:
        search_args.append(('*_final/*', ''))
        dest = gt.dflt_outpath(fldr_name='finaldata')
        ext = ['.gct', '.txt']
    if 'i' in sc:
        search_args.append(('*_ZSVCINF_*', ''))
        ext = '.gct'
    search_args.append(('', excl))

    if search is not None:
        if '*' not in search:
            search = '*' + search + '*'
        search_args.append((search, ''))

    if excl is not None:
        if '*' not in excl:
            excl = '*' + excl + '*'
        search_args.append(('', excl))

    for search, excl in search_args:
        cmd_str = f'aws s3 cp --recursive {src} {tempdest} --exclude "*"'
        if isinstance(search, str):
            search = [search]
        for st in search:
            cmd_str += f' --include {st}'
        if excl != '':
            cmd_str += f' --exclude {excl}'

        print(cmd_str)

    subprocess.run(cmd_str, shell=True)

    if ext is not None:
        if isinstance(ext, str):
            ext = [ext]
        for ex in ext:
            fl = gt.globit(tempdest, f'*{ex}')
            for f in fl:
                file_dest = os.path.join(tempdest, os.path.basename(f))
                shutil.move(f, file_dest)
            subdirs = [x[0] for x in os.walk(tempdest)][1:]
        for sd in subdirs:
            try:
                shutil.rmtree(sd)
            except:
                pass
    # do final copy from temp to main destination, erase temp
    for f in gt.get_flist(tempdest):
        file_dest = os.path.join(dest, os.path.basename(f))
        shutil.move(f, file_dest)
    shutil.rmtree(tempdest)

    if p is True:
        try:
            process_qc()
        except:
            print('error processing qc')
Exemplo n.º 17
0
def check_maps(path, compare=True, img=True, v=True, filt=True):
    """ looks through .txt and .xlsx maps in a directory and summarizes their content and relationship with each other,
    as well as generating plate visualizations of type and batch for each plate. V for verbose, lists names and doses per plate
     if filter is true, just observe 6character name excel files, otherwise consider all files"""

    # plot_fields = ['type', 'batch']
    plot_fields = ['type']
    # add checks to add batch and dose if present

    pert_dict, map_list = {}, {}
    wellpert_dict = {}

    flist = gt.get_flist(path, ext='.xlsx')
    if filt is True:
        flist = [x for x in flist if len(os.path.split(x)[-1]) == 11]
    if len(flist) == 0:
        flist = gt.get_flist(path, ext='.txt')
        flist = [x for x in flist if len(os.path.split(x)[-1]) == 10]

    if v is True:
        print('flist = ', flist)

    awells = gt.get_awells()
    composition = pd.DataFrame(columns=[
        'wells #', 'test #', 'doses', 'dose/trt', '# names', 'vehicle #',
        'poscon #', 'poscons'
    ])

    for f in flist:
        pname = gt.get_shn(f).split('.')[0]
        print(pname)
        if f.endswith('.xlsx'):
            m = pd.read_excel(f)
        elif f.endswith('.txt'):
            m = pd.read_table(f, index_col=False)
        m.sort_index(inplace=True)
        batches = m['batch'].dropna().unique()

        if any([('dose' in x) or ('dilution' in x) for x in m.columns]):
            dose_field = [
                x for x in m.columns if (('dose' in x) or ('dilution' in x))
            ][0]
        else:
            dose_field = None

        headers = {
            'wells #': lambda x: len(x.index),
            'test #': lambda x: ctup(x, {'type': 'test'}, 'well')
        }
        if dose_field is not None:
            headers.update({
                'doses':
                lambda x: ctup(x, {'type': 'test'}, dose_field),
                'dose/trt':
                lambda x: gt.hsub(m, {'type': 'test'})[dose_field].groupby(m[
                    'name']).unique().apply(lambda x: len(x)).mean()
            })
        elif dose_field is None:
            headers.update({'doses': 'na', 'dose/trt': 'na'})
        headers.update({
            '# names':
            lambda x: ctup(x, {'type': 'test'}, 'name'),
            'vehicle #':
            lambda x: ctup(x, {'type': 'vehicle'}, 'well'),
            'poscon #':
            lambda x: ctup(x, {'type': 'poscon'}, 'well'),
            'poscons':
            lambda x: ctup(x, {'type': 'poscon'}, 'name', lst=True)
        })

        summary = pd.DataFrame(columns=headers)

        # check wells for full plate
        well_result = set(awells) - set(m['well'].values)

        if len(well_result) != 0:
            print('{} wells error, {} entries'.format(pname, len(m.index)))

        if v is True:
            print(gt.hsub(m, {'type': 'test'})['name'].dropna().unique())
            try:
                doselist = gt.hsub(
                    m, {'type': 'test'})[dose_field].dropna().unique()
                print(doselist)
            except:
                print('error with dose col, ', dose_field)
                pass

        # summarize the header info per batch, and assemble pert-lists
        # for the overlap comparisons
        for b in batches:
            entry = pname + '-' + b
            ms = gt.hsub(m, {'batch': b})
            # gather pert names for overlap comparison
            pert_dict[entry] = ctup(m, {
                'batch': b,
                'type': 'test'
            },
                                    'name',
                                    lst=True)
            # get the well-pert identities for same plate comparison
            ms.loc[:, 'addr'] = ms['well'] + '-' + ms['name'].apply(
                lambda x: str(x))
            wellpert_dict[entry] = ms['addr'].values
            for k in headers.keys():
                try:
                    summary.loc[entry, k] = headers[k](ms)
                except (KeyError, TypeError):
                    summary.loc[entry, k] = 'na'

        composition = pd.concat([composition, summary])

        if img is True:
            for pf in plot_fields:
                plot_series = m[pf]
                if len(plot_series.dropna().unique()) > 1:
                    plot_series.name = pname + ' ' + pf
                    plate_map_vis(plot_series, path=path)

    composition.to_excel(os.path.join(path, 'batch_composition.xlsx'))

    if compare is True:
        same_plates = gt.overlap_matrix(wellpert_dict.values(),
                                        wellpert_dict.keys())
        name_overlap = gt.overlap_matrix(pert_dict.values(), pert_dict.keys())
        name_overlap.to_excel(os.path.join(path, 'name_overlaps.xlsx'))
        same_plates.to_excel(os.path.join(path, 'well-name_overlaps.xlsx'))