Пример #1
0
def setCVflagByGroup(args, wide, dat):

    # Split design file by treatment group

    pdfOut = PdfPages(args.CVplot)
    CV = pd.DataFrame(index=wide.index)
    for title, group in dat.design.groupby(args.group):

        # Filter the wide file into a new dataframe
        currentFrame = wide[group.index]

        # Change dat.sampleIDs to match the design file
        dat.sampleIDs = group.index

        CV['cv_'+title], CVcutoff = setCVflag(args, currentFrame, dat, groupName=title)

    CV['cv'] = CV.apply(np.max, axis=1)
    if not args.CVcutoff:
        CVcutoff = np.nanpercentile(CV['cv'].values, q=90)
        CVcutoff = round(CVcutoff, -int(floor(log(abs(CVcutoff), 10))) + 2)
    else:
        CVcutoff = float(args.CVcutoff)
    for title, group in dat.design.groupby(args.group):
        fig, ax = plt.subplots()
        xmin = -np.nanpercentile(CV['cv_'+title].values,99)*0.2
        xmax = np.nanpercentile(CV['cv_'+title].values,99)*1.5
        ax.set_xlim(xmin, xmax)
        CV['cv_'+title].plot(kind='hist', range = (xmin, xmax), bins = 15, normed = 1, color = 'grey', label = "CV histogram")
        CV['cv_'+title].plot(kind='kde', title="Density Plot of Coefficients of Variation in " + args.group + " " + title, ax=ax, label = "CV density")
        plt.axvline(x=CVcutoff, color = 'red', linestyle = 'dashed', label = "Cutoff at: {0}".format(CVcutoff))
        plt.legend()
        pdfOut.savefig(fig, bbox_inches='tight')
        plt.close(fig)

    fig, ax = plt.subplots()
    xmin = -np.nanpercentile(CV['cv'].values,99)*0.2
    xmax = np.nanpercentile(CV['cv'].values,99)*1.5
    ax.set_xlim(xmin, xmax)

    # Create flag file instance
    CVflag = Flags(index=CV['cv'].index)

    for title, group in dat.design.groupby(args.group):
        CV['cv_'+title].plot(kind='kde', title="Density Plot of Coefficients of Variation by " + args.group, ax=ax, label = "CV density in group "+title)

        # Create new flag row for each group
        CVflag.addColumn(column='flag_feature_big_CV_' + title,
                     mask=((CV['cv_'+title].get_values() > CVcutoff) | CV['cv_'+title].isnull()))

    plt.axvline(x=CVcutoff, color = 'red', linestyle = 'dashed', label = "Cutoff at: {0}".format(CVcutoff))
    plt.legend()
    pdfOut.savefig(fig, bbox_inches='tight')
    plt.close(fig)
    pdfOut.close()

    # Write flag file
    CVflag.df_flags.to_csv(args.CVflag, sep='\t')
Пример #2
0
def mergeFlags(args):
    """
    :Arguments:
        :type args: argparse.ArgumentParser
        :param args: Command line arguments

    :Returns:
        :rtype: .tsv
        :returns: Merged flags tsv file
    """
    # Need to take each arg and turn into data frame and add to new list
    flagDataFrameList = []
    logger.info("Importing data")

    # Check for commas, commas are used in galaxy. If there are commas separate
    # the list by commas
    if ',' in args.flagFiles[0]:
        args.flagFiles = args.flagFiles[0].split(',')

    # Convert files into dataframes and populate into new list
    for flagFile in args.flagFiles:
        dataFrame = pd.DataFrame.from_csv(flagFile, sep='\t')
        flagDataFrameList.append(dataFrame)

    logger.info("Checking all indexes are the same")

    # Merge flags using Flags class
    mergedFlags = Flags.merge(flagDataFrameList)

    # Export merged flags
    # NOTE: Pandas cannot store NANs as an int. If there are NANs from the
    # merge, then the column becomes a float. Here I change the float output to
    # look like an int.
    mergedFlags.to_csv(args.mergedFile, float_format='%.0f', sep='\t')
Пример #3
0
def main(args):

    # Import data
    dat = wideToDesign(args.fname, args.dname, args.uniqID)

    df_offFlags = Flags(index=dat.wide.index)

    # Iterate through each group to add flags for if a group has over half of
    # its data above the cutoff
    for title, group in dat.design.groupby(args.group):
        # Create mask of current frame containing True/False values if the
        # values are greater than the cutoff
        mask = (dat.wide[group.index] < args.cutoff)

        # Convert the mean column to a boolean
        meanOn = mask.mean(axis=1)

        # Add mean column of boolean values to flags
        df_offFlags.addColumn(column='flag_feature_' + title + '_off', mask=meanOn > 0.5)

    # flag_met_off column
    maskFlagMetOff = df_offFlags.df_flags.any(axis=1)
    df_offFlags.addColumn('flag_feature_off', maskFlagMetOff)

    # flag_met_all_off column
    maskFlagMetAllOff = df_offFlags.df_flags.all(axis=1)
    df_offFlags.addColumn('flag_feature_all_off', maskFlagMetAllOff)

    df_offFlags.df_flags.to_csv(args.output, sep="\t")
Пример #4
0
def setCVflag(args, wide, dat, groupName = ''):

    # Round all values to 3 significant digits
    DATround = wide.applymap(lambda x: x)

    # Get std, mean and calculate CV #
    DATstat = pd.DataFrame(index=DATround.index)

    DATstat['std']  = DATround.apply(np.std, axis=1)
    DATstat['mean'] = DATround.apply(np.mean, axis=1)
    DATstat['cv']   = abs(DATstat['std'] / DATstat['mean'])

    if not args.CVcutoff:
        CVcutoff = np.nanpercentile(DATstat['cv'].values, q=90)
        CVcutoff = round(CVcutoff, -int(floor(log(abs(CVcutoff), 10))) + 2)
    else:
        CVcutoff = float(args.CVcutoff)

    # Plot CVs
    if groupName == '':
        fig, ax = plt.subplots()
        xmin = -np.nanpercentile(DATstat['cv'].values,99)*0.2
        xmax = np.nanpercentile(DATstat['cv'].values,99)*1.5
        ax.set_xlim(xmin, xmax)
        DATstat['cv'].plot(kind='hist', range = (xmin, xmax), bins = 15, normed = 1, color = 'grey', ax = ax, label = "CV histogram")
        DATstat['cv'].plot(kind='kde', title="Density Plot of Coefficients of Variation", ax=ax, label = "CV density")
        plt.axvline(x=CVcutoff, color = 'red', linestyle = 'dashed', label = "Cutoff at: {0}".format(CVcutoff))
        plt.legend()

        # Set file name of pdf and export
        #CVplotFileName = args.CVplot
        plt.savefig(args.CVplot, format='pdf')
        plt.close(fig)

        # Create flag instance
        CVflag = Flags(index=DATstat.index)

        # Create new flag column with flags
        CVflag.addColumn(column='flag_feature_big_CV',
                     mask=((DATstat['cv'].get_values() > CVcutoff) | DATstat['cv'].isnull()))

        # Write output
        CVflag.df_flags.to_csv(args.CVflag, sep='\t')
    else:
        return DATstat['cv'], CVcutoff
Пример #5
0
    htmlContents.append('<div style=\"background-color:black; color:white; text-align:center; margin-bottom:5% padding:4px;\">'
                        '<h1>Output</h1>'
                        '</div>')
    htmlContents.append('<ul style=\"text-align:left; margin-left:5%;\">')

    # Import data
    logger.info(u'html system path: {}'.format(args.htmlPath))
    logger.info(u'Importing data with following parameters: \n\tWide: {0}\n\tDesign: {1}\n\tUnique ID: {2}'.format(args.fname, args.dname, args.uniqID))
    dat = wideToDesign(args.fname, args.dname, args.uniqID)

    # Only interested in samples
    wide = dat.wide[dat.sampleIDs]

    # Global flag file
    global flag
    flag = Flags(index=wide.index)
    flag.addColumn(column='flag_feature_count_digits')

    # Use group separation or not depending on user input
    if args.group:
        countDigitsByGroups(args, wide, dat, dir=directory)
    else:
        countDigits(wide, dat, dir=directory)

    # Create a zip archive with the inputted zip file name of the temp file
    if args.noZip:
        pass
    else:
        shutil.make_archive(directory + '/Archive_of_Results', 'zip', directory)

    # Add zip of all the files to the list
Пример #6
0
def setRTflag(args, wide, dat, dir):

    # Round retention time to 2 decimals
    RTround = wide.applymap(lambda x: ifZero(x))
    RTround = RTround.applymap(lambda x: round(x, 2))

    # Get percentiles, min, max, mean, median
    RTstat = pd.DataFrame(index=RTround.index)
    RTstat['min']    = RTround.apply(np.min, axis=1)
    RTstat['max']    = RTround.apply(np.max, axis=1)
    RTstat['p95']    = RTround.apply(np.nanpercentile, q=95, axis=1)
    RTstat['p90']    = RTround.apply(np.nanpercentile, q=90, axis=1)
    RTstat['p10']    = RTround.apply(np.nanpercentile, q=10, axis=1)
    RTstat['p05']    = RTround.apply(np.nanpercentile, q= 5, axis=1)
    RTstat['std']    = RTround.apply(np.std, axis=1)
    RTstat['mean']   = RTround.apply(np.mean, axis=1)
    RTstat['median'] = RTround.apply(np.median, axis=1)
    RTstat['cv']     = RTstat['std'] / RTstat['mean']
    RTstat['p95p05'] = RTstat['p95'] - RTstat['p05']
    RTstat['p90p10'] = RTstat['p90'] - RTstat['p10']

    # Set RT flags
    RTflag = Flags(index=RTround.index)
    if args.p90p10:
        RTflag.addColumn(column = 'flag_RT_Q90Q10_outlier',
                        mask   = (RTstat['p90p10'] > args.minutes))
    else:
        RTflag.addColumn(column = 'flag_RT_Q95Q05_outlier',
                        mask   = (RTstat['p95p05'] > args.minutes))

    RTflag.addColumn(column = 'flag_RT_max_gt_threshold',
                    mask   = (RTstat['max'] - RTstat['median'] > args.minutes / 2))

    RTflag.addColumn(column = 'flag_RT_min_lt_threshold',
                    mask   = (RTstat['min'] - RTstat['median'] < -args.minutes / 2))

    RTflag.addColumn(column = 'flag_RT_min_max_outlier',
                    mask   = ((RTstat['max']-RTstat['mean'] > 3 * RTstat['std']) |
                              (RTstat['min']-RTstat['mean'] < -3 * RTstat['std'])))

    if not args.CVcutoff:
        CVcutoff = np.nanpercentile(RTstat['cv'].values, q=90)
        CVcutoff = round(CVcutoff, -int(floor(log(CVcutoff, 10))) + 2)
    else:
        CVcutoff = args.CVcutoff
    RTflag.addColumn(column = 'flag_RT_big_CV',
                    mask   = (RTstat['cv'] > CVcutoff))

    # Output flags
    RTflag.df_flags.to_csv(args.RTflag, sep="\t")

    # Plot RT CVs
    fig, ax = plt.subplots()

    #xmin, xmax = ax.get_xlim()
    xmin = -np.nanpercentile(RTstat['cv'].values,99)*0.2
    xmax = np.nanpercentile(RTstat['cv'].values,99)*1.5
    ax.set_xlim(xmin, xmax)

    RTstat['cv'].plot(kind='hist', range = (xmin, xmax), bins = 15, normed = 1, color = 'grey', ax=ax, label = "CV histogram")
    RTstat['cv'].plot(kind='kde', title="Density Plot of Coefficients of Variation of the Retention Time", ax=ax, label = "CV density")

    plt.axvline(x=CVcutoff, color = 'red', linestyle = 'dashed', label = "Cutoff at: {0}".format(CVcutoff))
    plt.legend()

    galaxySavefig(fig, args.RTplot)