Пример #1
0
def main(args):
    # Import data
    logger.info(u'Importing data with following parameters: \n\tWide: {0}\n\tDesign: {1}\n\tUnique ID: {2}\n\tGroup Column: {3}'.format(args.fname, args.dname, args.uniqID, args.group))
    dat = wideToDesign(args.fname, args.dname, args.uniqID, args.group)
    dat.wide.convert_objects(convert_numeric=True)

    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 20))
    plt.subplots_adjust(hspace=0.3)

    # If there is group information, color by group.
    if hasattr(dat, 'group'):
        logger.info('Plotting sample distributions by group')
        legend1 = pltByTrt(dat, ax1)
    else:
        logger.info('Plotting sample distributions')
        pltBySample(dat, ax1)

    # Create Legend
    handles, labels = ax1.get_legend_handles_labels()
    ax1.legend(handles, labels, ncol=5, loc='upper right', fontsize=10)

    # Create second legend if there is group information
    if hasattr(dat, 'group'):
        ax1.add_artist(legend1)

    # Plot boxplot of samples
    pltBoxplot(dat, ax2)

    plt.savefig(args.ofig, format='pdf')
    mpld3.save_html(fig, args.ofig2, template_type='simple')
Пример #2
0
def main(args):

    # Import data
    dat = wideToDesign(args.fname, args.dname, args.uniqID)

    df_offFlags = Flags(index=dat.wide.index)

    # Iterate through each group to add flags for if a group has over half of
    # its data above the cutoff
    for title, group in dat.design.groupby(args.group):
        # Create mask of current frame containing True/False values if the
        # values are greater than the cutoff
        mask = (dat.wide[group.index] < args.cutoff)

        # Convert the mean column to a boolean
        meanOn = mask.mean(axis=1)

        # Add mean column of boolean values to flags
        df_offFlags.addColumn(column='flag_feature_' + title + '_off', mask=meanOn > 0.5)

    # flag_met_off column
    maskFlagMetOff = df_offFlags.df_flags.any(axis=1)
    df_offFlags.addColumn('flag_feature_off', maskFlagMetOff)

    # flag_met_all_off column
    maskFlagMetAllOff = df_offFlags.df_flags.all(axis=1)
    df_offFlags.addColumn('flag_feature_all_off', maskFlagMetAllOff)

    df_offFlags.df_flags.to_csv(args.output, sep="\t")
Пример #3
0
def main(args):

    # Import data with clean string as true
    df_cleanedData = wideToDesign(wide=args.fname, design=args.dname,
                                  uniqID=args.uniqID, clean_string=True)

    # Export cleaned data
    df_cleanedData.wide.to_csv(args.wideOut, sep="\t")
    df_cleanedData.design.to_csv(args.designOut, sep="\t")
Пример #4
0
def main(args):
    """ """
    directory = args.RTflag

    # Import data
    dat = wideToDesign(args.fname, args.dname, args.uniqID)

    # Only interested in samples
    wide = dat.wide[dat.sampleIDs]

    # Set RT flags
    setRTflag(args, wide, dat, dir = directory)
Пример #5
0
def main(args):
    # Import data
    logger.info('Importing Data')
    dat = wideToDesign(args.fname, args.dname, args.uniqID)
    df_wide = dat.wide[dat.sampleIDs].copy()

    # Drop Missing
    if np.isnan(df_wide.values).any():
        nRows = df_wide.shape[0]        # Number of rows before dropping missing
        df_wide.dropna(inplace=True)    # Drop missing rows in place
        nRowsNoMiss = df_wide.shape[0]  # Number of rows after dropping missing
        logger.warn('{} rows were dropped because of missing values.'.format(nRows - nRowsNoMiss))

    # Run PCA
    # Initialize PCA class with default values
    pca = PCA()

    # Fit PCA
    scores = pca.fit_transform(df_wide)

    # Get loadings
    loadings = pca.components_

    # Get additional information
    sd = scores.std(axis=0)
    propVar = pca.explained_variance_ratio_
    cumPropVar = propVar.cumsum()

    # Create header block for output. I am replicating output from R, which
    # includes additional information (above) at the top of the output file.
    labels = np.array(['#Std. deviation', '#Proportion of variance explained', '#Cumulative proportion of variance explained'])
    blockDat = np.vstack([sd, propVar, cumPropVar])
    block = np.column_stack([labels, blockDat])

    # Create header for output
    header = np.array(['PC{}'.format(x + 1) for x in range(loadings.shape[1])])
    compoundIndex = np.hstack([df_wide.index.name, df_wide.index])
    sampleIndex = np.hstack(['sampleID', df_wide.columns])

    # Create loadings output
    loadHead = np.vstack([header, loadings])
    loadIndex = np.column_stack([sampleIndex, loadHead])
    loadOut = np.vstack([block, loadIndex])

    # Create scores output
    scoreHead = np.vstack([header, scores])
    scoreIndex = np.column_stack([compoundIndex, scoreHead])
    scoreOut = np.vstack([block, scoreIndex])

    # Save output
    np.savetxt(args.lname, loadOut, fmt='%s', delimiter='\t')
    np.savetxt(args.sname, scoreOut, fmt='%s', delimiter='\t')
Пример #6
0
def main(args):
    # Import data
    logger.info(u'Importing data with following parameters: \n\tWide: {0}\n\tDesign: {1}\n\tUnique ID: {2}\n\tGroup Column: {3}'.format(args.fname, args.dname, args.uniqID, args.group))
    dat = wideToDesign(args.fname, args.dname, args.uniqID, args.group, clean_string=True)
    results = initResults(dat)

    # Transpose the data
    dat.trans = dat.transpose()

    # Group by Treatment
    grp = dat.trans.groupby(dat.group)
    grpMeans = grp.mean().T
    combo = createCbn(dat)

    resids = list()
    fitted = list()
    # Iterate over compound
    logger.info('Running row-by-row analysis.')
    for compound in dat.wide.index.tolist():
        # Get Overall Mean
        results.ix[compound, 'GrandMean'] = dat.trans[compound].mean()

        # run one-way ANOVA
        resid, fit = oneWay(dat, compound, results)
        resids.append(resid)
        fitted.append(fit)

        # Calculate mean differences
        calcDiff(dat, compound, grpMeans, combo, results)

        # Calculate SE of difference between means
        calcDiffSE(dat, compound, combo, results)

        # Calculate T-test
        tTest(compound, combo, results)

    residDat = pd.concat(resids, axis=1)
    fitDat = pd.concat(fitted, axis=1)

    # Generate qqplots
    logger.info('Generating q-q plots.')
    qqPlot(residDat, fitDat, args.ofig)

    # Generate Volcano plots
    logger.info('Generating volcano plots.')
    volcano(combo, results, args.ofig2)

    # write results table
    results = results.convert_objects(convert_numeric=True)
    results.index = pd.Series([dat.revertStr(x) for x in results.index])
    results = results.apply(lambda x: x.round(4))
    results.to_csv(args.oname, sep="\t")
Пример #7
0
def main(args):

    # Import data
    dat = wideToDesign(args.fname, args.dname, args.uniqID)

    # Only interested in samples
    wide = dat.wide[dat.sampleIDs]

    # Use group separation or not depending on user input
    if not args.group:
        setCVflag(args, wide, dat)
    else:
        setCVflagByGroup(args, wide, dat)
Пример #8
0
def main(args):
    # Execute wideToDesign to make all data uniform
    formatted_data = wideToDesign(wide=args.fname, design=args.dname, uniqID=args.uniqID, group=args.group)

    # Convert flag file to DataFrame
    df_flags = pd.DataFrame.from_csv(args.flagFile, sep='\t')

    # If the user specified rows, run dropRows
    if args.dropRow:
        dropRows(df_wide=formatted_data.wide, df_flags=df_flags, cutoffValue=args.cutoff, args=args)

    # If the user specified columns, run dropColumns
    else:  # (if args.dropColumn:)
        dropColumns(df_wide=formatted_data.wide, df_design=formatted_data.design, df_flags=df_flags,
                    cutoffValue=args.cutoff, args=args)
def main(args):
    """ Main Script """

    # Import data
    dat = wideToDesign(args.fname, args.dname, args.uniqID)
    
    # Only interested in samples
    wide = dat.wide[dat.sampleIDs]
    
    # Put warnings and get rid of rows with missing values
    if wide.isnull().sum().sum():
        nOriginal = wide.shape[0]
        print "Missing values detected. All missing rows removed. "
        wide = wide.dropna()
        print "Original rows: {0}; # of rows after drop: {1}".format(nOriginal, wide.shape[0])
    
    # Calculate SED by group or not
    SEDbyGroup(dat, wide, args)
Пример #10
0
def main(args):
    # Import data and transpose
    logger.info(u'Importing data with following parameters: \n\tWide: {0}\n\tDesign: {1}\n\tUnique ID: {2}\n\tGroup Column: {3}'.format(args.fname, args.dname, args.uniqID, args.group))
    dat = wideToDesign(args.fname, args.dname, args.uniqID, args.group, clean_string=True)
    data = dat.transpose()
    data.dropna(axis=1, inplace=True)

    # Pull classifications out of dataset
    classes = data[dat.group].copy()
    data.drop(dat.group, axis=1, inplace=True)
    #TODO: Random forest does not handle NaNs, need to figure out the proper way to impute values.

    # Build Random Forest classifier
    logger.info('Creating classifier')
    model = RandomForestClassifier(n_estimators=args.num)
    model.fit(data, classes)

    # Identify features
    importance = pd.DataFrame([data.columns, model.feature_importances_]).T.sort(columns=1, ascending=False)

    # Export features ranked by importance
    logger.info('Exporting features')
    rev = importance.applymap(lambda x: dat.revertStr(x))
    rev.columns = ('feature', 'ranked_importance')
    rev.to_csv(args.oname2, index=False, sep='\t')

    # Select data based on features
    data = data[importance.ix[:, 0].tolist()]
    selected_data = pd.DataFrame(model.transform(data, threshold=0))
    selected_data.columns = [dat.revertStr(x) for x in data.columns]

    # Merge on classes and export
    logger.info('Exporting transformed data')
    clDf = pd.DataFrame(classes)
    clDf.reset_index(inplace=True)
    out = clDf.join(selected_data)
    out.to_csv(args.oname, index=False, sep='\t', float_format="%.4f")
Пример #11
0
parser.add_argument("--train_wide", dest="train_wide", action='store', required=True, help="wide part of the train dataset.")
parser.add_argument("--train_design", dest="train_design", action='store', required=True, help="design part of the train dataset.")
parser.add_argument("--test_wide", dest="test_wide", action='store', required=True, help="wide part of the test dataset.")
parser.add_argument("--test_design", dest="test_design", action='store', required=True, help="design part of the test dataset.")
parser.add_argument("--class_column_name", dest="class_column_name", action='store', required=True, help="Name of column in design file with Group/treatment information.")
parser.add_argument("--ID", dest="uniqID", action='store', required=True, help="Name of the column with unique identifiers.")
parser.add_argument("--kernel", dest="kernel", action='store', required=True, help="choice of kernel function: rbf, linear, poly, sigmoid.")
parser.add_argument("--degree", dest="degree", action='store', required=True, help="(integer) degree for the polynomial kernel, default 3.")
parser.add_argument("--C", dest="C", action='store', required=True, help="positive regularization parameter.")
parser.add_argument("--a", dest="a", action='store', required=True, help=" positive coefficient in kernel function.")
parser.add_argument("--b", dest="b", action='store', required=True, help=" independent term coefficient in kernel function.")
parser.add_argument("--outfile1", dest="outfile1", action='store', required=True, help="Output traget set with predicted_class labels.")
parser.add_argument("--accuracy_on_training", dest="accuracy_on_training", action='store', required=True, help="Output accuracy value on the training set.")

args = parser.parse_args()
train = wideToDesign(wide=args.train_wide, design= args.train_design, uniqID=args.uniqID, group=args.class_column_name).transpose()

test_design=read_table(args.test_design)
if args.class_column_name in test_design.columns:
    target = wideToDesign(wide=args.test_wide,design = args.test_design, uniqID=args.uniqID, group=args.class_column_name).transpose()
else:
    target = wideToDesign(wide=args.test_wide,design=args.test_design, uniqID=args.uniqID).transpose()

#target=target.loc[:, (target.dtypes== np.int) | (target.dtypes== np.float)]

# make sure test and train have the same features
for i in target.columns:
    if i not in train.columns:
        del target[i]

######################  trainig the SVM  #######################################
Пример #12
0
        logger.error("Error. {}".format(e))

    htmlFile = file(args.html, 'w')

    global htmlContents
    # universe_wsgi.ini file's html_sanitizing must be false to allow for styling
    htmlContents = ["<html><head><title>Count Digits Results List</title></head><body>"]
    htmlContents.append('<div style=\"background-color:black; color:white; text-align:center; margin-bottom:5% padding:4px;\">'
                        '<h1>Output</h1>'
                        '</div>')
    htmlContents.append('<ul style=\"text-align:left; margin-left:5%;\">')

    # Import data
    logger.info(u'html system path: {}'.format(args.htmlPath))
    logger.info(u'Importing data with following parameters: \n\tWide: {0}\n\tDesign: {1}\n\tUnique ID: {2}'.format(args.fname, args.dname, args.uniqID))
    dat = wideToDesign(args.fname, args.dname, args.uniqID)

    # Only interested in samples
    wide = dat.wide[dat.sampleIDs]

    # Global flag file
    global flag
    flag = Flags(index=wide.index)
    flag.addColumn(column='flag_feature_count_digits')

    # Use group separation or not depending on user input
    if args.group:
        countDigitsByGroups(args, wide, dat, dir=directory)
    else:
        countDigits(wide, dat, dir=directory)