Exemplo n.º 1
0
def main():

    # f.1. read user defined variables
    fileLocations, genomeAnnotation = optionReader.main()

    # f.2. read sequence data
    sampleNames = dataReader.main(fileLocations)

    # f.3. generate barcode-specific FASTQ files

    # working in a parallel environment
    numberOfThreads = len(sampleNames)
    print('Initialized parallel analysis using {} threads...'.format(
        numberOfThreads))
    hydra = multiprocessing.pool.Pool(numberOfThreads)
    instances = [[fileLocations, sampleName] for sampleName in sampleNames]
    tempo = hydra.map(barcodeWorkerSingleWithBarcode.main, instances)
    print('... completed.')

    # working in serial
    #for sampleName in sampleNames:
    #    print(sampleName)
    #    barcodeWorkerSingleWithBarcode.main(fileLocations,sampleName)

    # f.4. run quantification pipelines for each barcode-specific FASTQ file

    # f.5. map reads
    #readMapper.main(fileLocations,sampleNames)
    #readMapperSingle.main(fileLocations,genomeAnnotation,sampleNames)
    readMapperSingleWithBarcode.main(fileLocations, genomeAnnotation,
                                     sampleNames)

    # f.6. generate histograms of read maping for highesta buundace transcripts.

    return None
Exemplo n.º 2
0
def task(args):
  (data_type, repr_dim), seed, (algName, _, makeAlg) = args
  logging.info("datatype = %s, seed = %d, algorithm = %s", data_type, seed, algName)
  # read the data sets
  logging.info("Reading data...")
  y_train, x_train, y_test, x_test = dataReader.main("%s_%d" % (data_type, seed))
  data_dim = x_train.shape[1]
  logging.info(" * training set: %d x %d" % x_train.shape)
  logging.info(" * testing set: %d x %d" % x_test.shape)
  # init rng  
  np.random.seed(seed)

  logging.info("Running and evaluating the algorithm...")
  logging.info(" * using representation with dimension = %d", repr_dim)
  
  # init the algorithm
  alg = makeAlg(data_dim, repr_dim)
  
  # create output dir if does not exist
  ensure_dir_exists('res')

  # define the progress saving function
  progress_filename = 'res/progress-encdec-mse-%s-%d-%s.txt' % (data_type, seed, algName)
  progress_file = open(progress_filename, 'w', encoding='utf-8')
  def save_progress():
    x_test_pred = alg.decode(alg.encode(x_test))
    rel_mse = relative_mean_squared_error(x_test, x_test_pred)
    progress_file.write("%g\n" % rel_mse)

  # fit to the training data
  alg.learn(x_train,
            log_file_prefix=("log/%s-%d-%s" % (data_type, seed, algName)),
            callbacks=[save_progress])
  
  # TODO: remove?
  x_test = x_train

  # test with the testing data
  x_test_pred = alg.decode(alg.encode(x_test))
  ensure_dir_exists('pred')
  pred_filename = 'pred/final-encdec-%s-%d-%s' % (data_type, seed, algName)
  if save_pred:
    np.save(pred_filename, x_test_pred)
  #from sklearn import metrics
  #mse = metrics.mean_squared_error(x_test, x_test_pred,
  #    multioutput='uniform_average')
  #explained_var = metrics.explained_variance_score(x_test, x_test_pred,
  #    multioutput='uniform_average')
  mse = mean_squared_error(x_test, x_test_pred)
  rel_mse = relative_mean_squared_error(x_test, x_test_pred)

  logging.info("Result: rel_mse = %g", rel_mse)
  logging.info("Writing results to a file...")
  res_filename = 'res/final-encdec-mse-%s-%d-%s.txt' % (data_type, seed, algName)
  with open(res_filename, 'w', encoding='utf-8') as f:
    f.write("data = %-16s seed = %-4d alg = %-10s " % (data_type, seed, algName))
    f.write("mse = %.6f  " % mse)
    f.write("rel_mse = %.6f  " % rel_mse)
    f.write("\n")
Exemplo n.º 3
0
def task(args):
  data_type, seed, (algName, _, makeAlg) = args
  logging.info("datatype = %s, seed = %d, algorithm = %s", data_type, seed, algName)
  # read the data sets
  logging.info("Reading data...")
  y_train, x_train, y_test, x_test = dataReader.main("%s_%d" % (data_type, seed))
  data_dim = x_train.shape[1]
  logging.info(" * training set: %d x %d" % x_train.shape)
  logging.info(" * testing set: %d x %d" % x_test.shape)
  # init rng  
  np.random.seed(seed)

  x_test = x_train

  logging.info("Running and evaluating the algorithm...")
  
  # init the algorithm
  alg = makeAlg(data_dim, repr_dim)
  
  # create output dir if does not exist
  ensure_dir_exists('res')

  from sklearn.decomposition import PCA as sk_PCA
  pca = sk_PCA(n_components=repr_dim)
  pca.fit(x_train)
  y_train = pca.transform(x_train)
  y_test = pca.transform(x_test)

  # define the progress saving function
  progress_filename = 'res/progress-enc-mse-%s-%d-%s.txt' % (data_type, seed, algName)
  progress_file = open(progress_filename, 'w', encoding='utf-8')
  def save_progress():
    y_test_pred = alg.encode(x_test)
    rel_mse = relative_mean_squared_error(y_test, y_test_pred)
    progress_file.write("%g\n" % rel_mse)
  
  # fit
  alg.learn(x_train, y_train,
            log_file_prefix=("log/%s-%d-%s" % (data_type, seed, algName)),
            callbacks=[save_progress])
#tiled = False
tiled = (n_projections[0], len(data_types) * n_projections[1])

#figsize = (12.0, 9.0)
figsize = (6.0, 6.0)

#print(mpl.rcParams['axes.color_cycle'])

if tiled:
    plt.figure(figsize=(tiled[0] * figsize[0], tiled[1] * figsize[1]))

for d, data_type in enumerate(data_types):
    print("data = %s ..." % data_type)
    s = 0
    seed = seeds[s]
    y_train, x_train, y_test, x_test = dataReader.main("%s_%d" %
                                                       (data_type, seed))
    x_test = x_train
    x = x_test
    x = x[1:1000, :]
    for i in range(n_projections[1] * n_projections[0]):
        if tiled:
            plt.subplot(tiled[1], tiled[0],
                        d * n_projections[1] * tiled[0] + i + 1)
        else:
            plt.figure(figsize=figsize)
        ax = plt.gca()
        a = 2 * i
        b = 2 * i + 1
        plt.plot(x[:, a], x[:, b], '.k', label="data")
        plt.xlim([-1.2, 1.2])
        plt.ylim([-1.2, 1.2])