Exemplo n.º 1
0
    def __init__(self, properties):
        if not os.path.exists(DEV_PATH):
            preprocessing.run()

        logging.info('Preparing development set...')
        self.devset = json.load(open(DEV_PATH))
        self.devdata, self.voc2id, self.id2voc, self.vocabulary = utils.prepare_traindata(
            self.devset)

        logging.info('Preparing trainset...')
        self.trainset = json.load(open(TRAIN_PATH))
        self.traindata, self.voc2id, self.id2voc, self.vocabulary = utils.prepare_traindata(
            self.trainset)
        info = 'TRAIN DATA SIZE: ' + str(len(self.traindata))
        logging.info(info)

        self.EPOCH = properties['EPOCH']
        self.BATCH = properties['BATCH']
        self.EMB_DIM = properties['EMB_DIM']
        self.HIDDEN_DIM = properties['HIDDEN_DIM']
        self.DROPOUT = properties['DROPOUT']
        self.EARLY_STOP = properties['EARLY_STOP']
        self.pretrained = properties['pretrained_input']

        print('\nInitializing model...')
        print(self.fname())
        self.init()
Exemplo n.º 2
0
    def __init__(self, stop=True, vector=''):
        if not os.path.exists(DEV_PATH):
            preprocessing.run()

        self.stop = stop
        self.vector = vector
        self.alignments = []

        print('Preparing test set...')
        self.testset = json.load(open(TEST_PATH))

        print('Preparing development set...')
        self.devset = json.load(open(DEV_PATH))

        print('Preparing trainset...')
        self.trainset = json.load(open(TRAIN_PATH))

        self.word2vec = None
        if 'word2vec' in self.vector:
            self.word2vec = word2vec.init_word2vec(WORD2VEC_PATH)

        self.trainidx = self.trainelmo = self.devidx = self.develmo = self.testidx = self.testelmo = None
        self.fulltrainidx = self.fulltrainelmo = self.fulldevidx = self.fulldevelmo = self.fulltestidx = self.fulltestelmo = None
        if 'elmo' in self.vector:
            self.trainidx, self.trainelmo, self.devidx, self.develmo, self.testidx, self.testelmo = elmo.init_elmo(
                True, ELMO_PATH)
Exemplo n.º 3
0
    def __init__(self):
        if not os.path.exists(DEV_PATH):
            preprocessing.run()

        logging.info('Preparing development set...', extra=d)
        self.devset = json.load(open(DEV_PATH))
        self.devdata, self.voc2id, self.id2voc, self.vocabulary = utils.prepare_traindata(
            self.devset)

        logging.info('Preparing trainset...', extra=d)
        self.trainset = json.load(open(TRAIN_PATH))
        self.traindata, self.voc2id, self.id2voc, self.vocabulary = utils.prepare_traindata(
            self.trainset)
        info = 'TRAIN DATA SIZE: ' + str(len(self.traindata))
        logging.info(info, extra=d)

        self.translation = features.init_translation(traindata=self.trainset,
                                                     alpha=0.7,
                                                     sigma=0.3)

        logging.info('Preparing SimBOW...', extra=d)
        self.simbow = SemevalQuestionCosine()
        self.simbow.train()

        self.trainidx, self.trainelmo, self.devidx, self.develmo = features.init_elmo(
        )
        self.fulltrainidx, self.fulltrainelmo, self.fulldevidx, self.fulldevelmo = features.init_elmo(
            stop=False)
        self.word2vec = features.init_word2vec()
Exemplo n.º 4
0
def main():
    """Entry point if called as an executable"""

    if USE_SBATCH:
        mode = "sbatch"
    else:
        mode = "direct"

    if not os.path.exists(PHE_DIR):
        os.makedirs(PHE_DIR)

    # estimate hsq with GCTA
    for hsq in HSQ:
        for n_snp in N_SNP:
            for n_ind in N_IND:
                phe_dir = os.path.join(PHE_DIR, "hsq_" + str(hsq) + "-snp_" + str(n_snp) + "-ind_" +
                                       str(n_ind))

                phe_file = os.path.join(phe_dir, "phe_\\i")

                out_dir_hsq = os.path.join(HSQ_DIR, os.path.basename(phe_dir))

                if os.path.exists(out_dir_hsq):
                    shutil.rmtree(out_dir_hsq)
                os.makedirs(out_dir_hsq)

                print("estimating hsq...")
                sys.stdout.flush()
                preprocessing.run([MYGCTA, GCTA,
                                   "--grm-bin", GRM,
                                   "--pheno", phe_file + ".phen",
                                   "--qcovar", os.path.join(PHE_DIR, "age.txt"),
                                   "--qcovar", PCS,
                                   "--covar", os.path.join(PHE_DIR, "centre.txt"),
                                   "--covar", os.path.join(PHE_DIR, "sex.txt"),
                                   "--out", os.path.join(out_dir_hsq, os.path.basename(phe_file)),
                                   "--reml-no-constrain"],
                                  mode=mode,
                                  slurm_par=["-J", "simu_hsq",
                                             "--mem", "2G",
                                             "-D", out_dir_hsq,
                                             "-W"],
                                  array=range(1, N_ITER+1),
                                  check=False)
                print("creating zip file...")
                sys.stdout.flush()
                shutil.make_archive(out_dir_hsq, "zip", os.path.dirname(out_dir_hsq), os.path.basename(out_dir_hsq))
Exemplo n.º 5
0
def knn():
    test_X, test_y, training_X, training_y = preprocessing.run()

    #Hyperparam Tuning
    """
    k_num_values = 20
    val_error = np.zeros(k_num_values)
    train_error = np.zeros(k_num_values)

    for k in range(k_num_values):
        #k values are (1 to 20), but are stored as (0 to 19)
        ztrain, zval = kfoldcv.knn5F(training_X, training_y, k+1)
        val_error[k] = zval.mean()
        train_error[k] = ztrain.mean()
    """

    #for random seed 30 when shuffling data
    train_error = np.array([
        0.910490152897502, 0.912516202101554, 0.9122242496696659,
        0.9104851192348832, 0.9084590700308312, 0.9145460265525702,
        0.9116474758279326, 0.9104880555380775, 0.9148295895467605,
        0.9130950733027119, 0.9130950733027119, 0.9130950733027119,
        0.9130950733027119, 0.9130950733027119, 0.9130950733027119,
        0.9119356530128568, 0.9165683005096582, 0.9165683005096582,
        0.9165683005096582, 0.9165683005096582
    ])

    val_error = np.array([
        0.9235448312945289, 0.9282094367522516, 0.9258502486893398,
        0.9269995967199891, 0.9223820405968544, 0.9154187390778329,
        0.9096115069229735, 0.9211990858986423, 0.920002688533405,
        0.9188600618362683, 0.9235045032934532, 0.9188869471703185,
        0.9200430165344805, 0.9130864363489716, 0.9200699018685305,
        0.9247143433257158, 0.9235179459604786, 0.9339696195725231,
        0.9281892727517139, 0.9328001075413361
    ])
    BEST_K = 7

    #Training
    clf = KNeighborsClassifier(n_neighbors=BEST_K)
    pca = PCA(n_components=100)

    pca_training_X = pca.fit_transform(training_X)
    pca_test_X = pca.transform(test_X)

    clf.fit(pca_training_X, training_y)

    #Testing
    predictions = clf.predict(pca_test_X)
    confusion_matrix = np.zeros((12, 12))
    for prediction, label in zip(predictions, test_y):
        confusion_matrix[prediction][label] += 1

    print('KNN Accuracy: ', accuracy_score(predictions, test_y))

    #Graphing
    visualize_KNN_k(train_error, val_error, 'KNN_K_Value_Plot')
    visualize_precision_recall(confusion_matrix, 'KNN_Precision_Recall_Plot',
                               'KNN Precision and Recall Values by Category')
Exemplo n.º 6
0
    def __init__(self):
        if not os.path.exists(DEV_PATH):
            preprocessing.run()
        logging.info('Preparing development set...', extra=d)
        self.devset = json.load(open(DEV_PATH))

        logging.info('Preparing trainset...', extra=d)
        self.trainset = json.load(open(TRAIN_PATH))

        logging.info('Preparing word2vec...', extra=d)
        self.word2vec = features.init_word2vec()
        # self.glove, self.voc2id, self.id2voc = features.init_glove()
        logging.info('Preparing elmo...', extra=d)
        self.trainidx, self.trainelmo, self.devidx, self.develmo = features.init_elmo(
        )
        self.tfidf = {}
        self.dict = Dictionary()
Exemplo n.º 7
0
def main(config_file):
    """Entry point if called as an executable"""
    config = config_dataset.config_dataset(config_file)

    in_grm = os.path.join(config.grm_dir, 'grm-all', 'all')
    out_file_pca = os.path.join(config.grm_dir, 'grm-all', 'all.pca')

    in_grm_filtered = os.path.join(config.grm_dir, 'grm-all-' + str(config.grm_cutoff),
                                   'all-' + str(config.grm_cutoff))
    out_file_pca_filtered = os.path.join(config.grm_dir, 'grm-all-' + str(config.grm_cutoff),
                                         'all-' + str(config.grm_cutoff) + '.pca')

    nbpcs = 10

    if config.use_sbatch:
        rmode = "srun"
    else:
        rmode = "direct"

    # compute PCA for all individuals
    preprocessing.run([config.mygcta, config.gcta,
                       "--grm-bin", in_grm,
                       "--pca", str(nbpcs),
                       "--out", out_file_pca,
                       "--thread-num", str(config.nbproc)],
                      mode=rmode,
                      slurm_par=["-J", "gcta_pca",
                                 "-p", "common,dedicated",
                                 "--qos", "fast",
                                 "-c", str(config.nbproc)])
    plot_pca(out_file_pca)

    # compute PCA for unrelated individuals
    preprocessing.run([config.mygcta, config.gcta,
                       "--grm-bin", in_grm_filtered,
                       "--pca", str(nbpcs),
                       "--out", out_file_pca_filtered,
                       "--thread-num", str(config.nbproc)],
                      mode=rmode,
                      slurm_par=["-J", "gcta_pca",
                                 "-p", "common,dedicated",
                                 "--qos", "fast",
                                 "-c", str(config.nbproc)])
    plot_pca(out_file_pca_filtered)
Exemplo n.º 8
0
def main(config_file):

    """Entry point if called as an executable"""

    config = config_dataset.config_dataset(config_file)

    # ========= 1. All SNPS =============

    in_dir_gwas_allsnps = os.path.join(config.gwa_dir, 'gwas-all')
    out_dir = os.path.join(config.hsq_dir, 'genesis')
    out_dir_log = os.path.join(out_dir, 'log')

    os.makedirs(out_dir_log, exist_ok=True)

    # slurm configuration
    if config.use_sbatch:
        mode = "sbatch"
    else:
        mode = "direct"

    for pheno in config.phe_list:
        assoc_file = os.path.join(in_dir_gwas_allsnps, "all." + pheno + ".assoc.linear")
        res_file = os.path.join(out_dir, "all." + pheno + ".RData")

        if not os.path.exists(assoc_file):
            print("Warning: {} not found.".format(assoc_file))
            continue

        cmd = ["Rscript",
               os.path.join(os.path.dirname(os.path.abspath(__file__)), 'genesis.R'),
               assoc_file,
               res_file,
               str(config.nbproc)]
        slurm_par = ["-J", "genesis",
                     "--qos", "ghfc",
                     "-p", "ghfc",
                     # "-p", "common",
                     "-D", out_dir_log,
                     "-o", "all." + pheno + "-%j.out",
                     "-e", "all." + pheno + "-%j.out",
                     "-c", str(config.nbproc),
                     "--mem", "4G"]
        preprocessing.run(cmd, mode=mode, slurm_par=slurm_par)
Exemplo n.º 9
0
def main():
    """Entry point if called as an executable"""
    in_grm_filtered = os.path.join(GRM_DIR, 'grm-all-' + str(GRM_CUTOFF),
                                   'all-' + str(GRM_CUTOFF))
    out_file_pca = os.path.join(GRM_DIR, 'grm-all-' + str(GRM_CUTOFF),
                                'all-' + str(GRM_CUTOFF))
    nbpcs = 10

    if USE_SBATCH:
        rmode = "srun"
    else:
        rmode = "direct"

    preprocessing.run([
        MYGCTA, GCTA, "--grm-bin", in_grm_filtered, "--pca",
        str(nbpcs), "--out", out_file_pca, "--thread-num",
        str(NBPROC)
    ],
                      mode=rmode,
                      slurm_par=["-c", str(NBPROC)])
Exemplo n.º 10
0
    def __init__(self,
                 stop=True,
                 lowercase=True,
                 punctuation=True,
                 w2v_dim=300):
        if not os.path.exists(DEV_PATH):
            preprocessing.run()

        self.w2v_dim = w2v_dim
        self.lowercase = lowercase
        self.stop = stop
        self.punctuation = punctuation

        logging.info('Preparing test set 2016...')
        self.testset2016 = json.load(open(TEST2016_PATH))
        self.test2016data = self.format_data(self.testset2016)

        logging.info('Preparing test set 2017...')
        self.testset2017 = json.load(open(TEST2017_PATH))
        self.test2017data = self.format_data(self.testset2017)

        logging.info('Preparing development set...')
        self.devset = json.load(open(DEV_PATH))
        self.devdata = self.format_data(self.devset)

        logging.info('Preparing trainset...')
        self.trainset = json.load(open(TRAIN_PATH))
        self.traindata = self.format_data(self.trainset)
        info = 'TRAIN DATA SIZE: ' + str(len(self.traindata))
        logging.info(info)

        self.word2vec = word2vec.init_word2vec(lowercase=self.lowercase,
                                               punctuation=self.punctuation,
                                               stop=self.stop,
                                               dim=self.w2v_dim)

        # additional data
        self.init_additional()
Exemplo n.º 11
0
def svm():
    test_X, test_y, training_X, training_y = preprocessing.run()

    #Training
    clf = SVC(gamma='scale', decision_function_shape='ovo')
    pca = PCA(n_components=100)

    pca_training_X = pca.fit_transform(training_X)
    pca_test_X = pca.transform(test_X)

    clf.fit(pca_training_X, training_y)

    #Testing
    predictions = clf.predict(pca_test_X)
    confusion_matrix = np.zeros((12, 12))
    for prediction, label in zip(predictions, test_y):
        confusion_matrix[prediction][label] += 1

    print('SVM Accuracy: ', accuracy_score(predictions, test_y))

    #Graphing
    visualize_precision_recall(confusion_matrix, 'SVM_Precision_Recall_Plot',
                               'SVM Precision and Recall Values by Category')
Exemplo n.º 12
0
def extract(filename, inverse=False, resize=False):
    preprocessing.run(filename, inverse, resize)
    processing.run(filename)
    postprocessing.run(filename)
    return extraction.run(filename)
Exemplo n.º 13
0
def main():
    """Entry point if called as an executable"""
    # initiate seed for pandas (but not for GCTA)
    np.random.seed(2017)

    gen_file = os.path.join(PRU_DIR, 'all')

    if USE_SBATCH:
        mode = "sbatch"
    else:
        mode = "direct"

    if not os.path.exists(PHE_DIR):
        os.makedirs(PHE_DIR)

    # load SNP table
    snp_table = pd.read_table(gen_file + ".bim", na_values=".", header=None).iloc[:, 1]
    snp_table.dropna(inplace=True)

    # load individual table
    ind_table = pd.read_table(PCS, sep=' ', dtype={'FID': str, 'IID': str}).loc[:, ['FID', 'IID']]

    # simulate heritable phenotypes with GCTA
    for hsq in HSQ:
        for n_snp in N_SNP:
            for n_ind in N_IND:
                out_dir = os.path.join(PHE_DIR, "hsq_" + str(hsq) + "-snp_" + str(n_snp) + "-ind_" +
                                       str(n_ind))
                if os.path.exists(out_dir):
                    shutil.rmtree(out_dir)
                os.makedirs(out_dir)

                phe_file = os.path.join(out_dir, "phe_\\i")
                snp_file = os.path.join(out_dir, "snp_\\i.txt")
                ind_file = os.path.join(out_dir, "ind_\\i.txt")

                print("Generating SNP and individual lists...")
                for i in range(1, N_ITER+1):
                    # extract SNP subset
                    snp_file_i = snp_file.replace("\\i", str(i))
                    snp_list = snp_table.sample(n_snp)
                    snp_list.to_csv(snp_file_i, index=False)

                    # extract individual subset
                    ind_file_i = ind_file.replace("\\i", str(i))
                    ind_list = ind_table.sample(n_ind).sort_values(by=["FID", "IID"])
                    ind_list.to_csv(ind_file_i, index=False, sep='\t')

                print("Simulating phenotypes...")
                sys.stdout.flush()
                # simulate phenotypes with GCTA
                preprocessing.run([GCTA,
                                   "--bfile", gen_file,
                                   "--keep", ind_file,
                                   "--simu-qt",
                                   "--simu-hsq", str(hsq),
                                   "--simu-causal-loci", snp_file,
                                   "--out", phe_file],
                                  mode=mode,
                                  slurm_par=["-J", "simu_pheno",
                                             "--mem", "4G",
                                             "-D", out_dir,
                                             "-W"],
                                  array=range(1, N_ITER+1))
Exemplo n.º 14
0
    def __init__(self,
                 stop=True,
                 vector='',
                 lowercase=True,
                 punctuation=True,
                 proctrain=True,
                 elmo_layer='top',
                 w2vdim=300):
        if not os.path.exists(DEV_PATH):
            preprocessing.run()

        self.lowercase = lowercase
        self.stop = stop
        self.punctuation = punctuation
        self.w2vdim = w2vdim
        self.proctrain = proctrain
        self.vector = vector
        self.elmo_layer = elmo_layer

        logging.info('Preparing test set 2016...')
        self.testset2016 = json.load(open(TEST2016_PATH))
        self.test2016data, _, _, _ = self.format_data(self.testset2016)

        logging.info('Preparing test set 2017...')
        self.testset2017 = json.load(open(TEST2017_PATH))
        self.test2017data, _, _, _ = self.format_data(self.testset2017)

        logging.info('Preparing development set...')
        self.devset = json.load(open(DEV_PATH))
        self.devdata, _, _, _ = self.format_data(self.devset)

        logging.info('Preparing trainset...')
        self.trainset = json.load(open(TRAIN_PATH))
        self.traindata, self.voc2id, self.id2voc, self.vocabulary = self.format_data(
            self.trainset)
        info = 'TRAIN DATA SIZE: ' + str(len(self.traindata))
        logging.info(info)

        self.word2vec = None
        if 'word2vec' in self.vector:
            self.word2vec = word2vec.init_word2vec(
                lowercase=self.lowercase,
                punctuation=self.punctuation,
                stop=self.stop,
                dim=self.w2vdim)

        self.fasttext = None
        if 'fasttext' in self.vector:
            self.fasttext = fasttext.init_fasttext(
                lowercase=self.lowercase,
                punctuation=self.punctuation,
                stop=self.stop,
                dim=self.w2vdim)

        self.trainidx = self.trainelmo = self.devidx = self.develmo = self.test2016idx = self.test2016elmo = self.test2017idx = self.test2017elmo = None
        if 'elmo' in self.vector:
            self.trainidx, self.trainelmo, self.devidx, self.develmo, self.test2016idx, self.test2016elmo, self.test2017idx, self.test2017elmo = elmo.init_elmo(
                lowercase=self.lowercase,
                stop=self.stop,
                punctuation=self.punctuation,
                path=ELMO_PATH)

        self.alignments = self.init_alignments(ALIGNMENTS_PATH)

        # additional data
        self.init_additional()
def main():
    """Entry point if called as an executable"""

    if USE_SBATCH:
        mode = "sbatch"
    else:
        mode = "direct"

    gen_file = os.path.join(PRU_DIR, 'all')

    # estimate hsq with GCTA
    for hsq in HSQ:
        for n_snp in N_SNP:
            for n_ind in N_IND:
                phe_dir = os.path.join(PHE_DIR, "hsq_" + str(hsq) + "-snp_" + str(n_snp) + "-ind_" +
                                       str(n_ind))

                phe_file = os.path.join(phe_dir, "phe_\\i")
                ind_file = os.path.join(phe_dir, "ind_\\i.txt")

                print("Remove header of individual lists...")
                for i in range(1, N_ITER+1):
                    ind_file_i = ind_file.replace("\\i", str(i))
                    phe_file_i = phe_file.replace("\\i", str(i))

                    source_file = open(ind_file_i, 'r')
                    source_file.readline()
                    target_file = open(phe_file_i + ".ind", 'w')

                    shutil.copyfileobj(source_file, target_file)
                    source_file.close()
                    target_file.close()

                out_dir_hsq = os.path.join(HSQ_DIR + "_gctb", os.path.basename(phe_dir))

                if os.path.exists(out_dir_hsq):
                    shutil.rmtree(out_dir_hsq)
                os.makedirs(out_dir_hsq)

                print("estimating hsq...")
                sys.stdout.flush()

                # estimate required memory
                nsnp = sum(1 for _ in open(gen_file + ".bim"))
                mem = 4 * nsnp * n_ind + nsnp * 500
                mem_str = str(math.ceil(mem / 1e6)) + "M"

                preprocessing.run([MYPLINK, 'mpirun', '-np', str(NBPROC), '--oversubscribe', GCTB,
                                   "--bfile", gen_file,
                                   "--pheno", phe_file + ".phen",
                                   "--keep", phe_file + ".ind",
                                   "--qcovar", os.path.join(PHE_DIR, "age.txt"),
                                   "--qcovar", PCS,
                                   "--covar", os.path.join(PHE_DIR, "centre.txt"),
                                   "--covar", os.path.join(PHE_DIR, "sex.txt"),
                                   "--bayes", "S",
                                   "--out", os.path.join(out_dir_hsq, os.path.basename(phe_file))],
                                  mode=mode,
                                  slurm_par=["-J", "simu_hsq",
                                             "--mem", mem_str,
                                             "-c", str(NBPROC),
                                             "-D", out_dir_hsq,
                                             "-W"],
                                  array=range(1, N_ITER+1),
                                  check=False)
                print("creating zip file...")
                sys.stdout.flush()
                shutil.make_archive(out_dir_hsq, "zip", os.path.dirname(out_dir_hsq), os.path.basename(out_dir_hsq))
Exemplo n.º 16
0
def main():
    """Entry point if called as an executable"""

    in_prefix = 'all'
    in_file_allsnps = os.path.join(FIL_DIR, in_prefix)
    out_dir_gwas_allsnps = os.path.join(GWA_DIR, 'gwas-all')
    log_dir_gwas_allsnps = os.path.join(out_dir_gwas_allsnps, 'log')
    in_file_prunedsnps = os.path.join(PRU_DIR, in_prefix)
    out_dir_gwas_prunedsnps = os.path.join(GWA_DIR, 'gwas-pruned')
    log_dir_gwas_prunedsnps = os.path.join(out_dir_gwas_prunedsnps, 'log')

    ## use sex, center and age as covariates

    ## filter individuals in centre.txt keeping only those in the genotype file
    fam_table = pd.read_table(
        in_file_allsnps + ".fam",
        delim_whitespace=True,
        names=['FID', 'IID', 'PID', 'MID', 'Gender', 'Phenotype'])
    centre_table = pd.read_table(os.path.join(PHE_DIR, "centre.txt"),
                                 delim_whitespace=True,
                                 index_col=False)
    centre_table = centre_table[centre_table.IID.isin(fam_table.IID)]
    centre_table.to_csv(os.path.join(PHE_DIR, "centre.cov"),
                        sep='\t',
                        index=False)

    # slurm configuration
    if USE_SBATCH:
        smode = "sbatch"
    else:
        smode = "direct"

    ## All SNPs

    os.makedirs(log_dir_gwas_allsnps, exist_ok=True)

    for pheno in PHE_LIST:
        out_prefix = 'all.' + pheno
        preprocessing.run([
            MYPLINK, PLINK, "--bfile", in_file_allsnps, "--allow-no-sex",
            "--linear", "hide-covar", "--pheno",
            os.path.join(PHE_DIR, pheno + ".txt"), "--qcovar",
            os.path.join(PHE_DIR, "age.txt"), "--qcovar", PCS, "--covar",
            os.path.join(PHE_DIR, "centre.cov"), "--qcovar",
            os.path.join(PHE_DIR, "sex.txt"), "--out",
            os.path.join(out_dir_gwas_allsnps, out_prefix)
        ],
                          mode=smode,
                          slurm_par=["-J", "gwas", "-D", log_dir_gwas_allsnps])

    ## Pruned SNPs

    os.makedirs(log_dir_gwas_prunedsnps)

    for pheno in PHE_LIST:
        out_prefix = 'all.' + pheno
        preprocessing.run(
            [
                MYPLINK, PLINK, "--bfile", in_file_prunedsnps,
                "--allow-no-sex", "--linear", "hide-covar", "--pheno",
                os.path.join(PHE_DIR, pheno + ".txt"), "--qcovar",
                os.path.join(PHE_DIR, "age.txt"), "--qcovar", PCS, "--covar",
                os.path.join(PHE_DIR, "centre.cov"), "--qcovar",
                os.path.join(PHE_DIR, "sex.txt"), "--out",
                os.path.join(out_dir_gwas_prunedsnps, out_prefix)
            ],
            mode=smode,
            slurm_par=["-J", "gwas", "-D", log_dir_gwas_prunedsnps])
Exemplo n.º 17
0
import preprocessing
from warehouse import Warehouse
import numpy as np
import mdptoolbox
import pandas as pd

#Step 1
# run only one time to create transition probability matrix
preprocessing.run()
warehouse = Warehouse()
warehouse.save_tpm()

#Step2
warehouse = Warehouse()
tpm = warehouse.get_tpm()
rewards_matrix = warehouse.rewards_matrix()

mdp_p = mdptoolbox.mdp.PolicyIteration(tpm, rewards_matrix, 0.9, max_iter=100)
mdp_v = mdptoolbox.mdp.ValueIteration(tpm, rewards_matrix, 0.9, max_iter=100)

mdp_p.run()
mdp_v.run()

result_p = warehouse.test_rl_policy(mdp_p.policy)
result_v = warehouse.test_rl_policy(mdp_v.policy)

print("ValueIteration Robot traveled: ", result_v[0])
value_iter_states = result_v[1]

print("PolicyIteration Robot traveled: ", result_p[0])
policy_iter_states = result_p[1]
Exemplo n.º 18
0
"""
Atividade para trabalhar o pré-processamento dos dados.

Criação de modelo preditivo para diabetes e envio para verificação de peformance
no servidor.

@author: Aydano Machado <*****@*****.**>
"""

import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import requests
import preprocessing

print('\n - Fazendo pre-processamento')
feature_cols = preprocessing.run()

print('\n - Lendo o arquivo com o dataset sobre diabetes')
data = pd.read_csv('diabetes_dataset.csv')

# Criando X and y par ao algorítmo de aprendizagem de máquina.\
print(' - Criando X e y para o algoritmo de aprendizagem a partir do arquivo diabetes_dataset')
# Caso queira modificar as colunas consideradas basta algera o array a seguir.

# feature_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
#                    'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X = data[feature_cols]
y = data.Outcome

# Ciando o modelo preditivo para a base trabalhada
print(' - Criando modelo preditivo')
Exemplo n.º 19
0
def run():
    X, y = preprocessing.run()
    clf = svm.SVC(gamma='scale', decision_function_shape='ovo')
    #for now, just run the function. eventually, we'll be iterating through these, passing in different tuning parameters each time
    z = kfoldcv.svm5F(X, y)
    return clf.fit(X, y)
Exemplo n.º 20
0
'''
vae_sess = tf.Session()
rnn_sess = tf.Session()
actor_sess = tf.Session()

env = EnvWrap(FLAGS.init_frame_skip, FLAGS.frame_skip, FLAGS.env,
              FLAGS.renderGame)
vaegan = VAEGAN.VAEGAN(vae_sess)
rnn = RNN.RNN(rnn_sess)
actor = ACTOR.ACTOR(actor_sess)
mcts = MCTS.Tree(rnn, actor)
trainer = Trainer()

#If called, train the VAEGAN AND RNN before the actor
if (FLAGS.preprocessing):
    preprocessing.run(env, vaegan, trainer, rnn)

if (FLAGS.playing):
    #Make the actor play and train VAEGAN, RNN and actor
    playing.run(env, vaegan, rnn, actor, trainer, mcts)
'''
def main():
    
    #Tran alphazero using MCTS
    trainer.trainActor(mcts, vae, rnn, env, actor)
    
    ONce the VAE and RNN have been trained, I have to use the alphazero.
    Alphazero takes the current state and asks the MCTS to create the next states.
    No okay, so the next thhing is to give the current state to the MCTS that starts
    its algorithm. AT every iteration asks to alphazero to evaluate the node.
def main(config_file):
    config = config_dataset.config_dataset(config_file)

    nbit = config.nbit
    nbproc = config.nbproc

    out_dir = config.permu_dir
    out_dir_log = os.path.join(out_dir, 'log')
    if not os.path.exists(out_dir_log):
        os.makedirs(out_dir_log)

    if config.use_sbatch:
        mode = "sbatch"
    else:
        mode = "direct"

    # Run permutations
    def run_permutations():

        in_file_pruned = os.path.join(config.pru_dir, 'all')
        
        # Estimate required memory in kilobytes
        nind = preprocessing.linecount(config.keep_ind)
        nsnp_tot = preprocessing.linecount(in_file_pruned + '.bim')
        nsnps = [preprocessing.linecount(f) for f in snplist_files]
        if nsnp_tot == sum(nsnps):
            ncomp = 1 + len(snplist_files)
        else:
            ncomp = 2 + len(snplist_files)
        mem = math.ceil(500000 + (0.025 + 0.009 * ncomp) * nind**2)
        sbatch_par_mem = str(mem) + 'K'
        
        array_lim = config.array_lim

        job_id = preprocessing.run_array(
                ['python3',
                 os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              'permutations.py'),
                 '\\j' + '_' + '\\i',
                 in_file_pruned,
                 ",".join(snplist_files),
                 permu_path,
                 os.path.abspath(config_file)],
                mode=mode,
                slurm_par=["-J", "permu_pheno",
                           # "--qos", "fast",
                           "-p", "common",
                           "--mem", sbatch_par_mem,
                           "--cpus-per-task", str(nbproc),
                           "-D", out_dir_log],
                array=range(1, nbit+1),
                array_limit=array_lim)
        return job_id

    part2jid = {}

    # margins = [0, 10, 20, 30, 40, 50]
    margins = [0, 20, 50]

    # genic non-genic
    for margin in margins:
        snplist_files = [os.path.join(config.grm_dir, 'grm-genic', 'genic-margin' + str(margin) + '.snplist')]
        permu_path = os.path.join(out_dir, 'genic-margin' + str(margin))
        hsq_prefix = os.path.join("hsq-genic", "genic-margin" + str(margin))

        part2jid[hsq_prefix] = run_permutations()

        if margin > 0:
            snplist_files = [os.path.join(config.grm_dir, 'grm-genic', 'genic-margin0.snplist'),
                             os.path.join(config.grm_dir, 'grm-genic', 'updown-margin' + str(margin) + '.snplist')]
            permu_path = os.path.join(out_dir, 'updown-margin' + str(margin))
            hsq_prefix = os.path.join("hsq-genic", "updown-margin" + str(margin))

            part2jid[hsq_prefix] = run_permutations()

    # cnsexpression non-cnsexpression non-genic
    snplist_files = [os.path.join(config.grm_dir, 'grm-cnsexpression', 'cnsexpression-margin50.snplist'),
                     os.path.join(config.grm_dir, 'grm-cnsexpression', 'noncnsexpression-margin50.snplist')]
    permu_path = os.path.join(out_dir, "cnsexpression-margin50")
    hsq_prefix = os.path.join("hsq-cnsexpression", "cnsexpression-margin50")

    part2jid[hsq_prefix] = run_permutations()

    # neurodev non-neurodev non-genic
    snplist_files = [os.path.join(config.grm_dir, 'grm-neurodev', 'neurodev-margin50.snplist'),
                     os.path.join(config.grm_dir, 'grm-neurodev', 'nonneurodev-margin50.snplist')]
    permu_path = os.path.join(out_dir, "neurodev-margin50")
    hsq_prefix = os.path.join("hsq-neurodev", "neurodev-margin50")

    part2jid[hsq_prefix] = run_permutations()

    # maf
    maf_intervals = config.maf_intervals
    snplist_files = [os.path.join(config.grm_dir, 'grm-maf', 'maf{}-{}.snplist'.format(*maf_int)) for maf_int
                     in maf_intervals]
    permu_path = os.path.join(out_dir, "maf")
    hsq_prefix = os.path.join("hsq-maf", "maf")

    part2jid[hsq_prefix] = run_permutations()

    # == Once permutations are done, compute z-scores and p-value for the different partitions == #

    # to compute the p-values with sbatch
    for partition, jid in part2jid.items():
        cmd = ["python3",
               os.path.join(os.path.dirname(os.path.abspath(__file__)), 'permutations_zscores.py'),
               os.path.abspath(config_file),
               partition,
               out_dir]
        slurm_par = ["-J", "zscores_" + partition,
                     "--qos", "fast",
                     "-p", "dedicated",
                     # "-p", "common",
                     "-D", out_dir_log,
                     "--mem", "2G",
                     "--dependency", "afterany:" + jid]
        preprocessing.run(cmd, mode=mode, slurm_par=slurm_par)
Exemplo n.º 22
0
def main(config_file):

    """Entry point if called as an executable"""

    config = config_dataset.config_dataset(config_file)

    # numit = 10000
    # burnin = 5000
    seed = 333
    ndist = 4
    gpin = [0., 0.00001, 0.0001, 0.001]

    # slurm configuration
    if config.use_sbatch:
        mode = "sbatch"
    else:
        mode = "direct"

    ncpus = config.nbproc

    out_dir = os.path.join(config.hsq_dir, 'bayesR')
    log_dir = os.path.join(out_dir, 'log')
    tmp_dir = os.path.join(out_dir, 'plink')

    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(tmp_dir, exist_ok=True)

    in_files = {"all": os.path.join(config.fil_dir, 'all'), "pruned": os.path.join(config.pru_dir, 'all')}

    for key in in_files:

        in_file = in_files[key]

        for pheno in config.phe_list:
            pheno_file = os.path.join(config.phe_dir, pheno+'.txt')
            tmp_in_file = os.path.join(tmp_dir, key + "." + pheno)
            out_file = os.path.join(out_dir, key + "." + pheno)

            data = pd.read_table(pheno_file)
            data.rename(columns={data.columns[2]: "pheno"}, inplace=True)

            for qcov in config.quant_covar:
                data = data.merge(pd.read_table(qcov, sep='\s+'))
            for cov in config.qual_covar:
                data = data.merge(pd.read_table(cov, sep='\s+'))

            data.set_index(["FID", "IID"], inplace=True)

            model = smf.ols(formula='pheno~' + "+".join(data.columns.difference(["pheno"])), data=data).fit()
            resid_file = tmp_in_file + ".resid.txt"
            model.resid.to_csv(resid_file, sep=" ")

            preprocessing.run([config.plink,
                               "--bfile", in_file,
                               "--keep", config.keep_ind,
                               "--pheno", resid_file,
                               "--make-bed",
                               "--out", tmp_in_file
                               ])

            # Estimate required memory in megabytes
            nind = preprocessing.linecount(tmp_in_file + '.fam')
            nsnp = preprocessing.linecount(tmp_in_file + '.bim')
            mem = math.ceil((1 + 2e-6 * nind * nsnp) * 1.1)
            sbatch_par_mem = str(mem) + 'M'

            cmd = [config.bayesrv2,
                   "-bfile", tmp_in_file,
                   "-nthreads", str(ncpus),
                   "-ndist", str(ndist),
                   "-gpin", ",".join(map(str, gpin)),
                   "-out", out_file,
                   # "-numit", str(numit),
                   # "-burnin", str(burnin),
                   "-seed", str(seed)]
            slurm_par = ["-J", "bayesR_" + key,
                         "--qos", "ghfc",
                         "-p", "ghfc",
                         "-c", str(ncpus),
                         "-D", log_dir,
                         "-o", key + "." + pheno + "-%j.out",
                         "-e", key + "." + pheno + "-%j.out",
                         "--mem", sbatch_par_mem]
            jid = preprocessing.run(cmd, mode=mode, slurm_par=slurm_par)

            if config.clean_permu:
                cmd = ["rm", tmp_in_file + ".*"]
                slurm_par = ["-J", "clean_bayesR",
                             "-p", "common,dedicated",
                             "--qos", "fast",
                             "-D", log_dir,
                             "-o", "clean." + key + "." + pheno + "-%j.out",
                             "-e", "clean." + key + "." + pheno + "-%j.out",
                             "--mem", "500M",
                             "--dependency", "afterany:" + jid]
                preprocessing.run(cmd, mode=mode, slurm_par=slurm_par)
Exemplo n.º 23
0
def main(config_file):
    """Entry point if called as an executable"""

    config = config_dataset.config_dataset(config_file)

    in_prefix = 'all'
    in_file_allsnps = os.path.join(config.fil_dir, in_prefix)
    out_dir_gwas_allsnps = os.path.join(config.gwa_dir, 'gwas-all')
    log_dir_gwas_allsnps = os.path.join(out_dir_gwas_allsnps, 'log')
    in_file_prunedsnps = os.path.join(config.pru_dir, in_prefix)
    out_dir_gwas_prunedsnps = os.path.join(config.gwa_dir, 'gwas-pruned')
    log_dir_gwas_prunedsnps = os.path.join(out_dir_gwas_prunedsnps, 'log')

    # use sex, center and age as covariates

    # Create dummy coded centre table
    if not os.path.isfile(os.path.join(config.phe_dir, "centre.cov")):
        # filter individuals in centre.txt keeping only those in the genotype file
        fam_table = pd.read_table(in_file_allsnps + ".fam", delim_whitespace=True,
                                  names=['FID', 'IID', 'PID', 'MID', 'Gender', 'Phenotype'])
        centre_table = pd.read_table(os.path.join(config.phe_dir, "centre.txt"), delim_whitespace=True,
                                     index_col=False)
        centre_table = centre_table[centre_table.IID.isin(fam_table.IID)]
        centre_table.to_csv(os.path.join(config.phe_dir, "centre.cov"), sep='\t', index=False)

    # slurm configuration
    if config.use_sbatch:
        smode = "sbatch"
    else:
        smode = "direct"

    # All SNPs

    os.makedirs(log_dir_gwas_allsnps, exist_ok=True)

    for pheno in config.phe_list:
        out_prefix = 'all.' + pheno
        preprocessing.run([config.myplink, config.plink,
                           "--bfile", in_file_allsnps,
                           "--allow-no-sex", "--linear", "hide-covar",
                           "--pheno", os.path.join(config.phe_dir, pheno+".txt"),
                           "--qcovar", os.path.join(config.phe_dir, "age.txt"),
                           "--qcovar", config.pcs,
                           "--covar", os.path.join(config.phe_dir, "centre.cov"),
                           "--qcovar", os.path.join(config.phe_dir, "sex.txt"),
                           "--ci", str(0.95),
                           "--out", os.path.join(out_dir_gwas_allsnps, out_prefix)],
                          mode=smode,
                          slurm_par=["-J", "gwas",
                                     "-D", log_dir_gwas_allsnps])

    # Pruned SNPs

    os.makedirs(log_dir_gwas_prunedsnps, exist_ok=True)

    for pheno in config.phe_list:
        out_prefix = 'all.' + pheno
        preprocessing.run([config.myplink, config.plink,
                           "--bfile", in_file_prunedsnps,
                           "--allow-no-sex", "--linear", "hide-covar",
                           "--pheno", os.path.join(config.phe_dir, pheno+".txt"),
                           "--qcovar", os.path.join(config.phe_dir, "age.txt"),
                           "--qcovar", config.pcs,
                           "--covar", os.path.join(config.phe_dir, "centre.cov"),
                           "--qcovar", os.path.join(config.phe_dir, "sex.txt"),
                           "--ci", str(0.95),
                           "--out", os.path.join(out_dir_gwas_prunedsnps, out_prefix)],
                          mode=smode,
                          slurm_par=["-J", "gwas",
                                     "-D", log_dir_gwas_prunedsnps])
Exemplo n.º 24
0
def run():
    X, y = preprocessing.run()
    nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X)
    distances, indices = nbrs.kneighbors(X)

    return distances, indices
Exemplo n.º 25
0
import preprocessing
import learning
from utils import load_submission, load_train_df
from configuration import CONFIG


if __name__ == "__main__":
    # Create train and meteo preprocessed files
    preprocessing.run()
    # Define a model
    from sklearn.neighbors import KNeighborsRegressor

    _df = load_train_df(CONFIG.preprocessed_train_path)
    _submission_df = load_submission()
    _estimator = KNeighborsRegressor(n_neighbors=4, weights='distance')
    # estimator = LogisticRegression()
    _scoring = 'mean_squared_error'
    _k_fold = 3
    _n_jobs = 3
    _verbose = 0
    _fit_params = None
    _cols = ["YEAR", "WEEK_NUMBER", "WEEK_DAY", "TIME"]
    _weights = [1, 1, 1, 0.1]

    # Test the model
    print(learning.cross_val_score(_estimator, _cols, _k_fold, _weights, _scoring, _n_jobs, _verbose, _fit_params,
                                   chunksize=100000))

    # Create the corresponding submission file
    learning.create_submission_file(_estimator, _cols, weights=_weights)
Exemplo n.º 26
0
import preprocessing
import sys
import socket

if __name__ == "__main__":
    if len(sys.argv) == 3:
        preprocessing.run(sys.argv[1], sys.argv[2])
    elif len(sys.argv) == 2:
        ip_addr = socket.gethostbyname(socket.getfqdn())
        preprocessing.run(ip_addr, 8000, sys.argv[1])
    else:
        raise Exception("Correct argument form not supplied")