def main():
	(ppiF, goAnnotation, outF) = sys.argv[1:]

	ppis = utils.readData(ppiF, np.array([0,1]), np.array([0]))
	prot2GO = utils.readData(goAnnotation, np.array([1]), np.array([0]))

	out = getGoldStandard(ppis, prot2GO)
	outFH = open(outF, "w")
	outFH.write( "IDA\tIDB\tLabel\n" + "\n".join(out))
	outFH.close()
Пример #2
0
def runExperiments(features, es, logFile):
    # Reading the data into an array
    data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    data = m.conceptPreprocessing(data, es.removeDeniedConcepts,
                                  es.splitDeniedConcepts,
                                  es.removeUncertainConcepts,
                                  es.splitUncertainConcepts,
                                  es.removeFamilyConcepts,
                                  es.splitFamilyConcepts)

    if es.bootstrap:
        bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED,
                                        cfg.PATH_PREPROCESSED_UNANNOTATED)
        bootstrap_data = m.conceptPreprocessing(
            bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts,
            es.removeUncertainConcepts, es.splitUncertainConcepts,
            es.removeFamilyConcepts, es.splitFamilyConcepts)
    # Looping over different feature parameters
    for featTypes in features:
        #for x in [True, False]:
        #es.fs_confidence = x

        logFile.write('Executing for ' + ','.join(featTypes) + ' model.\n')
        es.featTypes = featTypes

        if es.svmParamSweep:
            result_params = m.param_sweep_svm(data,
                                              es,
                                              gammaSweep=False,
                                              nFolds=10,
                                              verbose=False,
                                              random_seed=44)
            for name in result_params:
                logFile.write(
                    str(name) + ": " + str(result_params[name]) + '\n')
        else:
            estimator = m.getEstimator(es)
            if es.bootstrap:
                results = m.eval_bootstrapped_crossVal(estimator,
                                                       data,
                                                       bootstrap_data,
                                                       es,
                                                       10,
                                                       printTree=False)
            else:
                results = m.evalCrossval(estimator,
                                         data,
                                         es,
                                         10,
                                         printTree=False)
            for name in results:
                logFile.write(str(name) + ": " + str(results[name]) + '\n')
Пример #3
0
 def __init__(self, name):
     self.con = lite.connect(name)
     with self.con:
         cur = self.con.cursor()
         print "Database",name,"loaded"
         sql = utils.readData(PATH+"/sqlScripts/player_save.sql")
         cur.executescript(sql)
Пример #4
0
    def OnPlayerSelect(self, e):
        index = e.GetSelection()
        if index==wx.NOT_FOUND:
            return

        player = self.GetSelectedPlayer()
        if player == None:
            print "PlayerPanel -> OnPlayerSelect: player == None"

        else:
            stats = player.GetStats()
            value = stats_string.format(stats['AC'], stats['Listen'], 
                    stats['Spot'], stats['Search'], stats['Move_Silently'],
                    stats['Hide'])
            self.plstats_text.SetValue(value)

            items = player.GetItems()
            self.UpdateItemLbox(items)

            bstory_filename = path+'/../backstories/'+player.GetName()+'.txt'
            if os.path.isfile(bstory_filename):
                bstory_text = utils.readData(bstory_filename)
                self.bstory_tctrl.SetValue(bstory_text)
            else:
                self.bstory_tctrl.Clear()
Пример #5
0
def runMain():
    print('Running ...')
    try:
        data = readData(INPUT_FILE_NAME) 
    except Exception as e:
        print('ERROR 1: File I/O Error While Reading Input Data! App terminating ...')
        return


    try:
        dataOutputList = processData(HEADERS_LIST, data)
    except Exception as f:
        print('ERROR 2: Error Processing Data! App terminating ...')
        return

    try:
        writeOutputToFile(OUTPUT_FILE_NAME, HEADERS_LIST, dataOutputList)
        writeData(HEADERS_LIST, 1)
        for element in dataOutputList:
            writeData(element, 0)

    except Exception as g:
        print('ERROR 3: File I/O Error While Writing Output Data! App terminating ...')
        print(g)
        return
Пример #6
0
def main():
	(rawDataF, mappingF, outF) = sys.argv[1:]

	mapData = utils.readData(mappingF, np.array([0]), np.array([1]))	

	mapping = {}

	mappingFH = open(mappingF)
	mappingFH.readline()
	for line in mappingFH:
		line = line.rstrip()
		if len(line.split("\t")) != 2: continue
		(wormID, geneName) = line.split("\t")
		(geneName, wormID) = line.split("\t")
		
		if geneName not in mapping: mapping[geneName] = set([])
		if wormID not in mapping: mapping[wormID] = set([])
		mapping[geneName].add(wormID)
		mapping[wormID].add(geneName)

	outFH = open(outF, "w")
	dataFH = open(rawDataF)
	outFH.write(dataFH.readline())
	for line in dataFH:
		line = line.rstrip()
		lineSplit = line.split("\t")
		idA = lineSplit[0]
		mapA = mapID(idA, mapping)
#		mapB = mapID(idB, mapping)
		if mapA == "" : continue
		print >> outFH, "%s\t%s" % (mapA, "\t".join(lineSplit[1:]))
	dataFH.close()
	outFH.close()
Пример #7
0
def train_test_data_read(train_label_file, train_text_file, test_label_file, test_text_file, word_ebd_file, all_text_file):
    trainCorpus, testCorpus = utils.readData(train_label_file, train_text_file, test_label_file, test_text_file,
                                         word_ebd_file, all_text_file)
    train_x, train_y = corpus_read(trainCorpus)

    test_x, test_y = corpus_read(testCorpus)

    return train_x,  train_y, test_x, test_y
def main():
	(proteinComplexes, goAnnotation, outF) = sys.argv[1:]

	complexes2prot = utils.readData(proteinComplexes, np.array([1]), np.array([0]))
	prot2GO = utils.readData(goAnnotation, np.array([1]), np.array([0]))

	complexes2prot = filterComplexes(complexes2prot, 2,40)
	print len(complexes2prot)
	complexes2prot = mergeComplexes(complexes2prot)
	print len(complexes2prot)


	prot2complexes = getProtsToComplex(complexes2prot)
	out = getGoldStandard(prot2complexes, prot2GO)
	outFH = open(outF, "w")
	outFH.write( "IDA\tIDB\tLabel\n" + "\n".join(out))
	outFH.close()
Пример #9
0
def evaluateForFeats(feats):
    log = ''
    
    for run in feats:
        run = ','.join(run)
        data = utils.readData(cfg.PATH_OUTPUT+run + '/', cfg.PATH_PREPROCESSED_TEST)
        gs = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST)
        log += str([x.key for x in data]) + '\n'
        log += str([x.key for x in gs]) + '\n'
        
        labels = [x.severity for x in data]
        labels_gs = [x.severity for x in gs]
        log += str(labels) + '\n'
        log += str(labels_gs) + '\n'
        
        log += str("Scores for " + run + ": \n")
        log += m.getScores(labels_gs, labels)
        log += '\n\n'
    return log;
Пример #10
0
def increaseDataSize(bacteria_name,
                     fasta_file,
                     p_bed_path,
                     n_bed_path,
                     genome_file_path,
                     output_folder,
                     both_sides_padding_size=20):
    print("INCREASING BED SEQ SIZES: ", bacteria_name, "\n", 10 * "__")
    if not os.path.exists(output_folder):
        print("CREATING: ", output_folder)
        os.mkdir(output_folder)
    output_folder = "{}/{}".format(output_folder, bacteria_name)
    if not os.path.exists(output_folder):
        os.mkdir(output_folder)

    input_p_bed_df = BedTool(p_bed_path).to_dataframe()
    input_n_bed_df = BedTool(n_bed_path).to_dataframe()
    print("INPUT - P {} N {}".format(input_p_bed_df.shape,
                                     input_n_bed_df.shape))

    print("BED SLOP POSITIVE")
    os.system("bedtools slop -b {} -i {} -g {} > {}/positive.bed".format(
        both_sides_padding_size, p_bed_path, genome_file_path, output_folder))
    print("BED SLOP NEGATIVES")
    os.system("bedtools slop -b {} -i {} -g {} > {}/negative.bed".format(
        both_sides_padding_size, n_bed_path, genome_file_path, output_folder))
    print("GET FASTA POSITIVE")
    os.system(
        "bedtools getfasta -fi {} -bed {}/positive.bed -s -fo {}/positive.fasta"
        .format(fasta_file, output_folder, output_folder))
    print("GET FASTA NEGATIVES")
    os.system(
        "bedtools getfasta -fi {} -bed {}/negative.bed -s -fo {}/negative.fasta"
        .format(fasta_file, output_folder, output_folder))
    p_df, n_df = readData(
        "{}/positive.fasta".format(output_folder),
        "{}/negative.fasta".format(output_folder),
    )
    p_bed_df = BedTool("{}/positive.bed".format(output_folder)).to_dataframe()
    n_bed_df = BedTool("{}/negative.bed".format(output_folder)).to_dataframe()
    print("P: ", p_df.shape, " N: ", n_df.shape, "P: ", p_bed_df.shape, " N: ",
          n_bed_df.shape)
    p_bed_df["sequence"] = p_df.values
    n_bed_df["sequence"] = n_df.values
    p_bed_df["label"] = [1] * len(p_df)
    n_bed_df["label"] = [0] * len(n_df)
    dataset_df = pd.concat([p_bed_df, n_bed_df])
    print("SAVING DATASET: P {}  + N {} = {}".format(p_bed_df.shape,
                                                     n_bed_df.shape,
                                                     dataset_df.shape))
    p_bed_df.to_csv("{}/positive.csv".format(output_folder))
    n_bed_df.to_csv("{}/negative.csv".format(output_folder))
    dataset_df.to_csv("{}/dataset.csv".format(output_folder))
    print(dataset_df.head())
Пример #11
0
def LR(train_label_file, train_text_file, test_label_file, test_text_file,
       word_ebd_file, all_text_file):
    # sent=utils.Sent()
    train_set, test_set = utils.readData(train_label_file, train_text_file,
                                         test_label_file, test_text_file,
                                         word_ebd_file, all_text_file)
    print("len of train data: ", len(train_set))
    len_train = len(train_set)
    print("len of test data: ", len(test_set))
    len_test = len(test_set)
    # print("label of train data=",train_set[0].label)
    # print("embedding of train data=",train_set[0].emb)

    # Training dataset
    train_x = []
    train_y = []
    for i in range(0, len_train):
        x = train_set[i].emb
        train_x.append(x)
        y = train_set[i].label
        #if you use 20ng remember to us y=np.int(y)
        y = np.int(y)
        train_y.append(y)
    train_x = np.array(train_x)
    train_y = np.array(train_y)
    # Test dataset
    test_x = []
    test_y = []
    for i in range(0, len_test):
        x = test_set[i].emb
        test_x.append(x)
        y = test_set[i].label
        y = np.int(y)
        test_y.append(y)
    test_x = np.array(test_x)
    test_y = np.array(test_y)

    # Logistic Regression
    if (len(test_set) < 7000):
        logreg = LogisticRegression()
        logreg.fit(train_x, train_y)
        y_pred = logreg.predict(test_x)
    else:
        logreg = LogisticRegression(multi_class="multinomial",
                                    solver='newton-cg')
        logreg.fit(train_x, train_y)
        y_pred = logreg.predict(test_x)
    # with open('y_pred.pkl', 'wb') as f:
    #    pickle.dump(y_pred, f)
    # with open('test_y.pkl', 'wb') as f:
    #    pickle.dump(test_y, f)
    return y_pred, test_y
Пример #12
0
def main(useAnnotatorWeighing=True):
    '''
    This script evaluates the system performance for the different runs on testdata, created with testSetup.py
    
    '''

    #runs = ['DSM+1,DIST_HIER','CONCEPTSwithoutContext','CONCEPTS+CONTEXT', 'BOW', 'DSM+2','CATEGORICAL_QUESTIONSET,QUESTIONSET,LONG_QUESTIONSET','DSM','SNOMED+1','DSM+1']
    runs = ['DSM+1']
    for run in runs:
        data = utils.readData(cfg.PATH_OUTPUT + run + '/',
                              cfg.PATH_PREPROCESSED_TEST)
        gs = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST)
        print([x.key for x in data])
        print([x.key for x in gs])

        labels = [x.severity for x in data]
        labels_gs = [x.severity for x in gs]
        print(labels)
        print(labels_gs)

        print("Scores for", run, ": ")
        m.getScores(labels_gs, labels)
Пример #13
0
 def __init__(self, filePath, actor, joints):
     """
     filePath - path to file of FoB data
     joints - collection of joints, ordered by heaviness
     """
     self.filePath = filePath
     self.actor = actor
     self.joints = joints
     self.controlJoints = [self.actor.controlJoint(None, "modelRoot", joint) for joint in self.joints]
     self.exposedJoints = [self.actor.exposeJoint(None, "modelRoot", joint) for joint in self.joints]
     self.data = utils.readData(filePath, config.num_sensors)
     self.dataStream = izip(*self.data)
     self.prevFobSegments = segmentsFromJoints(self.exposedJoints[:-1], self.exposedJoints[1:])
     self.prevJointSegments = segmentsFromJoints(self.exposedJoints[:-1], self.exposedJoints[1:])
     self.stopped = False
     self.paused = False
Пример #14
0
def main():
#	src/AddAnotationtoPPI_data.py data/WORM/final_PPI_ratio5.ids.txt data/WORM/worm_gene_uniprot.map.txt data/WORM/final_PPI_ratio5.gprofiler_out.map.txt data/WORM/Human_elegans_uniprot_names.txt data/WORM/uniprotGeneNameMap.txt data/WORM/human.var.txt data/WORM/gene.omim.combined data/WORM/allComplexes.tab test/test.out
	(protIDsF, worm_gene2UniprotF, goMapF, othologMapF, human_gene2UniprotF, snpF, disF, complexF, outF) = sys.argv[1:]

	protIDs = utils.readData(protIDsF, np.array([0]), np.array([0]))
	worm_gene2uniprot = utils.readData(worm_gene2UniprotF, np.array([0]), np.array([1]))
	goData = utils.readData(goMapF, np.array([0]), np.array([1,2]))

	orthmap = utils.readData(othologMapF, np.array([1]), np.array([0]))
	mapData = utils.readData(human_gene2UniprotF, np.array([1]), np.array([0]))
	snpData = utils.readData(snpF, np.array([0]), np.array([1,2,3]), mapData)
	disData = utils.readData(disF, np.array([0]), np.array([1]), primKeyMap= mapData)
	complexData = utils.readData(complexF, np.array([0]), np.array([2]))

	header =  "GeneName\tUniprotIDs\tHumanOrthologIDs\tEnriched_GO_terms\tSNP\tDisease\tComplexe"
	cats = header.split("\t")
	counts = {"wAnno" : set([])}
	for cat in cats:
		counts[cat] = set([])

	outFH = open(outF, "w")
	print >> outFH, header
	for protID in protIDs:
		line = protID[0]
		line += "\t" + annoToString(getAnnotation(protID, worm_gene2uniprot))
		line += "\t" + annoToString(getAnnotation(protID, orthmap, [worm_gene2uniprot]))
		line += "\t" + annoToString(getAnnotation(protID, goData))
		line += "\t" + annoToString(getAnnotation(protID, snpData, [worm_gene2uniprot, orthmap]))
		line += "\t" + annoToString(getAnnotation(protID, disData, [worm_gene2uniprot, orthmap]))
		line += "\t" + annoToString(getAnnotation(protID, complexData, [worm_gene2uniprot, orthmap]))
		lineSplit = line.split("\t")
		for i in range(len(lineSplit)):
			col = lineSplit[i]
			if col != "-":
				counts[cats[i]].add(protID[0])
		
		for i in range(3,len(lineSplit)):
			col = lineSplit[i]
			if col != "-":
				counts["wAnno"].add(protID[0])
		
		print >> outFH, line

	for cat in counts:
		print "%s\t%i" % (cat, len(counts[cat]))
Пример #15
0
def reSaveTextData(in_data, out_data):
    dataset, batch_num = utils.readData(in_data, batch_size=7002)
    # read all data
    sample = dataset[0]
    text_dict = utils.sampletxt2data(sample)
    text = list(text_dict.values())
    img_name = list(text_dict.keys())
    # label_name_dict = {'overall_sentiment_int':4}
    y_dict, y = utils.sampley2data(sample)
    df = pd.DataFrame(list(
        zip(img_name, text, list(y[0]), list(y[1]), list(y[2]), list(y[3]),
            list(y[4]))),
                      columns=[
                          'img_name', 'text', 'Humour', 'Sarcasm', 'offensive',
                          'Motivational', 'Overall_Sentiment'
                      ])
    df.to_csv(out_data, sep=',', encoding='utf-8', index=False)
Пример #16
0
    item3 = c * x**2 / (1. + (x / Is3)**3)

    y = A * np.exp(-(item1 + item2 + item3) * L)
    return y


def otherOptimization(func, theta0, datas, labels):
    result = scipy.optimize.minimize(func,
                                     theta0, (datas, labels),
                                     method='Powell',
                                     options={'maxiter': 100})
    #result = scipy.optimize.minimize(func, theta0, (datas, labels), method='Nelder-Mead')
    return result


datas, labels = utils.readData('datas.xlsx')

datas = np.array(datas)
labels = np.array(labels)

#theta0 = (.2, 200., 3., 1., .01, 20., 500.0)
theta0 = (.2, 100000., 100., 1., 0.01, 20., 500.0
          )  # Try so many initializations, total bad

result = otherOptimization(objectFunction, theta0, datas, labels)
print(result)

minVal = np.min(datas)
maxVal = np.max(datas)

theta = [a for a in result.x]
def main():
    (
        clusterF,
        worm_gene2UniprotF,
        othologMapF,
        disF,
        human_gene2UniprotF,
        oboF,
        omim_doid_F,
        omim2NameF,
        gene2DOIDF,
        outF,
    ) = sys.argv[1:]

    obo = readOntology(oboF, False)

    omim_doid = utils.readData(omim_doid_F, np.array([2]), np.array([0]))
    omim_name = utils.readData(omim2NameF, np.array([0]), np.array([1]))
    orthmap = utils.readData(othologMapF, np.array([1]), np.array([0]))
    worm_gene2uniprot = utils.readData(worm_gene2UniprotF, np.array([0]), np.array([1]))
    doid2Name = utils.readData(oboF, np.array([0]), np.array([1]))

    mapData = utils.readData(human_gene2UniprotF, np.array([1]), np.array([0]))
    disData = utils.readData(disF, np.array([0]), np.array([1]), primKeyMap=mapData)
    stats = importr("stats")

    doid2Gene = {}
    gene2DOIDFH = open(gene2DOIDF)
    for line in gene2DOIDFH:
        line = line.rstrip()
        (gene, thisDOID) = line.split("\t")
        if not thisDOID.startswith("DOID"):
            continue
        allParents = set([])
        #                allParents = getAll(thisDOID, obo)
        allParents.add(thisDOID)
        for doid in allParents:
            if doid not in doid2Gene:
                doid2Gene[doid] = set([])
            doid2Gene[doid].add(gene)
    gene2DOIDFH.close()

    allGenes = 20313
    allProts = 1713  # TOUPDATE when new complexes are used

    outFH = open(outF, "w")
    clusterFH = open(clusterF)
    for line in clusterFH:
        line = line.rstrip("\n")
        cluster_genes = line.split("\t")[0].split(",")
        cluster_prots = set([])
        cluster_omim = set([])
        doid_cluster_counts = {}
        for gene in cluster_genes:
            cluster_prots.update(getAnnotation((gene,), orthmap, [worm_gene2uniprot]))
            cluster_omim.update(getAnnotation((gene,), disData, [worm_gene2uniprot, orthmap]))
            doidIDs = mapIDs(cluster_omim, omim_doid)
            if len(doidIDs) > 0:
                for doidID in doidIDs:
                    parentDOID = set([])
                    # 					parentDOID = getAll(doidID[0], obo)
                    parentDOID.add(doidID[0])
                    for allIDs in parentDOID:
                        if allIDs not in doid_cluster_counts:
                            doid_cluster_counts[allIDs] = set([])
                        doid_cluster_counts[allIDs].add(gene)
        pvals = []
        ids = []
        for doidID in doid_cluster_counts:
            if doidID == "DOID:0000004" or doidID == "---":
                continue
            if len(doid_cluster_counts[doidID]) == 1:
                continue
            mat = [[allProts, allGenes], [len(doid_cluster_counts[doidID]), len(doid2Gene[doidID])]]
            pval = fisher_exact(mat)[1]
            pvals.append(pval)
            ids.append(doidID)
        pvals = stats.p_adjust(FloatVector(pvals), method="BH")
        enrichedDOIDs = set([])

        for i in range(len(pvals)):
            if pvals[i] <= 0.05:
                enrichedDOIDs.add(
                    "%s,%s,%i,%.4f" % (ids[i], doid2Name[(ids[i],)][0], len(doid_cluster_counts[ids[i]]), pvals[i])
                )

        tmp = set()
        for mim in cluster_omim:
            if mim in omim_name:
                tmp.add((mim[0], omim_name[mim][0][0]))
        doidIDs = mapIDs(cluster_omim, omim_doid)
        cluster_omim = tmp
        print >> outFH, "%s\t%s\t%s\t%s" % (
            line,
            annoToString(cluster_prots, sepB=","),
            annoToString(cluster_omim),
            ";".join(enrichedDOIDs),
        )

    clusterFH.close()
    outFH.close()
Пример #18
0
init = tf.global_variables_initializer()
saver_latest = tf.train.Saver(max_to_keep=1)
saver_acc = tf.train.Saver(max_to_keep=1)
saver_loss = tf.train.Saver(max_to_keep=1)

with sess.as_default():
    init.run()

if min_indices > max_indices:
    raise AssertionError('min_indices cannot be larger than min_indices')

if not train_labels_path:
    train_labels_path = os.path.join(os.path.dirname(train_images_path), 'labels')

src_files, src_labels_list, total_frames = readData(train_images_path, train_images_ext, train_labels_path,
                                                    train_labels_ext)
if start_id < 0:
    if end_id < 0:
        raise AssertionError('end_id must be non negative for random selection')
    elif end_id >= total_frames:
        raise AssertionError('end_id must be less than total_frames for random selection')
    print('Using {} random images for training'.format(end_id + 1))
    img_ids = np.random.choice(total_frames, end_id + 1, replace=False)
else:
    if end_id < start_id:
        end_id = total_frames - 1
    print('Using all {} images for training'.format(end_id - start_id + 1))
    img_ids = range(start_id, end_id + 1)

if start_id < 0:
    log_template = '{:d}_{}_{}_{}_random_{}_{}'.format(
Пример #19
0
    {'centroidDet': 'pretretedInputDataInitialCentroidsNear', 'recombMet': 'medianNewCentroids','distance': 'euclideanDistance'},
    {'centroidDet': 'pretretedInputDataInitialCentroidsNear', 'recombMet': 'medianNewCentroids', 'distance': 'manhattanDistance'},
    {'centroidDet': 'randomInitialCentroids', 'recombMet': 'averageNewCentroids', 'distance': 'euclideanDistance'},
    {'centroidDet': 'randomInitialCentroids', 'recombMet': 'averageNewCentroids', 'distance': 'manhattanDistance'},
    {'centroidDet': 'randomInitialCentroids', 'recombMet': 'medianNewCentroids', 'distance': 'euclideanDistance'},
    {'centroidDet': 'randomInitialCentroids', 'recombMet': 'medianNewCentroids', 'distance': 'manhattanDistance'}
]
# Nom des fichiers d'entree et sortie du programme
INPUT_FILE_NAME = "irisData.txt"
OUTPUT_FILE_NAME = "resultatsKMeans.txt"

#Function que applique les tests
def applyTest(tst,rawData):
    k = KMeans(tst['centroidDet'], tst['recombMet'], tst['distance'])
    return k.setupClusters(rawData)

if __name__ == "__main__":
    data=readData(INPUT_FILE_NAME)
    resultTest=''
    testNum=1
    for tst in tests:
        resultTest += '------------------------------------ Test N '+str(testNum)+' ------------------------------------\n'
        resultTest+='Centroid determination method: ' + tst['centroidDet'] +'\n'
        resultTest+='New centroid determination method: ' + tst['recombMet'] +'\n'
        resultTest += 'Distance method: ' + tst['distance'] + '\n'
        rslt, iterations=applyTest(tst, data)
        resultTest += 'Quantite iterations: ' + str(iterations) + '\n'
        resultTest+=printTestSummary(rslt)
        testNum+=1
        plotResults(rslt, tst['centroidDet']+'_'+tst['recombMet']+'_'+tst['distance'])
    printToFile(resultTest)s
Пример #20
0
			print 'Training model for single tags'
			clf = Pipeline([('vectorizer', DictVectorizer(sparse=False)),('classifier', LogisticRegression(solver='lbfgs',multi_class='multinomial'))])
			clf.fit(X, y1)
			self.MaxEntClassifier = clf
			filename1 = self.modelFileMaxEnt
			pickle.dump(clf, open(filename1, 'wb'))
			
			print 'Training model for pairs of tags'
			clf2 = Pipeline([('vectorizer', DictVectorizer(sparse=False)),('classifier', LogisticRegression(solver='lbfgs',multi_class='multinomial'))])
			clf2.fit(X, y2)
			self.TwoLabelClassifier = clf2
			filename2 = self.modelFileMaxEntPair
			pickle.dump(clf2, open(filename2, 'wb'))
		
			self.tags = self.MaxEntClassifier.classes_
			self.tagPairs = self.TwoLabelClassifier.classes_
		else:
			print('Cannot fit in test mode')
			exit(0)


if __name__ == '__main__':
	df_train, df_test, corpus, tags = readData('../data/')
	posTagger = MEMM()
	X_train, X_test, y_train1, y_test1, y_train2, y_test2 = posTagger.preprocess(df_train[:50000], df_test[:50000])
	#print("Fitting model")
	#posTagger.fit(X_train, y_train1, y_train2)
	print ("Sample Tags using Viterbi decoding")
	posTagger.viterbiDecoding(df_test[:46])

Пример #21
0
from utils import readData
from aco import ACO, World
from plot import plot

if __name__ == '__main__':

    #noCities, cost_matrix, points = readData("D:\\UBB_info_sem_4\\AI\\LAB\\Teme\\Lab5\\ulysses16.txt")
    noCities, cost_matrix, points = readData(
        "D:\\UBB_info_sem_4\\AI\\LAB\\Teme\\Lab5\\data.txt")
    #noCities, cost_matrix, points = readData("D:\\UBB_info_sem_4\\AI\\LAB\\Teme\\Lab5\\data2.txt")
    #noCities, cost_matrix, points = readData("D:\\UBB_info_sem_4\\AI\\LAB\\Teme\\Lab5\\data3.txt")

    paramACO = {
        "ant_count": 10,
        "generations": 100,
        "alpha": 1.0,
        "beta": 10.0,
        "rho": 0.5,
        "q": 10
    }
    paramWorld = {"cost_matrix": cost_matrix, "noCities": noCities}

    aco = ACO(paramACO)
    graph = World(paramWorld)

    path, cost = aco.solve(graph)
    print("\n")
    print("\n")
    print("BEST: Cost: " + str(cost) + " \nPath: " + str(path))
    plot(points, path)
Пример #22
0
def main():
	(protAnnoF, oboQueryF, oboF, omim_doid_F, gene2DOIDF, outF) = sys.argv[1:]
	
	obo = readOntology(oboF, False)

	omim_doid = utils.readData(omim_doid_F, np.array([2]), np.array([0]))

	qnames = utils.readData(oboQueryF, np.array([0,1]), np.array([0]))	

	protAnnoFH = open(protAnnoF)
	protAnnoFH.readline()
	prot2DOID = {}

	omimperOBO = {}
	COUNTperQTerm = {}
	for line in protAnnoFH:
		line = line.rstrip()
		prot = line.split("\t")[0]
		for m in re.finditer('OMIM:\d{6}', line):
			omimID = m.group(0)
			if (omimID, ) not in omim_doid: continue
			for doidID in omim_doid[(omimID, )]:
				parentDOID = getAll(doidID[0], obo)
				parentDOID.add(doidID[0])
				for (queryDOID, name) in qnames:
					if queryDOID in parentDOID:
						if queryDOID not in COUNTperQTerm: COUNTperQTerm[queryDOID] = set([])
						if queryDOID not in omimperOBO: omimperOBO[queryDOID] = set([])
						COUNTperQTerm[queryDOID].add(prot)
						omimperOBO[queryDOID].add((prot, omimID))

	protAnnoFH.close()


	allProts = len(utils.readData(protAnnoF, np.array([0]), np.array([0])))
        doid2Gene= {}
        gene2DOIDFH = open(gene2DOIDF)
        for line in gene2DOIDFH:
                line = line.rstrip()
                (gene, thisDOID) = line.split("\t")
                if not thisDOID.startswith("DOID"): continue
                allParents = getAll(thisDOID, obo)
                allParents.add(thisDOID)
                for doid in allParents:
                        if doid not in doid2Gene: doid2Gene[doid] = set([])
                        doid2Gene[doid].add(gene)
        gene2DOIDFH.close()

        allGenes = 20313 #len(allGenes)

	pvals = []
	names = []
	for (doid, name) in qnames:
		if doid not in doid2Gene: continue
		if doid not in COUNTperQTerm: continue
		mat = [[allProts, allGenes],[len(COUNTperQTerm[doid]), len(doid2Gene[doid])]]
		pval = fisher_exact(mat)[1]
		pvals.append(pval)
		names.append((doid, name))
	
	stats = importr('stats')
	selNames = {}
	pvals = stats.p_adjust(FloatVector(pvals), method = 'BH')
	for i in range(len(pvals)):
#		if pvals[i]>0.05: continue
		selNames[names[i]] = pvals[i]
	
	catCounts = {}
	for selname in selNames:
		pval = selNames[selname]
		doid, name = selname
		if len(COUNTperQTerm[doid]) == 0: continue
		counts = len(COUNTperQTerm[doid])
		omimCounts = len(omimperOBO[doid])
		if counts not in catCounts: catCounts[counts] = set([])
		catCounts[counts].add("%s\t%s\t%i\t%i\t%f" % (name, doid, counts, omimCounts, pval))

	outFH = open(outF + ".dat", "w")
	print >> outFH, "Name\tDOID\tCounts\nAll sites\tNA\t%i" % (allProts)
	for counts in sorted(catCounts.keys(), reverse=True):
		print >> outFH, "\n".join(catCounts[counts])
		print "\n".join(catCounts[counts])
	outFH.close()
Пример #23
0
    (options, args) = parser.parse_args()

    highestScore = 0.0
    eId = 0

    train_id2sent, train_id2pos, train_id2ner, train_id2nerBILOU, train_id2arg2rel, test_id2sent, test_id2pos, test_id2ner, test_id2nerBILOU, test_id2arg2rel = readCoNLL2004_prepared_corpus(
    )
    words, w2i, c2i, nertags, postagCount = vocabNER(train_id2sent,
                                                     train_id2pos,
                                                     train_id2nerBILOU)

    id2arg2rel, rels, classweights = getRelVocab(train_id2arg2rel,
                                                 train_id2nerBILOU)

    fulltrain_data = readData(train_id2sent, train_id2pos, c2i)
    test_data = readData(test_id2sent, test_id2pos, c2i)

    #print w2i
    #print c2i
    #print nertags
    #print postags

    train_data, train_id2arg2rel_train = {}, {}
    numInstances = len(fulltrain_data) / 5 * 4
    count = 0
    for index in fulltrain_data:
        train_data[index] = fulltrain_data[index]
        train_id2arg2rel_train[index] = train_id2arg2rel[index]

        count += 1
Пример #24
0
            store = [data, 1]
        elif btw_hold(data, store[0]):
            store = ave_func(store, [data, 1])
        else:
            if store[1] <= winsize:
                win.append(store)
            else:
                if len(win) > 0:
                    for w in win:
                        store[1] += w[1]
                win = []
                fs.append(map(int, store))
            store = [data, 1]
    if store is not None:
        fs.append(map(int, store))
    results = []
    index = 0
    for f in fs:
        if len(results) == 0 or f[0] != results[-1][1]:
            results.append((indexes[index], f[0]))
        index += f[1]
    return results


if __name__ == "__main__":
    if len(sys.argv) != 5:
        utils.printUsage(("datafile", "threshold", "winsize", "outputfile"))
    results = smooth(utils.readData(sys.argv[1], int), float(sys.argv[2]),
                     int(sys.argv[3]))
    utils.writeData(sys.argv[4], results, '%d\t%d\n')
Пример #25
0
from utils import readData

readData()
Пример #26
0
 def __init__(self, filePath, points):
     self.filePath = filePath
     self.data = utils.readData(filePath, config.num_sensors)
     self.points = points
     self.dataStream = izip(*self.data)
Пример #27
0
#!/usr/bin/env python
# coding: utf-8

# In[4]:

import sys
import numpy as np
import open3d as o3d

sys.path.insert(1, "../data")
from utils import readData, readPointCloud

# In[5]:

ground_truth = readData("../data/01.txt")
ground_truth = ground_truth[:77][:]

# In[11]:


def computeTransformation(point_ind):
    #     final_pose = []
    T = ground_truth[point_ind][:]
    T = np.reshape(T, (3, 4))
    b = np.array([0, 0, 0, 1])
    T = np.vstack((T, b))
    #     print (T.shape)
    return T


# In[8]:
Пример #28
0
def driver():
    data = utils.readData()
    clusterDataPairs(data, config.MIN_CLUSTERS, config.MAX_CLUSTERS)
Пример #29
0
                    results.append(item)
            else:
                item = QAPair()
                item.question = q
                item.answers = a
                item.begQue = ind[0]
                item.endQue = ind[1]
                item.begAns = ind[1] + 1
                item.endAns = ind[2]
                results.append(item)
        return results


if __name__ == '__main__':
    #TEST CODE
    data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN)
    s = Segment()
    for d in data:
        text = d.getTextObject()
        segments = s.segment(text)
#
#         print(segments)

#         for segment in segments:
#
#             print("Question in TextObject:", text.get_covered_tokens(segment.begQue, segment.endQue))
#             print("Question concepts:", text.get_covered_concepts(segment.begQue, segment.endQue))
#             print("Question in segment:", segment.question)
#
#             print("Answer in TextObject:", text.get_covered_tokens(segment.begAns, segment.endAns))
#             print("Answer concepts:", text.get_covered_concepts(segment.begAns, segment.endAns))
Пример #30
0
def runForExperimentSettings(features, es):

    # Reading the train/test_data into an array
    train_data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN)
    test_data = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts,
                                        es.splitDeniedConcepts,
                                        es.removeUncertainConcepts,
                                        es.splitUncertainConcepts,
                                        es.removeFamilyConcepts,
                                        es.splitFamilyConcepts)
    test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts,
                                       es.splitDeniedConcepts,
                                       es.removeUncertainConcepts,
                                       es.splitUncertainConcepts,
                                       es.removeFamilyConcepts,
                                       es.splitFamilyConcepts)

    # Reading in bootstrap data as well when enabled
    if es.bootstrap:
        bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED,
                                        cfg.PATH_PREPROCESSED_UNANNOTATED)
        bootstrap_data = m.conceptPreprocessing(
            bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts,
            es.removeUncertainConcepts, es.splitUncertainConcepts,
            es.removeFamilyConcepts, es.splitFamilyConcepts)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    # train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts)
    # test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts)

    vectorizer = DictVectorizer()
    min_max_scalar = MinMaxScaler()

    # Looping over different feature parameters
    for featTypes in features:
        utils.out('Executing for ' + ','.join(featTypes) + ' model.')
        es.featTypes = featTypes

        estimator = m.getEstimator(es)

        m.generatePrimaryFeats(train_data, es)
        m.generatePrimaryFeats(test_data, es)
        utils.out('Generated primary features for train and test_data!')

        y_train = [d.severity for d in train_data]

        #else argument added here to not override the train_data/y_train setting, otherwise we can only do one featType at a time
        if es.bootstrap:
            m.generatePrimaryFeats(bootstrap_data, es)
            (train_datac,
             y_trainc) = m.get_bootstrapped_trainset(train_data,
                                                     y_train,
                                                     bootstrap_data,
                                                     es,
                                                     estimator,
                                                     th_bs=0.6)
        else:
            train_datac = train_data
            y_trainc = y_train

        concatenated_data = []
        concatenated_data.extend(train_datac)
        concatenated_data.extend(test_data)

        m.generateDataDrivenFeats(train_datac, concatenated_data, es)

        featurized = m.featurize(concatenated_data)

        train_feats = featurized[0:len(train_datac)]
        test_feats = featurized[len(train_datac):len(featurized)]

        # Do feature selection on train data
        train_feats = fs.runFeatureSelection(train_feats, y_trainc, es)
        train_feats, y_trainc, train_bucket = ss.runSampleSelection(
            train_feats, y_trainc, [i for i in range(len(train_datac))], es)

        x_train = vectorizer.fit_transform(train_feats)
        x_test = vectorizer.transform(test_feats)

        if es.scaleData:
            x_train = min_max_scalar.fit_transform(x_train.toarray())
            x_test = min_max_scalar.transform(x_test.toarray())

        weights_train = m.getWeights(train_datac, train_bucket,
                                     es.weighInterAnnot)

        model = m.train(estimator,
                        x_train,
                        y_trainc,
                        weights_train,
                        model=None)

        y_pred = m.test(x_test, estimator=model)
        #         print(y_pred)
        for i, cur_data in enumerate(test_data):
            cur_data.predSev = y_pred[i]

        out_dir = cfg.PATH_OUTPUT + ','.join(featTypes) + '/'
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        utils.genOutput(data=test_data,
                        outDir=out_dir,
                        dtd=cfg.PATH_OUTPUT + '2016_CEGS_N-GRID_TRACK2.dtd/')
    for i in range(pcd.shape[0]):
        x = int(pcd[i, 0] - x_min)
        z = int(pcd[i, 2] - z_min)

        occupancy[x, z] += 1

    occupancy /= pcd.shape[0]
    return occupancy > THRESH

def numpy_to_image(arr: np.ndarray, path: str):
    cv2.imwrite(f'{path}.png',(arr * 255).astype(np.uint8))


if __name__ == "__main__":
    transf = utils.readData(DATASET_TR_PATH)
    
    if os.path.exists(RESULT_PATH_1):
        shutil.rmtree(RESULT_PATH_1)
    os.makedirs(RESULT_PATH_1)

    for filename in os.listdir(DATASET_PATH):
        arr = utils.readPointCloud(DATASET_PATH + filename)[:, :3]
        arr = utils.lidar_to_world(arr)
        ind = int(filename[:-4])
        arr = utils.make_homogenous_and_transform(arr, transf[ind].reshape(3, 4))

        pcd = o3d.geometry.PointCloud()
        pcd.points = o3d.utility.Vector3dVector(arr)
        pcd = pcd.voxel_down_sample(voxel_size = 1)
        arr = np.asarray(pcd.points)
if optim_type == "rmsprop":
    optimizer_D = optim.RMSprop(discriminator.parameters(), lr=lr)
    optimizer_G = optim.RMSprop(generator.parameters(), lr=lr)
elif optim_type == "adam":
    optimizer_D = optim.Adam(discriminator.parameters(),
                             lr=lr,
                             betas=(0.001, 0.8))
    optimizer_G = optim.Adam(generator.parameters(), lr=lr, betas=(0.001, 0.8))
elif optim_type == "sgd":
    optimizer_D = optim.SGD(discriminator.parameters(), lr=lr, momentum=0.1)
    optimizer_G = optim.SGD(generator.parameters(), lr=lr, momentum=0.1)
else:
    raise TypeError("optim type not found %s" % (optim_type))

#calculate background
x = utils.readData(filepath)

lx = np.floor(np.min(x[:, 0]))
hx = np.ceil(np.max(x[:, 0]))
ly = np.floor(np.min(x[:, 1]))
hy = np.ceil(np.min(x[:, 1]))

x_aix = np.arange(lx, hx, 0.01)
y_aix = np.arange(ly, hy, 0.01)
xx, yy = np.meshgrid(x_aix, y_aix)
print(xx.shape, yy.shape)
xx = torch.from_numpy(xx)
yy = torch.from_numpy(yy)
bc = torch.stack((xx, yy), dim=2)
bc = bc.view(-1, 2)
bc_cuda = bc.view(-1, 2).cuda().float()
Пример #33
0
end_id = args.end_id
save_seg = args.save_seg
save_stitched = args.save_stitched
gpu_id = args.gpu_id
loss_type = args.loss_type

stitch_labels = args.stitch_labels
show_img = args.show_img
save_raw = args.save_raw

psi_act_type = args.psi_act_type

normalize_labels = args.normalize_labels
n_layers = args.n_layers

src_files, src_labels_list, total_frames = readData(images_path, images_ext)

if end_id < start_id:
    end_id = total_frames - 1

eval_mode = False
if labels_path and labels_ext:
    _, labels_list, labels_total_frames = readData(labels_path=labels_path,
                                                   labels_ext=labels_ext)
    if labels_total_frames != total_frames:
        raise SystemError(
            'Mismatch between no. of frames in GT and seg labels')
    eval_mode = True
else:
    save_seg = True
Пример #34
0
                               name='train_labels')

global_steps = tf.Variable(0, name="global_step", trainable=False)
phPredction = ASRCNN(phTrainInput)
loss = utils.computeLoss(phTrainTarget, phPredction)
curr_lr_op = tf.train.exponential_decay(lr,
                                        global_steps,
                                        decay_step,
                                        decay_ratio,
                                        staircase=True)
train_op = tf.train.AdamOptimizer(learning_rate=curr_lr_op).minimize(
    loss, global_step=global_steps)
gpu_options = tf.GPUOptions(allow_growth=allow_growth)

# data
trainData1, testData, trainData2, trainTarget2, testTarget, trainTarget1, minNDVI, maxNDVI, perm = utils.readData(
    data_file, rcstart, rcend, opt.mode, data_scale)

trainData = [trainData1, trainData2]
trainTarget = [trainTarget1, trainTarget2]
num_patches_x = (image_size - patch_size + patch_stride) // patch_stride
num_patches_y = (image_size - patch_size + patch_stride) // patch_stride
num_patches = num_patches_x * num_patches_y

print(f'Extracting 80% for training,and 20% for validation ...')
pos = np.int(np.ceil(num_patches * 2 * 0.2 / batch_size) * batch_size)
valPerm = perm[:pos]
trainPerm = perm[pos:]
start_time = time.time()


def train_one_epoch(sess, n_epoch, saver):
Пример #35
0
def main(useAnnotatorWeighing=True):
    """
    This script allows for 10-fold cross validation over the data in the training set. Experiments only yield results, they don't yield annotated files.
    The standard deviation seen over the different folds for each metric are reported as well.
    
    Configure your model settings by modifying the ExperimentSettings object in the script.
    """

    # Making folders from config
    # cfg.makeFolders()

    # Here, you can specify the feature sets you would like to use. It is arranged in an array of arrays, to enable combinations
    features = [["DSM+2"], ["BOW"], ["DSM+1"], ["DSM"], ["SNOMED"],
                ["SNOMED+1"], ["DSM+2"], ["CONCEPTS"]]
    #features = [["DSM"],["DSM+1","DIST_HIER"],["DSM+1"], ["CATEGORICAL_QUESTIONSET","QUESTIONSET","LONG_QUESTIONSET"]]

    # Options:
    # 'CONCEPTS', 'DSM+1', 'DSM', 'DSM_HIER', 'MED', 'BOW', 'BOW_ANSWERS', 'CATEGORICAL_QUESTIONSET', 'QUESTIONSET'
    # 'WORD_VECTOR', 'WORD_VECTOR_ANSWERS', 'CONCEPT_VECTOR', 'DIST_WORDVECTOR', 'DIST_CONCEPTVECTOR'
    # 'CONCEPT_CLUSTERS', 'PREAMBLE_CLUSTERS'

    # if you want anything set differently than default, please change the corresponding parameter in es (ExperimentSettings)
    es = ExperimentSettings()
    es.fs_varianceFilter = True
    es.bootstrap = False
    es.ss_prototyping = False
    es.weighInterAnnot = False
    #es.ml_algorithm='XGBOOST'
    #es.ml_algorithm = 'RANDOM'
    '''es.removeDeniedConcepts=True
    es.removeUncertainConcepts=False
    es.splitDeniedConcepts=False
    es.splitFamilyConcepts=True'''

    es.removeDeniedConcepts = False
    es.splitDeniedConcepts = False
    es.splitUncertainConcepts = False
    es.splitFamilyConcepts = False

    #es.fs_confidence=True
    #es.fs_confidenceValueDistinction = True
    #es.fs_chiSquare = False
    #es.fs_varianceFilter = True
    #es.fs_varianceThreshold = 0.05
    #es.fs_confidence = True
    #es.fs_informationGain = False
    #es.fs_confidenceWithCoverage = True
    #es.fs_confidenceTopK = 100
    #es.fs_confidenceCoverageOverlap = 3
    #es.fs_confidenceCutOff = 0.05'''

    # Reading the data into an array
    data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    data = m.conceptPreprocessing(data, es.removeDeniedConcepts,
                                  es.splitDeniedConcepts,
                                  es.removeUncertainConcepts,
                                  es.splitUncertainConcepts,
                                  es.removeFamilyConcepts,
                                  es.splitFamilyConcepts)

    if es.bootstrap:
        bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED,
                                        cfg.PATH_PREPROCESSED_UNANNOTATED)
        bootstrap_data = m.conceptPreprocessing(
            bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts,
            es.removeUncertainConcepts, es.splitUncertainConcepts,
            es.removeFamilyConcepts, es.splitFamilyConcepts)
    # Looping over different feature parameters
    for featTypes in features:
        #for x in [True, False]:
        #es.fs_confidence = x

        utils.out('Executing for ' + ','.join(featTypes) + ' model.')
        es.featTypes = featTypes

        if es.svmParamSweep:
            result_params = m.param_sweep_svm(data,
                                              es,
                                              gammaSweep=False,
                                              nFolds=10,
                                              verbose=False,
                                              random_seed=44)
            for name in result_params:
                print(str(name) + ":", result_params[name])
        else:
            estimator = m.getEstimator(es)
            if es.bootstrap:
                results = m.eval_bootstrapped_crossVal(estimator,
                                                       data,
                                                       bootstrap_data,
                                                       es,
                                                       10,
                                                       printTree=False)
            else:
                results = m.evalCrossval(estimator,
                                         data,
                                         es,
                                         10,
                                         printTree=False)
            for name in results:
                print(str(name) + ":", results[name])
Пример #36
0
# coding: utf-8

import sys
from utils import readRelations, readData, wordStatForQuestion, wordStatForRelation, \
  convert_data
from data import data_static, argConfig, dataMgr
from train import train_model

if __name__ == "__main__":
    print "Start to read relations..."
    relation_list_seg, relation_list_seg_all = \
       readRelations("KBQA_data/sq_relations/relation.2M.list")
    print "\n"

    print "Start to read training data..."
    training_data = readData(
        "KBQA_data/sq_relations/train.replace_ne.withpool")
    print "Start to read testing data..."
    testing_data = readData("KBQA_data/sq_relations/test.replace_ne.withpool",
                            False)
    print "Start to read validation data"
    valid_data = readData("KBQA_data/sq_relations/valid.replace_ne.withpool",
                          False)
    print "\n"

    print "start to get word dictionary for questions and relations"
    question_words = wordStatForQuestion(training_data)
    relation_words = wordStatForRelation(relation_list_seg,
                                         relation_list_seg_all, training_data)
    print "\n"

    print "Start to convert data to vectors..."
Пример #37
0
    m = re.findall ( '<concepts_FILEUMLS(.*?)\/>', text, re.DOTALL)
    for n in m:
        text = text.replace("<concepts_FILEUMLS"+n+"/>\n","")
        text = text.replace("<concepts_FILEUMLS"+n+"/>","")
    return text

if __name__ == '__main__':
    #cfg.makeFolders()
    #texts = utils.readData(cfg.PATH_INPUT)
    #for text in texts:
    #    texts[text]["tokens"] = utils.dumbTokeniser(texts[text]["note"])
    
    basic = cfg.PATH_TEST
    preprocessed = cfg.PATH_PREPROCESSED_TEST
    
    data = utils.readData(basic, preprocessed)
    
    #TODO reset this to 0.80 idf threshold
    matcher = DictionaryMatcher(4, 0.80)
    matcher.loadLibrary('FILEUMLS')
    #matcher.loadLibrary('DSM')
    # matcher.loadDefinitions()
    
    #matcher.saveModel()
    #matcher = matcher.loadModel('FILEUMLS')
    
    for d in data:
        text = d.getTextObject()
        #for line in matcher.processText(text, True):
        #    print(line)
        
Пример #38
0
import Model
import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerLine2D
'''
a = '12 ,  34   '
re_splitA = re.compile(r'[\s\,]+')
print(re_splitA.split(a))
print(re.split(r'[\s\,]+', a))
print(re_splitA.split(a)[2] == '')      # True

b = [a for a in re_splitA.split(a) if a.isdigit()]
print(b)                   # work ! ! !
'''

print('Test...')
X, y = utils.readData('datas.xlsx')

X = np.atleast_2d(X).T
y = np.atleast_2d(y).T

print('Data info\n')
print('X = ', X.dtype, X.shape)
print('y = ', y.dtype, y.shape)

# Test datas
x = np.atleast_2d(np.linspace(X.min(), X.max(), 2000)).T

y_pred, sigma_pred = Model.gpSklearn(X, y, x)

fig = plt.figure(1)
plotf, = plt.plot(X, y, label='Origin')
Пример #39
0
def main(useAnnotatorWeighing=True):
    '''
    This script runs the experiments by training on a trainset and testing on a test set. Also allows bootstrapping (which is hard coded in this script as well)
    Configure your model settings by modifying the ExperimentSettings object in the script.

    The output of these models are annotated files in the output folder, which can be evaluated (in metrics) using testEval.py
    '''

    # Making folders from config
    # cfg.makeFolders()

    # Here, you can specify the feature sets you would like to use. It is arranged in an array of arrays, to enable combinations
    features = [["DSM+1"]]
    #features = [["CONCEPTS"]]#['BOW'],
    #     features = [["CONCEPTS"]]

    # if you want anything set differently than default, please change the corresponding parameter in es (ExperimentSettings)
    es = ExperimentSettings()
    #     es.fs_varianceFilter = True
    #     es.bootstrap = True
    #     es.ss_prototyping = True
    #     es.weighInterAnnot = False
    #     es.ml_algorithm='RF'
    #remove these!
    #     es.removeDeniedConcepts=False
    #     es.splitFamilyConcepts=False
    #     es.splitUncertainConcepts=False

    # Reading the train/test_data into an array
    train_data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN)
    test_data = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts,
                                        es.splitDeniedConcepts,
                                        es.removeUncertainConcepts,
                                        es.splitUncertainConcepts,
                                        es.removeFamilyConcepts,
                                        es.splitFamilyConcepts)
    test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts,
                                       es.splitDeniedConcepts,
                                       es.removeUncertainConcepts,
                                       es.splitUncertainConcepts,
                                       es.removeFamilyConcepts,
                                       es.splitFamilyConcepts)

    # Reading in bootstrap data as well when enabled
    if es.bootstrap:
        bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED,
                                        cfg.PATH_PREPROCESSED_UNANNOTATED)
        bootstrap_data = m.conceptPreprocessing(
            bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts,
            es.removeUncertainConcepts, es.splitUncertainConcepts,
            es.removeFamilyConcepts, es.splitFamilyConcepts)

    # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE)
    # train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts)
    # test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts)

    vectorizer = DictVectorizer()
    min_max_scalar = MinMaxScaler()

    # Looping over different feature parameters
    for featTypes in features:
        utils.out('Executing for ' + ','.join(featTypes) + ' model.')
        es.featTypes = featTypes

        estimator = m.getEstimator(es)

        m.generatePrimaryFeats(train_data, es)
        m.generatePrimaryFeats(test_data, es)
        utils.out('Generated primary features for train and test_data!')

        y_train = [d.severity for d in train_data]

        if es.bootstrap:
            m.generatePrimaryFeats(bootstrap_data, es)
            (train_data, y_train) = m.get_bootstrapped_trainset(train_data,
                                                                y_train,
                                                                bootstrap_data,
                                                                es,
                                                                estimator,
                                                                th_bs=0.6)

        concatenated_data = []
        concatenated_data.extend(train_data)
        concatenated_data.extend(test_data)

        m.generateDataDrivenFeats(train_data, concatenated_data, es)

        featurized = m.featurize(concatenated_data)

        train_feats = featurized[0:len(train_data)]
        test_feats = featurized[len(train_data):len(featurized)]

        # Do feature selection on train data
        train_feats = fs.runFeatureSelection(train_feats, y_train, es)
        train_feats, y_train, train_bucket = ss.runSampleSelection(
            train_feats, y_train, [i for i in range(len(train_data))], es)

        x_train = vectorizer.fit_transform(train_feats)
        x_test = vectorizer.transform(test_feats)

        if es.scaleData:
            x_train = min_max_scalar.fit_transform(x_train.toarray())
            x_test = min_max_scalar.transform(x_test.toarray())

        weights_train = m.getWeights(train_data, train_bucket,
                                     es.weighInterAnnot)

        model = m.train(estimator, x_train, y_train, weights_train, model=None)

        y_pred = m.test(x_test, estimator=model)
        #         print(y_pred)
        for i, cur_data in enumerate(test_data):
            cur_data.predSev = y_pred[i]

        out_dir = cfg.PATH_OUTPUT + ','.join(featTypes) + '/'
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        utils.genOutput(data=test_data,
                        outDir=out_dir,
                        dtd=cfg.PATH_OUTPUT + '2016_CEGS_N-GRID_TRACK2.dtd/')
Пример #40
0
def generateData(
        bacteria_index,
        save_csv=False,
        save_data=True,
        out_dir="./data/promoters/",
        nextflow_path='./nextflow',
        nextflow_pipeline="pipeline_unbalanced.nf",  # 'pipeline_without_docker.nf'
        manually_balance_data=False):
    if (using_unbalanced):
        print("GENERATING UNBALANCED DATA WITH RATIO 1:10")
    else:
        print("GENERATE DATA")

    # bacteriaDir = "./bacteria"
    bacteria_report = {}
    if bacteria_index is None:
        index = createIndex()
    else:
        index = bacteria_index

    data_root = "./data/"
    if not os.path.exists(data_root):
        os.makedirs(data_root)

    w = csv.writer(open(data_root + "report.csv", "w"))
    vocab_size = None
    tokenizer = None

    hot_encoded_train_features = np.empty((0, 160), int)
    hot_encoded_train_labels = np.empty((0, ), int)
    hot_encoded_test_features = np.empty((0, 160), int)
    hot_encoded_test_labels = np.empty((0, ), int)
    hot_encoded_val_features = np.empty((0, 160), int)
    hot_encoded_val_labels = np.empty((0, ), int)

    tetra_freq_train_features = np.empty((0, 256), int)
    tetra_freq_train_labels = np.empty((0, ), int)
    tetra_freq_test_features = np.empty((0, 256), int)
    tetra_freq_test_labels = np.empty((0, ), int)
    tetra_freq_val_features = np.empty((0, 256), int)
    tetra_freq_val_labels = np.empty((0, ), int)

    rnn_token_train_features = np.empty((0, 37), int)
    rnn_token_train_labels = np.empty((0, ), int)
    rnn_token_test_features = np.empty((0, 37), int)
    rnn_token_test_labels = np.empty((0, ), int)
    rnn_token_val_features = np.empty((0, 37), int)
    rnn_token_val_labels = np.empty((0, ), int)
    global_rnn_complete = np.empty((0, 37), int)

    start_time = datetime.datetime.now().time().strftime('%H:%M:%S')
    bar = progressbar.ProgressBar(max_value=len(index))
    for i, row in index.iterrows():
        bacteria_start_time = datetime.datetime.now().time().strftime(
            '%H:%M:%S')

        # print("\n\n", 20*"*", i+1, f". {row['BACTERIA']}", 20*"*" )
        print("\n\n {} {} {} {}".format(20 * "*", i + 1, row['BACTERIA'],
                                        20 * "*"))
        #nextflow run main_pipeline.nf --bacteria ecoli && rsync outDir/ outDirOriginal/ -a --copy-links -v
        print("\n\n {} {} {} {}".format(20 * "*", i + 1,
                                        "NEXTFLOW DATA GENERATION", 20 * "*"))
        # print("\n\n", 10*"*", "NEXTFLOW DATA GENERATION",10*"*" )

        stderr = None
        stdout = None

        if (nextflow_path is not None):
            print("\n\nGENERATING NEXTFLOW DATA USING PIPELINE: ",
                  nextflow_pipeline, "\n\n")
            out = subprocess.Popen(
                [
                    nextflow_path,
                    'run',
                    nextflow_pipeline,  #'pipeline_without_docker.nf',   #    pipeline_unbalanced_without_docker.nf   'main_pipeline.nf',  
                    '--bacteria',
                    str(row['BACTERIA']),
                ],
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT)
            stdout, stderr = out.communicate()
            error_msg = ""

            print("\n\nOUT: \n\n", stdout)
            print("\n\nERRORS: \n\n ", stderr)

            bacteria_report[row['BACTERIA']] = {
                'stdout': stdout,
                'stderr': stderr
            }
        else:
            print("NEXTFLOW GENERATION SKIPPED.")

        if stderr == None:
            # print("\n\nConverting symlink to copy of file", row['BACTERIA'])
            # cmd = f"rsync outDir/{row['BACTERIA']} outDirOriginal/ -a --copy-links -v"

            if (nextflow_path is not None):
                cmd = "rsync " + out_dir + str(
                    row['BACTERIA']) + " outDirOriginal/ -a --copy-links -v"
                out = os.popen(cmd).read()

            try:
                p_df, n_df = readData(
                    out_dir + str(row['BACTERIA']) + "/positive.fasta",
                    out_dir + str(row['BACTERIA']) + "/negative.fasta")

                p_bed_df = BedTool(out_dir + str(row['BACTERIA']) +
                                   "/positive.bed").to_dataframe()
                n_bed_df = BedTool(out_fir + str(row['BACTERIA']) +
                                   "/negative.bed").to_dataframe()
                p_bed_df["sequence"] = p_df.values
                n_bed_df["sequence"] = n_df.values
                p_bed_df["label"] = [1] * len(p_df)
                n_bed_df["label"] = [0] * len(n_df)
                dataset_df = pd.concat([p_bed_df, n_bed_df])
                print("SAVING DATASET: P {}  + N {} = {}".format(
                    p_bed_df.shape, n_bed_df.shape, dataset_df.shape))
                p_bed_df.to_csv(out_dir + str(row['BACTERIA']) +
                                "/positive.csv")
                n_bed_df.to_csv(out_dir + str(row['BACTERIA']) +
                                "/negative.csv")
                dataset_df.to_csv(out_dir + str(row['BACTERIA']) +
                                  "/dataset.csv")

                print("\n\n" + 10 * "*" + "FASTA TO HOT ENCODING" + 10 * "*")
                print("P: {} N: {}".format(len(p_df), len(n_df)))
                if (manually_balance_data and len(p_df) < len(n_df)):
                    print(
                        "Manually balancing Positives and Negatives. Decreasing Negatives from {} -> {}. Ratio {}:{}"
                        .format(len(n_df), len(p_df), 1,
                                len(p_df) * 100 / len(n_df)))
                    n_df = n_df.sample(n=len(p_df))
                    print("FINAL DATA SHAPES -> P: {} N : {}".format(
                        p_df.shape, n_df.shape))

                hot_p_data, hot_n_data = fastaToHotEncoding(p_df, n_df)
                hot_encoded_dataset_df = joinPositiveAndNegative(
                    hot_p_data, hot_n_data)
                print("\n\n", hot_encoded_dataset_df.head(), "\n\n")
                X_hot_train, X_hot_test, y_hot_train, y_hot_test = generateTrainAndTestSplit(
                    hot_encoded_dataset_df.values)

                print("""
          X: {}
          Y: {}
          TX: {}
          TY: {}
        """.format(X_hot_train.shape, y_hot_train.shape, X_hot_test.shape,
                   y_hot_test.shape))

                if (row["IS_TRAINING"] == True):
                    hot_encoded_train_features = np.append(
                        hot_encoded_train_features, X_hot_train, axis=0)
                    hot_encoded_train_labels = np.append(
                        hot_encoded_train_labels, y_hot_train, axis=0)
                    hot_encoded_test_features = np.append(
                        hot_encoded_test_features, X_hot_test, axis=0)
                    hot_encoded_test_labels = np.append(
                        hot_encoded_test_labels, y_hot_test, axis=0)
                else:
                    print("\nAPPENDING TO VALIDATION DATA")
                    hot_encoded_val_features = np.append(
                        hot_encoded_val_features, X_hot_train, axis=0)
                    hot_encoded_val_labels = np.append(hot_encoded_val_labels,
                                                       y_hot_train,
                                                       axis=0)
                    hot_encoded_val_features = np.append(
                        hot_encoded_val_features, X_hot_test, axis=0)
                    hot_encoded_val_labels = np.append(hot_encoded_val_labels,
                                                       y_hot_test,
                                                       axis=0)

                print("\n\n", 10 * "*", "FASTA TO TETRA-NUCLEOTDE FRECUENCY",
                      10 * "*")
                tetra_n_array_positive = fastaToTetraNucletideDic(
                    p_df.values, 1)
                tetra_n_array_negative = fastaToTetraNucletideDic(
                    n_df.values, 0)
                joined_df = joinPositiveAndNegative(tetra_n_array_positive,
                                                    tetra_n_array_negative)
                joined_df = joined_df.fillna(0)
                print("\nHEAD-FASTA TO TETRA-NUCLEOTDE FRECUENCY")
                print("\n\n", joined_df.head(), "\n\n")
                X_train, X_test, y_train, y_test = generateTrainAndTestSplit(
                    joined_df.values)

                print("""
          X: {}
          Y: {}
          TX: {}
          TY: {}
        """.format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))

                if (row["IS_TRAINING"] == True):
                    tetra_freq_train_features = np.append(
                        tetra_freq_train_features, X_train, axis=0)
                    tetra_freq_train_labels = np.append(
                        tetra_freq_train_labels, y_train, axis=0)
                    tetra_freq_test_features = np.append(
                        tetra_freq_test_features, X_test, axis=0)
                    tetra_freq_test_labels = np.append(tetra_freq_test_labels,
                                                       y_test,
                                                       axis=0)
                else:
                    print("APPENDING TO VALIDATION DATA")
                    tetra_freq_val_features = np.append(
                        tetra_freq_val_features, X_train, axis=0)
                    tetra_freq_val_labels = np.append(tetra_freq_val_labels,
                                                      y_train,
                                                      axis=0)
                    tetra_freq_val_features = np.append(
                        tetra_freq_val_features, X_test, axis=0)
                    tetra_freq_val_labels = np.append(tetra_freq_val_labels,
                                                      y_test,
                                                      axis=0)

                print("\n\n", 10 * "*", "RNN DATA PROCESSING", 10 * "*")

                tetran_word_dataset = fasta_to_tetranucleotide_list(
                    p_df.values, n_df.values)
                tetran_word_dataset = tetran_word_dataset.dropna()
                print("\n\n", tetran_word_dataset.head(), "\n\n")
                X_tetran_train, X_tetran_test, y_tetran_train, y_tetran_test = generateTrainAndTestSplit(
                    tetran_word_dataset.values)

                print("""\n
          X:  {}
          Y:  {}
          TX: {}
          TY: {}
          COMPLETE:        {}
          COMPLETE+LABELS: {}
        """.format(
                    np.array(X_tetran_train).shape,
                    np.array(y_tetran_train).shape,
                    np.array(X_tetran_test).shape,
                    np.array(y_tetran_test).shape,
                    np.array(tetran_word_dataset.iloc[:, :-1].values).shape,
                    np.array(tetran_word_dataset.values).shape))

                if (row["IS_TRAINING"] == True):
                    rnn_token_train_features = np.append(
                        rnn_token_train_features, X_tetran_train, axis=0)
                    rnn_token_train_labels = np.append(rnn_token_train_labels,
                                                       y_tetran_train,
                                                       axis=0)
                    rnn_token_test_features = np.append(
                        rnn_token_test_features, X_tetran_test, axis=0)
                    rnn_token_test_labels = np.append(rnn_token_test_labels,
                                                      y_tetran_test,
                                                      axis=0)
                else:
                    print("APPENDING TO VALIDATION DATA")
                    rnn_token_val_features = np.append(rnn_token_val_features,
                                                       X_tetran_train,
                                                       axis=0)
                    rnn_token_val_labels = np.append(rnn_token_val_labels,
                                                     y_tetran_train,
                                                     axis=0)
                    rnn_token_val_features = np.append(rnn_token_val_features,
                                                       X_tetran_test,
                                                       axis=0)
                    rnn_token_val_labels = np.append(rnn_token_val_labels,
                                                     y_tetran_test,
                                                     axis=0)
                global_rnn_complete = np.append(
                    global_rnn_complete,
                    tetran_word_dataset.iloc[:, :-1].values,
                    axis=0)

            except Exception as e:
                print('\n\nFAILED : \n\n' + str(e))
                print(traceback.format_exc())
                error_msg = str(e)

        if (nextflow_path is not None):
            w.writerow([row['BACTERIA'], stdout, stderr, error_msg])

        bar.update(i)
        bacteria_end_time = datetime.datetime.now().time().strftime('%H:%M:%S')
        bacteria_total_time = (
            datetime.datetime.strptime(bacteria_end_time, '%H:%M:%S') -
            datetime.datetime.strptime(bacteria_start_time, '%H:%M:%S'))
        print("\n\nBACTERIA: ", row['BACTERIA'], " - TOTAL ELAPSED TIME: ",
              bacteria_total_time)

    print("\n\nTOKENIZING RNN DATASET\n\n")
    str_global_rnn_complete = tetranucleotide_list_to_string_list(
        global_rnn_complete)
    str_rnn_token_train_features = tetranucleotide_list_to_string_list(
        rnn_token_train_features)
    str_rnn_token_test_features = tetranucleotide_list_to_string_list(
        rnn_token_test_features)
    str_rnn_token_val_features = tetranucleotide_list_to_string_list(
        rnn_token_val_features)

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(str_global_rnn_complete)
    vocab_size = len(tokenizer.word_index) + 1

    print("\nTokenizer Summary")
    print("\n document_count: ", tokenizer.document_count)
    print("\n vocab size: ", vocab_size)

    rnn_token_train_features = tokenizer.texts_to_sequences(
        str_rnn_token_train_features)
    rnn_token_test_features = tokenizer.texts_to_sequences(
        str_rnn_token_test_features)
    rnn_token_val_features = tokenizer.texts_to_sequences(
        str_rnn_token_val_features)
    # X_train_pad = pad_sequences(rnn_token_train_features, maxlen=37, padding="post")
    # X_test_pad  = pad_sequences(rnn_token_test_features, maxlen=37, padding="post")
    rnn_token_train_features = np.array(rnn_token_train_features)
    rnn_token_test_features = np.array(rnn_token_test_features)
    rnn_token_val_features = np.array(rnn_token_val_features)

    print("\n\nTOTAL HOT ENCODING FEATURES"
          "\nHOT ENCODED FEATURE TRAIN", hot_encoded_train_features.shape,
          "\nHOT ENCODED LABELS  TRAIN", hot_encoded_train_labels.shape,
          "\nHOT ENCODED FEATURE TEST", hot_encoded_test_features.shape,
          "\nHOT ENCODED LABELS  TEST", hot_encoded_test_labels.shape,
          "\nHOT ENCODED FEATURE VAL", hot_encoded_val_features.shape,
          "\nHOT ENCODED LABELS  VAL", hot_encoded_val_labels.shape, "\n")
    print(
        "\n\nTOTAL TETRA-NUCLEOTDE FRECUENCY FEATURES"
        "\nTETRA-FREQ FEATURE TRAIN", tetra_freq_train_features.shape,
        "\nTETRA-FREQ LABELS  TRAIN", tetra_freq_train_labels.shape,
        "\nTETRA-FREQ FEATURE TEST", tetra_freq_test_features.shape,
        "\nTETRA-FREQ LABELS  TEST", tetra_freq_test_labels.shape,
        "\nTETRA-FREQ FEATURE VAL", tetra_freq_val_features.shape,
        "\nTETRA-FREQ LABELS  VAL", tetra_freq_val_labels.shape, "\n")
    print(
        "\n\nTOTAL RNN TETRANUCLEOTIDE STRING TOKEN SEQUENCES FEATURES"
        "\nRNN TOKEN FEATURE TRAIN", rnn_token_train_features.shape,
        "\nRNN TOKEN LABELS  TRAIN", rnn_token_train_labels.shape,
        "\nRNN TOKEN FEATURE TEST", rnn_token_test_features.shape,
        "\nRNN TOKEN LABELS  TEST", rnn_token_test_labels.shape,
        "\nRNN TOKEN FEATURE VAL", rnn_token_val_features.shape,
        "\nRNN TOKEN LABELS  VAL", rnn_token_val_labels.shape,
        "\nRNN TOKEN ALL", global_rnn_complete.shape, "\nVocab", vocab_size,
        "\n")
    # Save files
    if (save_data):
        saveData(hot_encoded_train_features, hot_encoded_train_labels,
                 hot_encoded_test_features, hot_encoded_test_labels,
                 hot_encoded_val_features, hot_encoded_val_labels,
                 tetra_freq_train_features, tetra_freq_train_labels,
                 tetra_freq_test_features, tetra_freq_test_labels,
                 tetra_freq_val_features, tetra_freq_val_labels,
                 rnn_token_train_features, rnn_token_train_labels,
                 rnn_token_test_features, rnn_token_test_labels,
                 rnn_token_val_features, rnn_token_val_labels, vocab_size,
                 tokenizer, save_csv)
        try:
            print("\n\nDeleting Temporary Files\n\n")
            os.system('rm -rf __pycache__')
            os.system('rm -rf .nextflow')
            #os.system('rm -rf outDirOriginal')
            #os.system('rm -rf work')
            #os.system('rm .nextflow.*')
            #os.system('mv -v *.genome ./data')
            #os.system('mkdir -p ./data/bacteria')
            #os.system('mv ./outDir/* ./data/bacteria')
            #os.system('rm -rf ./outDir')
        except Exception as e:
            print("\n\nError deleting temporary data. " + str(e))
    else:
        print("NOT SAVING BINARY DATA")

    end_time = datetime.datetime.now().time().strftime('%H:%M:%S')
    total_time = (datetime.datetime.strptime(end_time, '%H:%M:%S') -
                  datetime.datetime.strptime(start_time, '%H:%M:%S'))
    print("\n\nTOTAL ELAPSED TIME: ", total_time)

    return hot_encoded_train_features, \
      hot_encoded_train_labels, \
      hot_encoded_test_features, \
      hot_encoded_test_labels,  \
      hot_encoded_val_features, \
      hot_encoded_val_labels,  \
      tetra_freq_train_features, \
      tetra_freq_train_labels, \
      tetra_freq_test_features, \
      tetra_freq_test_labels, \
      tetra_freq_val_features, \
      tetra_freq_val_labels, \
      rnn_token_train_features, \
      rnn_token_train_labels,  \
      rnn_token_test_features, \
      rnn_token_test_labels, \
      rnn_token_val_features, \
      rnn_token_val_labels, \
      vocab_size, tokenizer