Exemplo n.º 1
0
def main():
    seq = 1
    data = ReadData(dsName='airsim', subType='mr', seq=seq)
    barNames = data.getNewImgNames(subtype='bar')
    pinNames = data.getNewImgNames(subtype='pin')
    dirBar = data.path + '/images_bar'
    dirPin = data.path + '/images_pin'

    if not os.path.exists(dirBar):
        os.makedirs(dirBar)
    if not os.path.exists(dirPin):
        os.makedirs(dirPin)

    N = data.imgs.shape[0]

    for i in range(0, N):
        img = data.imgs[i]
        img = np.reshape(img, (360, 720, 3))

        pin = cv2.fisheye.undistortImage(img, K, D=D_pincus, Knew=K_pincus)
        bar = cv2.fisheye.undistortImage(img, K, D=D_barrel, Knew=K_barrel)

        # cv2.imshow('input', img)
        # cv2.imshow('pin', pin)
        # cv2.imshow('bar', bar)
        # cv2.waitKey(1)
        cv2.imwrite(barNames[i], bar * 255.0)
        cv2.imwrite(pinNames[i], pin * 255.0)
        print(i / N)
Exemplo n.º 2
0
def main():
    filenames = ReadData.listStdDir(CSV_FILEPATH)
    for filename in filenames:
        print filename
        tweets = ReadData.CSVFileToMatrix(filename)
        for tweet in tweets:
            processTweet(tweet)
Exemplo n.º 3
0
def ReadAndExtractAll(fname='../data/features_all_v2.2.pkl'):
    '''
    read all data, extract features, write to dill
    '''

    short_pid, short_data, short_label = ReadData.ReadData(
        '../../data1/short.csv')
    long_pid, long_data, long_label = ReadData.ReadData('../../data1/long.csv')
    QRS_pid, QRS_data, QRS_label = ReadData.ReadData('../../data1/QRSinfo.csv')
    center_waves = ReadData.read_mean_wave(
        '../../data1/center_wave_euclid_direct.csv')

    all_pid = QRS_pid
    all_feature = GetAllFeature(short_data, long_data, QRS_data, long_pid,
                                short_pid, center_waves)
    all_label = QRS_label

    print('ReadAndExtractAll done')
    print('all_feature shape: ', np.array(all_feature).shape)

    #    with open(fname, 'wb') as output:
    #        dill.dump(all_pid, output)
    #        dill.dump(all_feature, output)
    #        dill.dump(all_label, output)

    return
Exemplo n.º 4
0
    def __init__(self, filename):
        points = RD.obj_vertices(filename)
        normals = RD.obj_normals(filename)
        self.pwns = [points[i] + normals[i] for i in range(len(points))]
        sorted_pwns = Lc.locate(self.pwns)
        self.plates = []
        for key in sorted_pwns:
            self.plates.append(Plate.plate(key, sorted_pwns[key]))

        self.intersect_matrix = np.matrix([[0, 0, 1], [0, 0, 1], [1, 1, 0]])
        n = 10
        self.inter_segments = []
        self.inter_segment_markers = []
        for i in range(len(self.plates)):
            for j in range(i + 1, len(self.plates)):
                if self.intersect_matrix[i, j] == 1:
                    segment = Bd.intersect_of_plate(self.plates[j],
                                                    self.plates[i])
                    Bd.sub_segment(segment, n)
                    self.inter_segments.append(segment)
                    self.plates[i].add_segment(segment)
                    self.plates[j].add_segment(segment)
                    self.inter_segment_markers.append(
                        [[i, len(self.plates[i].segments) - 1],
                         [j, len(self.plates[j].segments) - 1]])
Exemplo n.º 5
0
def ReadAndExtractAll(fname='../data/features_all_v2.5.pkl'):
    '''
    read all data, extract features, write to dill
    '''

    short_pid, short_data, short_label = ReadData.ReadData(
        '../../data1/short.csv')
    long_pid, long_data, long_label = ReadData.ReadData('../../data1/long.csv')
    QRS_pid, QRS_data, QRS_label = ReadData.ReadData('../../data1/QRSinfo.csv')
    center_waves = ReadData.read_mean_wave('../../data1/centerwave_raw.csv')

    all_pid = QRS_pid
    feature_list, all_feature = GetAllFeature(short_data, long_data, QRS_data,
                                              long_pid, short_pid,
                                              center_waves)
    all_label = QRS_label

    print('ReadAndExtractAll done')
    print('all_feature shape: ', np.array(all_feature).shape)
    print('feature_list shape: ', len(feature_list))
    np.nan_to_num(all_feature)

    with open(fname + '_feature_list.csv', 'w') as fout:
        for i in feature_list:
            fout.write(i + '\n')

    with open(fname, 'wb') as output:
        dill.dump(all_pid, output)
        dill.dump(all_feature, output)
        dill.dump(all_label, output)
    print('write done')
    return
Exemplo n.º 6
0
def read_data():
    long_pid, long_data, long_label = ReadData.ReadData('../../data1/long.csv')

    #    mat1 = [truncate_long(ts, 9000) for ts in long_data]
    #    mat2 = [truncate_long(ts, 6000) for ts in long_data]
    mat3 = [truncate_long(ts, 3000) for ts in long_data]

    #    mat4 = [sample_long(ts, 10) for ts in mat1]
    #    mat5 = [sample_long(ts, 10) for ts in mat2]
    #    mat6 = [sample_long(ts, 10) for ts in mat3]

    label_onehot = ReadData.Label2OneHot(long_label)

    #    plt.plot(mat1[0])
    #    plt.plot(mat4[0])

    mat = mat3

    all_feature = np.array(mat, dtype=np.float32)
    all_label = np.array(label_onehot, dtype=np.float32)

    kf = StratifiedKFold(n_splits=5, shuffle=True)
    for train_index, test_index in kf.split(all_feature, long_label):
        train_data = all_feature[train_index]
        train_label = all_label[train_index]
        test_data = all_feature[test_index]
        test_label = all_label[test_index]
        break

    train_data = np.expand_dims(np.array(train_data, dtype=np.float32), axis=2)
    test_data = np.expand_dims(np.array(test_data, dtype=np.float32), axis=2)

    return train_data, train_label, test_data, test_label
Exemplo n.º 7
0
    def test_exponential(self):

        import numpy as np

        retval = ReadData.exponential(0, 0)
        self.assertEqual(retval, 1)
        retval = ReadData.exponential(np.array([0, 0]), 1)
        self.assertEqual(retval[0], 1)
        self.assertEqual(retval[1], 1)
Exemplo n.º 8
0
def init():
    start = time.time()

    setTextMtime()
    ReadData.init()
    getDcit6()

    end = time.time()
    return "It run time is : %.03f seconds" % (end - start)
Exemplo n.º 9
0
def main():

    timer = time.time()
    # Read both the training and the test data
    ratings, users, books = r.PandaReader("./data/BX-Book-Ratings-Train.csv",
                                          "./data/BX-Users.csv",
                                          "./data/BX-Books.csv")
    testRatings = r.ReadTest("./data/BXBookRatingsTest.csv", users, books)

    print("Read data time: ", time.time() - timer)

    timer = time.time()
    # Construct the dictionaries necessary
    userRatingMap, bookRatingMap = knn.ConstructTrainModel(ratings)
    userRatingTestMap, bookRatingTestMap = knn.ConstructTrainModel(testRatings)

    print("matrix creation time: ", time.time() - timer)

    timer = time.time()
    # Choose similarity
    function = 'Cor'
    # Validate or test
    '''
    f = open("wknn" + function , "w")
    f.write("k" + " threshold" + " mae" + " time" + " weight" + "\n")
    
    min = [0, 0]
    
    for k in range(1, 50, 2):
        print("K = ", k)
    
        for split in range(0, 5):
            print("split = ", split)
            timer = time.time()
    
            sim, mae = knn.ValidateData(userRatingMap, bookRatingMap,
                                        split_1=split * int(len(userRatingMap) / 5), split_2=(split + 1) * int(len(userRatingMap) / 5),
                                        k=k, function=function, threshold=8, weighted=True)
            min[0] += mae
            min[1] += time.time() - timer
    
            print("Test k time: ", time.time() - timer)
    
        min[0] /= 5
        min[1] /= 5
        f.write("%d %d %.2f %.2f %d\n" % (k, 8, min[0], min[1], 1))
        min[0] = min[1] = 0
    '''

    sim, mae = knn.TestData(userRatingMap,
                            userRatingTestMap,
                            bookRatingMap,
                            k=3,
                            function=function,
                            threshold=0,
                            weighted=True)
    print("Validation time: ", time.time() - timer)
Exemplo n.º 10
0
def train():

    x = tf.placeholder(tf.float32, [None, 224,224,3])
    y = tf.placeholder(tf.float32, [None, 5])

    train_data,train_label=ReadData.get_outorder_data_train(train_dir)
    test_data, test_label = ReadData.get_outorder_data_test(test_dir)

    #logits = inference(x,True,regularizer)
    #logits = alexnet(x, 0.5, 5)
    #logits = vgg_net(x)
    logits = VGGNet_11(x, 0.5)
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y)
    loss = tf.reduce_mean(loss, name='loss')

    train_op = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(loss)

    correct_prediction = tf.equal(tf.cast(tf.argmax(logits, 1), tf.int32), tf.cast(tf.argmax(y, 1), tf.int32))
    acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    saver = tf.train.Saver()
    # Tensorboard
    filewriter_path = 'tensorboard'
    tf.summary.scalar('loss', loss)
    tf.summary.scalar('accuracy', acc)
    merged_summary = tf.summary.merge_all()
    #merge_summary = tf.summary.merge([loss_summary, acc_summary])
    writer = tf.summary.FileWriter(filewriter_path)

    #config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True
    f=open("loss_acc.txt","a+")
    with tf.Session() as sess:
       sess.run(tf.global_variables_initializer())
       #saver.restore(sess,  MODEL_SAVE_PATH+MODEL_NAME)
       for i in range(60):  # 20000
           train_loss, train_acc, n_batch = 0, 0, 0
           for train_data_batch, train_label_batch in ReadData.get_batch(train_data, train_label, BATCH_SIZE):
               _, err= sess.run([train_op, loss], feed_dict={x: train_data_batch, y: train_label_batch})
               train_loss += err
               n_batch += 1
           print(i," train loss: %f" % (np.sum(train_loss) / n_batch))

           saver.save(sess, MODEL_SAVE_PATH+MODEL_NAME)
           test_acc= 0
           test_acc = sess.run(acc, feed_dict={x: test_data, y: test_label})
           print(i," test acc: %f" % test_acc)
           new_context = str(i)+" " +str((np.sum(train_loss) / n_batch))+" " +str(test_acc)+ '\n'
           f.write(new_context)
           result = sess.run(merged_summary, feed_dict={x: test_data, y: test_label})
           writer.add_summary(result, i)
       writer.close()
       f.close()
Exemplo n.º 11
0
def test():
    appointMap = ReadData.getAppointMap()
    for user in appointMap:
        lastestTime = ReadData.get_user_lastest_time(user, appointMap)
        userDict = ReadData.get_recent_user_classId_Dict(
            lastestTime, appointMap)

        itemUser = getItemUserMap(userDict)
        print 'userID : ', user
        print 'his classID : ', userDict[user]
        print recommendByUserFC(userDict, itemUser, user, 8)
        print '=========================================='
Exemplo n.º 12
0
def test():
    appointMap = ReadData.getAppointMap()

    for userID in appointMap:
        lastestTime = ReadData.get_user_lastest_time(userID, appointMap)
        print userID, '========================='
        history_classID, col_recom, apriori_recom = getCandidate(
            lastestTime, userID, appointMap)
        print 'history : ', history_classID
        print 'col_recom  : ', col_recom
        print 'apriori_recom : ', apriori_recom
        print 'hour  : ', getHourOfCandidate(lastestTime, userID, appointMap)
        print 'storeID : ', getStoreOfCandidate(lastestTime, userID,
                                                appointMap)
def lstm_raw():
    print('Loading data...')
    folder_path = r'H:\network_diagnosis_data\cut-1000'
    
    X_t, y_t,dicc=ReadData.ReadRaw2HierData(folder_path,N)
    nb_classes = np.max(y_t)+1
    X_t = ReadData.to_num(X_t,max_features)
    X_train, X_test, y_train, y_test = train_test_split(X_t,y_t, test_size=0.2, random_state= 42 )
    
    print('Pading sequences ')
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)#padding = 'post'
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)#truncating = 'post'
    y_train = to_categorical (y_train,nb_classes)
    y_test = to_categorical (y_test,nb_classes)
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)
    
    print('Building model...')
    
    model = Sequential()
    model.add(Embedding(max_features, Embedding_Dim, dropout=0.2))
    model.add(LSTM(Embedding_Dim, dropout_W=0.2, dropout_U=0.2))  # 
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    
    # try using different optimizers and different optimizer configs
    model.compile(loss='categorical_crossentropy', #binary_crossentropy
                  optimizer='adam',
                  metrics=['accuracy'])
    
    from keras.utils.visualize_util import plot
    plot(model, to_file=r'.\data\lstm-model.png')
    print('Training...')
#     model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=num_epoch,
#               validation_data=(X_test, y_test))
    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=num_epoch,
              validation_split=0.1,verbose=1)
    
    score, acc = model.evaluate(X_test, y_test,
                                batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
    
    from keras.utils.visualize_util import plot
    data_today=time.strftime('%Y-%m-%d',time.localtime(time.time()))
    plot(model, to_file=r'.\data\lstm-model'+data_today+'.png')
    json_string = model.to_json()  #等价于 json_string = model.get_config()  
    open('.\data\lstm-model'+data_today+'.json','w+').write(json_string)    
    model.save_weights('.\data\keras-lstm'+data_today+'.h5', overwrite=True)
    print('model saved')
Exemplo n.º 14
0
    def run(self, name, datafiles, goldnet_file):
        import numpy

        os.chdir(os.environ["gene_path"])

        datastore = ReadData(datafiles[0], "steadystate")
        for file in datafiles[1:]:
            datastore.combine(ReadData(file, "steadystate"))
        datastore.normalize()

        settings = {}
        settings = ReadConfig(settings)
        # TODO: CHANGE ME
        settings["global"]["working_dir"] = os.getcwd() + '/'

        # Setup job manager
        print "Starting new job manager"
        jobman = JobManager(settings)

        # Make GENIE3 jobs
        genie3 = GENIE3()
        genie3.setup(datastore, settings, name)

        print "Queuing job..."
        jobman.queueJob(genie3)

        print jobman.queue
        print "Running queue..."
        jobman.runQueue()
        jobman.waitToClear()

        print "Queue finished"
        job = jobman.finished[0]
        print job.alg.gene_list
        print job.alg.read_output(settings)
        jobnet = job.alg.network
        print "PREDICTED NETWORK:"
        print job.alg.network.network
        print jobnet.original_network

        if goldnet_file != None:
            goldnet = Network()
            goldnet.read_goldstd(goldnet_file)
            print "GOLD NETWORK:"
            print goldnet.network
            print jobnet.analyzeMotifs(goldnet).ToString()
            print jobnet.calculateAccuracy(goldnet)

        return jobnet.original_network
Exemplo n.º 15
0
    def load_train_test_data(self, train_data, test_data):
        """

        :param train_data: train data(raw data) is file we wanted our algorithm to train with so we can use that result with test data.
        :param test_data: test data(raw data) for checking that our prediction is right or not and finding the accuracy.
        :return: well formed train and test data with having rows as one image and index is label of the image.
        """
        try:
            # next line will give you transposed and well formatted train data.
            train_data = ReadData.load_data(train_data)
            # next line will give you transposed and well formatted test data.
            test_data = ReadData.load_test_data(test_data)
            return train_data, test_data
        except Exception as e:
            print(e)
Exemplo n.º 16
0
def slide_and_cut(tmp_data, tmp_label, tmp_pid):

    out_pid = []
    out_data = []
    out_label = []

    window_size = 6000

    cnter = {'N': 0, 'O': 0, 'A': 0, '~': 0}
    for i in range(len(tmp_data)):
        #print(tmp_label[i])
        if cnter[tmp_label[i]] is not None:
            cnter[tmp_label[i]] += len(tmp_data[i])

    stride_N = 500
    stride_O = int(stride_N // (cnter['N'] / cnter['O']))
    stride_A = int(stride_N // (cnter['N'] / cnter['A']))
    stride_P = int(0.85 * stride_N // (cnter['N'] / cnter['~']))

    stride = {'N': stride_N, 'O': stride_O, 'A': stride_A, '~': stride_P}

    for i in range(len(tmp_data)):
        tmp_stride = stride[tmp_label[i]]
        tmp_ts = tmp_data[i]
        for j in range(0, len(tmp_ts) - window_size, tmp_stride):
            out_pid.append(tmp_pid[i])
            out_data.append(tmp_ts[j:j + window_size])
            out_label.append(tmp_label[i])

    out_label = ReadData.Label2OneHot(out_label)
    out_data = np.expand_dims(np.array(out_data, dtype=np.float32), axis=2)
    out_label = np.array(out_label, dtype=np.float32)
    out_pid = np.array(out_pid, dtype=np.string_)

    return out_data, out_label, out_pid
Exemplo n.º 17
0
def processResponses(rawData,
                     headerID,
                     questionTextHeader,
                     filters=None,
                     minAnsweredFrac=0.5,
                     keepDisconnected=False,
                     doRnd=False,
                     weightByFreq=False):
    # read in and prepare raw data
    print("Reading and preparing raw data, minFrac = " + str(minAnsweredFrac))
    data = rd.readQuestionAndResponseData(rawData,
                                          questionTextHeader,
                                          doRound=doRnd)
    data['questions']['questions'] = data['questions']['questions'].set_index(
        rd.headerQuestion)
    data['responses'] = data['responses'].set_index(headerID)

    if filters == None:
        filters = [responseFilterFlag]
    # response data contains filtered data used in subsequent analysis
    print("Building response data")
    responseData = buildResponseData(data, minAnsweredFrac, keepDisconnected,
                                     filters, headerID, True)
    if len(responseData['responseVectors']) == 0:
        return None
    #print(responseData['responseVectors'])
    # compute similarity matrix
    print("Computing similarity matrix")
    print("Building feature vectors")
    buildFeatureVectors(responseData, {}, [], weightByFreq=weightByFreq)
    #print(responseData['featureVectors'])
    print("Computing feature similarity")
    simData = computePairwiseFeatureSimilarities(responseData)
    return {'data': data, 'responseData': responseData, 'simData': simData}
Exemplo n.º 18
0
def get_virus_host_graph(virus):
    #Specify virus as either 'HIV' or 'SARS-CoV-2'
    S2E = ReadData.load_S2E()
    VHG = nx.Graph()  #virus-host graph
    if virus == 'SARS-CoV-2':
        f = '%s/SARS-COV2/bait_prey_high_confidence.xlsx' % data_dir
        Gordon_table = pd.read_excel(f)
        n = Gordon_table.shape[0] - 1  #first line is header
        for i in range(n):
            Bait = list(Gordon_table.loc[i + 1])[0]  #skipping first line
            Prey = list(Gordon_table.loc[i + 1])[2]
            try:
                Prey = S2E[Prey]
            except KeyError:
                continue
            VHG.add_edge(Bait, Prey)
    if virus == 'HIV':
        f = '%s/HIV/NCBI/HIV-1_physical_interactions.tsv' % data_dir
        NCBI_table = pd.read_table(f, delimiter='\t')
        n = NCBI_table.shape[0] - 1  #first line is header
        for i in range(n):
            if list(
                    NCBI_table.loc[i + 1]
            )[3] >= 2:  #include only the ones with at leasst two interactions
                v = list(NCBI_table.loc[i + 1])[0]
                h = list(NCBI_table.loc[i + 1])[1]
                VHG.add_edge(v, str(h))

    return (VHG)
Exemplo n.º 19
0
def run() :
    global data_path, problem_set, instance_num
    if request.method == 'POST':
        print(request.form, flush=True)
        return
    else:
        instance = request.args.get('instance')
        if instance == "" :
            return render_template("index.html",x = x,y = y,z = z)
    J = []
    problem_set = instance[:1]
    instance_num = instance[1:]
    ReadData(os.path.join(APP_STATIC, data_path + problem_set + "_"+instance_num),J)
    ga = NSGA(500,3,Job_set = J,common_due_date=120)
    ga.run()
    pareto = [item for sublist in ga.nondominated_sort() for item in sublist]
    input = [[] for _ in range(2)]
    output = [[]]
    weight = []
    for point in pareto :
        input[0].append(point.obj[0])#weighted tardiness
        input[1].append(point.obj[1])#total_flow_time
        output[0].append(point.obj[2])#pieces
        weight.append(point.weights)
    res = (DEA_analysis(input, output))
    eff = [r['Efficiency'] for r in res]
    result["weight"] = weight
    result["Flow_time"] = input[1]
    result["Tardiness"] = input[0]
    result["Piece"] = output[0]
    result["DEA_score"] = eff
    return render_template("index.html",result = result)
Exemplo n.º 20
0
def searchBestDumping():
    datasetMetrics = {}
    linfoma = ReadData.getDatasets()

    for df, param in linfoma:
        print("searching Damping for dataset: ", param.name)
        metrics = []
        path = 'Experiments/SearchingDamping/Affinity_bin/' + param.name
        affinity_matrix = pd.read_csv(path, header=None)
        for damp in np.array(range(50, 96, 1)) / 100:
            if (damp >= 1): damp = 0.99
            if ((damp % 0.1) <= 0.01):
                print(str(int(damp * 200 - 100)), '%')
            if (affinity_matrix is None):
                affinity = AffinityPropagationModel(df)
                affinity.fit(damp, param.affinity['preference'])
                affinity_matrix = pd.DataFrame(affinity.model.affinity_matrix_)
                affinity_matrix.to_csv(path, index=False, header=False)
            else:
                affinity = AffinityPropagationModel(df)
                affinity.fit(damp, param.affinity['preference'], 'precomputed',
                             affinity_matrix)

            metrics.append(affinity.getValidation())
        datasetMetrics[param.name] = (pd.DataFrame(
            metrics,
            columns=[
                'pureza', 'entropia', 'rand index', 'silhueta',
                'davies-bouldin', 'n_groups'
            ],
            index=np.array(range(50, 96, 1)) / 100), param)

    return datasetMetrics
Exemplo n.º 21
0
def apriori_classID(lastestTime, appointMap, minSupport=0.1, minConf=0.6):

    dataSet = ReadData.get_all_recent_class(lastestTime, appointMap)

    L, suppData = Apriori.apriori(dataSet, minSupport)

    rules = Apriori.generateRules(L, suppData, minConf)
    return rules
Exemplo n.º 22
0
def read_data():
    X = ReadData.read_centerwave('../../data1/centerwave_resampled.csv')
    _, _, Y = ReadData.ReadData('../../data1/QRSinfo.csv')
    all_feature = np.array(X)
    print(all_feature.shape)
    all_label = np.array(Y)
    all_label_num = np.array(ReadData.Label2OneHot(Y))
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    i_fold = 1
    print('all feature shape: {0}'.format(all_feature.shape))
    for train_index, test_index in kf.split(all_feature, all_label):
        train_data = all_feature[train_index]
        train_label = all_label_num[train_index]
        test_data = all_feature[test_index]
        test_label = all_label_num[test_index]
    print('read data done')
    return all_feature, all_label_num, train_data, train_label, test_data, test_label
Exemplo n.º 23
0
def expand_three_part():
    long_pid, long_data, long_label = ReadData.ReadData('../../data1/long.csv')

    kf = StratifiedKFold(n_splits=5, shuffle=True)
    for train_index, other_index in kf.split(np.array(long_data),
                                             np.array(long_label)):
        train_data = np.array(long_data)[train_index]
        train_label = np.array(long_label)[train_index]
        train_pid = np.array(long_pid)[train_index]
        other_data = np.array(long_data)[other_index]
        other_label = np.array(long_label)[other_index]
        other_pid = np.array(long_pid)[other_index]

        kf_1 = StratifiedKFold(n_splits=2, shuffle=True)
        for val_index, test_index in kf_1.split(np.array(other_data),
                                                np.array(other_label)):
            val_data = np.array(other_data)[val_index]
            val_label = np.array(other_label)[val_index]
            val_pid = np.array(other_pid)[val_index]
            test_data = np.array(other_data)[test_index]
            test_label = np.array(other_label)[test_index]
            test_pid = np.array(other_pid)[test_index]

            break
        break

    train_data_out, train_label_out, train_data_pid_out = slide_and_cut(
        list(train_data), list(train_label), list(train_pid))
    val_data_out, val_label_out, val_data_pid_out = slide_and_cut(
        list(val_data), list(val_label), list(val_pid))
    test_data_out, test_label_out, test_data_pid_out = slide_and_cut(
        list(test_data), list(test_label), list(test_pid))

    print(
        len(set(list(train_pid)) & set(list(val_pid))
            & set(list(test_pid))) == 0)

    # with open('../../data1/expanded_three_part_window_6000_stride_500_6.pkl', 'wb') as fout:
    #     pickle.dump(train_data_out, fout)
    #     pickle.dump(train_label_out, fout)
    #     pickle.dump(val_data_out, fout)
    #     pickle.dump(val_label_out, fout)
    #     pickle.dump(test_data_out, fout)
    #     pickle.dump(test_label_out, fout)
    #     pickle.dump(test_data_pid_out, fout)

    ### use np.save to save larger than 4 GB data
    fout = open('../../data1/expanded_three_part_window_6000_stride_299.bin',
                'wb')
    np.save(fout, train_data_out)
    np.save(fout, train_label_out)
    np.save(fout, val_data_out)
    np.save(fout, val_label_out)
    np.save(fout, test_data_out)
    np.save(fout, test_label_out)
    np.save(fout, test_data_pid_out)
    fout.close()
    print('save done')
Exemplo n.º 24
0
def build_three_layer_network(drug_source,
                              virus,
                              target_type,
                              HI=None,
                              VHG=None):
    S2E = ReadData.load_S2E()

    if HI == None:
        HI = ReadData.load_HI()
    multi_graph = HI.copy()
    if VHG == None:
        VHG = get_virus_host_graph(virus)
    for edge in VHG.edges():
        multi_graph.add_edge(edge[0], edge[1])
    D2T = ReadData.load_drugbank(target_type=target_type)
    for drug, targets in D2T.items():
        for target in targets:
            multi_graph.add_edge(drug, str(target))
    return (multi_graph)
Exemplo n.º 25
0
def expand_all():
    long_pid, long_data, long_label = ReadData.ReadData('../../data1/long.csv')
    data_out, label_out, pid_out = slide_and_cut(long_data, long_label,
                                                 long_pid)

    ### use np.save to save larger than 4 GB data
    fout = open('../../data1/expanded_all_window_6000_stride_500.bin', 'wb')
    np.save(fout, data_out)
    np.save(fout, label_out)
    fout.close()
    print('save done')
Exemplo n.º 26
0
def read_data():
    long_pid, long_data, long_label = ReadData.ReadData( '../../data1/centerwave.csv' )
    
    mat1 = [truncate_long(ts, 9000) for ts in long_data]
    mat2 = [truncate_long(ts, 6000) for ts in long_data]
    mat3 = [truncate_long(ts, 3000) for ts in long_data]
    
    mat4 = [sample_long(ts, 10) for ts in mat1]
    mat5 = [sample_long(ts, 10) for ts in mat2]
    mat6 = [sample_long(ts, 10) for ts in mat3]
    
    label_onehot = ReadData.Label2OneHot(long_label)
    
#    plt.plot(mat1[0])
#    plt.plot(mat4[0])

    mat1 = np.expand_dims(np.array(mat1), axis=2)
    label_onehot = np.array(label_onehot)
    
    return mat1, label_onehot
Exemplo n.º 27
0
def findOutlier():
    train_data, test_data = rd.readData()
    plt.figure(figsize=(18, 10))
    plt.boxplot(x=train_data.values, labels=train_data.columns)
    plt.hlines([-7.5, 7.5], 0, 40, colors='r')
    plt.show()
    train_data = train_data[train_data['V9'] > -7.5]
    test_data = test_data[test_data['V9'] > -7.5]
    # train_data.describe()
    # test_data.describe()
    return train_data, test_data
Exemplo n.º 28
0
def createNormalizedDatasets():

    list_data = ReadData.getDatasets()
    datasets = []
    for data, param in list_data[1:2]:
        print(data)
        #Scaling the samples to have unit norm
        normalization = ['l1', 'l2']
        for norm in normalization:
            data_normalized = preprocessing.normalize(data,
                                                      norm=norm,
                                                      axis=0,
                                                      copy=True)
            name = 'data_normalized_' + norm
            data_normalized = pd.DataFrame(data_normalized,
                                           columns=data.columns)
            parameters = Parameters(name, 'Data/' + name + '.csv',
                                    param.k_clusters, param.eps, param.damping)
            datasets.append((data_normalized, parameters))

        #Mapping data to a defined distribution by quantile transforms
        distribution = ['uniform', 'normal']
        for dist in distribution:
            data_normalized = preprocessing.quantile_transform(
                data,
                axis=0,
                output_distribution=dist,
                random_state=Seed,
                copy=True)
            name = 'data_distribution_' + dist
            data_normalized = pd.DataFrame(data_normalized,
                                           columns=data.columns)
            parameters = Parameters(name, 'Data/' + name + '.csv',
                                    param.k_clusters, param.eps, param.damping)
            datasets.append((data_normalized, parameters))
        '''
        #Mapping data to a defined distribution by power transforms
        standardize = [True,False]
        for stand in standardize:
            data_normalized = preprocessing.power_transform(data,
                                                            method = 'yeo-johnson',
                                                            standardize = stand,
                                                            copy=True)
            name = 'data_distribution_' + dist
            data_normalized = pd.DataFrame(data_normalized, columns = data.columns)
            parameters = Parameters(name, 
                                    'Data/'+name+'.csv', 
                                    param.k_clusters,
                                    param.eps,
                                    param.damping)
            datasets.append( (data_normalized, parameters) )
        '''
    return datasets
Exemplo n.º 29
0
def main():
    import ReadData
    filename = r'train_data.txt'
    print '.........读取数据...........'
    UsersItems, Items = ReadData.ReadData(filename)
    print '.........分割数据...........'
    train, test = ReadData.divideData(UsersItems)

    print '.............训练推荐............'
    flag = 0    #0: 表示训练测试, 1:表示生成结果
    near_num = 200; top_num = 1
    hiddenStates_num = 10; max_iter=30
    mPLSA = CpLSA()
    if flag==0:
        mPLSA.transformData(train)
        mPLSA.process(hiddenStates_num, max_iter)
        simUsers = mPLSA.calSimUsers(near_num)

    elif flag==1:
        mPLSA.transformData(UsersItems)
        mPLSA.process(hiddenStates_num, max_iter)
Exemplo n.º 30
0
def main(tweetList):
    tweetListPreprocessed = []
    tweetPreprocessed = ""

    for tweet in tweetList:
        if (tweet.text != 'Not Available'):
            tweetPreprocessed = preprocessText(tweet.text)

            # Save in new object
            tweetPre = read.make_tweet(tweet.id, tweet.name, tweet.language, tweetPreprocessed)
            tweetListPreprocessed.append(tweetPre)

    return tweetListPreprocessed
Exemplo n.º 31
0
 def pair2rank(self, pair_pref_prob_samp):
     # list(Nclass * Nclass)#
     # borda count #
     ppps = pair_pref_prob_samp
     Nclass = len(ppps)
     for i in range(Nclass):
         if ppps[i][i] != 0:
             print "warning: self comparison", i, ppps[i][i]
             ppps[i][i] = 0
     rscore = map(sum, ppps)
     # print "rscore", rscore
     rank = ReadData.rankOrder(rscore)
     return rank
Exemplo n.º 32
0
def initializeData():
    global CLASS_WORDS
    global CLASSES
    global CLASS_DICT
    global TRAINING_DATA_STATS
    global training_data
    global test_data

    ReadData.shuffleTrainingData()
    training_data = ReadData.TRAINING_DATA[:800]
    test_data = ReadData.TRAINING_DATA[800:]

    for i in range(len(CLASSES)):
        CLASS_DICT[CLASSES[i]] = Class(CLASSES[i])

    for data in training_data:
        # Tokenize each sentence into words
        sentence = removeSpecialCharacters(data['sentence'])
        sentence = nltk.word_tokenize(sentence)
        sentence = lemmatizeSentence(sentence)
        doc_num = CLASS_DICT[data['class']].getTotalDocuments()

        document = Document(doc_num, data['class'], sentence)
        CLASS_DICT[data['class']].addDocument(document)
        TRAINING_DATA_STATS.training_docs.append(sentence)

        for word in sentence:
            # Have we not seen this word already?
            CLASS_DICT[data['class']].addToTotalClassWordFreq(word)
            # Add the word to our words in class list
            CLASS_DICT[data['class']].documents[doc_num].addToDocsWordFreq(
                word)

            # This is frequency so we need to change this part
            CLASS_DICT[data['class']].addWords([word])

            TRAINING_DATA_STATS.addToTotalWordFreq(word)
            TRAINING_DATA_STATS.addWords([word])
Exemplo n.º 33
0
def run():
#ターミナルコマンド
    cmd = "ls"
    subprocess.call(cmd, shell=True)

#データ読み込みと集計、結果をCSVで出力
    rawdatafiles = ReadData.rawfilelisting("../data/updates/YoutubePakistan/")
    #ReadData.readupdatedata(rawdatafiles)

#データチェック
#    ReadData.checklink()
#    ReadData.checkupdate()

# CSVからグラフへ
    timerange = 60
    number_of_files = int(timerange / 15)
    DrawGraph.draw(number_of_files) #15分間分のファイル 4つ分読み込んでグラフへ
#
#     This will generate a file with the linear coefficients from 1 to 5 grams.
#
#
#
# _____________________________________________________________________________


# 1-. Read dataset and create tweetList fullfilled of Tweet object*

dataset = sys.argv[1]
maxNgram = int(sys.argv[2])

filename = os.path.basename(dataset).split('.')

tweetList = read.read_tweets_dataset(dataset)

# 2-. Pre-process state
    # Raw data -> tweetList
    # Clean data -> tweetListPreProcessed
tweetListPreProcessed = preprocess.main(tweetList)

# 3-. OBTAIN N-GRAMS and Linear Coefficients

for i in xrange(5, maxNgram+1):
    corpusNgrams, arrayLanguages,arrayLanguagesFull = utils.obtainNgrams(tweetListPreProcessed, i+1)
    linearCoefficients = linear.getlinearcoefficientsForLanguageArray(arrayLanguages, i, corpusNgrams)
    # print linearCoefficients
    file = open('../Dataset/LICoefficients_'+str(maxNgram)+'gram_for-'+str(filename[0])+'.txt', 'a+')
    for li in linearCoefficients:
        file.write(str(i)+"\t"+str(li[0]))
Exemplo n.º 35
0
    # write data a line per sentence, to temporary file
    temp = tempfile.NamedTemporaryFile()
    for obs in data:
        temp.write(obs + "\n")
    temp.flush()
    return temp


# Wikipedia data is already in a sentence per line format
with open(Globals.WIKI_POS, "w") as pos_file:
    tag(stanford_cmd, Globals.STANFORD_PATH, Globals.WIKI_TRAIN, pos_file)


# Twitter, we will read in data, and write to temporary file
# Training data
twitter_train_raw = ReadData.readTwitterData(Globals.TWITTER_TRAIN, splitwords=False)
twitter_temp_file = write_temp(twitter_train_raw[1])
with open(Globals.TWITTER_TRAIN_POS, 'w') as pos_file:
    tag(gate_cmd, Globals.GATE_PATH, twitter_temp_file.name, pos_file)
twitter_temp_file.close()

# Test data
twitter_test_raw = ReadData.readTwitterData(Globals.TWITTER_TEST, splitwords= False)
twitter_temp_file = write_temp(twitter_test_raw[1])
with open(Globals.TWITTER_TEST_POS, 'w') as pos_file:
    tag(gate_cmd, Globals.GATE_PATH, twitter_temp_file.name, pos_file)
twitter_temp_file.close()

# Blog data
blog = ReadData.readBlogData(Globals.BLOG_DATA, splitwords = False)
blog = [ txt for txt,label in blog ]
    def run(self, kofile, tsfile, wtfile, datafiles, name, goldnet_file, normalize=False):
        os.chdir(os.environ["gene_path"])
        knockout_storage = ReadData(kofile, "knockout")
        print "Reading in knockout data"
        wildtype_storage = ReadData(wtfile, "steadystate")

        if datafiles == []:
          other_storage = None
        else:
          other_storage = ReadData(datafiles[0], "steadystate")
          for file in datafiles[1:]:
              other_storage.combine(ReadData(file, "steadystate"))

        timeseries_storage = None
        if tsfile != None:
            timeseries_storage = ReadData(tsfile, "timeseries")
            #for ts in timeseries_storage:
                #ts.normalize()

        #if normalize:
            #knockout_storage.normalize()
            #wildtype_storage.normalize()
            #other_storage.normalize()


        settings = {}
        settings = ReadConfig(settings)
        # TODO: CHANGE ME
        settings["global"]["working_dir"] = os.getcwd() + '/'

        # Setup job manager
        print "Starting new job manager"
        jobman = JobManager(settings)

        # Make inferelator jobs
        inferelatorjob = inferelator()
        inferelatorjob.setup(knockout_storage, wildtype_storage, settings, timeseries_storage, other_storage, name)

        print "Queuing job..."
        jobman.queueJob(inferelatorjob)

        print jobman.queue
        print "Running queue..."
        jobman.runQueue()
        jobman.waitToClear()

        print "Queue finished"
        job = jobman.finished[0]
        #print job.alg.gene_list
        #print job.alg.read_output(settings)
        jobnet = job.alg.network
        #print "PREDICTED NETWORK:"
        #print job.alg.network.network
        print jobnet.original_network

        if goldnet_file != None:
            goldnet = Network()
            goldnet.read_goldstd(goldnet_file)
            #print "GOLD NETWORK:"
            #print goldnet.network
            #print jobnet.analyzeMotifs(goldnet).ToString()
            print jobnet.calculateAccuracy(goldnet)
            import AnalyzeResults
            tprs, fprs, rocs = AnalyzeResults.GenerateMultiROC(jobman.finished, goldnet )
            ps, rs, precs = AnalyzeResults.GenerateMultiPR(jobman.finished, goldnet)
            print "Area Under ROC"
            print rocs

            print "Area Under PR"
            print precs

        return jobnet.original_network
    settings["global"]["experiment_name"] + "-" + t + "/"
os.mkdir(settings["global"]["output_dir"])

# Read in configs for this algorithm
from dfg4grn import *
settings = ReadConfig(settings, "./config/default_values/dfg4grn.cfg")
settings = ReadConfig(settings, settings["dfg4grn"]["config"])

data = {}
knockouts = {}
wildtypes = {}
knockdowns = {}
multifactorials = {}
timeseries_as_steady_state = {}
# Loop over the directories we want, reading in the timeseries files
data = ReadData(data_file, "kranthi_data")
ss_names = open(ss_names).read().splitlines()
ts_names = open(ts_names).read().splitlines()
pert_names = open(pert_names).read().splitlines()

# Filter out the genes we don't want
genes_of_interest = open(genes_file).read().splitlines()
genes_of_interest = [x.upper() for x in genes_of_interest]
data.filter(genes_of_interest)

# Read in the legend so we know what the experiment names relate to
col_to_exp = {}
pert_cond = {}
for line in open(legend):
    line = line.strip()
    line = line.split(',')
if sys.argv[1] == "dream4100":
    goldnet.read_goldstd(settings["global"]["dream4100_network_goldnet_file"])
    #Get a list of the knockout files
    ko_file = settings["global"]["dream4100_network_knockout_file"].split()
    kd_file = settings["global"]["dream4100_network_knockdown_file"].split()
    ts_file = settings["global"]["dream4100_network_timeseries_file"].split()
    wt_file = settings["global"]["dream4100_network_wildtype_file"].split()
    mf_file = settings["global"]["dream4100_network_multifactorial_file"].split()

# Read data into program
# Where the format is "FILENAME" "DATATYPE"
knockout_storage = ReadData(ko_file[0], "knockout")
knockdown_storage = ReadData(kd_file[0], "knockdown")
timeseries_storage = ReadData(ts_file[0], "timeseries")
wildtype_storage = ReadData(wt_file[0], "wildtype")
mf_storage = ReadData(mf_file[0], "multifactorial")

# Setup job manager
jobman = JobManager(settings)

# Make BANJO jobs
mczjob = MCZ()
mczjob.setup(knockout_storage, wildtype_storage, settings, timeseries_storage, knockdown_storage, "MCZ_Alone")
jobman.queueJob(mczjob)

clrjob = CLR()
clrjob.setup(knockout_storage, settings, "clr_" + t + "_Bins-" + str(6), "plos", 6)
jobman.queueJob(clrjob)

#cojob = ConvexOptimization()
#cojob.setup(knockout_storage, settings, "ConvOpt_T-Plos",None, None, 0.04)
Exemplo n.º 39
0
# Usage: python selectionSort.py input 
# Chen-Yu Li     [email protected]
# 2014/3/5

## Standard module
import sys, os
import numpy as np
import re

## User's own module
sys.path.append('/home/cli56/scripts')
import ReadData

## Read input
print "Start loading data..."
array=ReadData.loadAscii(sys.argv[1])
print "Finished loading data!"

## set output file name
inputPrefix = re.split('\.', sys.argv[1])
inputPrefix_noType = ''
for i in range(len(inputPrefix)-1):
    inputPrefix_noType = inputPrefix_noType + inputPrefix[i]
    if i < (len(inputPrefix)-2):
        inputPrefix_noType = inputPrefix_noType + '.' 
    

s="_sorted.dat"
outputPrefix = inputPrefix_noType+s

Exemplo n.º 40
0
# Create date string to append to output_dir
t = datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
settings["global"]["output_dir"] = settings["global"]["output_dir"] + "/" + \
    settings["global"]["experiment_name"] + "-" + t + "/"
os.mkdir(settings["global"]["output_dir"])

# Read in the gold standard network

# Read in the gold standard network
#goldnet.read_goldstd(settings["global"]["large_network_goldnet_file"])
#ko_file, kd_file, ts_file, wt_file, mf_file, goldnet = get_example_data_files(sys.argv[1], settings)


# Read data into program
# Where the format is "FILENAME" "DATATYPE"
dex_storage = ReadData("datasets/RootArrayData/DexRatios.csv", "dex")
dexcombined = ReadData("datasets/RootArrayData/DexRatios.csv", "dex")
dex_storage2 = ReadData("datasets/RootArrayData/HHO3_DEX_ratios.csv", "dex")

cnlo_storage = ReadData("datasets/RootArrayData/Root_CNLO_Krouk.txt", "dex")
cnlo_no3_storage = ReadData("datasets/RootArrayData/Root_CNLO_Krouk.txt", "dex")
no3_1_storage = ReadData("datasets/RootArrayData/Root_NO3_Wang03.txt", "dex")
no3_2_storage = ReadData("datasets/RootArrayData/Root_NO3_Wang04.txt", "dex")
no3_3_storage = ReadData("datasets/RootArrayData/Root_NO3_Wang07.txt", "dex")
#ts_storage = ReadData("datasets/RootArrayData/Root_WT_Krouk11.txt", "dex")

tfs_file = open("datasets/RootArrayData/tfs.csv", 'r')
line = tfs_file.readlines()[0]
tfs = line.strip().split(',')
tfs = [x.upper() for x in tfs]
Exemplo n.º 41
0
import numpy as np

# import scipy as sp
from scipy import interpolate
import ReadData
import libMATERIALS


Ge_table, Ge_start, dU_Ge, N_Ge1, N_Ge2 = libMATERIALS.initialize_ge()
TGe = interpolate.interp1d(Ge_table[2, :], Ge_table[0, :], "slinear")

Tx = np.zeros((100))
Ex = np.zeros((100))
ct2 = 0
for iter in range(10000, 173000, 1000):
    ct, L, dL, Ne, qfluxL, qfluxC, qfluxR = ReadData.readdata1(iter)
    qL, qC, qR, Tavg, Eavg = ReadData.readdata2(iter, Ne[0], Ne[1], Ne[2])
    ct2 = ct2 + 1
    for i in range(Ne[0]):
        Ex[i] = Ex[i] + np.sum(Eavg[i, :, :]) / (Ne[1] * Ne[2])


Ex = Ex / ct2 / (dL[0] * dL[1] * dL[2])
Tx = TGe(Ex)


x = np.linspace(0.5 * dL[0], L[0] - 0.5 * dL[0], Ne[0])


plt.figure()
plt.plot(x, Tx)
Exemplo n.º 42
0
'''
Created on Jun 8, 2012

@author: yyb
'''

'Write the information into GraphML format files'


import os;
import ReadData;
 
 
ReadData.readSubDate();
ReadData.readLinks();
 
 
myDayDict = ReadData.dailyDict;
myMonDict = ReadData.monthlyDict;
myYrDict = ReadData.yearlyDict;
myPaperDict = ReadData.paperDict;
myCiteDict = ReadData.citeDict;
totalDay = len(ReadData.dailyList);
totalMon = len(ReadData.monthlyList);
totalYr = len(ReadData.yearlyList);
 
 
#===============================================================================
# Write information into GraphML format files with a daily unit
#===============================================================================
def writeDailyGML(size):
wt_file = settings["global"]["medium_network_wildtype_file"].split()

# Read in the gold standard network
#goldnet = Network()
#goldnet.read_goldstd(settings["global"]["large_network_goldnet_file"])

#ko_file = settings["global"]["large_network_knockout_file"].split()
#kd_file = settings["global"]["large_network_knockdown_file"].split()
#ts_file = settings["global"]["large_network_timeseries_file"].split()

#wt_file = settings["global"]["large_network_wildtype_file"].split()

# Read data into program
# Where the format is "FILENAME" "DATATYPE"
knockout_storage = ReadData(ko_file[0], "knockout")
knockdown_storage = ReadData(kd_file[0], "knockdown")
timeseries_storage = ReadData(ts_file[0], "timeseries")
wildtype_storage = ReadData(wt_file[0], "wildtype")
wildtype_storage.combine(knockout_storage)
wildtype_storage.combine(knockdown_storage)
wildtype_storage.combine(timeseries_storage)

wildtype_storage.normalize()
knockout_storage.normalize()


# Setup job manager
jobman = JobManager(settings)

# Make BANJO jobs
Exemplo n.º 44
0
    def run(self, datafiles=None, name=None, goldnet_file=None, topd=None, restk=None):
        import numpy

        os.chdir(os.environ["gene_path"])

        print "Reading in data"
        data_storage = ReadData(datafiles[0], "steadystate")
        for file in datafiles[1:]:
            data_storage.combine(ReadData(file, "steadystate"))

        settings = {}
        settings = ReadConfig(settings)
        # TODO: CHANGE ME
        settings["global"]["working_dir"] = os.getcwd() + "/"

        # Setup job manager
        print "Starting new job manager"
        jobman = JobManager(settings)

        # Make nir jobs
        nirjob = NIR()
        nirjob.setup(data_storage, settings, name, topd, restk)

        print "Queuing job..."
        jobman.queueJob(nirjob)

        print jobman.queue
        print "Running queue..."
        jobman.runQueue()
        jobman.waitToClear()

        print "Queue finished"
        job = jobman.finished[0]
        print job.alg.gene_list
        print job.alg.read_output(settings)
        jobnet = job.alg.network
        print "PREDICTED NETWORK:"
        print job.alg.network.network

        if goldnet_file != None:
            goldnet = Network()
            goldnet.read_goldstd(goldnet_file)
            # print "GOLD NETWORK:"
            # print goldnet.network
            # print jobnet.analyzeMotifs(goldnet).ToString()
            print jobnet.calculateAccuracy(goldnet)
            import AnalyzeResults

            tprs, fprs, rocs = AnalyzeResults.GenerateMultiROC(
                jobman.finished, goldnet, True, job.alg.output_dir + "/ROC.pdf"
            )
            ps, rs, precs = AnalyzeResults.GenerateMultiPR(
                jobman.finished, goldnet, True, job.alg.output_dir + "/PR.pdf"
            )
            print "Area Under ROC"
            print rocs

            print "Area Under PR"
            print precs

        return job.alg.network.network
Exemplo n.º 45
0
import UtilsTweetSafa as utils
import Smoothing as linear
import numpy as np
import CrossValidation as cv

import sys

maxNgram = 5

# 1-. Read dataset and create tweetList fullfilled of Tweet object*

dataset = "../Dataset/output_complete.txt"
test = "../Dataset/mezclado.txt"
LI_Coefficients = "../Dataset/LICoefficients_5gram_for-output_complete.txt"

tweetList = read.read_tweets_dataset(dataset)
tweetListtest = read.read_tweets_dataset(test)
# 2-. Pre-process state

tweetListPreProcessed = preprocess.main(tweetList)
tweetListPreProcessedtest= preprocess.main(tweetListtest)
shuffle(tweetListPreProcessed)
    # Raw data -> tweetList
    # Clean data -> tweetListPreProcessed

#utils.printTweets(tweetListPreProcessed)

# 3-. Algorithms
#
# 3.1-. OBTAIN N-GRAMS
# goldnet.read_goldstd(settings["global"]["large_network_goldnet_file"])
settings["global"]["experiment_name"] = "GENIE3" + sys.argv[1]
ko_file, kd_file, ts_file, wt_file, mf_file, goldnet = get_example_data_files(
    sys.argv[1], settings
)  # Create date string to append to output_dir
t = datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
settings["global"]["output_dir"] = (
    settings["global"]["output_dir"] + "/" + settings["global"]["experiment_name"] + "-" + t + "/"
)
os.mkdir(settings["global"]["output_dir"])

# Get a list of the multifactorial files

# Read data into program
# Where the format is "FILENAME" "DATATYPE"
mf_storage = ReadData(mf_file[0], "multifactorial")
ko_storage = ReadData(ko_file[0], "knockout")
kd_storage = ReadData(kd_file[0], "knockdown")
wt_storage = ReadData(wt_file[0], "wildtype")

# Setup job manager
jobman = JobManager(settings)

# Make GENIE3 jobs
genie3job = GENIE3()
genie3job.setup(mf_storage, settings, "MF")
jobman.queueJob(genie3job)

mf_storage.combine(ko_storage)
genie3job = GENIE3()
genie3job.setup(mf_storage, settings, "MF_KO")
Exemplo n.º 47
0
import ReadData
import numpy
import pylab
from math import *

myArray = ReadData.loadAscii("DataSet_PythonStatisticalAnalysis.txt")
print "I have read in myArray and the first element is ", myArray[0]


def Mean(myArray):
    for i in myArray:
        print i
    # this prints everything in the array.
    # let's change this to average things in a
    average = numpy.average(myArray)
    return average


def StandardDev(myArray):
    # do stuff
    standard_dev = numpy.std(myArray)
    return standard_dev


def NaiveStandardError(myArray):
    # calculate the standard error by calling the standard deviation function
    Neff = len(myArray)
    sig = StandardDev(myArray)
    se = sig / (numpy.sqrt(Neff - 1))
    return standard_err
Exemplo n.º 48
0
    """
    # wrap application to handle tuple and non-tuple
    satisfies = lambda f, e: f(e[0]) if isinstance(data[0], tuple) else lambda f, e: f(e)
    defined_filters = set(filters.keys())
    applied = defined_filters.intersection(filter_names) if filter_names != None else defined_filters
    if not applied:
        raise ValueError("Must apply at least one filter. try filter_names = None")
    if filter_names != None and len(applied) != len(filter_names):
        missing = applied.difference(filter_names)
        raise ValueError("Unspecified filter names: %s" % ", ".join(missing))
    # ok down to business now
    clean_data = enumerate(data)
    for filter_name in applied:
        print "Applying filter: %s" % filter_name
        filter = filters[filter_name]
        clean_data = [ (i, obs) for i, obs in clean_data if satisfies(filter, obs) ]
    if return_indices:
        return [i for i, _ in clean_data]
    else:
        return [obs for _, obs in clean_data]


# an example
if __name__ == "__main__":
    import ReadData
    import Globals
    wikidata = ReadData.prepareWikiData(Globals.WIKI_TRAIN, splitwords= False)
    cleandata = clean_wiki(wikidata)
    print "Reduced data by %f" % (len(cleandata) / float(len(wikidata)))

    for gene1 in goldnet.network:
        for gene2 in goldnet.network[gene1]:
            if goldnet.network[gene1][gene2] > 0:
                t.append(gene1)
    tfs[name] = list(set(t))

goldnet = Network()
goldnet.read_goldstd(goldnets[data.keys()[0]])

genie3nets = {}

for name in data.keys():
    for i in range(50):
        ts_storage = data[name]
        settings["global"]["time_series_delta_t"] = (1008.0 / (len(ts_storage[0].experiments)-1))
        combined = ReadData(exp_data_directory + '/' + name + '/' + timeseries_filename, "timeseries")[0]
        for ts in timeseries_as_steady_state[name][1:]:
            combined.combine(ts)
        combined.combine(multifactorials[name])

        genie3job = GENIE3()
        genie3job.setup(combined, settings, "Genie3_TimeSeries_SS_{0}-{1}".format(name, i))
        jobman.queueJob(genie3job)
        genie3nets[name] = genie3job


jobman.runQueue()
jobman.waitToClear()

for name in data.keys():
    for i in range(50):
Exemplo n.º 50
0
__author__ = 'saghar hosseini ([email protected])'
import numpy as np
from ReadData import*
from projection import*
##########################################################################################
#                              Load Data
##########################################################################################
# path="C:/Users/sagha_000/Documents/SVN/My_SVN/TimeVaryingSocialNetworks/datasets/as-733/"
path="F:/Saghar_SVN/TimeVaryingSocialNetworks/datasets/twitter-pol-dataset/graphs/"
dataset=ReadData(path)
edges=dataset.read_network_snapshot(1,hasHeader=True)
nodes_list=set(edges.keys())
output_path='F:/Saghar_SVN/TimeVaryingSocialNetworks/datasets/twitter-pol-dataset/Results/wo_OPD/'
############################################################################################
#                               Define Parameters
############################################################################################
numberOfSnapshots=1175
numCommunity=10
mu=0.1
lambdah_C=0.0
lambdah_B=0.0
sampleFraction=0.25
n=len(nodes_list)
K_B=1.0
K_C=1.0
#############################################################################################
#variables
learning_rate_C={}
initial_state=dict()
state=dict()
visit={}
settings["global"]["experiment_name"] = "GenBio-German-Last-Removed-" + sys.argv[1]

settings["global"]["n_processors"] = 1


# Set up output directory
t = datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
settings["global"]["output_dir"] = settings["global"]["output_dir"] + "/" + \
    settings["global"]["experiment_name"] + "-" + t + "/"
os.mkdir(settings["global"]["output_dir"])

jobman = JobManager(settings)

# Read data into program
# Where the format is "FILENAME" "DATATYPE"
c4d = ReadData("datasets/German_Data/Caldana-4d.tsv", "dex")
c4l = ReadData("datasets/German_Data/Caldana-4L.tsv", "dex")
c21d = ReadData("datasets/German_Data/Caldana-21d.tsv", "dex")
c21hl = ReadData("datasets/German_Data/Caldana-21HL.tsv", "dex")
c21l = ReadData("datasets/German_Data/Caldana-21L.tsv", "dex")
c21ll = ReadData("datasets/German_Data/Caldana-21LL.tsv", "dex")
c32l = ReadData("datasets/German_Data/Caldana-32L.tsv", "dex")
c32l2 = ReadData("datasets/German_Data/Caldana-32L2.tsv", "dex")

combined = ReadData("datasets/German_Data/Caldana-4d.tsv", "dex")

c21l.experiments = c21l.experiments[1:]

#settings["global"]["time_series_delta_t"] = [5,10,20,40,60,80,100,120,140,160,180,200,220,240,260,280,300,320,340,360,640,1280]
settings["global"]["time_series_delta_t"] = [5,10,20,40,60,80,100,120,140,160,180,200,220,240,260,280,300,320,340,360,640,1280]
#settings["global"]["time_series_delta_t"] = settings["global"]["time_series_delta_t"][:-remove]
Exemplo n.º 52
0
  firstLine = True
  j = 0
  for line in inputFile[i]:
    if firstLine:
    # First line label
      inputFileLabel[i] = str(line)
      firstLine = False
    else:
    # For every beta point
      inputLine = str(line)
      fileExtension = inputLine.replace(' ','-')
      fileExtension = fileExtension.replace('\n','')
      filePath = "data/traces/" + varLabel + "Trace-" + fileExtension + ".dat"    
      print "\nReading data from " + filePath + "."
      (myArray, myArrayHeadings) = ReadData.loadAscii(filePath)
      yValData[i].append(myArray)
      xValData[i].append(myArrayHeadings)

      # Make histograms and generate error bars
      print "\nSorting Data..."
  
      xVals[i].append([])
      for k in range(0, len(xValData[i][j])):
        # For every x value
        xVals[i][j].append([])          
        xVals[i][j][k] = float(xValData[i][j][k])
    
      bins[i].append([])
      for l in range(0, len(xVals[i][j])):
        # For every bin in the histogram
Exemplo n.º 53
0
    def run(self, ko_file, wt_file, ts_file=None, kd_file=None, name=None):
        import numpy

        os.chdir(os.environ["gene_path"])

        print "Reading in knockout data"
        knockout_storage = ReadData(ko_file, "knockout")
        knockout_storage.normalize()
        wildtype_storage = ReadData(wt_file, "wildtype")
        wildtype_storage.normalize()
        knockdown_storage = ReadData(kd_file, "knockdown")
        knockdown_storage.normalize()

        wildtype_storage.combine(knockdown_storage)

        timeseries_storage = None
        if ts_file != None:
            timeseries_storage = ReadData(ts_file, "timeseries")
            for ts in timeseries_storage:
                ts.normalize()

        settings = {}
        settings = ReadConfig(settings)
        # TODO: CHANGE ME
        settings["global"]["working_dir"] = os.getcwd() + '/'

        # Setup job manager
        print "Starting new job manager"
        jobman = JobManager(settings)

        # Make MCZ jobs
        mczjob = MCZ()
        mczjob.setup(knockout_storage, wildtype_storage, settings, timeseries_storage, name)

        print "Queuing job..."
        jobman.queueJob(mczjob)

        print jobman.queue
        print "Running queue..."
        jobman.runQueue()
        jobman.waitToClear()

        print "Queue finished"
        job = jobman.finished[0]
        print job.alg.gene_list
        print job.alg.read_output(settings)
        jobnet = job.alg.network
        print "PREDICTED NETWORK:"
        print job.alg.network.network
        print jobnet.original_network

        return jobnet.original_network
def get_network_results(name, settings, cache):
  print "STARTING", name

  if name in cache.keys():
    print "CACHE HIT"
    return cache[name]

  ko_file, kd_file, ts_file, wt_file, mf_file, goldnet = get_example_data_files(name, settings)

  # Create date string to append to output_dir
  t = datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
  settings["global"]["output_dir"] = settings["global"]["output_dir_save"] + "/" + \
      settings["global"]["experiment_name"] + "-" + t + "-" + name + "/"
  os.mkdir(settings["global"]["output_dir"])

  # Get a list of the multifactorial files

  # Read data into program
  # Where the format is "FILENAME" "DATATYPE"
  mf_storage = ReadData(mf_file[0], "multifactorial")
  knockout_storage = ReadData(ko_file[0], "knockout")
  knockdown_storage = ReadData(kd_file[0], "knockdown")
  wildtype_storage = ReadData(wt_file[0], "wildtype")
  timeseries_storage = ReadData(ts_file[0], "timeseries")
  gene_list = knockout_storage.gene_list

  # Setup job manager
  jobman = JobManager(settings)

  # MCZ
  mczjob = MCZ()
  mczjob.setup(knockout_storage, wildtype_storage, settings, timeseries_storage, knockdown_storage, "MCZ")
  jobman.queueJob(mczjob)

  # CLR
  clrjob = CLR()
  clrjob.setup(knockout_storage, settings, "CLR", "plos", 6)
  jobman.queueJob(clrjob)

  # GENIE3
  mf_storage.combine(knockout_storage)
  mf_storage.combine(wildtype_storage)
  mf_storage.combine(knockdown_storage)
  genie3job = GENIE3()
  genie3job.setup(mf_storage, settings, "GENIE3")
  jobman.queueJob(genie3job)

  ## TLCLR
  tlclrjob = TLCLR()
  tlclrjob.setup(knockout_storage, wildtype_storage, settings, timeseries_storage, knockdown_storage, "TLCLR")
  jobman.queueJob(tlclrjob)

  #if sys.argv[1] != "dream4100":
      #cojob = ConvexOptimization()
      #cojob.setup(knockout_storage, settings, "ConvOpt_T-"+ str(0.01),None, None, 0.01)
      #jobman.queueJob(cojob)

  ### DFG4GRN
  dfg = DFG4GRN()
  settings["dfg4grn"]["eta_z"] = 0.01
  settings["dfg4grn"]["lambda_w"] = 0.001
  settings["dfg4grn"]["tau"] = 3
  dfg.setup(timeseries_storage, TFList(timeseries_storage[0].gene_list), settings, "DFG", 20)
  jobman.queueJob(dfg)

  ### Inferelator

  ### NIR
  nirjob = NIR()
  nirjob.setup(knockout_storage, settings, "NIR", 5, 5)
  jobman.queueJob(nirjob)

  #### TDARACNE
  settings = ReadConfig(settings, "./config/default_values/tdaracne.cfg")
  bjob = tdaracne()
  settings["tdaracne"]["num_bins"] = 4
  bjob.setup(timeseries_storage, settings, "TDARACNE")
  jobman.queueJob(bjob)


  print jobman.queue
  jobman.runQueue()
  jobman.waitToClear(name)
  SaveResults(jobman.finished, goldnet, settings, name)

  cache[name] = jobman.finished[:]

  return cache[name]
multifactorial_filename = exp_set + '-1_multifactorial.tsv'
dex_filename = exp_set + '-1_multifactorial.tsv'

goldstandard_filename = exp_set + '-1_goldstandard.tsv'

ts_only_data = {}
ts_pert_data = {}
pert_data = {}
ko_pert_data = {}

# Do TS only first
ts_only_data["timeseries"] = ReadData(exp_data_directory + "/" + exp_set + "/" + '/TS/' + timeseries_filename, "timeseries")
#knockdowns.normalize()
ts_only_data["ss_data"] = ReadData(exp_data_directory + "/" + exp_set + "/" + '/TS/' + wildtype_filename, "wildtype")
#wildtypes.normalize()
ts_only_data["multifactorial_data"] = ReadData(exp_data_directory + "/" + exp_set + "/" + '/TS/' + multifactorial_filename, "multifactorial")

ts_only_data["knockout_data"] = ReadData(exp_data_directory + "/" + exp_set + "/" + '/TS/' + knockout_filename, "knockout")

#pert_data = ReadData(exp_data_directory + "/" + exp_set + "/" + '/TS/' + multifactorial_filename, "multifactorial")
#multifactorials.normalize()
ts_only_data["goldnet_file"] = exp_data_directory + "/" + exp_set + "/" + '/TS/' + goldstandard_filename



ts_pert_data["timeseries"] = ReadData(exp_data_directory + "/" + exp_set + "/" + '/TS/' + timeseries_filename, "timeseries")
#knockdowns.normalize()
ts_pert_data["ss_data"] = ReadData(exp_data_directory + "/" + exp_set + "/" + '/TS/' + wildtype_filename, "wildtype")
#wildtypes.normalize()
ts_pert_data["multifactorial_data"] = ReadData(exp_data_directory + "/" + exp_set + "/" + '/TS/' + multifactorial_filename, "multifactorial")
# Read in the gold standard network
goldnet = Network()
goldnet.read_goldstd(settings["global"]["large_network_goldnet_file"])

ko_file = settings["global"]["large_network_knockout_file"].split()
kd_file = settings["global"]["large_network_knockdown_file"].split()
ts_file = settings["global"]["large_network_timeseries_file"].split()

wt_file = settings["global"]["large_network_wildtype_file"].split()

# Read data into program
# Where the format is "FILENAME" "DATATYPE"
knockout_storage = ReadData(ko_file[0], "knockout")
knockdown_storage = ReadData(kd_file[0], "knockdown")
timeseries_storage = ReadData(ts_file[0], "timeseries")
wildtype_storage = ReadData(wt_file[0], "wildtype")
wildtype_storage.combine(knockout_storage)
wildtype_storage.combine(knockdown_storage)
wildtype_storage.combine(timeseries_storage)



# Setup job manager
jobman = JobManager(settings)

# Make BANJO jobs
mczjob = MCZ()
mczjob.setup(knockout_storage, wildtype_storage, settings, None, "mcz-test-run-1")
jobman.queueJob(mczjob)

print jobman.queue
Exemplo n.º 57
0
'''
Created on Jun 25, 2012

@author: yyb
'''

import os;
import ReadData;
from copy import deepcopy;
from datetime import datetime;


ReadData.readItemInfo();
ReadData.readUserInfo();


myUsrInfo = ReadData.usrInfo;
myItmInfo = ReadData.itmInfo;


#===============================================================================
# Defining my own comparison function for sorting filenames in filelist
#===============================================================================
def compare(s1, s2):
    a = int(s1.split('.')[0]);
    b = int(s2.split('.')[0]);
    return cmp(a, b);


#===============================================================================
# A function for writing given data into GraphML format 
                t.append(gene1)
    tfs[name] = list(set(t))

for key in goldnets.keys():
    goldnet = Network()
    goldnet.read_goldstd(goldnets[key])
    goldnets[key] = goldnet



genie3nets = {}
for i in range(20):
    for name in data.keys():
        ts_storage = data[name]
        settings["global"]["time_series_delta_t"] = (1008.0 / (len(ts_storage[0].experiments)-1))
        combined = ReadData(exp_data_directory + '/' + name + '/' + timeseries_filename, "timeseries")[0]

        for ts in timeseries_as_steady_state[name][1:11]:
            combined.combine(ts)
        #combined.combine(knockouts[name])
        combined.combine(multifactorials[name])

        genie3job = GENIE3()
        genie3job.setup(combined, settings, "Genie3_TimeSeries_{0}_{1}".format(name, i))
        jobman.queueJob(genie3job)
        genie3nets[name] = genie3job
        genie3job.goldnet = goldnets[name]


jobman.runQueue()
jobman.waitToClear()
Exemplo n.º 59
0
        if line[k]!=' ':
          entry += line[k]
        else:
          params += [entry]
          entry=''
      beta[i].append(float(params[3]))
      nPart = float(params[0])
      nD = float(params[1])
      #interaction = float(params[11])

      fileExtension = inputLine.replace(' ','-')
      fileExtension = fileExtension.replace('\n','')
      scalarFilePath = "data/traces/scalarTrace-" + fileExtension + ".dat"    
      scalarFilePath = "data/traces/Energy-7-3-32-2.7366-7200-1-1-1-0.dat"
      print "\nReading data from " + scalarFilePath + "."
      (myArray, myArrayHeadings) = ReadData.loadAscii(scalarFilePath)
      scalarData[i].append(CalcStatistics.getAndOutputStats(myArray, myArrayHeadings))
      #Plotting.makePlots(myArray, myArrayHeadings, fileExtension)

print "\nSorting Data..."
# Rotate Data into Columns
col = []
for i in range(0, len(scalarData)):
  # For every input file
  col.append([])
  for j in range(0, len(scalarData[i][0])):
    # For every observable
    col[i].append([])
    for k in range(0, len(scalarData[i][0][0])):
      # For every statistic
      col[i][j].append([])