Пример #1
0
def task2A():
    dataOriginal = sampling.originalData()
    #dataOriginal.drop(['A15'], axis=1)
    del dataOriginal['A15']

    dataRandom = sampling.randomSampling()
    del dataRandom['A15']

    dataStrat = sampling.stratifiedSampling()
    del dataStrat['A15']

    def intrinsicDim(data):
        x = StandardScaler().fit_transform(data)
        A = np.asmatrix(x.T) * np.asmatrix(x)
        U, S, V = np.linalg.svd(A)
        eigVals = S**2 / np.sum(S**2)

        cumulative = [sum(eigVals[:i]) for i in range(1, 15)]

        intrinsicDim = pd.DataFrame({
            "dimension": [np.arange(1, 15)],
            "eigenValues": [eigVals],
            "cumulativeEigVals": [cumulative]
        })
        return intrinsicDim.to_json()

    #Original Data
    intrinsicDimOrg = intrinsicDim(dataOriginal)
    intrinsicDimRand = intrinsicDim(dataRandom)
    intrinsicDimStrat = intrinsicDim(dataStrat)

    data = [intrinsicDimOrg, intrinsicDimRand, intrinsicDimStrat]
    data = pd.DataFrame(data)
    data = data.to_json()
    return render_template('task.html', taskJS="task2a", data=data)
Пример #2
0
def task1B():
    dataOriginal = sampling.originalData()
    #dataOriginal.drop(['A15'], axis=1)
    del dataOriginal['A15']

    dataRandom = sampling.randomSampling()
    del dataRandom['A15']

    dataStrat = sampling.stratifiedSampling()
    del dataStrat['A15']

    def kmeansElbow(data):
        dictionary = {}
        for i in range(1, 10):
            km = KMeans(n_clusters=i)
            #kmeans for stratified sampled data
            alldistances = km.fit(data)
            #totalDistance = np.min(alldistances, axis=1).sum()
            dictionary[i] = alldistances.inertia_  #totalDistance/i;

        return dictionary

    dictOriginal = kmeansElbow(dataOriginal)
    dictRandom = kmeansElbow(dataRandom)
    dictStrat = kmeansElbow(dataStrat)

    dict = [dictOriginal, dictRandom, dictStrat]
    # kmeansData=pd.DataFrame({"clusters":[clusters],"avgDistance":[avgDistance]})
    kmeansData = pd.DataFrame(dict)
    data = kmeansData.to_json()
    return render_template('task.html', taskJS="task1b", data=data)
Пример #3
0
def task3A():

    dataOriginal = sampling.originalData()
    dataOriginalY = dataOriginal['A15']
    del dataOriginal['A15']
    dataOriginal = StandardScaler().fit_transform(dataOriginal)

    dataRandom = sampling.randomSampling()
    dataRandomY = dataRandom['A15']
    del dataRandom['A15']
    dataRandom = StandardScaler().fit_transform(dataRandom)

    dataStrat = sampling.stratifiedSampling()
    dataStratY = dataStrat['A15']
    del dataStrat['A15']
    dataStrat = StandardScaler().fit_transform(dataStrat)

    pca = PCA(n_components=2)

    originalPCA = pca.fit_transform(dataOriginal)

    randomPCA = pca.fit_transform(dataRandom)

    stratPCA = pca.fit_transform(dataStrat)

    print(originalPCA[:, :2])

    data = [
        originalPCA, dataOriginalY, randomPCA, dataRandomY, stratPCA,
        dataStratY
    ]
    data = pd.DataFrame(data)
    data = data.to_json()
    return render_template('task.html', taskJS="task3a", data=data)
Пример #4
0
def task3c():

    dataOriginal = sampling.originalData()
    dataOriginalY = dataOriginal['A15']
    del dataOriginal['A15']
    dataOriginal = StandardScaler().fit_transform(dataOriginal)

    dataRandom = sampling.randomSampling()
    dataRandomY = dataRandom['A15']
    del dataRandom['A15']
    dataRandom = StandardScaler().fit_transform(dataRandom)

    dataStrat = sampling.stratifiedSampling()
    dataStratY = dataStrat['A15']
    del dataStrat['A15']
    dataStrat = StandardScaler().fit_transform(dataStrat)

    pca = PCA(n_components=3)

    def return_dict_arr(data, yVal):
        array = []
        yVal = np.array(yVal)
        for i in range(len(data)):
            array.append({
                "target": yVal[i],
                "PCA1": data[i, 0],
                "PCA2": data[i, 1],
                "PCA3": data[i, 2]
            })

        return array

    originalPCA = pca.fit_transform(dataOriginal)
    originalPCA = {"values": return_dict_arr(originalPCA, dataOriginalY)}

    randomPCA = pca.fit_transform(dataRandom)
    randomPCA = {"values": return_dict_arr(randomPCA, dataRandomY)}

    stratPCA = pca.fit_transform(dataStrat)
    stratPCA = {"values": return_dict_arr(stratPCA, dataStratY)}

    # print(originalPCA[:,:2])

    data = [
        json.dumps(originalPCA),
        json.dumps(randomPCA),
        json.dumps(stratPCA)
    ]  #,"randomPCA":dataOriginalY,dataRandomY,stratPCA,dataStratY}
    data = pd.DataFrame(data)
    data = data.to_json()
    #data=json.dumps(data)
    return render_template('task3c.html', taskJS="task3c", data=data)
Пример #5
0
def task3B():

    dataOriginal = sampling.originalData()
    dataOriginalY = dataOriginal['A15']
    del dataOriginal['A15']
    dataOriginal = StandardScaler().fit_transform(dataOriginal)

    dataRandom = sampling.randomSampling()
    dataRandomY = dataRandom['A15']
    del dataRandom['A15']
    dataRandom = StandardScaler().fit_transform(dataRandom)

    dataStrat = sampling.stratifiedSampling()
    dataStratY = dataStrat['A15']
    del dataStrat['A15']
    dataStrat = StandardScaler().fit_transform(dataStrat)

    mds_data = manifold.MDS(n_components=2, dissimilarity='precomputed')

    similarity = pairwise_distances(dataOriginal, metric='euclidean')
    originalMDSEu = mds_data.fit_transform(similarity)

    similarity = pairwise_distances(dataRandom, metric='euclidean')
    randomMDSEu = mds_data.fit_transform(similarity)

    similarity = pairwise_distances(dataStrat, metric='euclidean')
    stratMDSEu = mds_data.fit_transform(similarity)

    similarity = pairwise_distances(dataOriginal, metric='correlation')
    originalMDSCo = mds_data.fit_transform(similarity)

    similarity = pairwise_distances(dataRandom, metric='correlation')
    randomMDSCo = mds_data.fit_transform(similarity)

    similarity = pairwise_distances(dataStrat, metric='correlation')
    stratMDSCo = mds_data.fit_transform(similarity)

    data=[originalMDSEu,dataOriginalY,randomMDSEu,dataRandomY,stratMDSEu,dataStratY,\
            originalMDSCo,dataOriginalY,randomMDSCo,dataRandomY,stratMDSCo,dataStratY]

    data = pd.DataFrame(data)
    data = data.to_json()
    return render_template('task.html', taskJS="task3b", data=data)
Пример #6
0
def txt2tuple():
    csv_list = glob.glob("../data/csv/{}*.csv".format(FIELD))
    length = len(csv_list)

    if length == 0:
        raise Exception("No matching file!")

    for count in range(length):
        input_file = csv_list[count]
        # file_name = re.search('{}(.+?)-13-17.csv'.format(FIELD), input_file).group(1)[1:]
        file_name = re.search('{}(.+?).csv'.format(FIELD),
                              input_file).group(1)[1:]

        start = time.time()
        print("Generating {} tuple ... ".format(file_name), flush=True)

        file = txt2list.txt2list(input_file)
        records = file.convert("\t")
        rows = records.shape[0]

        # with sampling, 40 records per year
        # optional operation
        if count < length:
            print(count)
            sampling_data = []
            sampling_labels = np.arange(2013, 2018)
            for i in range(rows):
                record = records[i]
                bh = i
                py = int(record[1])
                sampling_data.append([bh, py])
            sampling_input = np.array(sampling_data)
            sampling_index = 0
            label_index = 1
            # 每年的采样数应该一致
            sampling_type = 'rs'
            scale = 40 * len(sampling_labels)
            sampling_output = sampling.stratifiedSampling(
                sampling_input, sampling_labels, label_index, sampling_index,
                sampling_type, scale)
            # print("Length of sampling data is: {}\nShape of sampling result is: {}".format(len(sampling_data), sampling_output.shape))
            for j in range(len(sampling_labels)):
                result = sampling_output[j]
                py = sampling_labels[j]

                output_file = open("../data/tuple/{}-{}-{}.txt".format(
                    FIELD, file_name, str(py)),
                                   'w+',
                                   encoding='utf-8')
                output_file.write('bh,py,src,speech,word\n')

                for bh in result:
                    record = records[bh]
                    TIss = pynlpir.segment(record[0])
                    # do not segment keywords
                    # KWs = re.split(r'[|]',record[2])[:-1]
                    # segment keywords
                    KWs = pynlpir.segment(record[2])
                    AB = re.split(r'[<正>]', record[3])[-1]
                    ABss = pynlpir.segment(AB)
                    write_tuple(bh, py, TIss, 4, output_file, file_name)
                    write_tuple(bh, py, KWs, 2, output_file, file_name)
                    write_tuple(bh, py, ABss, 1, output_file, file_name)
                output_file.close()

            print("finished in {:.2f} sec.".format(time.time() - start),
                  flush=True)

        # without sampling, full records
        elif count > length:
            output_file = open("../data/tuple/Full/{}-{}.txt".format(
                FIELD, file_name),
                               'w+',
                               encoding='utf-8')
            output_file.write('bh,py,src,speech,word' + '\n')

            for i in range(rows):
                record = records[i]
                bh = i
                TIss = pynlpir.segment(record[0])
                py = record[1]
                # KWs = re.split(r'[|]',record[2])[:-1]
                KWs = pynlpir.segment(record[2])
                AB = re.split(r'<[正]>', record[3])[-1]
                ABss = pynlpir.segment(AB)
                write_tuple(bh, py, TIss, 4, output_file)
                write_tuple(bh, py, KWs, 2, output_file)
                write_tuple(bh, py, ABss, 1, output_file)

            output_file.close()
            print("finished in {:.2f} sec.".format(time.time() - start),
                  flush=True)
        else:
            pass
Пример #7
0
def task1A():
    dataStrat = sampling.stratifiedSampling().to_json()
    dataRandom = sampling.randomSampling().to_json()
    data = {"stratifiedSampling": dataStrat, "randomSampling": dataRandom}
    return render_template('task.html', taskJS="task1a", data=data)