예제 #1
0
def main(args):
    database = parse_database(args.database)
    probe = np.load(args.probe)

    os.makedirs(args.output, exist_ok=True)
    copy_video(video_id_from_name(args.probe), args.videos,
               os.path.join(args.output, 'probe.avi'))

    start = timer()
    k_naive = top_k_naive(database, probe, args.k)
    end = timer()

    os.makedirs(os.path.join(args.output, 'naive'), exist_ok=True)
    with open(os.path.join(args.output, 'naive', 'results.txt'), 'w') as f:
        for i in range(len(k_naive)):
            v = k_naive[i]
            d = np.linalg.norm(v[0] - probe)
            f.write('Video ID: {}, Distance: {}\n'.format(v[1], d))

            copy_video(v[1], args.videos,
                       os.path.join(args.output, 'naive', '{}.avi'.format(i)))

        f.write('Time: {} s\n'.format(end - start))
        f.write('DB Memory: {}\n'.format(
            human_size(sys.getsizeof(pickle.dumps(database)))))

    for basis_size in args.lsh:
        basis = lsh.generate_basis(basis_size, database[0][0].shape[0])
        probe_lsh = lsh.lsh(probe, basis)
        database_lsh = lsh_database(database, basis)

        start = timer()
        k_l = top_k_lsh(database_lsh, probe_lsh, args.k)
        end = timer()

        result_dir = 'lsh_{}'.format(basis_size)
        os.makedirs(os.path.join(args.output, result_dir), exist_ok=True)

        with open(os.path.join(args.output, result_dir, 'results.txt'),
                  'w') as f:
            for i in range(len(k_l)):
                v = k_l[i]
                d = lsh.hamming_distance(v[0], probe_lsh)
                f.write('Video ID: {}, Distance: {}\n'.format(v[1], d))

                copy_video(
                    v[1], args.videos,
                    os.path.join(args.output, result_dir, '{}.avi'.format(i)))

            f.write('Time: {} s\n'.format(end - start))
            f.write('DB Memory: {}\n'.format(
                human_size(sys.getsizeof(pickle.dumps(database_lsh)))))
예제 #2
0
	def __init__(self):
		self.train_dataframe = pd.read_csv('data/training.csv', header=0) 
		self.test_dataframe = pd.read_csv('data/test.csv', header=0)
		self.test_dataframe_refId = self.test_dataframe['RefId']
		
		self.preprocess_data()
		
		# Initialise classifiers
		self.attributes = list(self.train_dataframe.columns.values)[1:]					
		self.lsh_neighbours = 2000
		self.initialise_knn()
		self.initialise_pca()
		self.initialise_svm()
		self.initialise_nn()
		
		self.lsh = lsh.lsh(self.train_dataframe)
예제 #3
0
    def __init__(self):
        self.train_dataframe = pd.read_csv("data/training.csv", header=0)
        self.test_dataframe = pd.read_csv("data/test.csv", header=0)
        self.test_dataframe_refId = self.test_dataframe["RefId"]

        self.preprocess_data()

        # Initialise classifiers
        self.attributes = list(self.train_dataframe.columns.values)[1:]
        self.lsh_neighbours = 2000
        self.initialise_knn()
        self.initialise_pca()
        self.initialise_svm()
        self.initialise_nn()

        self.lsh = lsh.lsh(self.train_dataframe)
def compute_hashes(domains,
                   n,
                   num_perms=32,
                   max_items=100,
                   hash_function=lsh.md5hash):
    # domains는 도메인 객체로 사전 형태이다.
    # 도메인 이름을 키로 지정한다.

    # LSH 인덱스를 생성한다.
    hashes = lsh.lsh(num_perms, hash_function)

    # minHashes를 계산한다.
    for dom in domains:
        dg = hashes.digest(domains[dom].ngrams[n])
        domains[dom].digest = dg
        hashes.insert(dom, dg)

    return hashes
예제 #5
0
from org.apache.lucene.search.highlight import QueryScorer
import datetime
import parser
import web
import jieba
import lsh
import extract_faces
#import extract_features
from flask import Flask, redirect, render_template, request, url_for

app = Flask(__name__,static_url_path='/static')
search_result=[] #储存搜索结果
dic1=dict()
dic2=dict()

LSH_1=lsh.lsh('features')
LSH_1.generate_hash()
LSH_2=lsh.lsh('faces_features')
LSH_2.generate_hash()

###############
#基于关键词搜索#
###############
def search(keyword):
    STORE_DIR = "sports_index"
    vm_env.attachCurrentThread()
    directory=SimpleFSDirectory(File(STORE_DIR).toPath())
    searcher=IndexSearcher(DirectoryReader.open(directory))
    analyzer=StandardAnalyzer()

    res_cnt,res_list=get_res(searcher,analyzer,keyword)
예제 #6
0
def lsh_database(database, basis):
    return [(lsh.lsh(r, basis), v) for r, v in database]
예제 #7
0
def twitter():
    if int((sklearn.__version__).split(".")[1]) < 18:
        from sklearn.cross_validation import train_test_split
    else:
        from sklearn.model_selection import train_test_split
    #Twitter Data Set

    ip_file_path = 'data/twitter/'
    ip_file_name = 'twitter.txt'
    out = 'twitter'
    ip_label_file_name = 'twitter_label.txt'
    test_file_name = 'twitter_test.txt'
    test_label_file_name = 'twitter_test_label.txt'
    vocabulary = {}
    input_file = open(ip_file_path + ip_file_name, 'r+')
    file_lines = input_file.readlines()
    word_matrix = []
    for line in file_lines:
        line = line.strip()
        sentence = line.split(" ")
        word_matrix.append(sentence)
    label_matrix = np.genfromtxt(ip_file_path + ip_label_file_name,
                                 delimiter=' ')
    (train_data, test_data, train_labels,
     test_labels) = train_test_split(word_matrix,
                                     label_matrix,
                                     test_size=0.20,
                                     random_state=42)

    #prior_dict = prior_density(train_labels)
    for sentence in train_data:
        for word in sentence:
            if (word not in vocabulary):
                vocabulary[word] = 1
            else:
                vocabulary[word] = vocabulary[word] + 1
    #orderedDictionary = collections.OrderedDict(sorted(vocabulary.items(), key=lambda x: x[1]))
    updated_voc = {k: v for k, v in vocabulary.items() if v > 3}
    data_matrix = np.zeros((len(train_data), len(updated_voc)))
    test_matrix = np.zeros((len(test_data), len(updated_voc)))
    i = 0
    for sentence in train_data:
        for word in sentence:
            if (word not in updated_voc):
                continue
            index = list(updated_voc.keys()).index(word)
            data_matrix[i][index] = data_matrix[i][index] + 1
        i = i + 1
    i = 0
    for sentence in test_data:
        for word in sentence:
            if (word not in updated_voc):
                continue
            index = list(updated_voc.keys()).index(word)
            test_matrix[i][index] = test_matrix[i][index] + 1
        i = i + 1

    class_partition_set = class_partition(data_matrix, train_labels)
    prior_prob_set = {}
    for class_value, features in class_partition_set.items():
        prior_prob_set[class_value] = len(features) / len(train_data)

    #Bayes On Twitter
    class_densities_words = {}
    for class_value, features in class_partition_set.items():
        voc_size = len(features[0])
        matrix = np.matrix(features)
        sum_elements = np.sum(matrix)
        deno = voc_size + sum_elements
        c_sum = matrix.sum(axis=0).tolist()
        densities = []
        for i in range(0, len(c_sum[0])):
            num = c_sum[0][i] + 1
            prob = (num) / deno
            densities.append(prob)
        class_densities_words[class_value] = densities
    predictions = []
    accuracy = 0
    for i in range(len(test_matrix)):
        test_vector = np.zeros((len(updated_voc)))
        #testVector = list(testData[0])
        for word in test_data[i]:
            if (word not in updated_voc):
                continue
            index = list(updated_voc.keys()).index(word)
            test_vector[index] += 1
        probabilities = {}
        for class_value, densities in class_densities_words.items():
            prob = prior_prob_set[class_value]
            for i in range(0, len(test_vector)):
                if (test_vector[i] != 0):
                    if (test_vector[i] == 1):
                        prob = prob * densities[i]
                    else:
                        prob = prob * np.power(densities[i], test_vector[i])
            probabilities[class_value] = prob
        predictions.append(max(probabilities, key=probabilities.get))
    for x in range(len(test_data)):
        if test_labels[x] == predictions[x]:
            accuracy += 1
    accuracy_mycode = (accuracy / len(test_data)) * 100.0
    print("Accuracy without using library ", accuracy_mycode)

    lsh.lsh(data_matrix, train_labels, test_matrix, test_labels)
    nn.nearest_neighbour(data_matrix, train_labels, test_matrix, test_labels)
예제 #8
0
def main(csv_txt, write_flag, path, label_path):
    # Raed from csv file -
    if csv_txt == 0:
        input_data, input_label = read_csv_file(path, label_path, 0)
        #input_label = read_csv_file(label_path,0)
    if csv_txt == 1:
        input_data, input_label = read_csv_file(path, label_path, 1)

    split_ratio = 0.80

    input_train, input_train_label, input_test, input_test_label = split_input_data(
        input_data, input_label, split_ratio)
    k = 3

    number_hash_function = 1
    #print len(input_data)
    #print len(input_data[0])
    orginal_dim = len(input_data[0])
    #k = int(orginal_dim/3)
    new_dim = 5
    hash_tables, all_hash_functions = lsh.lsh(number_hash_function,
                                              orginal_dim, new_dim,
                                              input_train, input_train_label)

    input_test = np.asarray(input_test)

    #print input_test.shape
    #print all_hash_functions[0].shape

    projected_test_data = np.dot(input_test, all_hash_functions[0])

    k = 3
    #print projected_test_data.shape
    predict_label = []
    for row in range(len(projected_test_data)):
        test_vector = projected_test_data[row]
        all_neighbour = np.asarray(
            lsh.get_all_localiy_sensetive_elements(hash_tables, test_vector))

        #print type(all_neighbour)
        if len(all_neighbour.shape) == 0:
            #print "NO NEIGHBOUR"
            cur_class = 0
        else:
            input_train = all_neighbour[:, 0:-1]
            input_train_label = all_neighbour[:, -1]

            if len(input_train_label) >= k:
                top_k_neighbour = nn.get_top_k_neighbours(
                    k, input_train, test_vector, input_train_label)

                cur_class = nn.find_class(np.atleast_2d(top_k_neighbour))
            else:
                #print 'less than k'
                cur_class = input_train_label[0]

        predict_label.append(cur_class)

    test_accuracy, macro, micro, score = nn.calculate_test_accuracy(
        input_test_label, predict_label)
    #test_accuracy= calculate_test_accuracy(true_label,predict_label)

    #print ('Test Accuracy :: {0}% for k = {1}').format(test_accuracy,k)
    print(
        'Test Accuracy          :: {0} %\n\nTest Macro F1-score    :: {1}\n\nTest Micro F1-score    :: {2}\n\nTest weighted F1 score :: {3}'
    ).format(test_accuracy, macro * 100, micro * 100, score * 100)
예제 #9
0
파일: main.py 프로젝트: POOSARLADIVAKAR/LSH
def main():
    """
        This functions is main function in your folder which should run first
        calls other functions and calculate time taken to execute for evry function call and serializing all Dataframes obtained

        Parametrs
        ---------
        similar_docs: Python Dictionary()
            Dictionary of similar documents to Query Document
        query: string
            string representig query document's file name
        jscore : float
            User threshold for jaccard similarity score between two documents 

        
        Returns
        -------
        Precision_count : int
        Returns number of documents from input similar_docs dictionary that have higher jaccard score than User threshold
    """

    shingle_length = 4  #set shingle length
    if os.path.exists("./shingle_pickle4.py") == False:

        time_start = time.time()
        preprocess.preprocess(
            './temp')  #Time calculation for pre processing files
        time_end = time.time()
        print(time_end - time_start)  # 0.01327657699584961

    if os.path.exists("./shingle_pickle4.py") == False:

        time_start = time.time()
        shingleDf = shingle.get_shingles(
            shingle_length)  #Time calculation fro shingling
        time_end = time.time()
        print(time_end - time_start)  # 207.01369958496123463
        # shingleDf.to_pickle("./shingle_pickle4.py")
        # shingleDf.to_pickle("./shingle_pickle.py")
        # un_pickle_df=pd.read_pickle("./shingle_pickle.py")

    if os.path.exists("./sig_nat.pickle") == False:

        time_start = time.time()
        signatureDf = minhashing.generate_signature_matrix(
            shingleDf, 100)  # Time calculation for min hashing
        time_end = time.time()
        print(time_end - time_start)  # 2041.65769958496173297

        # signatureDf.to_pickle("./signature_pickle4shingles.py")
        # un_pickle_df=pd.read_pickle("./signature_pickle.py")

    output0 = "Please Enter document number : "  # taking user input query and threshold jaccard score
    print(output0)
    query = (int)(input())
    output1 = "Please Enter threshold jscore: "
    print(output1)
    sig_matrix = pd.read_pickle("./sig_matc_4shigles.pickle")
    jscore = (float)(input())
    for i in range(query, query + 1):
        similar_documents = lsh.lsh(sig_matrix, sig_matrix.columns[i],
                                    100)  # applying Lsh on signature matrix
        print(similar_documents)
        if (len(similar_documents) == 0):
            print("No similar Documents found")
        else:
            p_count = lsh.precision(
                similar_documents, sig_matrix.columns[i],
                jscore=0.1)  #calculating precision for retreivs documents
            print("precision is = " + (str)(p_count / len(similar_documents)))
            r_count = lsh.recall(
                sig_matrix.columns, sig_matrix.columns[i],
                jscore=0.1)  #calculating recall for retreived documents
            print("recall is = " + (str)(p_count / r_count))
예제 #10
0
    if len(sys.argv) > 1:
        threshold = float(sys.argv[1])
    (row, b) = findLSHParameters(numHashFunctions, threshold)
    
    # Get the data
    documents = []
    unProcessedDocuments = readDataJson()
    for i in range(0, len(unProcessedDocuments)):
        documents.append(constructShingles(unProcessedDocuments[i][1], shingleLength))
    
    # Generate signature matrix
    print(len(documents))
    sm = minHash(documents, numHashFunctions)
    
    # Get candidate pairs
    candidatePairs = findCandidatePairs(lsh(sm, b, row))
    
    # Get similar pairs by comparing all signatures
    similarDocuments = findSimilarDocumentsCompareToOne(sm, threshold, unProcessedDocuments)

    printElements = similarDocuments
    printElements.sort(key=takeThird, reverse=True) 

    for doc in printElements:
        print(doc)

    print(len(similarDocuments))
    allSimilarDocuments = []
    for doc in similarDocuments:
        docDic = dict()
        docDic = {"id1": doc[0], "id2": doc[1], "similarity": doc[2]}
예제 #11
0
        f.write("Bayes Test Accuracy " + str(accuracyBayes))
        f.write("\n")
        f.write("Bayes Test Macro F1-score " + str(f1_score_macroBayes))
        f.write("\n")
        f.write("Bayes Test Micro F1-score " + str(f1_score_microBayes))
        f.write("\n")
        f.write("\n")
        # endregion

        # region Task-VI:LSH Custom Code

        print("Custom LSH Classifier Statistcs:")
        f.write("Custom LSH Classifier Statistcs:")
        f.write("\n")
        f.write("\n")
        accuracyLSH, f1_score_macroLSH, f1_score_microLSH = lsh.lsh(
            trainData, trainLabels, testData, testLabels)
        print("LSH Test Accuracy ", str(accuracyLSH))
        print("LSH Test Macro F1-score ", str(f1_score_macroLSH))
        print("LSH Test Micro F1-score ", str(f1_score_microLSH))
        f.write("Original Dimension :" + str(noOfColumns))
        f.write("\n")
        f.write("LSH Test Accuracy " + str(accuracyLSH))
        f.write("\n")
        f.write("LSH Test Macro F1-score " + str(f1_score_macroLSH))
        f.write("\n")
        f.write("LSH Test Micro F1-score " + str(f1_score_microLSH))
        f.write("\n")
        f.write("\n")

        # endregion
예제 #12
0
def twitter(testMatrix, testlabelMatrix):
    vocabulary = {}
    Accuracy = {}
    F1ScoreMacro = {}
    F1ScoreMicro = {}
    # region handle older versions of sklearn
    if int((sklearn.__version__).split(".")[1]) < 18:
        from sklearn.cross_validation import train_test_split
    # otherwise we're using at lease version 0.18
    else:
        from sklearn.model_selection import train_test_split
    # endregion
    # region Twitter Data Set

    outputFileName = "Twitter_stat.txt"
    f = open(outputFileName, "w")
    print("Processing Twitter DataSet.......")
    f.write("Processing Twitter DataSet.......")
    f.write("\n")

    # region Input File INformation
    inputFilePath = '../data/twitter/'
    inputFileName = 'twitter.txt'
    out = 'twitter'
    inputLabelFileName = 'twitter_label.txt'
    labelMatrix = np.genfromtxt(inputFilePath + inputLabelFileName,
                                delimiter=' ')
    # endregion

    # region Generate Word Matrix from txt file

    inputFile = open(inputFilePath + inputFileName, 'r+')
    fileLines = inputFile.readlines()
    wordMatrix = []
    for line in fileLines:
        line = line.strip()
        sentence = line.split(" ")
        wordMatrix.append(sentence)
    # endregion

    # region Original Dimension Analysis KNN/Bayes/LSH

    trainData = wordMatrix
    trainLabels = labelMatrix
    testData = testMatrix
    testLabels = testlabelMatrix

    # region Generate Vocubulary Dictionary
    for sentence in trainData:
        for word in sentence:
            if (word not in vocabulary):
                vocabulary[word] = 1
            else:
                vocabulary[word] = vocabulary[word] + 1
    updatedVoc = {k: v for k, v in vocabulary.items() if v > 2}
    # endregion

    # region Convert Word Matrix into MultiNomial Form
    dataMatrix = np.zeros((len(trainData), len(updatedVoc)))
    testMatrix = np.zeros((len(testData), len(updatedVoc)))
    i = 0
    for sentence in trainData:
        for word in sentence:
            if (word not in updatedVoc):
                continue
            index = list(updatedVoc.keys()).index(word)
            dataMatrix[i][index] = dataMatrix[i][index] + 1
        i = i + 1
    i = 0
    for sentence in testData:
        for word in sentence:
            if (word not in updatedVoc):
                continue
            index = list(updatedVoc.keys()).index(word)
            testMatrix[i][index] = testMatrix[i][index] + 1
        i = i + 1
    # endregion

    noOfColumns = len(dataMatrix[0])

    # region K-Nearest Neighbour Classifier
    print("Custom K-NN Classifier Statistcs:")
    print("K-Value Taken:5")
    f.write("Custom K-NN Classifier Statistcs:")
    f.write("\n")
    f.write("K-Value Taken:5")
    f.write("\n")
    accuracyNN, f1_score_macroNN, f1_score_microNN = nn.nearestNeighbour(
        dataMatrix, trainLabels, testMatrix, testLabels)
    print("KNN Test Accuracy ", str(accuracyNN))
    print("KNN Test Macro F1-score ", str(f1_score_macroNN))
    print("KNN Test Micro F1-score ", str(f1_score_microNN))
    f.write("\n")
    f.write("KNN Test Accuracy " + str(accuracyNN))
    f.write("\n")
    f.write("KNN Test Macro F1-score " + str(f1_score_macroNN))
    f.write("\n")
    f.write("KNN Test Micro F1-score " + str(f1_score_microNN))
    f.write("\n")
    f.write("\n")
    # endregion

    # region Bayes Classifier
    print("Custom Bayes Classifier Statistcs:")
    f.write("Custom Bayes Classifier Statistcs:")
    f.write("\n")
    accuracyBayes, f1_score_macroBayes, f1_score_microBayes = naiveBayesMultiNomial(
        dataMatrix, trainLabels, testMatrix, testLabels, updatedVoc)
    print("Bayes Test Accuracy ", str(accuracyBayes))
    print("Bayes Test Macro F1-score ", str(f1_score_macroBayes))
    print("Bayes Test Micro F1-score ", str(f1_score_microBayes))
    f.write("Original Dimension :" + str(noOfColumns))
    f.write("\n")
    f.write("Bayes Test Accuracy " + str(accuracyBayes))
    f.write("\n")
    f.write("Bayes Test Macro F1-score " + str(f1_score_macroBayes))
    f.write("\n")
    f.write("Bayes Test Micro F1-score " + str(f1_score_microBayes))
    f.write("\n")
    f.write("\n")
    # endregion

    # region LSH Custom Code
    print("Custom LSH Classifier Statistcs:")
    f.write("Custom LSH Classifier Statistcs:")
    f.write("\n")
    f.write("\n")
    accuracyLSH, f1_score_macroLSH, f1_score_microLSH = lsh.lsh(
        dataMatrix, trainLabels, testMatrix, testLabels)
    print("LSH Test Accuracy ", str(accuracyLSH))
    print("LSH Test Macro F1-score ", str(f1_score_macroLSH))
    print("LSH Test Micro F1-score ", str(f1_score_microLSH))
    f.write("Original Dimension :" + str(noOfColumns))
    f.write("\n")
    f.write("LSH Test Accuracy " + str(accuracyLSH))
    f.write("\n")
    f.write("LSH Test Macro F1-score " + str(f1_score_macroLSH))
    f.write("\n")
    f.write("LSH Test Micro F1-score " + str(f1_score_microLSH))
    f.write("\n")
    f.write("\n")
    # endregion

    # endregion

    # region Random Projection
    '''
    K = 2
    while (K <= int(noOfColumns / 2)):

        Accuracy[K] = 0
        F1ScoreMacro[K] = 0
        F1ScoreMicro[K] = 0
        randomDataMatrix,randomTestMatrix = projections.randomProjectionTwitter(dataMatrix,testMatrix, noOfColumns, K, inputFilePath, out)
        #randomTestMatrix = projections.randomProjection(testMatrix, noOfColumns, K, inputFilePath, out)

        #accuracyOfMyCode, f1_score_macro, f1_score_micro = nn.nearestNeighbour(randomDataMatrix, trainLabels,randomTestMatrix,testLabels)

        accuracyOfMyCode, f1_score_macro, f1_score_micro = naiveBayesMultiNomial(randomDataMatrix, trainLabels,randomTestMatrix, testLabels,updatedVoc)

        print("Reduced Dimension :", K)
        f.write("Reduced Dimension :" + str(K))
        f.write("\n")
        print("Test Accuracy ", str(accuracyOfMyCode))
        print("Test Macro F1-score ", str(f1_score_macro))
        print("Test Micro F1-score ", str(f1_score_micro))
        f.write("Test Accuracy " + str(accuracyOfMyCode))
        f.write("\n")
        f.write("Test Macro F1-score " + str(f1_score_macro))
        f.write("\n")
        f.write("Test Micro F1-score " + str(f1_score_micro))
        f.write("\n")
        f.write("\n")
        Accuracy[K] = accuracyOfMyCode
        F1ScoreMacro[K] = f1_score_macro
        F1ScoreMicro[K] = f1_score_micro
        K = K * 2
        '''
    # endregion

    # region Plot and Save Data
    '''
    index = []
    accuracyList = []
    for k, accuracy in Accuracy.items():
        index.append(k)
        accuracyList.append(accuracy)
    #title = "Accuracy:Different D values Of custom K-NN for Twitter Data Set."
    #figname = "custom_K-NN_Accuracy_Twitter.png"
    #title = "Accuracy:Different D values Of Sklearn K-NN for Twitter Data Set."
    #figname = "Sklearn_K-NN_Accuracy_Twitter.png"
    #title = "Accuracy: Different D values Of custom Bayes for Twitter Data Set."
    #figname="custom_Bayes_Accuracy_Twitter.png"
    # title = "Accuracy: Different D values Of Sklearn Bayes for Twitter Data Set."
    # figname="Sklearn_Bayes_Accuracy_Twitter.png"
    plotData(index, accuracyList, 'D-Value', 'Accuracy', title, figname, np.arange(0, 100, step=10))

    index = []
    F1MacroList = []
    for k, f1_score_macro in F1ScoreMacro.items():
        index.append(k)
        F1MacroList.append(f1_score_macro)
    #title = "F1-Score(Macro):Different D values Of custom K-NN for Twitter Data Set."
    #figname = "custom_K-NN_F1-Score(Macro)_Twitter.png"
    #title = "F1-Score(Macro):Different D values Of Sklearn K-NN for Twitter Data Set."
    #figname = "Sklearn_K-NN_F1-Score(Macro)_Twitter.png"
    #title = "F1-Score(Macro): Different D values Of custom Bayes for Twitter Data Set."
    #figname = "custom_Bayes_F1-Score(Macro)_Twitter.png"
    # title = "F1-Score(Macro): Different D values Of Sklearn Bayes for Twitter Data Set."
    # figname = "Sklearn_Bayes_F1-Score(Macro)_Twitter.png"
    plotData(index, F1MacroList, 'D-Value', 'F1-Score(Macro)', title, figname, np.arange(0, 1, step=0.1))

    index = []
    F1MicroList = []
    for k, f1_score_micro in F1ScoreMicro.items():
        index.append(k)
        F1MicroList.append(f1_score_micro)
    #title = "F1-Score(Micro):Different D values Of custom K-NN for Twitter Data Set."
    #figname = "custom_K-NN_F1-Score(Micro)_Twitter.png"
    #title = "F1-Score(Micro):Different D values Of Sklearn K-NN for Twitter Data Set."
    #figname = "Sklearn_K-NN_F1-Score(Micro)_Twitter.png"
    #title = "F1-Score(Micro): Different D values Of custom Bayes for Twitter Data Set."
    #figname = "custom Bayes_F1-Score(Micro)_Twitter.png"
    # title = "F1-Score(Micro): Different D values Of Sklearn Bayes for Twitter Data Set."
    # figname = "Sklearn Bayes_F1-Score(Micro)_Twitter.png"
    plotData(index, F1MicroList, 'D-Value', 'F1-Score(Micro)', title, figname, np.arange(0, 1, step=0.1))
    '''
    # endregion

    # region Task-VII PCA Analysis:Twitter
    '''
    K = 2
    while (K <= int(noOfColumns / 2)):
        pca = PCA(n_components=K).fit(dataMatrix)
        data_reduce = pca.transform(dataMatrix)
        pca2 = PCA(n_components=K).fit(testMatrix)
        test_reduce = pca2.transform(testMatrix)
        #accuracyOfMyCode, f1_score_macro, f1_score_micro = nn.nearestNeighbour(data_reduce, trainLabels, test_reduce,testLabels)
        accuracyOfMyCode, f1_score_macro, f1_score_micro = naiveBayesMultiNomial(data_reduce, trainLabels, test_reduce,testLabels,updatedVoc)
        print("PCA Reduced Dimension :", K)
        print("Test Accuracy ", str(accuracyOfMyCode))
        print("Test Macro F1-score ", str(f1_score_macro))
        print("Test Micro F1-score ", str(f1_score_micro))
        f.write("Reduced Dimension :" + str(K))
        f.write("\n")
        f.write("Test Accuracy " + str(accuracyOfMyCode))
        f.write("\n")
        f.write("Test Macro F1-score " + str(f1_score_macro))
        f.write("\n")
        f.write("Test Micro F1-score " + str(f1_score_micro))
        f.write("\n")
        f.write("\n")
        Accuracy[K] = accuracyOfMyCode
        F1ScoreMacro[K] = f1_score_macro
        F1ScoreMicro[K] = f1_score_micro
        K = K * 2
    index = []
    accuracyList = []
    for k, accuracy in Accuracy.items():
        index.append(k)
        accuracyList.append(accuracy)
    #title = "Accuracy:Different D values Of PCA K-NN for Twitter Data Set."
    #figname = "PCA_K-NN_Accuracy_Twitter.png"
    title = "Accuracy: Different D values Of PCA Bayes for Twitter Data Set."
    figname = "PCA_Bayes_Accuracy_Twitter.png"
    plotData(index, accuracyList, 'D-Value', 'Accuracy', title, figname, np.arange(0, 100, step=10))

    index = []
    F1MacroList = []
    for k, f1_score_macro in F1ScoreMacro.items():
        index.append(k)
        F1MacroList.append(f1_score_macro)
    #title = "F1-Score(Macro):Different D values Of PCA K-NN for Twitter Data Set."
    #figname = "PCA_K-NN_F1-Score(Macro)_Twitter.png"
    title = "F1-Score(Macro): Different D values Of PCA Bayes for Twitter Data Set."
    figname = "PCA_Bayes_F1-Score(Macro)_Twitter.png"
    plotData(index, F1MacroList, 'D-Value', 'F1-Score(Macro)', title, figname, np.arange(0, 1, step=0.1))

    index = []
    F1MicroList = []
    for k, f1_score_micro in F1ScoreMicro.items():
        index.append(k)
        F1MicroList.append(f1_score_micro)
    #title = "F1-Score(Micro):Different D values Of PCA K-NN for Twitter Data Set."
    #figname = "PCA_K-NN_F1-Score(Micro)_Twitter.png"
    title = "F1-Score(Micro): Different D values Of PCA Bayes for Twitter Data Set."
    figname = "PCA Bayes_F1-Score(Micro)_Twitter.png"
    plotData(index, F1MicroList, 'D-Value', 'F1-Score(Micro)', title, figname, np.arange(0, 1, step=0.1))
    '''
    # endregion

    f.close()
예제 #13
0
        #figname="task_3_Bayes_Accuracy_"+str(data_set_name.upper())+".png"
        #plot_data(index,accuracy_list,'Dimension','Accuracy',title,figname,np.arange(0,100,step=10))

        index = []
        F1MacroList = []
        for k, f1_score_macro in F1ScoreMacro.items():
            index.append(k)
            F1MacroList.append(f1_score_macro)
        title = "F1-Score(Macro) v/s Dimension for nearest neighbor classifier for " + str(data_set_name) + " Data Set."
        figname = "task_3_NN_F1-Score(Macro)_" + str(data_set_name) + ".png"
        #title = "F1-Score(Macro) v/s Dimension for Bayes classifier for " + str(data_set_name.upper()) + " Data Set."
        #figname = "task_3_Bayes_F1-Score(Macro)_" + str(data_set_name.upper()) + ".png"
        plot_data(index, F1MacroList, 'Dimension', 'F1-Score(Macro)', title, figname,np.arange(0,1,step=0.1))

        index = []
        F1MicroList = []
        for k, f1_score_micro in F1ScoreMicro.items():
            index.append(k)
            F1MicroList.append(f1_score_micro)
        title = "F1-Score(Micro) v/s Dimension for nearest neighbor classifier for " + str(data_set_name) + " Data Set."
        figname = "task_3_NN_F1-Score(Micro)_" + str(data_set_name) + ".png"
        #title = "F1-Score(Micro) v/s Dimension for Bayes classifier for " + str(data_set_name.upper()) + " Data Set."
        #figname = "task_3_Bayes_F1-Score(Micro)_" + str(data_set_name.upper()) + ".png"
        plot_data(index, F1MicroList, 'Dimension', 'F1-Score(Micro)', title, figname,np.arange(0,1,step=0.1))


        #prior_dict=prior_density(train_labels)
        #accuracy_mycode, f1_score_macro, f1_score_micro=nn.nearest_neighbour(train_data,train_labels,test_data,test_labels)
        lsh.lsh(train_data, train_labels, test_data, test_labels)
        #bayes.bayes_classifier(data_matrix,label_matrix)