Пример #1
0
    def init_command_line_config(self):
        print "using command line argparser"
        from argparser import argdict
        conf = argdict
        self.pageLimit = conf["page_limit"]
        self.linkLimit = conf["link_limit"]
        self.relevantThreshold = conf["relevancy_threshold"]

        classifierString = conf["classifier"]
        self.classifier = None
        if "NB" in classifierString.upper():
            self.classifier = NaiveBayesClassifier()
        elif "SVM" in classifierString.upper():
            self.classifier = SVMClassifier()

        seeds = conf["seeds"]
        self.seedUrls = []
        urlsPerSeed = 10
        for keyword in seeds:
            if keyword is not None and keyword is not "":
                if "http" in keyword:
                    self.seedUrls.append(keyword)
                else:
                    seedUrlGenerator = google.search(keyword)
                    searchResultUrls = list(itertools.islice(seedUrlGenerator, 0, urlsPerSeed))
                    self.seedUrls = list(set(self.seedUrls) | set(searchResultUrls))
            else:
                raise Exception("Seed is not valid: (" + str(keyword) + ") -- it must be a keyword or URL")

        print "seed urls: "
        print self.seedUrls

        self.blacklistDomains = conf["blacklist_domains"]
        self.labeled = {}

        self.labeled["relevantUrls"] = conf["relevant_urls"]
        for url in self.labeled["relevantUrls"]:
            if url is None or url is "":
                raise Exception("Relevant URL is not valid: (" + str(url) + ")")

        self.labeled["irrelevantUrls"] = conf["relevant_urls"]
        for url in self.labeled["irrelevantUrls"]:
            if url is None or url is "":
                raise Exception("Irrelevant URL is not valid: (" + str(url) + ")")

        self.vsm = {
            "on": conf["vsm"],
            "filterModel": conf["vsm_filter"],
            "minRepositoryDocNum": conf["min_repo_doc_num"],
            "filterIrrelevantThreshold": conf["irrelevancy_threshold"],
            "filterRelevantThreshold": conf["relevancy_threshold"]
        }
Пример #2
0
    def init_config_file_config(self):
        print "using config file"
        conf = Config("config.ini")
        self.pageLimit = conf["pageLimit"]
        self.linkLimit = conf["linkLimit"]
        self.relevantThreshold = conf["relevantThreshold"]

        classifierString = conf["classifier"]
        self.classifier = None
        if "NB" in classifierString.upper():
            self.classifier = NaiveBayesClassifier()
        elif "SVM" in classifierString.upper():
            self.classifier = SVMClassifier()

        seedKeywords = linesFromFile(conf["seedFile"])
        self.seedUrls = []
        urlsPerSeed = 10
        for keyword in seedKeywords:
            if "http" in keyword:
                self.seedUrls.append(keyword)
            else:
                seedUrlGenerator = google.search(keyword)
                searchResultUrls = list(itertools.islice(seedUrlGenerator, 0, urlsPerSeed))
                self.seedUrls = list(set(self.seedUrls) | set(searchResultUrls))

        print "seed urls: "
        print self.seedUrls

        self.blacklistDomains = linesFromFile(conf["blacklistFile"])

        self.trainingDocsPath = conf["trainingDocs"]
        self.trainingDocsPath = os.path.abspath(self.trainingDocsPath)
        self.labeled = {}
        self.labeled["relevantPath"] = os.path.join(self.trainingDocsPath, "relevant.txt");
        self.labeled["irrelevantPath"] = os.path.join(self.trainingDocsPath, "irrelevant.txt");
        self.labeled["relevantUrls"] = linesFromFile(self.labeled["relevantPath"]);
        self.labeled["irrelevantUrls"] = linesFromFile(self.labeled["irrelevantPath"]);

        self.vsm = {
            "on": conf["useVSM"],
            "filterModel": conf["VSMFilterModel"],
            "minRepositoryDocNum": conf["minRepositoryDocNum"],
            "filterIrrelevantThreshold": conf["filterIrrelevantThreshold"],
            "filterRelevantThreshold": conf["filterRelevantThreshold"]
        }
        print "done"
Пример #3
0
def scoreClassifier(postType, featureDimension):
    #MLP
    acc_list = []
    for k in range(2, 11):
        l = MLPClassifier('CSV Files_' + postType +'/data_std_hr24.csv', 'CSV Files_' + postType +'/label_hr24.csv', k , featureDimension)
        acc_list.append(l.kfold_validator())
    t = np.arange(2, 11, 1)
    plt.plot(t, acc_list, 'ro')
    plt.ylabel('accuracy')
    plt.xlabel('K-fold')
    plt.title('MLP Classifier')
    plt.savefig('Images_' + postType + "/"+"MLP_Classifier.png")
    plt.close()
    #plt.show()
    #LR
    acc_list = []
    avgCoeff = np.zeros(shape=(1,featureDimension))
    for k in range(2,11):
        l = LRClassifier('CSV Files_' + postType +'/data_std_hr24.csv', 'CSV Files_' + postType +'/label_hr24.csv',k, featureDimension)
        accuracy, lrCoeff = l.kfold_validator()
        acc_list.append(accuracy)
        avgCoeff = avgCoeff + lrCoeff

    avgCoeff /= 9
    print(avgCoeff)
    t = np.arange(2, 11, 1)
    plt.plot(t, acc_list, 'ro')
    plt.ylabel('accuracy')
    plt.xlabel('K-fold')
    plt.title('Logistic Regression')
    plt.savefig('Images_' + postType + "/" + "Logistic_Regression.png")
    plt.close()
    #plt.show()
    #SVM
    acc_list =[]
    for k in range(2,11):
        l = SVMClassifier('CSV Files_' + postType +'/data_std_hr24.csv', 'CSV Files_' + postType +'/label_hr24.csv',k, featureDimension)
        acc_list.append(l.kfold_validator())
    t = np.arange(2, 11, 1)
    plt.plot(t, acc_list, 'ro')
    plt.ylabel('accuracy')
    plt.xlabel('K-fold')
    plt.title('Support Vector Machine')
    plt.savefig('Images_' + postType + "/" + "Support_Vector_Machine.png")
    plt.close()
    #plt.show()
    #NB
    acc_list =[]
    for k in range(2,11):
        l = NBClassifier('CSV Files_' + postType +'/data_std_hr24.csv', 'CSV Files_' + postType +'/label_hr24.csv',k, featureDimension)
        acc_list.append(l.kfold_validator())
    t = np.arange(2, 11, 1)
    plt.plot(t, acc_list, 'ro')
    plt.ylabel('accuracy')
    plt.xlabel('K-fold')
    plt.title('Gaussian Naive Bayes')
    plt.savefig('Images_' + postType + "/" + "Gaussian_Naive_Bayes.png")
    plt.close()
def vcClassifier(postType, featureDimension):

    gp = GraphPlotting.GraphPlot()

    hours = [1, 6, 12, 24]
    graph_list = []
    #LR
    for hour in hours:
        acc_list = []
        print("For Hour: " + str(hour))
        for k in range(2, 11):
            l = LRClassifier('CSV Files/data_std_hr' + str(hour) + '.csv',
                             'CSV Files/label_std_hr' + str(hour) + '.csv', k,
                             featureDimension)
            acc_list.append(l.kfold_validator()[0])
        graph_list.append((hour, acc_list))
    gp.PlotGraph(graph_list, "Hours", "Accuracy", "Logistic Regression",
                 postType)

    #SVM
    grap_list = []
    for hour in hours:
        acc_list = []
        print("For Hour: " + str(hour))
        for k in range(2, 11):
            l = SVMClassifier('CSV Files/data_std_hr' + str(hour) + '.csv',
                              'CSV Files/label_std_hr' + str(hour) + '.csv', k,
                              featureDimension)
            acc_list.append(l.kfold_validator())
        grap_list.append((hour, acc_list))
    gp.PlotGraph(grap_list, "Hours", "Accuracy", "Support vector machines",
                 postType)

    #NB
    graph_list = []
    for hour in hours:
        acc_list = []
        print("For Hour: " + str(hour))
        for k in range(2, 11):
            l = NBClassifier('CSV Files/data_std_hr' + str(hour) + '.csv',
                             'CSV Files/label_std_hr' + str(hour) + '.csv', k,
                             featureDimension)
            acc_list.append(l.kfold_validator())
        graph_list.append((hour, acc_list))
    gp.PlotGraph(graph_list, "Hours", "Accuracy", "Navie Bayes", postType)
Пример #5
0
def svm_classification(k,
                       lenData,
                       pctTest,
                       params,
                       C=1,
                       gamma=1,
                       kernel="rbf"):

    clear_csv()

    samples = []

    print(params)
    if (params[0] == "PAIS"):
        samples = generar_muestra_pais(lenData)
    else:
        samples = generar_muestra_provincia(lenData, params[1])

    quantity_for_testing = int(lenData * pctTest)

    normalizer = Normalizer()
    data = normalizer.prepare_data(samples, quantity_for_testing)

    svmClassifier = SVMClassifier(kernel, C, gamma)

    firstRound = cross_validation(k, svmClassifier, data, lenData,
                                  "trainingFeatures", "testingFeatures",
                                  "First")

    secondRound = cross_validation(k, svmClassifier, data, lenData,
                                   "trainingFeatures", "testingFeatures",
                                   "Second")

    secondWithFirst = cross_validation(k, svmClassifier, data, lenData,
                                       "trainingFeaturesFirstInclude",
                                       "testingFeaturesFirstInclude", "Second")

    normalData = normalizer.get_normal_data()
    predictions = [firstRound, secondRound, secondWithFirst]

    show_accuracy("SVM", predictions)
    make_csv(k, normalData, lenData, pctTest, predictions)
Пример #6
0
    def initialize_data_vars(self, data_dict):
        ## Seen Unseen data I/O
        self.seen_data_input = data_dict['seen_data_input']
        self.seen_data_output = data_dict['seen_data_output']
        self.unseen_data_input = data_dict['unseen_data_input']
        self.unseen_data_output = data_dict['unseen_data_output']

        self.seen_attr_mat = data_dict['seen_attr_mat']
        self.unseen_attr_mat = data_dict['unseen_attr_mat']

        self.seen_class_ids = data_dict['seen_class_ids']
        self.unseen_class_ids = data_dict['unseen_class_ids']

        self.num_attr = self.seen_attr_mat.shape[1]
        self.num_seen_classes = self.seen_attr_mat.shape[0]
        self.num_unseen_classes = self.unseen_attr_mat.shape[0]

        ## Create training Dataset
        print('Creating training dataset...')
        self.X_train = self.seen_data_input
        self.a_train = self.seen_attr_mat[self.seen_data_output, :]

        ## Create testing Dataset
        print('Creating test dataset...')
        self.X_test = self.unseen_data_input
        self.a_test = self.unseen_attr_mat[self.unseen_data_output, :]

        if (normalize):
            ## Updates self.seen_mean and self.seen_std
            self.X_train, self.X_test = self.preprocess(self.X_train,
                                                        self.X_test,
                                                        clamp_thresh=3.0)

        self.clfs = []
        if (self.binary):
            for _ in range(self.num_attr):
                self.clfs.append(SVMClassifier())
        else:
            for _ in range(self.num_attr):
                self.clfs.append(SVMRegressor())

        self.pprint()
def classfiySVM():
    svmInst = SVMClassifier(finalList, classList, testFinalList, testClassList)
    svmInst.classify()
Пример #8
0
def main():
    conf = FCConfig("config.ini")

    seedUrls = linesFromFile(conf["seedFile"])
    repositoryDocNames = linesFromFile(conf["docsFile"])

    if conf["labelFile"]:
        print "Using labels"
        labels = intLinesFromFile(conf["labelFile"])
        relevantDocs = [
            doc for doc, lab in zip(repositoryDocNames, labels) if lab == 1
        ]
        irrelevantDocs = [
            doc for doc, lab in zip(repositoryDocNames, labels) if lab == 0
        ]
    else:
        # use VSM model to label training docs
        vsmModel = None
        if conf["VSMFilterModel"].lower() == "tf-idf":
            vsmModel = TfidfScorer(getUrlTexts(seedUrls))
        elif conf["VSMFilterModel"].lower() == "lsi":
            vsmModel = LSIScorer(getUrlTexts(seedUrls))
        print "constructed vsm model"

        relevantDocs, irrelevantDocs = vsmModel.labelDocs(
            repositoryDocNames, conf["minRepositoryDocNum"],
            conf["filterIrrelevantThreshold"], conf["filterRelevantThreshold"])

    print len(relevantDocs), len(irrelevantDocs)

    # Train classifier
    classifier = None
    testSize = min(len(relevantDocs), len(irrelevantDocs))
    trainSize = conf["trainDocNum"]
    if (trainSize > testSize):
        raise Exception("Training size is larger than test size")
    trainDocs = relevantDocs[:trainSize] + irrelevantDocs[:trainSize]
    trainLabels = [1] * trainSize + [0] * trainSize
    if conf["classifier"].upper() == "NB":
        classifier = NaiveBayesClassifier()
    elif conf["classifier"].upper() == "SVM":
        classifier = SVMClassifier()
    classifier.trainClassifierFromNames(trainDocs, trainLabels)

    print "Training complete"

    # Test classifier
    testSize = min(len(relevantDocs), len(irrelevantDocs))
    testDocs = relevantDocs[:testSize] + irrelevantDocs[:testSize]
    testLabels = [1] * testSize + [0] * testSize
    predictedLabels = list(classifier.predictFromNames(testDocs))

    # Statistical analysis (recall and precision)
    allRelevant = testSize
    allIrrelevant = testSize
    predictedRelevant = predictedLabels.count(1)
    predictedIrrelevant = predictedLabels.count(0)
    correctlyRelevant = 0
    for i in range(0, testSize):
        if predictedLabels[i] == 1:
            correctlyRelevant += 1
    correctlyIrrelevant = 0
    for i in range(testSize, 2 * testSize):
        if predictedLabels[i] == 0:
            correctlyIrrelevant += 1
    relevantRecall = float(correctlyRelevant) / allRelevant
    relevantPrecision = float(correctlyRelevant) / (predictedRelevant)
    irrelevantRecall = float(correctlyIrrelevant) / allIrrelevant
    irrelevantPrecision = float(correctlyIrrelevant) / (predictedIrrelevant)
    print relevantRecall, relevantPrecision

    [(-1, p) for p in seedUrls]
    priorityQueue = PriorityQueue(t)
    crawler = Crawler(priorityQueue, classifier, 10)
    crawler.crawl()
    print crawler.relevantPagesCount

    print crawler.pagesCount
Пример #9
0
class FocusedCrawler:

    def init_command_line_config(self):
        print "using command line argparser"
        from argparser import argdict
        conf = argdict
        self.pageLimit = conf["page_limit"]
        self.linkLimit = conf["link_limit"]
        self.relevantThreshold = conf["relevancy_threshold"]

        classifierString = conf["classifier"]
        self.classifier = None
        if "NB" in classifierString.upper():
            self.classifier = NaiveBayesClassifier()
        elif "SVM" in classifierString.upper():
            self.classifier = SVMClassifier()

        seeds = conf["seeds"]
        self.seedUrls = []
        urlsPerSeed = 10
        for keyword in seeds:
            if keyword is not None and keyword is not "":
                if "http" in keyword:
                    self.seedUrls.append(keyword)
                else:
                    seedUrlGenerator = google.search(keyword)
                    searchResultUrls = list(itertools.islice(seedUrlGenerator, 0, urlsPerSeed))
                    self.seedUrls = list(set(self.seedUrls) | set(searchResultUrls))
            else:
                raise Exception("Seed is not valid: (" + str(keyword) + ") -- it must be a keyword or URL")

        print "seed urls: "
        print self.seedUrls

        self.blacklistDomains = conf["blacklist_domains"]
        self.labeled = {}

        self.labeled["relevantUrls"] = conf["relevant_urls"]
        for url in self.labeled["relevantUrls"]:
            if url is None or url is "":
                raise Exception("Relevant URL is not valid: (" + str(url) + ")")

        self.labeled["irrelevantUrls"] = conf["relevant_urls"]
        for url in self.labeled["irrelevantUrls"]:
            if url is None or url is "":
                raise Exception("Irrelevant URL is not valid: (" + str(url) + ")")

        self.vsm = {
            "on": conf["vsm"],
            "filterModel": conf["vsm_filter"],
            "minRepositoryDocNum": conf["min_repo_doc_num"],
            "filterIrrelevantThreshold": conf["irrelevancy_threshold"],
            "filterRelevantThreshold": conf["relevancy_threshold"]
        }

    def init_config(self):
        if len(sys.argv) > 1:
            self.init_command_line_config()
        else:
            self.init_config_file_config()

    def init_config_file_config(self):
        print "using config file"
        conf = Config("config.ini")
        self.pageLimit = conf["pageLimit"]
        self.linkLimit = conf["linkLimit"]
        self.relevantThreshold = conf["relevantThreshold"]

        classifierString = conf["classifier"]
        self.classifier = None
        if "NB" in classifierString.upper():
            self.classifier = NaiveBayesClassifier()
        elif "SVM" in classifierString.upper():
            self.classifier = SVMClassifier()

        seedKeywords = linesFromFile(conf["seedFile"])
        self.seedUrls = []
        urlsPerSeed = 10
        for keyword in seedKeywords:
            if "http" in keyword:
                self.seedUrls.append(keyword)
            else:
                seedUrlGenerator = google.search(keyword)
                searchResultUrls = list(itertools.islice(seedUrlGenerator, 0, urlsPerSeed))
                self.seedUrls = list(set(self.seedUrls) | set(searchResultUrls))

        print "seed urls: "
        print self.seedUrls

        self.blacklistDomains = linesFromFile(conf["blacklistFile"])

        self.trainingDocsPath = conf["trainingDocs"]
        self.trainingDocsPath = os.path.abspath(self.trainingDocsPath)
        self.labeled = {}
        self.labeled["relevantPath"] = os.path.join(self.trainingDocsPath, "relevant.txt");
        self.labeled["irrelevantPath"] = os.path.join(self.trainingDocsPath, "irrelevant.txt");
        self.labeled["relevantUrls"] = linesFromFile(self.labeled["relevantPath"]);
        self.labeled["irrelevantUrls"] = linesFromFile(self.labeled["irrelevantPath"]);

        self.vsm = {
            "on": conf["useVSM"],
            "filterModel": conf["VSMFilterModel"],
            "minRepositoryDocNum": conf["minRepositoryDocNum"],
            "filterIrrelevantThreshold": conf["filterIrrelevantThreshold"],
            "filterRelevantThreshold": conf["filterRelevantThreshold"]
        }
        print "done"

    def setup_model(self):
        if self.vsm["on"]:
            self.setup_vsm_model()
        else:
            self.setup_labeled_model()
        self.testSize = min(len(self.relevantDocs), len(self.irrelevantDocs))

    def setup_labeled_model(self):
        print "Using labels provided by relevant.txt & irrelevant.txt"
        if self.labeled["irrelevantUrls"] is not None and len(self.labeled["irrelevantUrls"]) > 0:
            self.irrelevantDocs = [Webpage(url).save_tmp() for url in self.labeled["irrelevantUrls"] ]
        else:
            raise Exception("Irrelevant URLs must be provided for classification")
        if self.labeled["relevantUrls"] is not None and len(self.labeled["relevantUrls"]) > 0:
            self.relevantDocs = [Webpage(url).save_tmp() for url in self.labeled["relevantUrls"] ]
        else:
            raise Exception("Relevant URLs must be provided for classification")
        print "Found {} relevantDocs & {} irrelevantDocs".format(len(self.relevantDocs), len(self.irrelevantDocs))

    def setup_vsm_model(self):
        # use VSM model to label training docs
        self.vsm["model"] = None
        if self.vsmFilterModel.lower() == "tf-idf":
            self.vsm["model"] = TfidfScorer(getUrlTexts(seedUrls))
        elif self.vsmFilterModel.lower() == "lsi":
            self.vsm["model"] = LSIScorer(getUrlTexts(seedUrls))

        if self.vsm["model"] is None:
            print "No filter model specified. Cannot construct vsm model"
            sys.exit()
        else:
            print "constructed vsm model"

        self.relevantDocs , self.irrelevantDocs = self.vsm["model"].labelDocs(
            self.repositoryDocNames, self.vsm["minRepositoryDocNum"],
            self.vsm["filterIrrelevantThreshold"],
            self.vsm["filterRelevantThreshold"])

    def train_classifier(self):
        print "Training classifier"
        trainDocs = self.relevantDocs + self.irrelevantDocs
        trainLabels = [1]*len(self.relevantDocs) + [0]*len(self.irrelevantDocs)
        self.classifier.trainClassifierFromNames(trainDocs, trainLabels)
        print "Training complete"

    # Statistical analysis (recall and precision)
    def stat_analysis(self):
        testDocs = self.relevantDocs[:self.testSize] + self.irrelevantDocs[:self.testSize]
        testLabels = [1]*self.testSize + [0]*self.testSize
        self.predictedLabels = list(self.classifier.predictFromNames(testDocs))

        allRelevant = self.testSize
        allIrrelevant = self.testSize
        self.predictedRelevant = self.predictedLabels.count(1)
        self.predictedIrrelevant = self.predictedLabels.count(0)
        correctlyRelevant = 0
        for i in range(0, self.testSize):
            if self.predictedLabels[i] == 1:
                correctlyRelevant += 1
        correctlyIrrelevant = 0
        for i in range(self.testSize, 2*self.testSize):
            if self.predictedLabels[i] == 0:
                correctlyIrrelevant += 1
        self.relevantRecall = float(correctlyRelevant) / allRelevant
        self.relevantPrecision = float(correctlyRelevant) / (self.predictedRelevant)
        self.irrelevantRecall = float(correctlyIrrelevant) / allIrrelevant
        self.irrelevantPrecision = float(correctlyIrrelevant) / (self.predictedIrrelevant)
        print self.relevantRecall, self.relevantPrecision

    def crawl(self):
        t = [(-1,p) for p in self.seedUrls]
        priorityQueue = PriorityQueue(t)
        crawler = Crawler(
            priorityQueue,
            self.classifier,
            self.pageLimit,
            self.linkLimit,
            self.relevantThreshold,
            self.blacklistDomains)
        crawler.crawl()

        print crawler.relevantPages
        print len(crawler.relevantPages) / len(crawler.visited)

    def cleanup_tmp_html_files(self):
        for filepath in self.relevantDocs + self.irrelevantDocs:
            os.remove(filepath)
        a = self.evaluatorInstance.getAccuracy(Y_val_true, Y_val_pred)
        p, r, f = self.evaluatorInstance.getPRF(Y_val_true, Y_val_pred)
        print("Accuracy: " + str(a))
        print("Precision: " + str(p))
        print("Recall: " + str(r))
        print("F-measure: " + str(f))


if __name__ == '__main__':
    trainFilePath = sys.argv[
        1]  #please give the path to your reformatted quasar-s json train file
    valFilePath = sys.argv[2]  # provide the path to val file
    retrievalInstance = Retrieval()
    featurizerInstance = [
        TfIdfFeaturizer(),
        CountFeaturizer(),
        HashVectorizer()
    ]
    classifierInstance = [
        NNClassifier(),
        SVMClassifier(),
        MultinomialNaiveBayes()
    ]

    for feature in featurizerInstance:
        for classifier in classifierInstance:
            trainInstance = Pipeline(trainFilePath, valFilePath,
                                     retrievalInstance, feature, classifier)

        print "-------------------------------------------------"
Пример #11
0
    def SVMDriver(self):
        svm = SVMClassifier(self.train_x, self.train_y, self.test_x,
                            self.test_y)
        # -----= SVM
        svm.train_1()
        svm_labels = svm.predic()
        svm_acc = svm.getAccuracy()
        svm.printResult()
        self.acc['svm-NuSVC'] = {
            'accuracy': svm_acc,
            'train-time': svm.trainTime(),
            'test-time': svm.testTime(),
        }

        svm.train_2()
        svm_labels_2 = svm.predic()
        svm_acc = svm.getAccuracy()
        svm.printResult()
        self.acc['svm-LinearSVC'] = {
            'accuracy': svm_acc,
            'train-time': svm.trainTime(),
            'test-time': svm.testTime(),
        }
        svm.train_3()
        svm_labels_3 = svm.predic()
        svm_acc = svm.getAccuracy()
        svm.printResult()
        self.acc['svm-SVC'] = {
            'accuracy': svm_acc,
            'train-time': svm.trainTime(),
            'test-time': svm.testTime(),
        }
def DirectAttributePrediction(classifier='SVM',
                              predicate_type='binary',
                              C=10.0):
    # Get features index to recover samples
    train_index = bzUnpickle('./CreatedData/train_features_index.txt')
    test_index = bzUnpickle('./CreatedData/test_features_index.txt')

    # Get classes-attributes relationship
    train_attributes = get_class_attributes('./',
                                            name='train',
                                            predicate_type=predicate_type)
    test_attributes = get_class_attributes('./',
                                           name='test',
                                           predicate_type=predicate_type)
    N_ATTRIBUTES = train_attributes.shape[1]

    # Create training Dataset
    print('Creating training dataset...')
    X_train, y_train = create_data('./CreatedData/train_featuresVGG19.pic.bz2',
                                   train_index, train_attributes)

    print('X_train to dense...')
    X_train = X_train.toarray()

    Xplat_train, Xplat_val, yplat_train, yplat_val = train_test_split(
        X_train, y_train, test_size=0.10, random_state=42)

    print('Creating test dataset...')
    X_test, y_test = create_data('./CreatedData/test_featuresVGG19.pic.bz2',
                                 test_index, test_attributes)
    y_pred = np.zeros(y_test.shape)
    y_proba = np.copy(y_pred)

    print('X_test to dense...')
    X_test = X_test.toarray()

    # CHOOSING SVM
    if classifier == 'SVM':
        platt_params = []
        for i in range(N_ATTRIBUTES):
            print('--------- Attribute %d/%d ---------' %
                  (i + 1, N_ATTRIBUTES))
            t0 = time()

            # SVM classifier
            if predicate_type == 'binary':
                clf = SVMClassifier()
            else:
                clf = SVMRegressor()

            # Training
            clf.fit(X_train, y_train[:, i])
            print('Fitted classifier in: %fs' % (time() - t0))
            if predicate_type == 'binary':
                clf.set_platt_params(Xplat_val, yplat_val[:, i])

            # Predicting
            print('Predicting for attribute %d...' % (i + 1))
            y_pred[:, i] = clf.predict(X_test)
            if predicate_type == 'binary':
                y_proba[:, i] = clf.predict_proba(X_test)

            print('Saving files...')
            np.savetxt('./DAP_' + predicate_type + '/prediction_SVM', y_pred)
            if predicate_type == 'binary':
                np.savetxt('./DAP_' + predicate_type + '/platt_params_SVM',
                           platt_params)
                np.savetxt('./DAP_' + predicate_type + '/probabilities_SVM',
                           y_proba)

    # CHOOSING NEURAL NETWORK
    if classifier == 'NN':
        if predicate_type != 'binary':
            clf = NeuralNetworkRegressor(dim_features=X_train.shape[1],
                                         nb_attributes=N_ATTRIBUTES)
        else:
            clf = NeuralNetworkClassifier(dim_features=X_train.shape[1],
                                          nb_attributes=N_ATTRIBUTES)

        print('Fitting Neural Network...')
        clf.fit(X_train, y_train)

        print('Predicting attributes...')
        y_pred = np.array(clf.predict(X_test))
        y_pred = y_pred.reshape((y_pred.shape[0], y_pred.shape[1])).T
        y_proba = y_pred

        print('Saving files...')
        np.savetxt('./DAP_' + predicate_type + '/prediction_NN', y_pred)
        if predicate_type == 'binary':
            np.savetxt('./DAP_' + predicate_type + '/probabilities_NN',
                       y_proba)
Пример #13
0
from VectorizedDataset import VectorizedDataset
from SpectrumKernel import SpectrumKernel
from SVMClassifier import SVMClassifier
import kernels
import utils

path = 'data/'

Kernel = SpectrumKernel

n_grams_list = [4, 3, 2]

VD = VectorizedDataset(Kernel, path, n_grams_list)

SVMC = {}
Cs = [0.9, 1.4, 1.2]
for i in range(3):
    SVMC[i] = SVMClassifier(kernel=kernels.rbf, C=Cs[i])
    SVMC[i].fit(VD.X[i]['train'], VD.Y[i]['train'])
    print("fit done for training {}".format(i))
    print("Training accuracy for classifier {} : ".format(i) + str(
        utils.compute_val_accuracy(SVMC[i], VD.X[i]['train'], VD.Y[i]
                                   ['train'])))
    print("Validation accuracy for classifier {} : ".format(i) + str(
        utils.compute_val_accuracy(SVMC[i], VD.X[i]['val'], VD.Y[i]['val'])))

utils.generate_submission_file(SVMC,
                               path,
                               n_grams_list,
                               submission_filename='Yte.csv')