Exemplo n.º 1
0
def load_best_classifier_conf():
    from DataClassifierV2 import ClassifiersWrapper
    from pyspark.mllib.classification import SVMWithSGD, LogisticRegressionWithSGD, LogisticRegressionWithLBFGS, NaiveBayes
    myClassifier = ClassifiersWrapper()
    myClassifier.addClassifier(classifier=SVMWithSGD, trainParameters={}, weight=0.3)
    myClassifier.addClassifier(classifier=LogisticRegressionWithSGD, trainParameters={}, weight=0.3)
    myClassifier.addClassifier(classifier=NaiveBayes, trainParameters={}, weight=0.3)
    myClassifier.addClassifier(classifier=LogisticRegressionWithLBFGS, trainParameters={}, weight=0.7)
    return myClassifier
    #newSource.lookingAll('NASDAQ:NVDA', ['NVIDIA'])
    #newSource.lookingAll('VTX:NESN', ['NESTLE'])
    #newSource.lookingAll('VTX:SCMN', ['SWISSCOM'])
    #newSource.lookingAll('VTX:NOVN', ['NOVARTIS'])

    newsRDD = newSource.doIt()
    marketSource = GoogleFinanceMarketSourceSpark(
        ['NASDAQ:GOOGL', 'NASDAQ:NVDA', 'VTX:NESN', 'VTX:SCMN', 'VTX:NOVN'])
    newsRDD = newsRDD.map(lambda x: marketSource.addMarketStatusToNews(x))
    #newsRDD = newsRDD.randomSplit([1,1,1,1,1,1,1,1,1,1,1,1,1,1,1])[0]
    newsRDD.cache()
    print('nb news : %d' % newsRDD.count())
    dataSetMaker = DataSetMakerV2(n=config.FEATURES_CONF['vecteur_size'])
    fullDataSet = dataSetMaker.process(newsRDD)  # TODO change
    fullDataSet.cache()
    myClassifier = ClassifiersWrapper()
    myClassifier.addClassifier(classifier=SVMWithSGD,
                               trainParameters={},
                               weight=0.4)
    #myClassifier.addClassifier(classifier=LogisticRegressionWithSGD, trainParameters={}, weight=0.3)
    myClassifier.addClassifier(classifier=NaiveBayes,
                               trainParameters={},
                               weight=0.4)
    myClassifier.addClassifier(classifier=LogisticRegressionWithLBFGS,
                               trainParameters={},
                               weight=0.4)

    myClassifier2 = ClassifiersWrapper()
    myClassifier2.addClassifier(classifier=SVMWithSGD,
                                trainParameters={},
                                weight=0.3)