Exemplo n.º 1
0
    def get_info_NLP(self):
        def get_data_i_FE2(i):
            print "Read" + str(i)
            return DataParser().AmericanCombo_i_FE2(i)

        def write_out_data(df, i):
            print "Writing out data"
            DataParser()._write_HDFStore_Combined_FE2(df, i)

        data_count = DataParser().number_of_datasets
        i = 0

        df_ALL = pd.DataFrame()
        df_ALL2 = pd.DataFrame()

        LOAN_COUNT = 0
        FICO_MEAN = 0
        FICO_MEDIAN = []
        BALANCE_MEAN = 0
        LOAN_LENGTH_MEAN = 0
        DEFAULT_LOANS_COUNT = 0
        FULLY_PAID_LOANS_COUNT = 0

        while i < data_count:

            df_IN = get_data_i_FE2(i)

            # Find all loans that have instance of 90-dd
            df = df_IN.loc[df_IN['status_month_0'] == 3]
            loan_ids_with_90_dd = df['id_loan'].unique()
            mask = df_IN['id_loan'].isin(loan_ids_with_90_dd)
            df_with_90_dd = df_IN.loc[mask]

            # Removes all updates that occured before loan FIRST became 90-dd, i.e. when loan becomes non-performing
            df_with_90_dd = df_with_90_dd.sort_values(
                ['id_loan', 'svcg_cycle'], ascending=[True, True])
            df_with_90_dd['90_dd'] = 0
            df_with_90_dd.loc[df_with_90_dd['status_month_0'] == 3,
                              '90_dd'] = 1
            df_with_90_dd['90_dd'] = df_with_90_dd.groupby(
                ['id_loan'])['90_dd'].apply(lambda x: x.cumsum())
            df_with_90_dd = df_with_90_dd.loc[df_with_90_dd['90_dd'] != 0]
            df_with_90_dd = df_with_90_dd.drop('90_dd', 1)

            # print df_with_90_dd[['id_loan', 'svcg_cycle','st', 'status_month_0', 'occr_30dd', ]]

            df_np = df_with_90_dd

            LOAN_COUNT += len(np.unique(df_np['id_loan'].values))
            FICO_MEAN += df_np['fico'].mean()
            BALANCE_MEAN += df_np['orig_upb'].mean()
            FICO_MEDIAN.append(df_np['fico'].median())
            LOAN_LENGTH_MEAN += df_np.groupby(['id_loan']).size().mean()
            DEFAULT_LOANS_COUNT += len(df_np.loc[df_np['label_good_bad_loan']
                                                 == 0]['id_loan'].unique())
            FULLY_PAID_LOANS_COUNT += len(df_np.loc[
                df_np['label_good_bad_loan'] == 1]['id_loan'].unique())

            df_np = df_np.sort_values("id_loan")
            df_np.reset_index(drop=True, inplace=True)
            # n = 10000
            # if len(df_X) < n:
            #     n = len(df_X)
            # df_ALL = pd.concat([df_X[:n], df_ALL], axis=0)

            m = 20000
            if len(df_np) < m:
                m = len(df_np)
            df_ALL2 = pd.concat([df_np[:m], df_ALL2], axis=0)

            i += 1

        LOAN_COUNT = LOAN_COUNT
        FICO_MEAN = FICO_MEAN / i
        BALANCE_MEAN = BALANCE_MEAN / i
        LOAN_LENGTH_MEAN = LOAN_LENGTH_MEAN / i
        FICO_MEDIAN = np.median(FICO_MEDIAN)

        print "LOAN_LENGTH_MEAN: " + str(LOAN_LENGTH_MEAN)
        print "FICO_MEDIAN: " + str(FICO_MEDIAN)
        print "FICO_MEAN: " + str(FICO_MEAN)
        print "LOAN_COUNT: " + str(LOAN_COUNT)
        print "BALANCE_MEAN: " + str(BALANCE_MEAN)
        print "DEFAULT_LOANS_COUNT: " + str(DEFAULT_LOANS_COUNT)
        print "FULLY_PAID_LOANS_COUNT: " + str(FULLY_PAID_LOANS_COUNT)

        # write_out_data(df_ALL, -3)
        write_out_data(df_ALL2, -3)
 def __init__(self):
     #实例化同级类
     self.parser = DataParser()
Exemplo n.º 3
0
 def get_data_i_FE2(i):
     return DataParser().AmericanCombo_i_FE2(i)
def read_data_split_and_search():
    """
    This function provides a simple example on how to tune parameters of a given algorithm

    The BayesianSearch object will save:
        - A .txt file with all the cases explored and the recommendation quality
        - A _best_model file which contains the trained model and can be loaded with recommender.load_model()
        - A _best_parameter file which contains a dictionary with all the fit parameters, it can be passed to recommender.fit(**_best_parameter)
        - A _best_result_validation file which contains a dictionary with the results of the best solution on the validation
        - A _best_result_test file which contains a dictionary with the results, on the test set, of the best solution chosen using the validation set
    """

    seed = 1205
    parser = DataParser()

    URM_all = parser.get_URM_all()
    ICM_obj = parser.get_ICM_all()

    # SPLIT TO GET TEST PARTITION
    URM_train, URM_test = split_train_in_two_percentage_global_sample(
        URM_all, train_percentage=0.90, seed=seed)

    # SPLIT TO GET THE HYBRID VALID PARTITION
    URM_train, URM_valid_hybrid = split_train_in_two_percentage_global_sample(
        URM_train, train_percentage=0.85, seed=seed)

    URM_valid_hybrid = parser.filter_URM_test_by_range(URM_train,
                                                       URM_valid_hybrid,
                                                       (3, -1))

    collaborative_algorithm_list = [
        # EASE_R_Recommender
        # PipeHybrid001,
        # Random,
        # TopPop,
        # P3alphaRecommender,
        # RP3betaRecommender,
        # ItemKNNCFRecommender,
        # UserKNNCFRecommender,
        # MatrixFactorization_BPR_Cython,
        # MatrixFactorization_FunkSVD_Cython,
        # PureSVDRecommender,
        # NMFRecommender,
        # PureSVDItemRecommender
        # SLIM_BPR_Cython,
        # SLIMElasticNetRecommender
        # IALSRecommender
        # MF_MSE_PyTorch
        # MergedHybrid000
        # LinearHybrid002ggg
        HybridCombinationSearch
    ]

    content_algorithm_list = [
        # ItemKNNCBFRecommender
    ]

    from Base.Evaluation.Evaluator import EvaluatorHoldout

    evaluator_valid_hybrid = EvaluatorHoldout(URM_valid_hybrid,
                                              cutoff_list=[10])
    evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])
    """
    earlystopping_keywargs = {"validation_every_n": 5,
                              "stop_on_validation": True,
                              "evaluator_object": evaluator_valid_hybrid,
                              "lower_validations_allowed": 5,
                              "validation_metric": 'MAP',
                              }
    
    print('IALS training...')
    ials = IALSRecommender(URM_train, verbose=False)
    ials_params = {'num_factors': 83, 'confidence_scaling': 'linear', 'alpha': 28.4278070726612,
                   'epsilon': 1.0234211788885077, 'reg': 0.0027328110246575004, 'epochs': 20}
    ials.fit(**ials_params, **earlystopping_keywargs)
    print("Done")
    
    
    print("PureSVD training...")
    psvd = PureSVDRecommender(URM_train, verbose=False)
    psvd_params = {'num_factors': 711}
    psvd.fit(**psvd_params)
    print("Done")
    """
    print("Rp3beta training...")
    rp3b = RP3betaRecommender(URM_train, verbose=False)
    rp3b_params = {
        'topK': 753,
        'alpha': 0.3873710051288722,
        'beta': 0.0,
        'normalize_similarity': False
    }
    rp3b.fit(**rp3b_params)
    print("Done")
    print("P3alpha training...")
    p3a = P3alphaRecommender(URM_train, verbose=False)
    p3a_params = {
        'topK': 438,
        'alpha': 0.41923120471415165,
        'normalize_similarity': False
    }
    p3a.fit(**p3a_params)
    print("Done")
    print("ItemKnnCF training...")
    icf = ItemKNNCFRecommender(URM_train, verbose=False)
    icf_params = {
        'topK': 565,
        'shrink': 554,
        'similarity': 'tversky',
        'normalize': True,
        'tversky_alpha': 1.9109121434662428,
        'tversky_beta': 1.7823834698905734
    }
    icf.fit(**icf_params)
    print("Done")
    print("UserKnnCF training...")
    ucf = UserKNNCFRecommender(URM_train, verbose=False)
    ucf_params = {
        'topK': 190,
        'shrink': 0,
        'similarity': 'cosine',
        'normalize': True
    }
    ucf.fit(**ucf_params)
    print("Done")
    print("ItemKnnCBF training...")
    icb = ItemKNNCBFRecommender(URM_train, ICM_obj, verbose=False)
    icb_params = {
        'topK': 205,
        'shrink': 1000,
        'similarity': 'cosine',
        'normalize': True,
        'feature_weighting': 'BM25'
    }
    icb.fit(**icb_params)
    print("Done")
    """
    print("SlimElasticNet training...")
    sen = SLIMElasticNetRecommender(URM_train, verbose=False)
    sen_params = {'topK': 954, 'l1_ratio': 3.87446082207643e-05, 'alpha': 0.07562657698792305}
    sen.fit(**sen_params)
    print("Done")
    """

    list_recommender = [icb, icf, ucf, p3a, rp3b]
    list_already_seen = []

    for rec_perm in combinations(list_recommender, 3):

        if rec_perm not in combinations(list_already_seen, 3):

            recommender_names = '_'.join(
                [r.RECOMMENDER_NAME for r in rec_perm])
            output_folder_path = "result_experiments_v3/seed_" + str(
                seed) + '_3--1' + '/' + recommender_names + '/'

            # If directory does not exist, create
            if not os.path.exists(output_folder_path):
                os.makedirs(output_folder_path)

            # TODO: setta I GIUSTI EVALUATOR QUI!!!!
            runParameterSearch_Collaborative_partial = partial(
                runParameterSearch_Collaborative,
                URM_train=URM_train,
                ICM_train=ICM_obj,
                metric_to_optimize="MAP",
                n_cases=50,
                n_random_starts=20,
                evaluator_validation_earlystopping=evaluator_valid_hybrid,
                evaluator_validation=evaluator_valid_hybrid,
                evaluator_test=evaluator_test,
                output_folder_path=output_folder_path,
                allow_weighting=False,
                # similarity_type_list = ["cosine", 'jaccard'],
                parallelizeKNN=False,
                list_rec=rec_perm)
            pool = multiprocessing.Pool(processes=int(
                multiprocessing.cpu_count()),
                                        maxtasksperchild=1)
            pool.map(runParameterSearch_Collaborative_partial,
                     collaborative_algorithm_list)
def ComputeFscore(modelfile, testfile, outputfile):
    maxParagraphLength = int(sys.argv[1])
    maxParagraphs = int(sys.argv[2])
    filterSizes = [int(i) for i in sys.argv[3].split("-")]
    num_filters = int(sys.argv[4])
    wordEmbeddingDimension = int(sys.argv[5])
    lrate = float(sys.argv[10])

    # maxParagraphLength = 20
    # maxParagraphs = 5
    # filterSizes = [int(i) for i in "1-2".split("-")]
    # num_filters = 16
    # wordEmbeddingDimension = 30
    # lrate = 0.001

    labels = 30938
    vocabularySize = 101939

    model = Model(maxParagraphs, maxParagraphLength, labels, vocabularySize,
                  filterSizes, num_filters, wordEmbeddingDimension, lrate)

    testing = DataParser(maxParagraphs, maxParagraphLength, labels,
                         vocabularySize)
    testing.getDataFromfile(testfile)

    model.load(modelfile)

    print("loading done")

    testing.restore()
    truePre = []
    pred = []
    for itr in range(testing.totalPages):
        data = testing.nextBatch(1)
        truePre.append(data[0])
        pre = model.predict(data)
        pred.append(pre[0])

    labelsCount = {}
    ConfusionMa = {}
    fScr = {}

    thres = 0.5
    valid = int(
        len(truePre) * 0.5
    )  #using first 50% data for threshold tuning - we have merged test and cv files
    labelsCount = {}
    ConfusionMa = {}
    fScr = {}
    thresLab = {}
    for la in range(labels):
        if la % 25 == 0:
            print("Current label", la)
        t = []
        p = []
        for i in range(valid):
            t.append(truePre[i][0][la])
            p.append(pred[i][la])
        bestF, bestThre = thresholdTuning(t, p)

        t = []
        p = []
        for i in range(valid, len(truePre)):
            t.append(truePre[i][0][la])
            p.append(pred[i][la])

        p = np.array(p)
        fScr[la] = f1_score(t, p >= bestThre)
        ConfusionMa[la] = confusion_matrix(t, p > bestThre)
        thresLab[la] = bestThre

    f = open(outputfile, "a")
    output = sys.argv[9]

    sum_fscore = 0.0
    for i in range(labels):
        sum_fscore = sum_fscore + fScr[i]
        output = output + "," + str(fScr[i])
    output += "," + str(sum_fscore / float(labels - 1))
    print("Fscore at " + sys.argv[7] + " epochs: " +
          str(sum_fscore / float(labels - 1)))
    # print("Fscore at 400 epochs: " + str(sum_fscore / float(labels - 1)) )
    f.write(output + "\n")
    f.close()
Exemplo n.º 6
0
 def write_out_data(df, i):
     print "Writing out data"
     DataParser()._write_HDFStore_Combined_FE2(df, i)
Exemplo n.º 7
0
import sys

paragraphLength = int(sys.argv[1])
maxParagraphs = int(sys.argv[2])
filterSizes = [int(i) for i in sys.argv[3].split("-")]
print(filterSizes)
num_filters = int(sys.argv[4])
wordEmbeddingDimension = int(sys.argv[5])
batchSize = int(sys.argv[6])
epochEnd = int(sys.argv[7])
folder_name = sys.argv[8]
lrate = float(sys.argv[9])
nlabels = 10
vocabularySize = 101940

training = DataParser(maxParagraphs, paragraphLength, nlabels, vocabularySize)
training.getDataFromfile(
    "../wiki10_miml_dataset/preprocessed_data/toplabels_split/wiki10-top10labels_train.txt"
)
model = Model(maxParagraphs, paragraphLength, nlabels, vocabularySize,
              filterSizes, num_filters, wordEmbeddingDimension, lrate)

costfile = open("results/costfile.txt", "a")
output = folder_name

epoch = 0
# epochEnd=400
costepochs = []

for e in range(epoch, epochEnd):
Exemplo n.º 8
0
                predictions=predictions)

        if mode == tf.estimator.ModeKeys.EVAL:
            labels = tf.reshape(labels, shape=[-1, 1])  # 样本标签
            eval_metric_ops = {"auc": tf.metrics.auc(labels, tf.sigmoid(logits))}
            loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))
            return tf.estimator.EstimatorSpec(
                mode=mode,
                loss=loss,
                eval_metric_ops=eval_metric_ops)


if __name__ == '__main__':
    from DataParser import DataParser

    dataParser = DataParser(track_name='track2', data_dir=None)
    features = {'feature_ids': tf.placeholder(dtype=tf.int32, shape=[32, 51]),
                'feature_weights': tf.placeholder(dtype=tf.float32, shape=[32, 51]),
                'video_weights': tf.placeholder(dtype=tf.float32, shape=[32, 128]),
                'audio_weights': tf.placeholder(dtype=tf.float32, shape=[32, 128]),
                'word_ids': tf.placeholder(dtype=tf.int32, shape=[32, 35]),
                'word_weights': tf.placeholder(dtype=tf.float32, shape=[32, 35]),
                'item_ids': tf.placeholder(dtype=tf.int32, shape=[32, 400]),
                'item_weights': tf.placeholder(dtype=tf.float32, shape=[32, 400]),
                'author_ids': tf.placeholder(dtype=tf.int32, shape=[32, 400]),
                'author_weights': tf.placeholder(dtype=tf.float32, shape=[32, 400]),
                'music_ids': tf.placeholder(dtype=tf.int32, shape=[32, 400]),
                'music_weights': tf.placeholder(dtype=tf.float32, shape=[32, 400]),
                'item_city_ids': tf.placeholder(dtype=tf.int32, shape=[32, 400]),
                'item_city_weights': tf.placeholder(dtype=tf.float32, shape=[32, 400]),
                'item_uid_ids': tf.placeholder(dtype=tf.int32, shape=[32, 150]),
Exemplo n.º 9
0
def ComputeFscore(modelfile, testfile, outputfile):

    CURRENT_DIR = os.path.dirname(os.path.abspath("./WikiCategoryLabelling/"))
    sys.path.append(os.path.dirname(CURRENT_DIR + "/WikiCategoryLabelling/"))

    maxParagraphLength = 250
    maxParagraphs = 10
    labels = 1000
    vocabularySize = 150000
    model = Model(maxParagraphLength, maxParagraphs, labels, vocabularySize)

    testing = DataParser(maxParagraphLength, maxParagraphs, labels,
                         vocabularySize)
    testing.getDataFromfile(testfile)

    model.load(modelfile)

    print("loading done")

    testing.restore()
    truePre = []
    pred = []
    for itr in range(testing.totalPages):
        data = testing.nextBatch()
        truePre.append(data[0])
        pre = model.predict(data)
        pred.append(pre[0])

    labelsCount = {}
    ConfusionMa = {}
    fScr = {}

    thres = 0.5
    valid = int(len(truePre) * 0.35)
    labelsCount = {}
    ConfusionMa = {}
    fScr = {}
    thresLab = {}
    for la in range(1000):
        if la % 25 == 0:
            print("Currnet label", la)
        t = []
        p = []
        for i in range(valid):
            t.append(truePre[i][la])
            p.append(pred[i][la])
        bestF, bestThre = thresholdTuning(t, p)

        t = []
        p = []
        for i in range(valid, len(truePre)):
            t.append(truePre[i][la])
            p.append(pred[i][la])

        p = np.array(p)
        fScr[la] = f1_score(t, p >= bestThre)
        ConfusionMa[la] = confusion_matrix(t, p > bestThre)
        thresLab[la] = bestThre

    f = open(outputfile, "w")
    for i in range(1000):
        inp = str(i) + "," + str(thresLab[i]) + "," + str(fScr[i]) + "\n"
        f.write(inp)
    f.close()
 def Dataset_count(self):
     """str: Properties should be documented in their getter method."""
     return DataParser().number_of_datasets
 def process_all_data_HDF_Combo_reduced(self):
     for i in range(0, self.Dataset_count):
         print("Starting : " + str(i) + "/" + str(self.Dataset_count))
         df = self.Dataset_i_Reduced(i)
         DataParser()._write_HDFStore_Combined(df, i)
 def Dataset_Origin(self):
     dataset = DataParser()._read_HDFStore_Origination_Filtered(-1)
     assert isinstance(dataset, pd.DataFrame)
     return dataset
 def Dataset_Combo_i(self, i):
     dataset = DataParser().AmericanCombo_i_FE2(i)
     dataset = FeatureExtractionSecond().filter_main(dataset)
     assert isinstance(dataset, pd.DataFrame)
     return dataset
def read_data_split_and_search():
    """
    This function provides a simple example on how to tune parameters of a given algorithm

    The BayesianSearch object will save:
        - A .txt file with all the cases explored and the recommendation quality
        - A _best_model file which contains the trained model and can be loaded with recommender.load_model()
        - A _best_parameter file which contains a dictionary with all the fit parameters, it can be passed to recommender.fit(**_best_parameter)
        - A _best_result_validation file which contains a dictionary with the results of the best solution on the validation
        - A _best_result_test file which contains a dictionary with the results, on the test set, of the best solution chosen using the validation set
    """

    parser = DataParser()
    seed = 1666
    URM_all = parser.get_URM_all()
    ICM_obj = parser.get_ICM_all()

    #URM_train, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage=0.85, seed=seed)
    #URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_train, train_percentage=0.85, seed=seed)
    k = 5

    output_folder_path = "result_experiments_CV/"

    collaborative_algorithm_list = [
        HybridCombinationSearchCV
        #HybridSuperLinear
    ]

    from Base.Evaluation.Evaluator import EvaluatorHoldout

    icb = (ItemKNNCBFRecommender), {
        'topK': 164,
        'shrink': 8,
        'similarity': 'jaccard',
        'normalize': True
    }
    icbsup = (SpecialItemKNNCBFRecommender), {
        'topK': 1000,
        'shrink': 1000,
        'similarity': 'cosine',
        'normalize': True,
        'feature_weighting': 'BM25'
    }
    icbcf = (ItemKNN_CBF_CF), {
        'topK': 1000,
        'shrink': 1000,
        'similarity': 'asymmetric',
        'normalize': True,
        'asymmetric_alpha': 0.241892724784089,
        'feature_weighting': 'TF-IDF',
        'icm_weight': 1.0
    }
    icf = (ItemKNNCFRecommender), {
        'topK': 1000,
        'shrink': 1000,
        'similarity': 'cosine',
        'normalize': True,
        'feature_weighting': 'TF-IDF'
    }
    ucf = (UserKNNCFRecommender), {
        'topK': 163,
        'shrink': 846,
        'similarity': 'cosine',
        'normalize': True,
        'feature_weighting': 'TF-IDF'
    }
    p3a = (RP3betaRecommender), {
        'topK': 926,
        'alpha': 0.4300109351916609,
        'beta': 0.01807360750913967,
        'normalize_similarity': False
    }
    rp3b = (P3alphaRecommender), {
        'topK': 575,
        'alpha': 0.48009885897470206,
        'normalize_similarity': False
    }
    sbpr = (SLIM_BPR_Cython, {
        'topK': 1000,
        'epochs': 130,
        'symmetric': False,
        'sgd_mode': 'adam',
        'lambda_i': 1e-05,
        'lambda_j': 1e-05,
        'learning_rate': 0.0001
    })
    sslim = (SSLIMElasticNet, {
        'beta': 0.567288665094892,
        'topK': 1000,
        'l1_ratio': 1e-05,
        'alpha': 0.001
    })

    combo_algorithm_list = [
        icb, icbsup, icbcf, icf, ucf, p3a, rp3b, sbpr, sslim
    ]
    list_already_seen = []
    combinations_already_seen = combinations(list_already_seen, 3)
    """
    (icb, icf, p3a), (icb, icf, rp3b), (icb, icf, sen), (icb, p3a, rp3b), (icb, p3a, sen),
                                (icb, rp3b, sen), (icf, p3a, rp3b), (icf, p3a, sen)
    """
    combination_to_be_done = list(combinations(combo_algorithm_list, 3))

    for rec_perm in combination_to_be_done:

        if rec_perm not in combinations_already_seen:
            recommender_names = '_'.join(
                [r[0].RECOMMENDER_NAME for r in rec_perm])
            output_folder_path = "result_experiments_CV2/seed_" + str(
                seed) + '/linear/' + recommender_names + '/'
            print(F"\nTESTING THE COMBO {recommender_names}")

            if ((icb in rec_perm) or
                (icbsup in rec_perm)) and not ((icb in rec_perm) and
                                               (icbsup in rec_perm)):
                # If directory does not exist, create
                if not os.path.exists(output_folder_path):
                    os.makedirs(output_folder_path)

                    runParameterSearch_Collaborative_partial = partial(
                        runParameterSearch_Collaborative,
                        URM_train=URM_all,
                        ICM_train=ICM_obj,
                        metric_to_optimize="MAP",
                        n_cases=50,
                        n_random_starts=20,
                        output_folder_path=output_folder_path,
                        parallelizeKNN=False,
                        allow_weighting=False,
                        k=k,
                        seed=seed,
                        list_rec=rec_perm,
                        level='hybrid_search')
                    pool = multiprocessing.Pool(processes=int(
                        multiprocessing.cpu_count()),
                                                maxtasksperchild=1)
                    pool.map(runParameterSearch_Collaborative_partial,
                             collaborative_algorithm_list)
Exemplo n.º 15
0
    def get_info_BOTH(self):
        def get_data_i_FE2(i):
            print "Read" + str(i)
            return DataParser().AmericanCombo_i_FE2(i)

        def write_out_data(df, i):
            print "Writing out data"
            DataParser()._write_HDFStore_Combined_FE2(df, i)

        data_count = DataParser().number_of_datasets
        i = 0

        df_ALL = pd.DataFrame()
        df_ALL2 = pd.DataFrame()

        LOAN_COUNT = 0
        FICO_MEAN = 0
        FICO_MEDIAN = []
        BALANCE_MEAN = 0

        while i < data_count:

            df_X = get_data_i_FE2(i)
            df_IN = df_X.copy()

            df_X = df_X.sort_values("id_loan")
            df_X.reset_index(drop=True, inplace=True)
            n = 20000
            if len(df_X) < n:
                n = len(df_X)
            df_ALL = pd.concat([df_X[:n], df_ALL], axis=0)

            # Find all loans that have instance of 90-dd
            df = df_IN.loc[df_IN['status_month_0'] == 3]
            loan_ids_with_90_dd = df['id_loan'].unique()
            mask = df_IN['id_loan'].isin(loan_ids_with_90_dd)
            df_with_90_dd = df_IN.loc[mask]

            # Removes all updates that occured before loan FIRST became 90-dd, i.e. when loan becomes non-performing
            df_with_90_dd = df_with_90_dd.sort_values(
                ['id_loan', 'svcg_cycle'], ascending=[True, True])
            df_with_90_dd['90_dd'] = 0
            df_with_90_dd.loc[df_with_90_dd['status_month_0'] == 3,
                              '90_dd'] = 1
            df_with_90_dd['90_dd'] = df_with_90_dd.groupby(
                ['id_loan'])['90_dd'].apply(lambda x: x.cumsum())
            df_with_90_dd = df_with_90_dd.loc[df_with_90_dd['90_dd'] != 0]
            df_with_90_dd = df_with_90_dd.drop('90_dd', 1)

            # print df_with_90_dd[['id_loan', 'svcg_cycle','st', 'status_month_0', 'occr_30dd', ]]

            df_np = df_with_90_dd

            df_np = df_np.sort_values("id_loan")
            df_np.reset_index(drop=True, inplace=True)
            # n = 10000
            # if len(df_X) < n:
            #     n = len(df_X)
            # df_ALL = pd.concat([df_X[:n], df_ALL], axis=0)

            # m = 20000
            # if len(df_np) < m:
            #     m = len(df_np)
            df_ALL2 = pd.concat([df_np, df_ALL2], axis=0)

            i += 1

        write_out_data(df_ALL, -1)
        write_out_data(df_ALL2, -3)
Exemplo n.º 16
0
def ComputePrecisionK(modelfile,testfile,outputfile):
    maxParagraphLength = int(sys.argv[1])
    maxParagraphs = int(sys.argv[2] )
    filterSizes = [int(i) for i in sys.argv[3].split("-")]
    num_filters = int(sys.argv[4])
    wordEmbeddingDimension = int(sys.argv[5])
    lrate = float(sys.argv[10])
    poolLength = int(sys.argv[11])
    keep_prob = float(sys.argv[12])

    keep_prob = 1.0
    
    labels = 30938
    vocabularySize = 101939

    model = Model(maxParagraphs,maxParagraphLength,labels,vocabularySize,filterSizes\
                        ,num_filters,wordEmbeddingDimension,lrate, poolLength, keep_prob)

    testing = DataParser(maxParagraphs,maxParagraphLength,labels,vocabularySize)
    testing.getDataFromfile(testfile)

    model.load(modelfile)

    print("loading done")
    print("no of test examples: " + str(testing.totalPages))

    print("Computing Prec@k")
    
    #check if batchsize needs to be taken by parameter

    batchSize = 1
    testing.restore()
    truePre=[]
    pred=[]
    for itr in range(testing.totalPages):
        data=testing.nextBatch(1)
        truePre.append(data[0])
        pre=model.predict(data)
        pred.append(pre[0])

    K_list = [1,3,5]     #prec@1 .....prec@NoofLabels
    precAtK = [0.0]*6	

    # #As need to get Prec only on last 50% of test data as first 50% is for cross validation
    # valid=int(len(truePre)*0.5)
    # pred = pred[valid:]
    # truePre = truePre[valid:]

    for i,v in enumerate(pred):
        temp = [(labId,labProb) for labId,labProb in enumerate(v) ]
        temp = sorted(temp,key=lambda x:x[1],reverse=True)  #sorting based on label probability to get top k
        for ele in K_list:        #1....No of Labels
            pBag = 0              #no of true positive for this instance 
            for itr in range(ele): #top k ie top ele
                if truePre[i][0][temp[itr][0]]==1:
                	precAtK[ele] += 1 
                    # pBag += 1
            # precAtK[ele] += float(pBag)/float(ele)

    f = open(outputfile,"a")
    output = sys.argv[9]

    for k in K_list:
		precAtK[k] /= (k * len(pred)) 
		print ("Prec@" + str(k) + " = " + str(precAtK[k]))
		output = output + "," + "Prec@" + str(k) + "=," + str(precAtK[k])

    f.write(output + "\n")
    f.close()
Exemplo n.º 17
0
 def get_data_i_FE2(i):
     print "Read" + str(i)
     return DataParser().AmericanCombo_i_FE2(i)
Exemplo n.º 18
0
from os.path import dirname
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from DataParser import DataParser
from CsvParser import CsvParser

data_dir = os.path.join(os.getcwd(), '../notebooks/datasets')
signal_file = 'ap_100MeV_L1L1_tight_08mm.csv'
background_file = 'tritrig-wab-beam_100MeV_L1L1_tight.csv'

background = CsvParser(os.path.join(data_dir, background_file))
signal = CsvParser(os.path.join(data_dir, signal_file))

myData = DataParser(signal=signal, background=background)

X_train, Y_train, X_test, Y_test, classes = myData.load_dataset()

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)

X_train = X_train.T
Y_train = Y_train.T

clf.fit(X_train, Y_train)

importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
def ComputePrecisionK(modelfile,testfile):
    maxParagraphLength = 20
    maxParagraphs = 10
    filterSizes = [2,3]
    num_filters = 16
    wordEmbeddingDimension = 50
    lrate = float(0.001)
    poolLength = 5
    labels = 30938
    vocabularySize = 101939

    model = Model(maxParagraphs,maxParagraphLength,labels,vocabularySize,\
                    filterSizes,num_filters,wordEmbeddingDimension,lrate,poolLength)

    testing = DataParser(maxParagraphs,maxParagraphLength,labels,vocabularySize)
    testing.getDataFromfile(testfile)

    model.load(modelfile)

    print("loading done")
    print("no of test examples: " + str(testing.totalPages))

    batchSize = 1
    testing.restore()
    truePre=[]
    pred=[]
    for itr in range(testing.totalPages):
        data=testing.nextBatch(1)
        truePre.append(data[0])
        pre=model.predict(data)
        pred.append(pre[0])

    labelids = open("../../dataset/sorted_labelid.txt","r").read().strip().split("\n")
    labelids = [ int(x) for x in labelids ]

    no_of_partition = 10
    partition_size = labels / no_of_partition
    prec1 = [0]*no_of_partition
    prec3 = [0]*no_of_partition
    prec5 = [0]*no_of_partition

    for i,v in enumerate(pred):
        temp = [(labId,labProb) for labId,labProb in enumerate(v) ]
        temp = sorted(temp,key=lambda x:x[1],reverse=True)  #sorting based on label probability to get top k
        #finding how many of these were true

        if truePre[i][0][temp[0][0]] == 1:
            prec1[ labelids.index( temp[0][0] ) / partition_size ] += 1
            prec3[ labelids.index( temp[0][0] ) / partition_size ] += 1
            prec5[ labelids.index( temp[0][0] ) / partition_size ] += 1

        if truePre[i][0][temp[1][0]] == 1:
            prec3[ labelids.index( temp[1][0] ) / partition_size ] += 1
            prec5[ labelids.index( temp[1][0] ) / partition_size ] += 1

        if truePre[i][0][temp[2][0]] == 1:
            prec3[ labelids.index( temp[2][0] ) / partition_size ] += 1
            prec5[ labelids.index( temp[2][0] ) / partition_size ] += 1

        if truePre[i][0][temp[3][0]] == 1:
            prec5[ labelids.index( temp[3][0] ) / partition_size ] += 1

        if truePre[i][0][temp[4][0]] == 1:
            prec5[ labelids.index( temp[4][0] ) / partition_size ] += 1

    print( prec1 )
    print( prec3 ) 
    print( prec5 )

    prec1 = [ ( float(x) /testing.totalPages )*100 for x in prec1  ]
    prec3 = [ ( float(x) /( 3 * testing.totalPages) )*100 for x in prec3  ]
    prec5 = [ ( float(x) /( 5 * testing.totalPages) )*100 for x in prec5  ]

    
    print( prec1 )
    print( prec3 ) 
    print( prec5 )
Exemplo n.º 20
0
import numpy as np
import os
import scipy.sparse as sps
from DataParser import DataParser
from Data_manager.split_functions.split_train_validation_random_holdout import \
    split_train_in_two_percentage_global_sample

from SLIM_ElasticNet.SLIMElasticNetRecommender import SLIMElasticNetRecommender

if __name__ == '__main__':
    parser = DataParser()
    URM_all = parser.get_URM_all()
    random_seed = 1205
    URM_train, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage=0.85, seed=random_seed)
    slim = SLIMElasticNetRecommender(URM_train)
    slim.fit(topK=140, l1_ratio=1e-5, alpha=0.386)
    slim.save_model('stored_recommenders/slim_elastic_net/',
                    f'best_{random_seed}_23_10_20')
def ComputeFscore(modelfile, testfile, outputfile):
    maxParagraphLength = 20
    maxParagraphs = 10
    #nlabels=1001
    #vocabularySize=76391
    labels = 8
    vocabularySize = 244
    model = Model(maxParagraphLength, maxParagraphs, labels, vocabularySize)

    testing = DataParser(maxParagraphLength, maxParagraphs, labels,
                         vocabularySize)
    testing.getDataFromfile(testfile)

    model.load(modelfile)

    print("loading done")

    testing.restore()
    truePre = []
    pred = []
    for itr in range(testing.totalPages):
        data = testing.nextBatch(1)
        truePre.append(data[0])
        pre = model.predict(data)
        pred.append(pre[0])

    labelsCount = {}
    ConfusionMa = {}
    fScr = {}

    thres = 0.5
    valid = int(
        len(truePre) * 0.5
    )  #using first 50% data for threshold tuning - we have merged test and cv files
    labelsCount = {}
    ConfusionMa = {}
    fScr = {}
    thresLab = {}
    for la in range(labels):
        if la % 25 == 0:
            print("Current label", la)
        t = []
        p = []
        for i in range(valid):
            t.append(truePre[i][0][la])
            p.append(pred[i][la])
        bestF, bestThre = thresholdTuning(t, p)

        t = []
        p = []
        for i in range(valid, len(truePre)):
            t.append(truePre[i][0][la])
            p.append(pred[i][la])

        p = np.array(p)
        fScr[la] = f1_score(t, p >= bestThre)
        ConfusionMa[la] = confusion_matrix(t, p > bestThre)
        thresLab[la] = bestThre

    f = open(outputfile, "w")
    sum_fscore = 0.0
    for i in range(labels):

        sum_fscore = sum_fscore + fScr[i]
        inp = str(i) + "," + str(thresLab[i]) + "," + str(fScr[i]) + "\n"
        f.write(inp)
    f.write(str(sum_fscore / float(labels - 1)))

    print(sum_fscore)
    print(sum_fscore / float((labels - 1)))
    f.close()
    return (sum_fscore / float((labels - 1)))
def read_data_split_and_search():
    """
    This function provides a simple example on how to tune parameters of a given algorithm

    The BayesianSearch object will save:
        - A .txt file with all the cases explored and the recommendation quality
        - A _best_model file which contains the trained model and can be loaded with recommender.load_model()
        - A _best_parameter file which contains a dictionary with all the fit parameters, it can be passed to recommender.fit(**_best_parameter)
        - A _best_result_validation file which contains a dictionary with the results of the best solution on the validation
        - A _best_result_test file which contains a dictionary with the results, on the test set, of the best solution chosen using the validation set
    """

    seed = 1205
    parser = DataParser()

    URM_all = parser.get_URM_all()
    ICM_obj = parser.get_ICM_all()

    # SPLIT TO GET TEST PARTITION
    URM_train, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage=0.90, seed=seed)

    # SPLIT TO GET THE HYBRID VALID PARTITION
    URM_train, URM_valid_hybrid = split_train_in_two_percentage_global_sample(URM_train, train_percentage=0.85,
                                                                              seed=seed)

    collaborative_algorithm_list = [
        # EASE_R_Recommender
        # PipeHybrid001,
        # Random,
        # TopPop,
        # P3alphaRecommender,
        # RP3betaRecommender,
        # ItemKNNCFRecommender,
        # UserKNNCFRecommender,
        # MatrixFactorization_BPR_Cython,
        # MatrixFactorization_FunkSVD_Cython,
        # PureSVDRecommender,
        # NMFRecommender,
        # PureSVDItemRecommender
        # SLIM_BPR_Cython,
        # SLIMElasticNetRecommender
        # IALSRecommender
        # MF_MSE_PyTorch
        # MergedHybrid000
        # LinearHybrid002ggg
        HybridCombinationSearch
    ]

    content_algorithm_list = [
        # ItemKNNCBFRecommender
    ]

    from Base.Evaluation.Evaluator import EvaluatorHoldout

    evaluator_valid_hybrid = EvaluatorHoldout(URM_valid_hybrid, cutoff_list=[10])
    evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])

    """
    earlystopping_keywargs = {"validation_every_n": 5,
                              "stop_on_validation": True,
                              "evaluator_object": evaluator_valid_hybrid,
                              "lower_validations_allowed": 5,
                              "validation_metric": 'MAP',
                              }

    print('IALS training...')
    ials = IALSRecommender(URM_train, verbose=False)
    ials_params = {'num_factors': 83, 'confidence_scaling': 'linear', 'alpha': 28.4278070726612,
                   'epsilon': 1.0234211788885077, 'reg': 0.0027328110246575004, 'epochs': 20}
    ials.fit(**ials_params, **earlystopping_keywargs)
    print("Done")


    print("PureSVD training...")
    psvd = PureSVDRecommender(URM_train, verbose=False)
    psvd_params = {'num_factors': 711}
    psvd.fit(**psvd_params)
    print("Done")
    """

    rp3b = RP3betaRecommender(URM_train, verbose=False)
    rp3b_params = {'topK': 1000, 'alpha': 0.38192761611274967, 'beta': 0.0, 'normalize_similarity': False}
    try:
        rp3b.load_model(f'stored_recommenders/seed_{str(seed)}_hybrid_search/',
                        f'{rp3b.RECOMMENDER_NAME}_for_second_search')
        print(f"{rp3b.RECOMMENDER_NAME} loaded.")
    except:
        print(f"Fitting {rp3b.RECOMMENDER_NAME} ...")
        rp3b.fit(**rp3b_params)
        print(f"done.")
        rp3b.save_model(f'stored_recommenders/seed_{str(seed)}_hybrid_search/',
                        f'{rp3b.RECOMMENDER_NAME}_for_second_search')

    p3a = P3alphaRecommender(URM_train, verbose=False)
    p3a_params = {'topK': 131, 'alpha': 0.33660811631883863, 'normalize_similarity': False}
    try:
        p3a.load_model(f'stored_recommenders/seed_{str(seed)}_hybrid_search/',
                       f'{p3a.RECOMMENDER_NAME}_for_second_search')
        print(f"{p3a.RECOMMENDER_NAME} loaded.")
    except:
        print(f"Fitting {p3a.RECOMMENDER_NAME} ...")
        p3a.fit(**p3a_params)
        print(f"done.")
        p3a.save_model(f'stored_recommenders/seed_{str(seed)}_hybrid_search/',
                       f'{p3a.RECOMMENDER_NAME}_for_second_search')

    icf = ItemKNNCFRecommender(URM_train, verbose=False)
    icf_params = {'topK': 55, 'shrink': 1000, 'similarity': 'asymmetric', 'normalize': True, 'asymmetric_alpha': 0.0}
    try:
        icf.load_model(f'stored_recommenders/seed_{str(seed)}_hybrid_search/',
                       f'{icf.RECOMMENDER_NAME}_for_second_search')
        print(f"{icf.RECOMMENDER_NAME} loaded.")
    except:
        print(f"Fitting {icf.RECOMMENDER_NAME} ...")
        icf.fit(**icf_params)
        print(f"done.")
        icf.save_model(f'stored_recommenders/seed_{str(seed)}_hybrid_search/',
                       f'{icf.RECOMMENDER_NAME}_for_second_search')

    ucf = UserKNNCFRecommender(URM_train, verbose=False)
    ucf_params = {'topK': 190, 'shrink': 0, 'similarity': 'cosine', 'normalize': True}
    try:
        ucf.load_model(f'stored_recommenders/seed_{str(seed)}_hybrid_search/',
                       f'{ucf.RECOMMENDER_NAME}_for_second_search')
        print(f"{ucf.RECOMMENDER_NAME} loaded.")
    except:
        print(f"Fitting {ucf.RECOMMENDER_NAME} ...")
        ucf.fit(**ucf_params)
        print(f"done.")
        ucf.save_model(f'stored_recommenders/seed_{str(seed)}_hybrid_search/',
                       f'{ucf.RECOMMENDER_NAME}_for_second_search')

    icb = ItemKNNCBFRecommender(URM_train, ICM_obj, verbose=False)
    icb_params = {'topK': 65, 'shrink': 0, 'similarity': 'dice', 'normalize': True}
    try:
        icb.load_model(f'stored_recommenders/seed_{str(seed)}_hybrid_search/',
                       f'{icb.RECOMMENDER_NAME}_for_second_search')
        print(f"{icb.RECOMMENDER_NAME} loaded.")
    except:
        print(f"Fitting {icf.RECOMMENDER_NAME} ...")
        icb.fit(**icb_params)
        print(f"done.")
        icb.save_model(f'stored_recommenders/seed_{str(seed)}_hybrid_search/',
                       f'{icb.RECOMMENDER_NAME}_for_second_search')

    sen = SLIMElasticNetRecommender(URM_train, verbose=False)
    sen_params = {'topK': 992, 'l1_ratio': 0.004065081925341167, 'alpha': 0.003725005053334143}
    try:
        sen.load_model(f'stored_recommenders/seed_{str(seed)}_hybrid_search/',
                       f'{sen.RECOMMENDER_NAME}_for_second_search')
        print(f"{sen.RECOMMENDER_NAME} loaded.")
    except:
        print(f"Fitting {sen.RECOMMENDER_NAME} ...")
        sen.fit(**sen_params)
        print(f"done.")
        sen.save_model(f'stored_recommenders/seed_{str(seed)}_hybrid_search/',
                       f'{sen.RECOMMENDER_NAME}_for_second_search')

    print("\nStart.")
    list_recommender = [icb, icf, ucf, p3a, rp3b, sen]
    list_already_seen = []
    combinations_already_seen = []
    """
    (icb, icf, p3a), (icb, icf, rp3b), (icb, icf, sen), (icb, p3a, rp3b), (icb, p3a, sen),
                                (icb, rp3b, sen), (icf, p3a, rp3b), (icf, p3a, sen)
    """

    for rec_perm in combinations(list_recommender, 3):

        if rec_perm not in combinations_already_seen:

            recommender_names = '_'.join([r.RECOMMENDER_NAME for r in rec_perm])
            output_folder_path = "result_experiments_v3/seed_" + str(
                seed) + '/linear_combination/' + recommender_names + '/'
            print(F"\nTESTING THE COMBO {recommender_names}")

            # If directory does not exist, create
            if not os.path.exists(output_folder_path):
                os.makedirs(output_folder_path)

            # TODO: setta I GIUSTI EVALUATOR QUI!!!!
            runParameterSearch_Collaborative_partial = partial(runParameterSearch_Collaborative,
                                                               URM_train=URM_train,
                                                               ICM_train=ICM_obj,
                                                               metric_to_optimize="MAP",
                                                               n_cases=50,
                                                               n_random_starts=20,
                                                               evaluator_validation_earlystopping=evaluator_valid_hybrid,
                                                               evaluator_validation=evaluator_valid_hybrid,
                                                               #evaluator_test=evaluator_test,
                                                               output_folder_path=output_folder_path,
                                                               allow_weighting=False,
                                                               # similarity_type_list = ["cosine", 'jaccard'],
                                                               parallelizeKNN=False,
                                                               list_rec=rec_perm)
            pool = multiprocessing.Pool(processes=int(multiprocessing.cpu_count()), maxtasksperchild=1)
            pool.map(runParameterSearch_Collaborative_partial, collaborative_algorithm_list)
def read_data_split_and_search():
    """
    This function provides a simple example on how to tune parameters of a given algorithm

    The BayesianSearch object will save:
        - A .txt file with all the cases explored and the recommendation quality
        - A _best_model file which contains the trained model and can be loaded with recommender.load_model()
        - A _best_parameter file which contains a dictionary with all the fit parameters, it can be passed to recommender.fit(**_best_parameter)
        - A _best_result_validation file which contains a dictionary with the results of the best solution on the validation
        - A _best_result_test file which contains a dictionary with the results, on the test set, of the best solution chosen using the validation set
    """

    parser = DataParser()
    seed = 1666
    URM_all = parser.get_URM_all()
    ICM_obj = parser.get_ICM_all()

    URM_train, URM_test = split_train_in_two_percentage_global_sample(
        URM_all, train_percentage=0.85, seed=seed)
    URM_train, URM_validation = split_train_in_two_percentage_global_sample(
        URM_train, train_percentage=0.85, seed=seed)

    k = 5

    output_folder_path = "result_experiments_CV/"

    # If directory does not exist, create
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    collaborative = True

    content_algorithm_list = [
        #ItemKNNCBFRecommender
    ]

    collaborative_algorithm_list = [
        #Random,
        #TopPop,
        #P3alphaRecommender,
        #RP3betaRecommender,
        ItemKNNCFRecommender,
        #UserKNNCFRecommender,
        #MatrixFactorization_BPR_Cython,
        #MatrixFactorization_FunkSVD_Cython,
        #PureSVDRecommender,
        #SLIM_BPR_Cython,
        #SLIMElasticNetRecommender,
        #IALSRecommender,
    ]

    from Base.Evaluation.Evaluator import EvaluatorHoldout

    evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[5])
    evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])

    if not collaborative:
        runParameterSearch_Content_partial = partial(
            runParameterSearch_Content,
            URM_train=URM_train,
            ICM_object=ICM_obj,
            ICM_name='1BookFeatures',
            n_cases=50,
            n_random_starts=20,
            metric_to_optimize="MAP",
            output_folder_path=output_folder_path,
            parallelizeKNN=False,
            allow_weighting=True,
            #similarity_type_list = ['cosine']
            k=k,
            seed=seed)

        pool = multiprocessing.Pool(processes=int(multiprocessing.cpu_count()),
                                    maxtasksperchild=1)
        pool.map(runParameterSearch_Content_partial, content_algorithm_list)

    else:
        runParameterSearch_Collaborative_partial = partial(
            runParameterSearch_Collaborative,
            URM_train=URM_train,
            metric_to_optimize="MAP",
            n_cases=50,
            n_random_starts=20,
            #evaluator_test = evaluator_test,
            output_folder_path=output_folder_path,
            similarity_type_list=["cosine"],
            parallelizeKNN=False,
            allow_weighting=False,
            k=k,
            seed=seed)

        pool = multiprocessing.Pool(processes=int(multiprocessing.cpu_count()),
                                    maxtasksperchild=1)
        pool.map(runParameterSearch_Collaborative_partial,
                 collaborative_algorithm_list)
Exemplo n.º 24
0
    def _combine_sums(self, ST_OR_ZIP, cols_to_sum):

        # def get_data_i_FE(i):
        #     return DataParser().AmericanCombo_i_FE(i)

        def get_data_i_FE2(i):
            return DataParser().AmericanCombo_i_FE2(i)

        def write_out_data(df, i):
            print "Writing out data"
            DataParser()._write_HDFStore_Combined_FE2(df, i)

        cols_sum = [col + '_sum' for col in cols_to_sum]

        df_store = pd.DataFrame(columns=['svcg_cycle', ST_OR_ZIP] + cols_sum)

        data_count = DataParser().number_of_datasets
        # data_count = 3  # STOP FOR TESTING PURPOSES!!!!!

        # i = 0
        #
        # while i < data_count:
        #
        #     print "A" + str(i) + " " + ST_OR_ZIP
        #
        #     # Get fetch current data
        #     df_active = get_data_i_FE2(i)
        #     df_active.reset_index(drop=True, inplace=True)
        #
        #     for COL_TO_SUM in cols_to_sum:
        #         COL_SUM = COL_TO_SUM + '_sum'
        #
        #         df_current = df_active[['svcg_cycle', ST_OR_ZIP, COL_TO_SUM]]
        #
        #         # Sum duplicates cuased by fragmentation
        #         df_current.loc[:, COL_TO_SUM] = df_current.groupby(['svcg_cycle', ST_OR_ZIP])[COL_TO_SUM].apply(
        #             lambda x: x.cumsum() + sum(np.unique(x)) - x.cumsum())
        #
        #         # Remove duplicate entries
        #         df_current = df_current.drop_duplicates(subset=['svcg_cycle', ST_OR_ZIP], keep='last')
        #         # Merge outer
        #         df_store = pd.merge(df_store, df_current, on=['svcg_cycle', ST_OR_ZIP], how='outer')
        #         # set nan to 0's
        #         df_store.loc[pd.isnull(df_store[COL_SUM]), COL_SUM] = 0
        #         df_store.loc[pd.isnull(df_store[COL_TO_SUM]), COL_TO_SUM] = 0
        #
        #         # Add number of occurences in current to overall total
        #         df_store.loc[:, COL_SUM] = df_store[COL_SUM] + df_store[COL_TO_SUM]
        #         # drop column
        #         df_store = df_store.drop(COL_TO_SUM, 1)
        #         # Sum duplicate entries
        #         df_store.loc[:, COL_SUM] = df_store.groupby(['svcg_cycle', ST_OR_ZIP])[COL_SUM].apply(
        #             lambda x: x.cumsum() + sum(x) - x.cumsum())
        #
        #         # Remove duplicate entries
        #         df_store = df_store.drop_duplicates(subset=['svcg_cycle', ST_OR_ZIP], keep='last')
        #     i += 1
        #
        # # Get cummax of occr columns
        # if ST_OR_ZIP == 'st':
        #     total_cols_st = ['new_loans_per_state', 'occr_default_per_state', 'occr_paid_off_per_state']
        #     df_store = df_store.sort_values(['st', 'svcg_cycle'], ascending=[True, True])
        #     for col in total_cols_st:
        #         col_sum = col + '_sum'
        #         if col_sum in df_store.columns.values:
        #             df_store[col_sum] = df_store.groupby(['st'])[col_sum].transform(lambda v: v.cummax())
        # else:
        #     total_cols_zip = ['new_loans_per_zipcode', 'occr_default_per_zipcode', 'occr_paid_off_per_zipcode']
        #     df_store = df_store.sort_values(['zipcode', 'svcg_cycle'], ascending=[True, True])
        #     for col in total_cols_zip:
        #         col_sum = col + '_sum'
        #         if col_sum in df_store.columns.values:
        #             df_store[col_sum] = df_store.groupby(['zipcode'])[col_sum].transform(lambda v: v.cummax())
        #
        # i = 0
        #
        # while i < data_count:
        #
        #     print "B" + str(i) + " " + ST_OR_ZIP
        #
        #     # Get fetch current data
        #     df_current = get_data_i_FE2(i)
        #
        #     for COL_TO_SUM in cols_to_sum:
        #         COL_SUM = COL_TO_SUM + '_sum'
        #         df_store_tmp = df_store[['svcg_cycle', ST_OR_ZIP, COL_SUM]]
        #         df_current = pd.merge(df_current, df_store_tmp, on=['svcg_cycle', ST_OR_ZIP], how='left')
        #         df_current.loc[:, COL_TO_SUM] = df_current[COL_SUM]
        #         df_current.loc[:, COL_TO_SUM] = df_current[COL_TO_SUM].astype(int)
        #         df_current = df_current.drop(COL_SUM, 1)
        #     # WRITE OUT DATA TO FILE
        #     write_out_data(df_current, i)
        #
        #     i += 1

        i = 26

        while i < data_count:

            print "C" + str(i) + " " + ST_OR_ZIP

            # Get fetch current data
            df_current = get_data_i_FE2(i)

            df_current['ones'] = 1
            df_current['small'] = 0.0001

            # default rate by zipcode
            df_current.loc[:, 'rt_default_per_zipcode'] = (
                df_current['occr_default_per_zipcode'] + df_current['small']
            ) / (df_current['new_loans_per_zipcode'] + df_current['ones'])
            df_current.loc[:, 'rt_default_per_zipcode'] = df_current[
                'rt_default_per_zipcode'].astype(float)

            # default rate by zipcode in last 12 months
            df_current.loc[:, 'rt_default_per_zipcode_12_mon'] = (
                df_current['occr_default_per_zipcode_12_mon'] +
                df_current['small']) / (df_current['active_loans_per_zipcode']
                                        + df_current['ones'])
            df_current.loc[:, 'rt_default_per_zipcode_12_mon'] = df_current[
                'rt_default_per_zipcode_12_mon'].astype(float)

            #  ----------------------------------------

            # default rate by state
            df_current.loc[:, 'rt_default_per_state'] = (
                df_current['occr_default_per_state'] + df_current['small']) / (
                    df_current['new_loans_per_state'] + df_current['ones'])
            df_current.loc[:, 'rt_default_per_state'] = df_current[
                'rt_default_per_state'].astype(float)

            # default rate by state in last 12 months
            df_current.loc[:, 'rt_default_per_state_12_mon'] = (
                df_current['occr_default_per_state_12_mon'] +
                df_current['small']) / (df_current['active_loans_per_state'] +
                                        df_current['ones'])
            df_current.loc[:, 'rt_default_per_state_12_mon'] = df_current[
                'rt_default_per_state_12_mon'].astype(float)

            cols_str = [
                'st', 'id_loan', 'flag_fthb', 'occpy_sts', 'channel',
                'prod_type', 'prop_type', 'loan_purpose', 'repch_flag',
                'flag_mod'
            ]
            for col_str in cols_str:
                df_current.loc[:, col_str] = df_current[col_str].astype(str)

            cols_int = [
                'ppmt_pnlty', 'delq_sts', 'net_sale_proceeds', 'occpy_sts'
            ]
            for col_int in cols_int:
                df_current.loc[:, col_int] = df_current[col_int].astype(str)

            # WRITE OUT DATA TO FILE
            write_out_data(df_current, i)

            i += 1
Exemplo n.º 25
0
def genAnalysis(modelfile, testfile, confusionFile):
    maxParagraphLength = 20
    maxParagraphs = 5
    filterSizes = [1]
    num_filters = 64
    wordEmbeddingDimension = 30
    lrate = float(1e-3)
    labels = 30938
    vocabularySize = 101939

    model = Model(maxParagraphs, maxParagraphLength, labels, vocabularySize,
                  filterSizes, num_filters, wordEmbeddingDimension, lrate)

    testing = DataParser(maxParagraphs, maxParagraphLength, labels,
                         vocabularySize)
    testing.getDataFromfile(testfile)

    model.load(modelfile)

    print("loading done")

    testing.restore()
    truePre = []
    pred = []
    for itr in range(testing.totalPages):
        data = testing.nextBatch(1)
        truePre.append(data[0])
        pre = model.predict(data)
        pred.append(pre[0])

    valid = int(
        len(truePre) * 0.5
    )  #using first 25% data for threshold tuning - we have merged test and cv files
    thresLab = {}
    for la in range(labels):
        t = []
        p = []
        for i in range(valid):
            t.append(truePre[i][0][la])
            p.append(pred[i][la])
        bestF, bestThre = thresholdTuning(t, p)
        thresLab[la] = bestThre

    print(thresLab)

    labelIDName = open("../labelId-labelName-full.txt").read().split("\n")
    labelIDName = [[int(x.split("\t")[0]),
                    x.split("\t")[1].rstrip()] for x in labelIDName]
    # print(labelIDName)

    #making it a dictionary
    labelname = dict(labelIDName)
    # print(labelName[9026])

    f = open(confusionFile, "w")
    for itr in range(valid,
                     testing.totalPages):  #on next 75% getting analaysis
        predLabel = [pred[itr][i] > thresLab[i] for i in range(labels)]
        output = ""
        for i in range(labels):
            if predLabel[i] == 1:
                output = output + "," + labelname[i]

        tn, fp, fn, tp = confusion_matrix(truePre[itr][0], predLabel).ravel()
        f.write(
            str(itr) + "," + str(tn) + "," + str(fp) + "," + str(fn) + "," +
            str(tp) + "," + output + "\n")
    f.close()
Exemplo n.º 26
0
if (len(sys.argv) > 1):
    serverName = sys.argv[1]

# Optional server port number
if (len(sys.argv) > 2):
    serverPort = int(sys.argv[2])

# Creem un 'serverSocket'
socket = ServerSocket(serverName, serverPort)

# Creem un objecte de la classe 'CloudStorage'
cloudStorage = CloudStorage()
cloudStorage.connect()

# Creem un objecte de la classe 'DataParser'
dataParser = DataParser(cloudStorage)

# Bucle infinit
while True:
    # Connectar socket
    socket.s_accept()

    # Get dades
    data = socket.read_data()

    # Parse dades
    DataParser.decodeAndStorage(data)

    # Tancar socket
    socket.close_socket()