示例#1
0
def setup(opt):

    if opt.dataset_name.lower() == 'cmu_mosi' or opt.dataset_name.lower(
    ) == 'cmu_mosei':
        from data.reader import DataReader

    reader = DataReader(opt)
    reader = reader.prepare_data()

    return reader
    def execute(self, args):

        try:
            # This call validates inputs. If a required arg isn't there 
            # or an additional, unexpected, arg is present it will except.
            execute_args = super()._parse_execute_arguments(args)

            if IFunction.GLOBAL_HELP in execute_args.keys():
                # Regardless of anything else, if help is there, show it and quit
                self.get_help(1)
            else:

                start_month = date_range(1, -1)
                end_month = date_range(50,-1)

                # Filter the files and get the data from the last one since
                # we are just going over global stats.
                region = execute_args['-r']
                files = DataReader.get_dated_files(self.datasets, start_month, end_month)

                region_data = GlobalDataParser.parse_region_data([files[-1]], region)

                print("Province information for region : ", region)

                if len(region_data):
                    columns = [6,30, 7, 8,11,11]
                    headers = ['Rank','Province', 'Cases', 'Deaths', 'Recovered', 'Mortality']
                    PrintTable.print_banner(columns,headers)

                    region = region_data[0]
                    rows = []

                    for province in region.provinces.keys():
                        mortality = "%.2f" % (region.provinces[province].get_mortality())

                        row = [
                            province,
                            region.provinces[province].confirmed_cases,
                            region.provinces[province].deaths,
                            region.provinces[province].recovered,
                            mortality + " %"
                        ]
                        rows.append(row)

                    # Sort them by mortality
                    rows = sorted(rows, reverse=True, key=lambda x: x[1])

                    # Now print them
                    rank = 1
                    for row in rows:
                        row.insert(0,rank)
                        PrintTable.print_row(columns, row)
                        rank += 1

                else:
                    print("NO data found....")

        except Exception as ex:
            print(str(ex))
示例#3
0
def main():
    # read the data
    reader = DataReader(main_path)

    questions = reader.questions
    workers = reader.workers
    answers = reader.answers

    print("Finished reading answers from " + reader.answer)
    print("Total questions: " + str(len(questions)))
    print("Total workers: " + str(len(workers)))
    print("Total answers: " + str(len(answers)))

    print(
        "class 0:",
        sum([1 if i == 0 else 0 for i in questions.values()]) /
        sum([1 for _ in questions.values()]))

    # split into train/validation/test
    (q_train, q_validation, q_test) = SplitData(split_seed,
                                                list(questions.items()))
    (full, train, validation, test) = ArrangeData(questions, q_train,
                                                  q_validation, q_test,
                                                  workers, answers)

    print("Train size", len(train.questions), len(train.workers),
          len(train.answers))
    print("Validation size", len(validation.questions),
          len(validation.workers), len(validation.answers))
    print("Test size", len(test.questions), len(test.workers),
          len(test.answers))

    # cut ground for test set but keep a local copy
    test_set = deepcopy(test)
    for q in test.questions.keys():
        test.questions[q] = None
        full.questions[q] = None
    #end for

    # initialize algorithms
    algos = []
    algos.append(MajorityVoting(full, train, validation, test))
    algos.append(
        DawidSkene(full, train, validation, test, ds_seed, "mv_w", 100))

    for alg in algos:
        print("")
        print("Algorithm - " + alg.name)
        # run the algo
        test_ans = alg.run()

        # get back metrics
        (precision, recall, f1score,
         accuracy) = alg.validate(test_set, test_ans)
        print(("precision", precision))
        print(("recall", recall))
        print(("f1score", f1score))
        print(("accuracy", accuracy))
    def execute(self, args):

        try:
            # This call validates inputs. If a required arg isn't there
            # or an additional, unexpected, arg is present it will except.
            execute_args = super()._parse_execute_arguments(args)

            if IFunction.GLOBAL_HELP in execute_args.keys():
                # Regardless of anything else, if help is there, show it and quit
                self.get_help(1)
            else:

                start_month = date_range(1, -1)
                end_month = date_range(50, -1)

                # Filter the files and get the data from the last one since
                # we are just going over global stats.
                files = DataReader.get_dated_files(self.datasets, start_month,
                                                   end_month)
                region_data = GlobalDataParser.parse_region_data([files[-1]])

                # Accumulate province data for each region
                regional_details = {}
                for reg in region_data:
                    regional_details[reg.region] = reg.condense_provinces()

                # Get the mortality for each region
                mortality_stats = []
                for reg in regional_details.keys():
                    mortality = regional_details[reg].get_mortality()

                    mortality_stats.append(mortality_overall(reg, mortality))

                sorted_mortality_stats = sorted(mortality_stats,
                                                reverse=True,
                                                key=lambda x: x.rate)
                '''
                    Table 1:
                    Prints out each region with it's rolled up stats across
                    all provinces.
                '''
                columns = [6, 35, 11]
                headers = ['Rank', 'Region', 'Mortality']
                PrintTable.print_banner(columns, headers)

                entry = 1
                for stat in sorted_mortality_stats:
                    mortality_rate = "%.2f" % (stat.rate)
                    row = [entry, stat.region, '{}%'.format(mortality_rate)]
                    PrintTable.print_row(columns, row)
                    entry += 1

        except Exception as ex:
            print(str(ex))
            raise ex
示例#5
0
def load_data(path, date_col, frequency):
    """
    Loads the dataset, indexes it and imputes the missing values
    :param path: path to the data set on disk
    :param file_type: file type of the data
    :param date_col: date column name in the dataset
    :param frequency: frequency of the dates
    :return: pandas dataframe
    """
    dataframe = DataReader.read_data(path, date_col=date_col)
    dataframe = Indexer.index_dates(dataframe, date_col, frequency=frequency)
    dataframe = Imputer.impute(dataframe)
    return dataframe
示例#6
0
 def convert_set(self, reader: DataReader):
     return [(self.convert_graph(d.graph), self.convert_plan(d.plan))
             for d in reader.copy().data]
    def execute(self, args):

        try:
            # This call validates inputs. If a required arg isn't there
            # or an additional, unexpected, arg is present it will except.
            execute_args = super()._parse_execute_arguments(args)

            if IFunction.GLOBAL_HELP in execute_args.keys():
                # Regardless of anything else, if help is there, show it and quit
                self.get_help(1)
            else:

                start_month = date_range(1, -1)
                end_month = date_range(50, -1)

                # Filter the files and get the data from the last one since
                # we are just going over global stats.
                files = DataReader.get_dated_files(self.datasets, start_month,
                                                   end_month)
                region_data = GlobalDataParser.parse_region_data([files[-1]])

                # Accumulate province data for each region
                regional_details = {}
                for reg in region_data:
                    regional_details[reg.region] = reg.condense_provinces()
                '''
                    Table 1:
                    Prints out each region with it's rolled up stats across
                    all provinces.
                '''
                columns = [35, 11, 11, 11]
                headers = ['Region', 'Confirmed', 'Deaths', 'Mortality']
                PrintTable.print_banner(columns, headers)

                overall_stats = OverallStats()

                data_rows = []
                for region_key in regional_details.keys():

                    data_row = [region_key]
                    data_row.append(
                        regional_details[region_key].confirmed_cases)
                    data_row.append(regional_details[region_key].deaths)
                    #data_row.append(regional_details[region_key].recovered)

                    mortality = '0.00'
                    try:
                        mortality = (
                            regional_details[region_key].deaths /
                            regional_details[region_key].confirmed_cases) * 100
                        mortality = "%.2f" % (mortality)
                    except:
                        pass
                    data_row.append(mortality)

                    overall_stats.add_deaths(
                        region_key, regional_details[region_key].deaths)
                    overall_stats.add_mortality(region_key, float(mortality))
                    overall_stats.add_confirmed(
                        region_key,
                        regional_details[region_key].confirmed_cases)
                    overall_stats.recovered_cases.append(
                        regional_details[region_key].recovered)

                    data_rows.append(data_row)

                sort_column = -1
                if '-c' in execute_args:
                    sort_column = 1
                elif '-d' in execute_args:
                    sort_column = 2

                if sort_column != -1:
                    sorted_rows = sorted(data_rows,
                                         reverse=True,
                                         key=lambda x: x[sort_column])
                    data_rows = sorted_rows

                for row in data_rows:
                    PrintTable.print_row(columns, row)
                '''
                    Table 2:
                    Global overall stats focusing on US vs World
                '''
                print("")
                columns = [36, 26]
                headers = ['General Statistic', 'Value']
                banner, cols = PrintTable.print_banner(columns, headers)

                total_cases = overall_stats.us_confirmed_cases + sum(
                    overall_stats.non_us_confirmed_cases)
                total_deaths = overall_stats.us_deaths + sum(
                    overall_stats.non_us_deaths)

                us_cases = "%d (%.3f)" % (
                    overall_stats.us_confirmed_cases,
                    float(overall_stats.us_confirmed_cases / total_cases) *
                    100)
                non_us_case = "%d (%.3f)" % (
                    sum(overall_stats.non_us_confirmed_cases),
                    float(
                        sum(overall_stats.non_us_confirmed_cases) /
                        total_cases) * 100)

                us_deaths = "%d (%.3f)" % (
                    overall_stats.us_deaths,
                    float(overall_stats.us_deaths / total_deaths) * 100)
                non_us_deaths = "%d (%.3f)" % (
                    sum(overall_stats.non_us_deaths),
                    float(sum(overall_stats.non_us_deaths) / total_deaths) *
                    100)

                PrintTable.print_row(
                    columns, ["Total Global Confirmed Cases", total_cases])
                PrintTable.print_row(columns, ["US Confirmed Cases", us_cases])
                PrintTable.print_row(columns,
                                     ["Non-US Confirmed Cases", non_us_case])
                PrintTable.print_row(columns,
                                     ["Total Global Deaths", total_deaths])
                PrintTable.print_row(columns, ["US Deaths", us_deaths])
                PrintTable.print_row(columns, ["Non-US Deaths", non_us_deaths])
                #PrintTable.print_row(columns, ["Total Recovered", sum(overall_stats.recovered_cases)])
                print(banner)
                PrintTable.print_row(
                    columns,
                    ["Highest Mortality", overall_stats.mortality_highest])
                PrintTable.print_row(columns, [
                    "Highest Mortality Region", overall_stats.mortality_winner
                ])
                print(banner)
                PrintTable.print_row(columns, [
                    "Highest Reported Deaths",
                    overall_stats.total_cases_highest
                ])
                PrintTable.print_row(columns, [
                    "Highest Reported Region", overall_stats.total_cases_winner
                ])
                print(banner)
                print("")

        except Exception as ex:
            print(str(ex))
            raise ex
    def execute(self, args):

        try:
            # This call validates inputs. If a required arg isn't there 
            # or an additional, unexpected, arg is present it will except.
            execute_args = super()._parse_execute_arguments(args)

            if IFunction.GLOBAL_HELP in execute_args.keys():
                # Regardless of anything else, if help is there, show it and quit
                self.get_help(1)
            else:

                start_month = date_range(int(execute_args['-s']), -1)
                end_month = date_range(-1,-1)
                if '-e' in execute_args.keys():
                    end_month = date_range(int(execute_args['-e']),-1)

                files = DataReader.get_dated_files(self.datasets, start_month, end_month)

                # Get the region and then collect the data
                desired_region = execute_args['-r']
                region_data = GlobalDataParser.parse_region_data(files, desired_region)

                # Output table header
                columns = [16, 19,19,19,11]
                headers = ['File', 'Confirmed', 'Deaths', 'Recovered', 'Mortality']
                PrintTable.print_banner(columns,headers)

                last_confirmed = -1
                last_death = -1
                last_recovered = -1

                # If -sum, then only first and last
                data_to_scan = region_data
                if '-sum' in execute_args.keys():
                    data_to_scan = [
                        region_data[0],
                        region_data[-1]
                    ]

                for rdata in data_to_scan:
                    file_name = rdata.data_file.file_name
                    mrate = '0.00'

                    # Accumulate all of the province/state data
                    condensed = rdata.condense_provinces()
                    confirmed = condensed.confirmed_cases
                    deaths = condensed.deaths
                    recovered = condensed.recovered

                    # Mortality rate
                    mrate = '%.2f' %(condensed.get_mortality())
                    # If we have a last entry, get the diff to today
                    if last_confirmed != -1:
                        confirmed = "{} ({})".format(str(confirmed), str(confirmed - last_confirmed))
                        deaths = "{} ({})".format(str(deaths), str(deaths - last_death))
                        recovered = "{} ({})".format(str(recovered), str(recovered - last_recovered))
                    
                    last_confirmed = condensed.confirmed_cases
                    last_death = condensed.deaths
                    last_recovered = condensed.recovered

                    output_data = [
                        file_name,
                        confirmed,
                        deaths,
                        recovered,
                        mrate + ' %'
                    ]
                    PrintTable.print_row(columns, output_data)


        except Exception as ex:
            print(str(ex))
示例#9
0
def main():
    # read the data
    reader = DataReader(main_path)

    questions = reader.questions
    workers = reader.workers
    answers = reader.answers

    print("Finished reading answers from " + reader.answer)
    print("Total questions: " + str(len(questions)))
    print("Total workers: " + str(len(workers)))
    print("Total answers: " + str(len(answers)))
    print()

    # questions
    print(
        "class 0:",
        sum([1 if i == 0 else 0 for i in questions.values()]) /
        sum([1 for _ in questions.values()]))
    print(
        "class 1:",
        sum([1 if i == 1 else 0 for i in questions.values()]) /
        sum([1 for _ in questions.values()]))
    print()

    # workers
    w_accuracy = {}
    w_answers = {}
    w_count = {}
    for w, ans in workers.items():
        answers = [0.0, 0.0]
        total = 0.0
        correct = 0.0

        for (q, a) in ans:
            # accuracy
            total += 1.0
            correct += 1.0 if a == questions[q] else 0.0
            answers[a] += 1.0
        #end for
        w_accuracy[w] = correct / total
        w_answers[w] = answers[1] / (answers[0] + answers[1])
        w_count[w] = total
    #end for

    # values
    print("# of answers per worker")
    print(("mean", stats.mean(w_count.values())),
          ("stddev", stats.stdev(w_count.values())),
          ("median", stats.median(w_count.values())))
    print(("min", min(w_count.values())), ("max", max(w_count.values())))
    print()

    w_count_s = sorted(w_count.values())
    count_90 = (len(w_count_s) * 90) / 100
    w_count_s = w_count_s[:int(count_90)]
    print("# of answers per worker (90th percentile)")
    print(("mean", stats.mean(w_count_s)), ("max", max(w_count_s)))
    print()

    print("# of answers per worker")
    print(("mean", stats.mean(w_accuracy.values())),
          ("stddev", stats.stdev(w_accuracy.values())),
          ("median", stats.median(w_accuracy.values())))
    print(("min", min(w_accuracy.values())), ("max", max(w_accuracy.values())))
    print()

    #exit()
    # plotting
    plt.hist(w_count.values())
    plt.title("Distribution of worker answer count")
    plt.xlabel("# answers")
    plt.ylabel("# workers")
    plt.show()

    bins = (
        np.arange(0, 13) / 11.0
    ) - 0.05  #to have areas that'd be centered around 0.0, 0.1, ... 1.0

    plt.hist(w_accuracy.values(), bins=bins)
    plt.title("Distribution of worker accuracy")
    plt.xlabel("% of correct answers")
    plt.ylabel("# workers")
    plt.show()

    plt.hist(w_answers.values(),
             bins=(bins + 0.02158749248346362
                   ))  # to make a prior line be more in the center of a block
    plt.axvline(x=0.12158749248346362, ymin=0, ymax=1, color="red")
    plt.title("Distribution of worker answers")
    plt.xlabel("% of positive answers")
    plt.ylabel("# workers")
    plt.show()

    plt.scatter(w_answers.values(), w_accuracy.values())
    plt.title("Distribution of worker answers to worker accuracy")
    plt.xlabel("% of positive answers")
    plt.ylabel("% of correct answers")
    plt.show()

    plt.scatter(w_count.values(), w_accuracy.values())
    plt.title("Distribution of worker activity to worker accuracy")
    plt.xlabel("# answers")
    plt.ylabel("% of correct answers")
    plt.show()
示例#10
0
def main(FLAGS):
    # Define data reader.
    dataset = DataReader(FLAGS.DATA_DIR, FLAGS.SAMPLING_MINUTE,
                         FLAGS.SAMPLING_SIZE, FLAGS.BATCH_SIZE, FLAGS.USE_JSON)

    # Define model.
    model = RNN_cell(FLAGS.INPUT_SIZE, FLAGS.HIDDEN_LAYER_SIZE,
                     FLAGS.TARGET_SIZE)

    # Define label and model output.
    label = tf.placeholder(tf.float32, shape=[None], name='labels')
    label_one_hot = tf.one_hot(tf.cast(label, tf.int32),
                               depth=FLAGS.TARGET_SIZE)
    outputs = model.get_outputs()
    last_output = outputs[-1]
    output = tf.nn.softmax(last_output)
    output_class = tf.argmax(output, 1)

    # Define cross entropy loss.
    cross_entropy = -tf.reduce_mean(label_one_hot * tf.log(output))
    #cross_entropy = FLAGS.tf_Weighted_RMSE(label, output)
    #cross_entropy = tf.reduce_mean(tf.square(output - label))
    #cross_entropy = tf.reduce_mean(label*output)

    # Define adam training optimizer.
    learning_rate = tf.placeholder(tf.float32)
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy)
    saved_eval = 10.0

    # Define log summary
    tf.summary.scalar('learning_rate', learning_rate)
    tf.summary.scalar('loss', cross_entropy)

    tf_config = tf.ConfigProto(allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        sess.run(tf.initialize_all_variables())
        merged = tf.summary.merge_all()
        if FLAGS.SAVE_SUMMARY:
            writer = tf.summary.FileWriter(FLAGS.LOG_PATH, sess.graph)
        saver = tf.train.Saver(max_to_keep=10)
        for epoch in range(0, FLAGS.EPOCH):
            start_time = time.time()
            for num in range(0, dataset.train_batch_num):
                step = epoch * dataset.train_batch_num + num
                lr = FLAGS.LEARNING_RATE * (0.5**(np.floor(
                    epoch / FLAGS.LR_DECAY_EPOCH)))

                # Train model
                train_x, train_y = dataset.get_train_batch()
                loss, summary, _ = sess.run(
                    [cross_entropy, merged, train_step],
                    feed_dict={
                        model._inputs: train_x,
                        label: train_y,
                        learning_rate: lr
                    })
                # Evaluate on validation set.
                valid_x = dataset.X_valid
                valid_y = dataset.Y_valid
                valid_predict = sess.run(output_class,
                                         feed_dict={model._inputs: valid_x})
                valid_eval = FLAGS.Weighted_RMSE(valid_y, valid_predict)

                if FLAGS.SAVE_SUMMARY:
                    writer.add_summary(summary, step)

                # Save model setting
                if step % FLAGS.SAVE_STEP == 0:
                    saver.save(sess,
                               os.path.join(FLAGS.MODEL_PATH, 'model'),
                               global_step=step)

                if valid_eval <= saved_eval:
                    saved_epoch = epoch
                    saved_num = num
                    saved_eval = valid_eval

                # Display setting
                if step % FLAGS.DISPLAY_STEP == 0:
                    rate = (step + 1) * FLAGS.BATCH_SIZE / (time.time() -
                                                            start_time)
                    remaining = (FLAGS.EPOCH * dataset.train_batch_num -
                                 step) * FLAGS.BATCH_SIZE / rate
                    print(
                        "###################################################")
                    print(
                        "progress  epoch %d  step %d / %d  image/sec %0.1f  remaining %0.1fm"
                        % (epoch, num, dataset.train_batch_num, rate,
                           remaining / 60))
                    print("- Loss =", loss)
                    print("- Weighted RMSE on validation =", valid_eval)
                    print(
                        "- Accuracy on validation =",
                        np.sum(valid_y == valid_predict) /
                        np.shape(valid_y)[0])
                    print("- Best(but not saved) Weight RMSE on validation =",
                          saved_eval)
                    print("- Best model on validation at epoch =", saved_epoch,
                          "step =", saved_num)
                    print("- Min kp-index on validation set :",
                          np.min(valid_predict))
                    print("- Max kp-index on validation set :",
                          np.max(valid_predict))

    print("Finish!")
示例#11
0
def run():
    result = pd.DataFrame(columns=result_col_names)

    # variance_thresholds = [1, 0.25, 0.50, 0.75]
    variance_thresholds = [1]

    # combinations = list(itertools.combinations([1, 2], 2))

    # borda combinations
    # combinations = []
    combinations = [
        BordaCombinations.LS_SPEC, BordaCombinations.LS_IDETECT,
        BordaCombinations.LS_SPEC_IDETECT, BordaCombinations.SPEC_IDETECT,
        BordaCombinations.GLSPFS_LS, BordaCombinations.GLSPFS_SPEC,
        BordaCombinations.GLSPFS_IDETECT, BordaCombinations.GLSPFS_LS_SPEC,
        BordaCombinations.GLSPFS_LS_IDETECT,
        BordaCombinations.GLSPFS_SPEC_IDETECT,
        BordaCombinations.GLSPFS_LS_SPEC_IDETECT
    ]

    for variance_threshold in variance_thresholds:

        for dataset_name in dataSets:

            reader = DataReader(dataset_name)

            time.start_time()

            dataset, n_features, y_true = reader.get_preprocessed_data()

            time.end_time("read dataset")

            # default_values = state of art
            control = init.get_initial_variables(is_default_values=False)

            rankings = np.zeros([n_features, 1])

            for method in methods:

                control.best_silhouette = 0

                current_result, best_rank = feat_selection.run_and_evaluate_fs_methods(
                    dataset, dataset_name, method, y_true, result_col_names,
                    variance_threshold, control)

                result = pd.concat([result, current_result])
                results_filename = "result_best_fs_" + dataset_name + ".csv"
                result.to_csv(FoldersLocation.results.value + results_filename,
                              sep=" ")

                rank = pd.Series(best_rank)
                logger.log("best rank: " + str(rank.shape), False)

                rankings = np.append(rankings, rank[:, None], 1)

            # Adding Borda Count results
            if not control.state_of_art:
                for comb in combinations:
                    borda_results = borda.get_borda_results(rankings,
                                                            dataset,
                                                            dataset_name,
                                                            result_col_names,
                                                            y_true,
                                                            combination=comb)
                    result = pd.concat([result, borda_results])
            variance_results_filename = "result_after_" + str(
                variance_threshold
            ) + "_variance_best_fs_" + dataset_name + ".csv"
            results_filename = FoldersLocation.results.value + variance_results_filename
            result.to_csv(results_filename, sep=" ")