예제 #1
0
def analyze_maj(fname, mode):
    input_dir = 'C:\\research\\falseMedicalClaims\\White and Hassan\\model input\\majority\\'
    reports_dir = input_dir + 'reports\\'
    reports_filename = fname + '_' + mode.name + '_report'
    gen_majority_report(input_dir + fname + '.csv',
                        reports_dir + reports_filename + '.csv', mode)

    df = pd.read_csv(reports_dir + '\\' + reports_filename + '.csv')
    queries = get_queries_from_df(df)
    labels = {q: int(queries[q].label) for q in queries}

    gen_metrics_comparison(folder=reports_dir,
                           query_filenames=[reports_filename],
                           actual_values=labels,
                           cmp_filename=reports_filename + '_stats_report',
                           mode=mode)
예제 #2
0
def group_all():

    #input_dir = 'C:\\research\\falseMedicalClaims\\ECAI\\model input\\Yael_sigal_Irit\\by_group'
    #feature_file = "rel_only_group_features_by_stance_citation_range_1"
    #feature_file = "group_features_by_stance_citation_range_1"
    #df = pd.read_csv(input_dir + '\\group_features_by_stance.csv')
    #df = pd.read_csv(input_dir + '\\group_features_by_stance_no_enum.csv')
    #feature_file = "group_features_by_stance_citation_range_only_clinical1"
    #feature_file = "group_features_by_stance_citation_range_only_rev1"

    #feature_file = "group_features_by_stance_citation_range_1_no_stance"
    #feature_file = "group_features_by_stance_citation_range_1_no_stance_no_rel"

    input_folder = 'C:\\research\\falseMedicalClaims\\ECAI\\model input\\'
    cls = 'all_equal_weights'
    input_dir = input_folder + cls + '\\by_group'
    feature_file = "group_features_by_stance"
    df = pd.read_csv(input_dir + '\\' + feature_file + '.csv')
    queries = get_queries_from_df(df)
    labels = {q: int(queries[q].label) for q in queries}
    #mc = MajorityClassifier(input_dir + '\\majority.csv')
    decisionTreeLearner1 = SKLearner(DecisionTreeClassifier(random_state=0))
    svcLearner = SKLearner(svm.SVC(gamma='scale'))
    #layers = [Layer(input=5, output=10), Layer(input=10, output=10)]
    #layers = [Layer(input=31, output=20), Layer(input=20, output=10)]
    layers = [Layer(input=39, output=20), Layer(input=20, output=10)]
    net = TwoLayersNet(layers)
    #params = get_parms(net)
    #nnlearner = NNLearner(dataHelper.Method.GROUP_ALL, net=net, params=params)
    learners = [decisionTreeLearner1]
    #learners = [nnlearner]
    predictions = test_models(learners, queries, dataHelper.Split.BY_QUERY,
                              dataHelper.Method.GROUP_ALL)
    reports_dir = input_dir + '\\reports\\'
    query_report_file_name = feature_file + '_query_report.csv'
    query_report_full_file_name = reports_dir + query_report_file_name
    #metrics_report_file_name =input_dir + '\\reports\\'+feature_file+'_metrics_report.csv'
    create_query_report_file(query_report_full_file_name, queries, learners,
                             predictions, labels)
    files = ['google labels', 'majority', query_report_file_name]
    label_file = input_folder + cls + '\\labels.csv'
    gen_all_metrics_comparison(folder=reports_dir,
                               files=files,
                               label_file=label_file)
예제 #3
0
def w_h_report():
    input_dir = 'C:\\research\\falseMedicalClaims\\White and Hassan\\model input\\mult_features\\'
    feature_file = "h_index_stance_label_shrink_neg"
    feature_file = "h_index_stance_label_shrink_neg"
    df = pd.read_csv(input_dir + '\\' + feature_file + '.csv')
    #   feature_file = "group_features_by_stance_shrink"
    reports_dir = input_dir + '\\reports\\'
    query_report_file_name = feature_file + '_query_report.csv'
    queries = get_queries_from_df(df)
    labels = {q: int(queries[q].label) for q in queries}
    files = ['majority_nol', query_report_file_name]
    gen_metrics_comparison(folder=reports_dir,
                           query_filenames=files,
                           actual_values=labels,
                           cmp_filename=feature_file + 'stats_report',
                           mode=ValToClassMode.THREE_CLASSES_PESSIMISTIC)
    gen_metrics_comparison(folder=reports_dir,
                           query_filenames=files,
                           actual_values=labels,
                           cmp_filename=feature_file + 'stats_report',
                           mode=ValToClassMode.THREE_CLASSES_OPTIMISTIC)
예제 #4
0
def w_h():
    #   input_folder = 'C:\\research\\falseMedicalClaims\\White and Hassan\\model input\\'
    #    input_dir = 'C:\\research\\falseMedicalClaims\\White and Hassan\\model input\\mult_features\\no outlayers\\'
    #    feature_file = "group_features_by_stance_shrink_nol"
    input_dir = 'C:\\research\\falseMedicalClaims\\White and Hassan\\model input\\mult_features\\'
    feature_file = "group_features_by_stance_nol"
    #    input_dir = 'C:\\research\\falseMedicalClaims\\White and Hassan\\model input\\posterior\\'
    #    feature_file = "weighted_posterior_normed_ratio"
    df = pd.read_csv(input_dir + '\\' + feature_file + '.csv')
    features = list(df.head())[2:]
    queries = get_queries_from_df(df)
    labels = {q: int(queries[q].label) for q in queries}
    decisionTreeLearner = SKLearner(DecisionTreeClassifier(random_state=0),
                                    features)
    gnb = SKLearner(GaussianNB())
    ecl = ExpectedValLearner()
    mult = MultipleBinaryCls(ValToClassMode.THREE_CLASSES_PESSIMISTIC)
    neigh = SKLearner(KNeighborsClassifier(n_neighbors=5))
    lr = SKLearner(LogisticRegression(C=1e5))
    kmeans = KMeansClassifier(input_dir + '\\dist.csv')

    decisionforestLearner = SKLearner(RandomForestClassifier(random_state=0))
    #learners = [kmeans, neigh]
    learners = [mult]

    predictions = test_models(learners, queries, dataHelper.Split.BY_QUERY,
                              dataHelper.Method.GROUP_ALL,
                              ValToClassMode.THREE_CLASSES_PESSIMISTIC)
    reports_dir = input_dir + '\\reports\\'
    query_report_file_name = feature_file + '_query_report.csv'
    query_report_full_file_name = reports_dir + query_report_file_name
    create_query_report_file(query_report_full_file_name, queries, learners,
                             predictions, labels, ValToClassMode.W_H)
    files = ['majority', query_report_file_name]
    gen_metrics_comparison(folder=reports_dir,
                           query_filenames=files,
                           actual_values=labels,
                           cmp_filename=feature_file + 'stats_report',
                           mode=ValToClassMode.W_H)
예제 #5
0
def group_all():
    #input_dir = 'C:\\research\\falseMedicalClaims\\examples\\model input\\pubmed\\normed\\group7'
    input_dir = 'C:\\research\\falseMedicalClaims\\ECAI\\model input\\Yael\\by_group'
    models = [  #DecisionTreeRegressor(random_state=0),
        DecisionTreeClassifier(random_state=0)
    ]
    #LinearRegression()]

    #df = pd.read_csv(input_dir + '\\group_features.csv')
    df = pd.read_csv(input_dir + '\\group_features_by_stance.csv')
    #df = pd.read_csv(input_dir + '\\group_features_by_paper_type.csv')
    queries = get_queries_from_df(df)
    labels = {q: int(queries[q].label) for q in queries}
    mc = MajorityClassifier(input_dir + '\\majority.csv')
    predictions = test_models(models, queries, dataHelper.Split.BY_QUERY,
                              dataHelper.Method.GROUP_ALL)
    create_report_file(input_dir + '\\group_features_by_stance_report.csv',
                       queries=queries,
                       models=models,
                       predictions=predictions,
                       majority_classifier=mc,
                       labels=labels)
예제 #6
0
def ijcai():
    #input_folder = 'C:\\research\\falseMedicalClaims\\IJCAI\\model input\\ecai_new'
    input_folder = 'C:\\research\\falseMedicalClaims\\IJCAI\\model input\\non'
    #input_folder = 'C:\\research\\falseMedicalClaims\\IJCAI\\model input\\GTIC'
    input_dir = input_folder + '\\by_group'
    feature_file = "dummy_added_group_features_by_stance3"
    #feature_file = "dummy_added_group_features_by_stance_exp"
    #feature_file = "dist_exp"
    #feature_file = "dist_group_features_by_stance_paste_ecai2"
    feature_file = "dist"
    df = pd.read_csv(input_dir + '\\' + feature_file + '.csv')
    features = list(df.head())[2:]
    queries = get_queries_from_df(df)
    labels = {q: int(queries[q].label) for q in queries}
    decisionTreeLearner = SKLearner(DecisionTreeClassifier(random_state=0),
                                    features)
    gnb = SKLearner(GaussianNB())
    ecl = ExpectedValLearner()
    mult = MultipleBinaryCls(ValToClassMode.W_H)
    neigh = SKLearner(KNeighborsClassifier(n_neighbors=5))

    kmeans = KMeansClassifier(input_dir + '\\dist.csv')

    #decisionforestLearner = SKLearner(RandomForestClassifier(random_state=0))
    learners = [mult]
    #learners = [decisionforestLearner]
    predictions = test_models(learners, queries, dataHelper.Split.BY_QUERY,
                              dataHelper.Method.GROUP_ALL)
    reports_dir = input_dir + '\\reports\\'
    query_report_file_name = feature_file + '_query_report.csv'
    query_report_full_file_name = reports_dir + query_report_file_name
    create_query_report_file(query_report_full_file_name, queries, learners,
                             predictions, labels)
    files = ['majority', query_report_file_name]
    gen_all_metrics_comparison(folder=reports_dir,
                               files=files,
                               actual_values=labels,
                               cmp_filename=feature_file + 'stats_report')
예제 #7
0
def learn_by_doctors_annotations(val2class,
                                 feature_file,
                                 directory,
                                 resample,
                                 majority_filename,
                                 quick=False,
                                 filter_queries=None):
    input_dir = 'C:\\research\\falseMedicalClaims\\White and Hassan\\model input\\mult_features\\'
    output_dir = 'C:\\research\\falseMedicalClaims\\White and Hassan\\model input\\' + directory + '\\'
    # feature_file = "group_features_by_stance_nol"
    #majority_file = input_dir+ majority_filename
    reports_dir = output_dir + '\\reports\\'
    query_report_file_name = feature_file + '_query_report.csv'
    query_report_full_file_name = reports_dir + query_report_file_name
    #majority_report_file_name =  'majority_' + val2class.name + '.csv'
    #majority_report_full_file_name = reports_dir + majority_report_file_name
    #gen_majority_report(majority_file, majority_report_full_file_name, val2class)

    #feature_file = "group_features_by_label_shrink_nol"
    #feature_file = "group_features_by_stance_label_shrink_nol"
    df = pd.read_csv(input_dir + '\\' + feature_file + '.csv')
    queries = get_queries_from_df(df)
    labels = {q: int(queries[q].label) for q in queries}
    knn = KNeighborsClassifier(n_neighbors=5)
    neigh = SKLearner(knn, features=None, resample=resample)
    #weights = {1: 4.0, 3: 1.5 , 5:1.0}
    weights = {1: 100, 3: 25, 5: 1}
    svc = SKLearner(svm.SVC(class_weight=weights))
    rfc = RandomForestClassifier(random_state=0)
    decisionforestLearner = SKLearner(rfc, features=None, resample=resample)
    #    lr = SKLearner(LinearRegression(C=1e5))
    mult = MultipleBinaryCls(ValToClassMode.FOUR_CLASSES, resample=resample)
    #learners = [decisionforestLearner, neigh]
    maj = MajorityClassifier(val2class)
    learners = [
        maj,
        MultipleCls(val2class, resample=resample), decisionforestLearner, neigh
    ]
    # learners = [mult, decisionforestLearner, MultipleCls(val2class, resample = resample) ]

    if quick:
        for learner in learners:
            learner.quick_learn(queries)

    predictions = test_models(learners, queries, dataHelper.Split.BY_QUERY,
                              dataHelper.Method.GROUP_ALL, val2class)
    create_query_report_file(query_report_full_file_name, input_dir,
                             feature_file, queries, learners, predictions,
                             labels, val2class)
    # files = [majority_report_file_name, query_report_file_name]
    files = [query_report_file_name]
    if resample:
        cmp_filename = feature_file + '_stats_report_resample'
    else:
        cmp_filename = feature_file + '_stats_report'
    gen_all_metrics_comparison(folder=reports_dir,
                               files=files,
                               actual_values=labels,
                               cmp_filename=cmp_filename,
                               val2class=val2class)
    if filter_queries:
        gen_all_metrics_comparison(folder=reports_dir,
                                   files=files,
                                   actual_values=labels,
                                   cmp_filename=cmp_filename,
                                   val2class=val2class,
                                   filter_queries=filter_queries)