Exemplo n.º 1
0
def main(corpus_label, exp_name, precomputed_queries_filename="predictions.json"):
    experiment["precomputed_queries_filename"] = precomputed_queries_filename

    if corpus_label.lower() == "aac":
        corpus = ez_connect("AAC", SERVER)
        experiment["name"] = exp_name
        experiment["load_preselected_test_files_list"] = "test_guids_c3_aac.txt"
        experiment["test_files_condition"] = "metadata.num_in_collection_references:>0 AND metadata.year:>2010"
        experiment["doc_methods"] = doc_methods_aac

    elif corpus_label.lower() == "pmc":
        corpus = ez_connect("PMC_CSC", SERVER)

        experiment["files_dict_filename"] = "files_dict_test.json"
        # experiment["files_dict_filename"] = "precomputed_queries_test.json"
        experiment["name"] = exp_name
        experiment["load_preselected_test_files_list"] = "test_guids_pmc_c3_1k.txt"
        experiment["test_files_condition"] = "metadata.num_in_collection_references:>0 AND metadata.year:>2013"
        experiment["doc_methods"] = doc_methods_pmc

    # corpus.closeIndex("idx_*")

    exp = Experiment(experiment, options, use_celery=False, process_command_line=False)
    print("exp_dir: ", exp.exp["exp_dir"])
    print("Query file: ", precomputed_queries_filename)
    exp.run()
    pass
Exemplo n.º 2
0
def initialize_experiment():
    experiment = Experiment(data)
    experiment.add_classifier(TemporalEvidencesClassifier(data.features, data.target_names),
                              name="Our method")
    experiment.add_classifier(NaiveBayesClassifier(data.features, data.target_names),
                              name="Naive Bayes")
    return experiment
Exemplo n.º 3
0
def main():
    from multi.config import MINERVA_ELASTICSEARCH_ENDPOINT
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\aac", endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)
    cp.Corpus.setCorpusFilter("AAC")

    exp=Experiment(experiment, options, True)
    exp.run()
Exemplo n.º 4
0
def main():
    from multi.celery_app import MINERVA_ELASTICSEARCH_ENDPOINT
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\aac",
                            endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)
    cp.Corpus.setCorpusFilter("AAC")
    ##    experiment["test_files"]=["456f8c80-9807-46a9-8455-cd4a7e346f9d"]

    exp = Experiment(experiment, options, False)
    exp.run()
Exemplo n.º 5
0
def main():
    from multi.config import MINERVA_ELASTICSEARCH_ENDPOINT
    cp.useElasticCorpus()
    root_dir = getRootDir("aac")

    cp.Corpus.connectCorpus(root_dir, endpoint=MINERVA_ELASTICSEARCH_ENDPOINT)
    cp.Corpus.setCorpusFilter("AAC")
    ##    experiment["test_files"]=["456f8c80-9807-46a9-8455-cd4a7e346f9d"]

    exp = Experiment(experiment, options, False)
    exp.run()
Exemplo n.º 6
0
def main():
    drive = "g"
    ##    cp.useLocalCorpus()
    cp.useElasticCorpus()

    cp.Corpus.setCorpusFilter(collection_id="AAC")

    cp.Corpus.connectCorpus(drive + ":\\nlp\\phd\\aac")
    ##    generator=AtharQueryGenerator(drive+r":\NLP\PhD\citation_context\doc_dict.json", reassign_guids=True)
    ##    experiment["test_files"]=cp.Corpus.listPapers("year >= 2011")
    experiment["test_files"] = cp.Corpus.listPapers("year:>=2011")

    exp = Experiment(experiment, options)
    exp.run()
    pass
    def _run_synth_exp(self):
        # initialise result array
        results = np.zeros((self.param_values.shape[0], len(SCORE_NAMES), len(self.methods), self.num_runs))

        # iterate through parameter settings
        for param_idx in range(self.param_values.shape[0]):

            print('parameter setting: {0}'.format(param_idx))

            # read parameter directory
            path_pattern = self.output_dir + 'data/param{0}/set{1}/'

            # iterate through data sets
            for run_idx in range(self.num_runs):
                print('data set number: {0}'.format(run_idx))

                data_path = path_pattern.format(*(param_idx, run_idx))

                if os.path.exists(os.path.join(data_path, 'pred_.csv')):
                    os.remove(os.path.join(data_path, 'pred_.csv'))
                if os.path.exists(os.path.join(data_path, 'probs_.csv')):
                    os.remove(os.path.join(data_path, 'probs_.csv'))

                # read data
                doc_start, gt, annos = self.generator.read_data_file(data_path + 'full_data.csv')
                exp = Experiment(data_path, self.generator.num_labels, annos, gt, doc_start, None,
                                 alpha0_factor=self.alpha0_factor, alpha0_diags=self.alpha0_diags,
                                 beta0_factor=self.beta0_factor, begin_factor=self.begin_factor,
                                 max_iter=self.max_iter, crf_probs=self.crf_probs, bootstrapping=False)
                exp.methods = self.methods
                # run methods
                results[param_idx, :, :, run_idx], preds, probabilities, _, _, _ = exp.run_methods(new_data=True,
                                                                    save_with_timestamp=False)


        if self.save_results:
            if not os.path.exists(self.output_dir):
                os.makedirs(self.output_dir)

            print('Saving results...')
            # np.savetxt(self.output_dir + 'results.csv', np.mean(results, 3)[:, :, 0])
            results.dump(self.output_dir + 'results')

        if self.show_plots or self.save_plots:
            plot_results(self.param_values, self.methods, self.param_idx, results, self.show_plots,
                         self.save_plots, self.output_dir)

        return results
Exemplo n.º 8
0
def main():
    drive = "g"
    ##    cp.useLocalCorpus()
    cp.useElasticCorpus()

    cp.Corpus.setCorpusFilter(collection_id="AAC")

    cp.Corpus.connectCorpus(drive + ":\\nlp\\phd\\aac")
    generator = AtharQueryGenerator(
        drive + r":\NLP\PhD\citation_context\doc_dict.json",
        reassign_guids=True)
    experiment["test_files"] = list(generator.docs.keys())

    exp = Experiment(experiment, options)
    exp.query_generator = generator
    exp.run()
    pass
Exemplo n.º 9
0
def initialize_experiment():
    experiment = Experiment(data)
    experiment.add_classifier(TemporalEvidencesClassifier(
        data.features, data.target_names),
                              name="Our method")
    experiment.add_classifier(NaiveBayesClassifier(data.features,
                                                   data.target_names),
                              name="Naive Bayes")
    return experiment
Exemplo n.º 10
0
    print('loading ground truth labels...')
    gt = pd.read_csv(savepath + '/gt.csv', header=None).values
    #gt = IOB2_to_IOB(gt)

    # debug with subset -------
    # s = 1000
    # gt = gt[:s]
    # annos = annos[:s]
    # doc_start = doc_start[:s]
    # text = text[:s]
    # gt_dev = gt_dev[:s]
    # doc_start_dev = doc_start_dev[:s]
    # text_dev = text_dev[:s]
    # -------------------------

    exp = Experiment(None, 3, annos.shape[1], None)

    exp.alpha0_factor = 1
    exp.alpha0_diags = 100

    exp.save_results = True
    exp.opt_hyper = False  # True

    # run all the methods that don't require tuning here
    exp.methods = [
        'ibcc',
        'majority',
        'best',
        'worst',
    ]
def main():
    corpus = ez_connect("AAC", "aws-server")
    exp = Experiment(experiment, options, False)
    exp.run()
Exemplo n.º 12
0
def main():
    cp.useElasticCorpus()
    cp.Corpus.connectCorpus("g:\\nlp\\phd\\pmc_coresc",
                            endpoint={
                                "host": "129.215.90.202",
                                "port": 9200
                            })

    ##    experiment["test_files"]=["456f8c80-9807-46a9-8455-cd4a7e346f9d"]

    experiment["test_files"] = [
        ##                                "bdc9a118-cb76-4d26-9c4d-e886794428f5",
        ##                                "65a4319e-4324-4529-96fa-66c52e392da0",
        ##                                "0cc28fb0-b116-4990-b816-3dc066273c34",
        ##                                "ef3e4284-c527-4e83-9b59-f8996b09df76",
        ##                                "b3129460-d284-4f69-83a8-f87f588e7800",
        ##                                "d8548dab-ff28-4f93-b2ae-16887e59e8ad",
        ##                                "42efd8ec-4c06-4754-a527-3045eed87766",
        ##                                "f4374057-7ab2-4567-b73b-aa5b72328d3e",
        ##                                "cbf989c5-79f5-4317-8515-2192e2a3fe2a",
        ##                                "37d1cc24-68a5-4a36-b55d-94acdfad08c1",
        ##                                "2b5202ec-e71b-4d1a-8ef4-439c4f505342",
        ##                                "11ba9f31-13f8-4a40-8bfc-6c9c7725e7ba",
        ##                                "e047f55f-ff56-44a6-a07c-794887330752",
        ##                                "d39d353a-f9ca-4ce3-ab42-e9a16f5bd372",
        ##                                "a407716f-4516-4cba-9c52-d4e3b09bcda6",
        ##                                "680724b2-50e7-4809-a86f-e63326059f7e",
        ##                                "1ce857ab-7692-4a95-9ba0-f517179a940e",
        ##                                "e12b2e84-a91d-4170-88a6-6ba983ceab1b",
        ##                                "5a6c0a35-dbe0-486a-8edf-3c3d3638f06e",
        ##                                "c40d5876-208c-4eb4-b239-652ed14f8560",
        ##                                "9a764770-fd73-474e-8f38-cf0128371e2c",
        ##                                "54432fc8-c1c4-42f9-95b0-c5fad39f8317",
        ##                                "a7dab0f1-5891-4d83-92c2-d25069c49d27",
        ##                                "283ed90d-3ff9-4161-8c4d-4e55a555973e",
        ##                                "6478c6ca-e16c-473f-9f4c-060143b3cc8f",
        ##                                "666f2c58-3180-465b-877c-28d14cbcdf98",
        ##                                "f5dedb99-f2a1-4ae9-b4a0-3c23e33cbfc9",
        ##                                "e5ed924b-8b78-4c76-bb6c-54d9790c8a15",
        ##                                "b8ace4e7-8523-471f-847b-b45aee8ccfc1",
        ##                                "ff30447d-828e-4699-bbf7-ce586aae9764",
        ##                                "aec8d55c-43e0-42cb-b832-77f888c2325a",
        ##                                "067862a3-d8fd-4252-b831-f6f120af82a1",
        ##                                "64956609-5a4d-4e05-bad1-0445c3d1834d",
        ##                                "cd1cd1ec-ecc9-4e70-96b3-7f1447ec0df3",
        ##                                "d61b922b-622b-440c-b040-3db563fd6f0e",
        ##                                "51d71d97-5abb-4a4d-ba77-7d18a11343f0",
        ##                                "b4c2215a-0a38-4e44-a5ab-f0d0114d89fc",
        ##                                "d3265a02-86ba-47e9-879c-a15043ca5808",
        ##                                "3e53830f-33a5-4192-9159-bcd01a3e66d3",
        ##                                "be50acb5-e165-4afb-b259-eeb9f28d0f2e",
        ##                                "fb8e6675-46d9-41c8-8ba0-598842a63fe8",
        ##                                "34043f1e-3424-4c4a-b782-9489fc274db5",
        ##                                "e07f7715-d400-4958-a0ac-2e6dab3b1843",
        ##                                "44f52aea-cae3-4e1c-85ce-da0038cbcea1",
        ##                                "b99f5b2d-6edd-4787-a50d-fef7d030ff05",
        ##                                "55ef06b7-ffc6-4e43-9362-daf4b9f6735f",
        ##                                "89c63f73-988a-4ece-99fd-e6c91fc9f6fd",
        ##                                "83293e90-8f3e-45db-8dae-49a179568d3e",
        ##                                "7fbcd237-d40d-4d44-9c9f-7e2f462e547e",
        ##                                "5303a3a3-c1cd-458c-9bef-56df3080169d",
        ##
        ##                                "a3af153f-cf9d-40ea-b64d-c6e52e0a187b",
        ##                                "38a67959-7ad9-426d-8357-51ab376b7a4b",
        ##                                "5f71b848-2f22-49bf-b32a-c1cc441a6dbe",
        ##                                "e875b2dc-3757-4728-8e8b-47c6a1d8241c",
        ##                                "a1bd31a8-66bc-4be9-aa85-e2e821aa18f5",
        ##                                "ef030f01-cdcb-4aaf-8ec1-c1a8778095da",
        ##                                "5b4f7822-1127-4d45-84ca-6755e1debaab",
        ##                                "a11c9c92-e294-4dfc-8e73-f432ad460776",
        ##                                "6c647939-ef22-4d8b-b887-121272168829",
        ##                                "d4c97daa-790c-40fe-a17e-fffa8e7fbd36",
        ##                                "65ee7543-549d-4821-b51d-8fc27dbe85cb",
        ##                                "4fe8ae7f-47bf-41fd-95ef-14ee7831f37e",
        ##                                "45d1bde0-2bd5-413b-89e3-9151d5a73ffb",
        ##                                "7bedaa57-30ff-4569-8456-59236171a80f",
        ##                                "67a054e0-744e-477b-80a4-06a268064bc7",
        ##                                "51a51cbb-952b-450b-970e-f6a23ecf9ce6",
        ##                                "b14209b3-d868-41cd-b1d0-f1a1489220f3",
        ##                                "53230a94-3baf-4825-a039-e8125890e737",
        ##                                "bb576feb-658e-45ef-810a-617b586159e5",
        ##                                "7d5ad1b5-2f3d-4728-b583-ec1ebbc3dac6",
        ##                                "f27cf9db-0d2d-490b-9917-c076a5ebca2c",
        ##                                "3b6679e5-deae-43ee-a98b-cac7029e92f4",
        ##                                "0d44cbba-1989-4654-b250-1b41285359ea",
        ##                                "b0cceb78-5f66-4084-accb-171040521cda",
        ##                                "18bf7b21-2456-49da-882a-06032ec46bec",
        ##                                "588b99bd-c358-440b-b30c-e1f3dc10b96b",
        ##                                "c4c1d5c0-7f40-465a-bbdb-351b4c9948a8",
        ##                                "0efcb373-ecd3-4e10-9f2a-1bbd3a6cbf58",
        ##                                "9bb0db11-2821-4d55-8f34-9bfd5d58f444",
        ##                                "49ff3f83-b4d7-4979-800f-785460c95552",
        ##                                "58e0a5d1-6343-4e2e-b544-6f690bff023e",
        ##                                "5a84843d-d7b0-43b0-846c-d30d3196ee8a",
        ##                                "6f244f35-8f61-4eb9-9de0-dfbbca63532b",
        ##                                "9d3cf2ea-162b-4e78-a311-b7333ad65c3a",
        ##                                "75e80547-50f5-4a12-8db6-e799a2e5029b",
        ##                                "f6c70cbd-e6c3-4ea5-b99f-ac0c455d832a",
        ##                                "f72d2af0-1e8a-40f5-9acf-ddbe9ddc4a7b",
        ##                                "9a81337e-1280-4b11-9b00-7e516d298ea1",
        ##                                "c7a83006-6ed3-46ec-b476-c49770dc4979",
        ##                                "aa45f968-61a4-421d-a532-a036fb8336ef",
        ##                                "f38092bc-b1e2-4ba8-ad60-b964825e52ac",
        ##                                "a755f020-c04d-4640-8b43-fb63b560bd6e",
        ##                                "1f06edd3-09d6-4033-b65e-a96d6a78f748",
        ##                                "33273c27-bcb7-4a4d-b339-c8af16c97b91",
        ##                                "802d2b57-0425-410e-82b6-f0024bc6f0dd",
        ##                                "699e837a-a662-49a0-b4c5-b3ef113eff34",
        ##                                "5b019a09-e21f-4109-a757-2c8396c8f169",
        ##                                "d4a70f39-7c5c-4566-8b5c-72208f3929ea",
        ##                                "6cf4f22d-c77b-4f0a-9e3c-378d7803f62b",

        ##                                "5cc55656-d309-4906-9cbf-7e34e734c352",
        ##                                "4222902b-e5fc-4eef-a7a7-79ec85d8e7c0",
        ##                                "4e408d49-6e51-441a-9c1c-8720d0d7032a",
        ##                                "aba078e1-c385-45b2-9adf-1ab7901b373b",
        ##                                "772eaabf-8996-486f-9bba-355cbf0c15e1",
        ##                                "38048193-6565-45ea-9950-64c7e4c266a3",
        ##                                "4d36eeb9-9121-4510-847e-99b80c77473e",
        ##                                "7b9e39c9-18a3-4112-ba9f-36b70d60f60f",
        ##                                "e31fd474-e2a0-4b3c-9d36-b31003b3bbc6",
        ##                                "69b92870-e050-4277-bd00-08f79aa6d9e6",
        ##                                "a67be750-73dc-427b-ac6a-e46adcaf7430",
        ##                                "1f8a95c4-856a-4f39-9c45-52d309d8c075",
        ##                                "b3606482-e22f-4809-948a-385a4f1e47cb",
        ##                                "7a4a67fb-3f4c-4c26-a060-775e8a4b7480",
        ##                                "bc39625f-1bc1-49ce-8ea3-4f8debe90b01",
        ##                                "202ef49f-c3f9-4d3a-b971-2d8094c06242",
        ##                                "28967c8d-2584-4898-9b62-0bfc669e2490",
        ##                                "355d6857-06ee-4430-a511-aaf0e8eaf23d",
        ##                                "0ce7eebd-0815-4f5d-b1c9-fafb65584994",
        ##                                "870c4608-525d-44d1-960f-4eb73589618c",
        ##                                "2420a665-d848-459f-9f51-456275d42e8b",
        ##                                "deb5362b-af92-4973-970f-ebe3fec12ee9",
        ##                                "d244e4d9-808c-4abd-a627-02716e9609c9",
        ##                                "de3f08f8-12fa-41d1-82cc-30c2c43cf52e",
        ##                                "575e9d63-94d7-483d-a980-8c974afc0ad9",
        ##                                "31410622-7133-4472-8225-8cf6b1eb1683",
        ##                                "80f3bd59-c5d6-43a0-97e3-155c1af50275",
        ##                                "cf2e8b40-5fab-4b17-acb1-6b676d909aa6",
        ##                                "51375367-8a16-4070-bab7-5ebcca3427c4",
        ##                                "dc293831-b099-45e2-a9ef-87ec8ccb8722",
        ##                                "aad3943f-37f9-4774-aa77-f312650b699e",
        ##                                "cdf828fc-2fb5-4c6f-8b42-8ff7c8ff0ff0",
        ##                                "6569f681-77e3-4ceb-a956-04e7a751f2b3",
        ##                                "2edffd94-b1db-46b7-bc4b-e4da9dcf4f51",
        ##                                "45a922bc-814a-40bd-a76a-fcbeca77bc81",
        ##                                "be4f19f7-de07-4674-8f03-1fbec9c7dd04",
        ##                                "48d9f4cf-c081-4520-b350-6ca3142987a7",
        ##                                "f35243d1-a3e3-4402-99b3-e576a27cde0d",
        ##                                "e8f567f8-3179-4214-bcbc-79332c1cfd1d",
        ##                                "209e32f7-a3cd-4e86-afee-2935a1f25514",
        ##                                "1cd47a2c-58c1-4c89-a689-cbdc0dd1f6b7",
        ##                                "2c64d4d5-3883-4fee-8c2c-1c0afb3835cf",
        ##                                "1323e0b5-c986-4ca6-855a-0b147d938e50",
        ##                                "d293f62a-983f-4ddc-a227-84d82bb36af1",
        ##                                "5b6439c9-466d-4bc0-aff2-e85de8eb9337",
        ##                                "da2b0b43-26b1-458b-b57a-83279ceb314e",
        ##                                "c21e2afc-0f92-490d-aa1b-7f826a83221d",
        ##                                "c5e67372-cf98-45db-bb7a-e3f4e7662774",
        ##                                "c7f91884-cfc8-406b-919e-658008c21279",
        ##                                "753b9d9a-ce8d-4fba-ac74-106526416738",
        ##                                "799680bf-5150-4fb2-b9b6-91fd0edc2593",
    ]

    exp = Experiment(experiment, options, True)
    exp.run()
Exemplo n.º 13
0
# doc_start = doc_start[idxs]
# text = text[idxs]
# gt_task1_val = gt_task1_val[idxs]
# -------------------------

num_reps = 10
batch_frac = 0.03
AL_iters = 10

for rep in range(1, num_reps):

    output_dir = '../data/bayesian_sequence_combination/output/ner_al_super_new/'
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    exp = Experiment(None, 9, annos.shape[1], None, max_iter=20, crf_probs=True, rep=rep)
    exp.save_results = True
    exp.opt_hyper = False#True

    exp.nu0_factor = 0.1
    exp.alpha0_diags = 1 # best_diags
    exp.alpha0_factor = 1#9 # best_factor

    exp.methods =  [
        'bac_seq_integrateIF',
        'HMM_crowd',
                    ]

    results, preds, probs, results_nocrowd, preds_nocrowd, probs_nocrowd = exp.run_methods(
                        annos, gt, doc_start, output_dir, text,
                        ground_truth_val=gt_val, doc_start_val=doc_start_val, text_val=text_val,
Exemplo n.º 14
0
@author: Edwin Simpson
'''

from evaluation.experiment import Experiment
import data.load_data as load_data
import numpy as np

output_dir = '../../data/bayesian_sequence_combination/output/ner/'

regen_data = False
gt, annos, doc_start, text, gt_nocrowd, doc_start_nocrowd, text_nocrowd, gt_task1_val, gt_val, doc_start_val, text_val, _ = \
    load_data.load_ner_data(regen_data)

# ------------------------------------------------------------------------------------------------
exp = Experiment(None, 9, annos.shape[1], None, max_iter=20)
exp.save_results = True
exp.opt_hyper = False  #True

best_bac_wm = 'bac_seq'  #'unknown' # choose model with best score for the different BAC worker models
best_bac_wm_score = -np.inf

best_nu0factor = 0.1
best_diags = 1
best_factor = 1
best_acc_bias = 0

exp.alpha0_diags = best_diags
exp.alpha0_factor = best_factor
exp.nu0_factor = best_nu0factor
exp.alpha0_acc_bias = best_acc_bias
Exemplo n.º 15
0
num_reps = 10
batch_frac = 0.03
AL_iters = 10

output_dir = os.path.join(evaluation.experiment.output_root_dir, 'ner_al')
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

# ACTIVE LEARNING WITH UNCERTAINTY SAMPLING
for rep in range(1, num_reps):

    beta0_factor = 0.1
    alpha0_diags = 1
    alpha0_factor = 1
    exp = Experiment(output_dir, 9, annos, gt, doc_start, text, annos, gt_val, doc_start, text,
                     alpha0_factor=alpha0_factor, alpha0_diags=alpha0_diags, beta0_factor=beta0_factor,
                     max_iter=20, crf_probs=True, rep=rep)
    exp.methods = [
        'bac_seq_integrateIF',
        'HMM_crowd',
    ]
    results, preds, probs, results_nocrowd, preds_nocrowd, probs_nocrowd = exp.run_methods(
        active_learning=True, AL_batch_fraction=batch_frac, max_AL_iters=AL_iters
    )

    beta0_factor = 10
    alpha0_diags = 1
    alpha0_factor = 1
    exp = Experiment(output_dir, 9, annos, gt, doc_start, text, annos, gt_val, doc_start, text,
                     alpha0_factor=alpha0_factor, alpha0_diags=alpha0_diags, beta0_factor=beta0_factor,
                     max_iter=20, crf_probs=True, rep=rep)
beta0_factor = 1
alpha0_diags = 10
alpha0_factor = 10
best_begin_factor = 10

output_dir = os.path.join(
    evaluation.experiment.output_root_dir,
    'pico3_%f_%f_%f' % (beta0_factor, alpha0_diags, alpha0_factor))
exp = Experiment(output_dir,
                 3,
                 annos,
                 gt,
                 doc_start,
                 features,
                 annos,
                 gt_val,
                 doc_start,
                 features,
                 alpha0_factor=alpha0_factor,
                 alpha0_diags=alpha0_diags,
                 beta0_factor=beta0_factor,
                 max_iter=20,
                 begin_factor=best_begin_factor)
# # run all the methods that don't require tuning here
exp.methods = [
    # 'bac_seq_integrateIF',
    'bac_seq_integrateIF_thenLSTM',
    'bac_seq_integrateIF_integrateLSTM_atEnd',
]
# this will run task 1 -- train on all crowdsourced data, test on the labelled portion thereof
exp.run_methods(new_data=regen_data)
def main():
    corpus = ez_connect("PMC_CSC", "aws-server")
    corpus.closeIndex("idx_inlink_*")
    exp = Experiment(experiment, options, use_celery=False)
    exp.run()
    pass
Exemplo n.º 18
0
data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config")
intervals_to_test = [  #test various settings for delta t_max
    ("Delta t_max=1200s", initialize_bins(start=0, end=60, width=10) +
     initialize_bins(start=60, end=1200, width=30)),
    ("Delta t_max=120s", initialize_bins(start=0, end=60, width=10) +
     initialize_bins(start=60, end=120, width=30)),
    ("Delta t_max=60s", initialize_bins(start=0, end=60, width=10)),
    ("Delta t_max=30s", initialize_bins(start=0, end=30, width=10)),
    ("Delta t_max=10s", initialize_bins(start=0, end=10, width=10)),
    #test various interval widths
    ("all intervals 2s wide", initialize_bins(start=0, end=300, width=2)),
    ("all intervals 4s wide", initialize_bins(start=0, end=300, width=4)),
    ("all intervals 6s wide", initialize_bins(start=0, end=300, width=6)),
    ("all intervals 8s wide", initialize_bins(start=0, end=300, width=8)),
    ("all intervals 30s wide", initialize_bins(start=0, end=300, width=30)),
    ("all intervals 50s wide", initialize_bins(start=0, end=300, width=50)),
    ("all intervals 100s wide", initialize_bins(start=0, end=300, width=100))
]

#run 10-fold cross-validation for each of the configured intervals
experiment = Experiment(data)
for (name, bins) in intervals_to_test:
    experiment.add_classifier(TemporalEvidencesClassifier(data.features,
                                                          data.target_names,
                                                          bins=bins),
                              name=name)
results = experiment.run(folds=10)

results.print_quality_comparison_at_cutoff(
    cutoff=1, metrics=["Recall", "Precision", "F1"])
def main():
    corpus = ez_connect("AAC", "koko")
    corpus.closeIndex("idx_*")
    exp = Experiment(experiment, options, use_celery=False)
    exp.run()
    pass
Exemplo n.º 20
0
# ------------------------------------------------------------------------------------------------
# Rerunning with found parameters...

beta0_factor = 0.1
alpha0_diags = 0.1
alpha0_factor = 0.1
output_dir = os.path.join(
    evaluation.experiment.output_root_dir,
    'ner3_%f_%f_%f' % (beta0_factor, alpha0_diags, alpha0_factor))
exp = Experiment(output_dir,
                 9,
                 annos,
                 gt,
                 doc_start,
                 features,
                 annos,
                 gt_val,
                 doc_start,
                 features,
                 alpha0_factor=alpha0_factor,
                 alpha0_diags=alpha0_diags,
                 beta0_factor=beta0_factor,
                 max_iter=20)

exp.methods = [
    'best',  # does not use the hyperparameters
    'worst',  # does not use the hyperparameters
    'majority',  # does not use the hyperparameters
    'mace',  # worked best with its own default hyperparameters, smoothing=0.001, alpha=0.5, beta=0.5
    'ds',  # does not use the hyperparameters
    'HMM_Crowd',  # does not use alpha0_diags; default values happened to work best
]
Exemplo n.º 21
0
# , debug_subset_size=1000) # include this argument to debug with small dataset

# ------------------------------------------------------------------------------------------------

# only hmm_Crowd actually uses these hyperparameters
beta0_factor = 0.1
alpha0_diags = 0.1
alpha0_factor = 0.1
output_dir = os.path.join(evaluation.experiment.output_root_dir, 'pico3')
exp = Experiment(output_dir,
                 3,
                 annos,
                 gt,
                 doc_start,
                 features,
                 annos,
                 gt_val,
                 doc_start,
                 features,
                 alpha0_factor=alpha0_factor,
                 alpha0_diags=alpha0_diags,
                 beta0_factor=beta0_factor,
                 max_iter=20)
# # run all the methods that don't require tuning here
exp.methods = [
    'best',
    'worst',
    'majority',
    'ds',
    'mace',
    'HMM_crowd',
]
    #     exp.nu0_factor = nu_factors[best_idxs[0]]
    #     exp.alpha0_diags = diags[best_idxs[1]]
    #     exp.alpha0_factor = factors[best_idxs[2]]
    #
    #     print('Best values for %s: %f, %f, %f' % (method, exp.nu0_factor, exp.alpha0_diags, exp.alpha0_factor))
    #
    #     # this will run task 1 -- train on all crowdsourced data, test on the labelled portion thereof
    #     exp.methods = [method]
    #     exp.run_methods(annos, gt, doc_start, output_dir, text, rerun_all=True, return_model=True,
    #                 ground_truth_val=gt_dev, doc_start_val=doc_start_dev, text_val=text_dev,
    #                 new_data=regen_data
    #     )

    # ------------------------------------------------------------------------------------------------------------------

    exp = Experiment(None, nclasses, annos.shape[1], None, max_iter=5)

    exp.save_results = True
    exp.opt_hyper = False  #True

    # values obtained from tuning on dev:
    best_nu0factor = 1000  #1
    best_diags = 1000
    best_factor = 0.1

    exp.nu0_factor = best_nu0factor
    exp.alpha0_diags = best_diags
    exp.alpha0_factor = best_factor

    exp.methods = [
        # 'bac_seq_integrateIF',
Exemplo n.º 23
0
def main():
    global options, experiment
    corpus = ez_connect("AAC", "koko")

    # to_generate = "training_min2"
    to_generate = "training_min1"
    # to_generate = "testing"
    # to_generate = "PRUEBA"
    # to_generate = "export"

    # experiment["keyword_selector"] = "MultiMaximalSetSelector"
    # suffix = "mms"

    experiment["keyword_selector"] = "AllSelector"
    suffix = "w"

    if to_generate.startswith("training"):
        experiment["max_queries_generated"] = 100000000
        experiment["max_test_files"] = 500000

        # experiment["test_files_condition"] = "metadata.num_in_collection_references:>0 "
        # experiment["features_field_name"] = "_full_text"

        experiment[
            "test_files_condition"] = "metadata.num_in_collection_references:>0 AND metadata.year:>2010"
        experiment["features_field_name"] = "_all_text"

        # experiment["test_guids_to_ignore_file"] = "test_guids_c3_aac_desktop.txt"
        experiment[
            "load_preselected_test_files_list"] = "train_guids_aac_c6.txt"

        match = re.search(r"_min(\d)", to_generate)
        if match:
            min_multi = int(match.group(1))
            experiment["resolvable_cit_min_multi"] = min_multi
            experiment[
                "precomputed_queries_filename"] = "precomputed_queries_training_min%d.json" % min_multi
            experiment[
                "files_dict_filename"] = "files_dict_training_min%d.json" % min_multi
            experiment[
                "feature_data_filename"] = "feature_data_at_%s_min%d.json.gz" % (
                    suffix, min_multi)
        else:
            experiment["resolvable_cit_min_multi"] = 1
            experiment[
                "precomputed_queries_filename"] = "precomputed_queries_training.json"
            experiment["files_dict_filename"] = "files_dict_training.json"
            experiment[
                "feature_data_filename"] = "feature_data_at_%s.json.gz" % suffix

    elif to_generate == "testing":
        experiment[
            "load_preselected_test_files_list"] = "test_guids_c3_aac_desktop.txt"
        experiment["name"] = "aac_generate_kw_trace_TEST"
        experiment["max_queries_generated"] = 1000

        experiment[
            "precomputed_queries_filename"] = "precomputed_queries_new1k.json"
        experiment["files_dict_filename"] = "files_dict_new1k.json"
        experiment[
            "feature_data_filename"] = "feature_data_test_at_%s.json.gz" % suffix
        # experiment["resolvable_cit_min_multi"]= 1
        experiment["max_test_files"] = 1000
        experiment[
            "test_files_condition"] = "metadata.num_in_collection_references:>0 AND metadata.year:>2010"

    elif to_generate == "export":
        experiment[
            "feature_data_filename"] = "feature_data_at_%s.json.gz" % suffix
        options["clear_existing_prr_results"] = 0
        options = {
            "run_prebuild_bows":
            0,  # should the whole BOW building process run?
            "overwrite_existing_bows":
            0,  # if a BOW exists already, should we overwrite it?
            "build_indexes": 0,  # rebuild indices?
            "generate_queries": 0,  # precompute the queries?
            "force_regenerate_resolvable_citations":
            0,  # find again the resolvable citations in a file?
            "overwrite_existing_queries":
            0,  # force rebuilding of queries too?
            # delete previous precomputed results? i.e. start from scratch
            "run_precompute_retrieval": 0,
            "run_feature_annotation": 0,
            "refresh_results_cache":
            1,  # should we clean the offline reader cache and redownload it all from elastic?
            "run_package_features": 1,
            "list_missing_files": 0,
        }

    exp = Experiment(experiment, options, False)
    exp.run()
Exemplo n.º 24
0
sys.path.append("..") 

import pandas

from evaluation.experiment import Experiment
from evaluation.metrics import quality_metrics
from recsys.classifiers.temporal import TemporalEvidencesClassifier, configure_dynamic_cutoff
from recsys.dataset import load_dataset


#configuration
data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config")
methods_to_test = [("Fixed cutoff", None),
                   ("dynamic cutoff=4", configure_dynamic_cutoff(1.0, 0.4, 4)),
                   ("dynamic cutoff=2", configure_dynamic_cutoff(1.0, 0.4, 2))]

#run all configured cutoffs with 10-fold cross-validation
experiment = Experiment(data)
for name, method in methods_to_test:
    experiment.add_classifier(TemporalEvidencesClassifier(data.features, data.target_names,
                              postprocess=method), name=name)
results = experiment.run(folds=10)

#print results
pandas.set_option('expand_frame_repr', False)
pandas.set_option('max_columns', 4)
print "Maximum 5 recommendations"
results.print_quality_comparison_at_cutoff(cutoff=5, metrics=quality_metrics)
print "Maximum 10 recommendations"
results.print_quality_comparison_at_cutoff(cutoff=10, metrics=quality_metrics)
Exemplo n.º 25
0
    cutoff_results_at = 14
    return data, cutoff_results_at
    
def houseB():   
    """
    This dataset is partially dominated by one of the sensors, which makes the evaluation results less statistically
    sound, e.g. it leads to large confidence intervals when running 10-fold cross-validation.  
    """
    data = load_dataset("../datasets/houseB.csv", "../datasets/houseB.config")
    cutoff_results_at = 15    
    return data, cutoff_results_at

#configuration
data, cutoff_results_at = houseA()

#run several classifiers on the same dataset, use 10-fold cross-validation
experiment = Experiment(data)
experiment.add_classifier(TemporalEvidencesClassifier(data.features, data.target_names), name="Our method")
experiment.add_classifier(NaiveBayesClassifier(data.features, data.target_names), name="Naive Bayes")
experiment.add_classifier(RandomClassifier(data.features, data.target_names), name="Random")
results = experiment.run(folds=10)

#print and plot results
results.print_quality_comparison_at_cutoff(cutoff=1, metrics=["Recall", "Precision", "F1"])
results.print_runtime_comparison()
plot_conf = plot.plot_config(config.plot_directory, sub_dirs=[data.name], img_type=config.img_type)
results.plot_quality_comparison(metrics=["Recall", "Precision", "F1"], plot_config=plot_conf,
                                cutoff_results_at=cutoff_results_at)
                                

Exemplo n.º 26
0
def houseB():
    """
    This dataset is partially dominated by one of the sensors, which makes the evaluation results less statistically
    sound, e.g. it leads to large confidence intervals when running 10-fold cross-validation.  
    """
    data = load_dataset("../datasets/houseB.csv", "../datasets/houseB.config")
    cutoff_results_at = 15
    return data, cutoff_results_at


#configuration
data, cutoff_results_at = houseA()

#run several classifiers on the same dataset, use 10-fold cross-validation
experiment = Experiment(data)
experiment.add_classifier(TemporalEvidencesClassifier(data.features,
                                                      data.target_names),
                          name="Our method")
experiment.add_classifier(NaiveBayesClassifier(data.features,
                                               data.target_names),
                          name="Naive Bayes")
experiment.add_classifier(RandomClassifier(data.features, data.target_names),
                          name="Random")
results = experiment.run(folds=10)

#print and plot results
results.print_quality_comparison_at_cutoff(
    cutoff=1, metrics=["Recall", "Precision", "F1"])
results.print_runtime_comparison()
plot_conf = plot.plot_config(config.plot_directory,
Exemplo n.º 27
0
from recsys.classifiers.binning import initialize_bins
from recsys.dataset import load_dataset

#configuration
data = load_dataset("../datasets/houseA.csv", "../datasets/houseA.config")
intervals_to_test = [#test various settings for delta t_max
                     ("Delta t_max=1200s", initialize_bins(start=0, end=60, width=10) +
                                           initialize_bins(start=60, end=1200, width=30)),
                     ("Delta t_max=120s",  initialize_bins(start=0, end=60, width=10) +
                                           initialize_bins(start=60, end=120, width=30)),
                     ("Delta t_max=60s",   initialize_bins(start=0, end=60, width=10)),
                     ("Delta t_max=30s",   initialize_bins(start=0, end=30, width=10)),
                     ("Delta t_max=10s",   initialize_bins(start=0, end=10, width=10)),
                     #test various interval widths
                     ("all intervals 2s wide",   initialize_bins(start=0, end=300, width=2)),
                     ("all intervals 4s wide",   initialize_bins(start=0, end=300, width=4)),
                     ("all intervals 6s wide",   initialize_bins(start=0, end=300, width=6)),
                     ("all intervals 8s wide",   initialize_bins(start=0, end=300, width=8)),
                     ("all intervals 30s wide",  initialize_bins(start=0, end=300, width=30)),
                     ("all intervals 50s wide",  initialize_bins(start=0, end=300, width=50)),
                     ("all intervals 100s wide", initialize_bins(start=0, end=300, width=100))]

#run 10-fold cross-validation for each of the configured intervals
experiment = Experiment(data)
for (name, bins) in intervals_to_test:
    experiment.add_classifier(TemporalEvidencesClassifier(data.features, data.target_names,
                              bins=bins), name=name)
results = experiment.run(folds=10)

results.print_quality_comparison_at_cutoff(cutoff=1, metrics=["Recall", "Precision", "F1"])