def main():
    dat = riskofbias.RoBData(test_mode=False)
    dat.generate_data(doc_level_only=True)

    model_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)

    stupid_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)

    docs = riskofbias.SentFilter(dat)

    for domain in riskofbias.CORE_DOMAINS:

        uids = np.array(docs.get_ids(filter_domain=domain))
        no_studies = len(uids)

        kf = KFold(no_studies, n_folds=5, shuffle=False)

        print "%d docs obtained for domain: %s" % (no_studies, domain)

        tuned_parameters = {"alpha": np.logspace(-4, -1, 10)}

        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                           tuned_parameters,
                           scoring='f1')

        for train, test in kf:

            X_train_d, y_train = docs.Xy(uids[train], domain=domain)
            X_test_d, y_test = docs.Xy(uids[test], domain=domain)

            vec = modhashvec.InteractionHashingVectorizer(norm=None,
                                                          non_negative=True,
                                                          binary=True,
                                                          ngram_range=(1, 2),
                                                          n_features=2**24)

            X_train = vec.fit_transform(X_train_d, low=2)
            X_test = vec.transform(X_test_d)

            clf.fit(X_train, y_train)

            y_preds = clf.predict(X_test)

            model_metrics.add_preds_test(y_preds, y_test, domain=domain)

            stupid_metrics.add_preds_test([1] * len(y_test),
                                          y_test,
                                          domain=domain)

    model_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="stupid-baseline")))
def main():
    dat = riskofbias.RoBData(test_mode=False)
    dat.generate_data(doc_level_only=True)


    model_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)

    stupid_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)


    multitask_docs = riskofbias.MultiTaskDocFilter(dat) # use the same ids as the multitask model
    multitask_uids = np.array(multitask_docs.get_ids())
    no_studies = len(multitask_uids)


    kf = KFold(no_studies, n_folds=5, shuffle=False)

    for domain in riskofbias.CORE_DOMAINS:

        docs = riskofbias.DocFilter(dat)
        uids = np.array(docs.get_ids(filter_domain=domain))

        print "%d docs obtained for domain: %s" % (len(uids), domain)


        tuned_parameters = {"alpha": np.logspace(-4, -1, 10)}
        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='f1')

        no_studies = len(uids)


        for train, test in kf:

            X_train_d, y_train = docs.Xy(np.intersect1d(uids, multitask_uids[train]), domain=domain)
            X_test_d, y_test = docs.Xy(np.intersect1d(uids, multitask_uids[test]), domain=domain)

            vec = modhashvec.InteractionHashingVectorizer(norm=None, non_negative=True, binary=True)

            X_train = vec.fit_transform(X_train_d, low=2)
            X_test = vec.transform(X_test_d)

            clf.fit(X_train, y_train)

            y_preds = clf.predict(X_test)

            model_metrics.add_preds_test(y_preds, y_test, domain=domain)

            stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)

    model_metrics.save_csv(os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(os.path.join('results', outputnames.filename(label="stupid-baseline")))
def main(out_dir="results"):
    model_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains)
    human_metrics = metrics.BinaryMetricsRecorder(domains=skip_domains)

    # parse the risk of bias data from Cochrane
    print "risk of bias data!"
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)

    uids_all = filtered_data.get_ids(
        pmid_instance=0)  # those with 1 or more assessment (i.e. all)
    uids_double_assessed = filtered_data.get_ids(
        pmid_instance=1
    )  # those with 2 (or more) assessments (to hide for training)
    uids_train = np.setdiff1d(uids_all, uids_double_assessed)

    ########################
    # sentence prediction  #
    ########################

    # The first stage is to make the sentence prediction model using the
    #   training data set
    #
    print "First, making sentence prediction model"
    sent_docs = riskofbias.MultiTaskSentFilter(data)
    uids = np.array(sent_docs.get_ids())
    no_studies = len(uids)

    # sentence tokenization
    sent_vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    sent_vec.builder_clear()
    # add base features; this effectively generates the shared feature
    # space (i.e., features for all domains)
    sent_vec.builder_add_interaction_features(sent_docs.X(uids_train,
                                                          domain=skip_domains),
                                              low=7)

    # now we add interaction features, which cross the domain with the
    # tokens. specifically, the X_i method returns token tuples crossing
    # every term with every domain, and the vectorizer (an instance of
    # ModularVectorizer) deals with inserting the actual interaction tokens
    # that cross domains with tokens.
    domain_interaction_tuples = sent_docs.X_i(uids_train, domain=skip_domains)
    sent_vec.builder_add_interaction_features(domain_interaction_tuples, low=2)

    # setup sentence classifier
    tuned_parameters = {
        "alpha": np.logspace(-4, -1, 5),
        "class_weight": [{
            1: i,
            -1: 1
        } for i in np.logspace(0, 2, 5)]
    }
    # bcw: are we sure we want to do 'recall' here, and not (e.g.) F1?
    sent_clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                            tuned_parameters,
                            scoring='recall')

    X_train = sent_vec.builder_fit_transform()
    y_train = sent_docs.y(uids_train, domain=skip_domains)

    sent_clf.fit(X_train, y_train)
    del X_train, y_train
    # we only need the best performing
    sent_clf = sent_clf.best_estimator_

    # now we have our multi-task sentence prediction model,
    # which we'll use to make sentence-level predictions for
    # documents.

    ########################
    # document prediction  #
    ########################

    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)
    docs = riskofbias.MultiTaskDocFilter(data)
    X_train_d = docs.Xyi(uids_train, domain=skip_domains)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 5)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                       tuned_parameters,
                       scoring='f1')

    # bcw: note that I've amended the y method to
    # return interactions as well (i.e., domain strs)
    y_train = docs.y(uids_train, domain=skip_domains)

    # add interaction features (here both domain + high prob sentences)
    interactions = {domain: [] for domain in skip_domains}
    high_prob_sents = []
    interaction_domains = []

    for doc_index, (doc_text, doc_domain) in enumerate(X_train_d):

        doc_sents = sent_tokenizer.tokenize(doc_text)
        doc_domains = [doc_domain] * len(doc_sents)
        # interactions
        doc_X_i = izip(doc_sents, doc_domains)

        # sent_vec is from above.
        sent_vec.builder_clear()
        sent_vec.builder_add_interaction_features(
            doc_sents)  # add base features
        sent_vec.builder_add_interaction_features(
            doc_X_i)  # then add interactions
        doc_sents_X = sent_vec.builder_transform()

        ## bcw -- shouldn't we use the *true* sentence labels
        # here, rather than predictions????

        # sent_clf was trained above
        doc_sents_preds = sent_clf.predict(doc_sents_X)

        high_prob_sents.append(" ".join([
            sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
            if sent_pred == 1
        ]))
        interaction_domains.append("-s-" + doc_domain)

        if doc_index % 10 == 0:
            print doc_index
        # from collections import Counter
        # prob_count = Counter(list(doc_sents_preds))
        # print prob_count

        # for domain in riskofbias.CORE_DOMAINS:
        #     if domain == doc_domain:
        #         interactions[domain].append(True)
        #     else:
        #         interactions[domain].append(False)

    vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    vec.builder_clear()
    vec.builder_add_docs(docs.X(uids_train, domain=skip_domains),
                         low=7)  # add base features
    vec.builder_add_docs(docs.Xyi(uids_train, domain=skip_domains),
                         low=2)  # add domain interactions
    # removed X_train_d since already been through the generator! (needed reset)
    vec.builder_add_docs(izip(high_prob_sents, interaction_domains),
                         low=2)  # then add sentence interaction terms

    X_train = vec.builder_fit_transform()
    clf.fit(X_train, y_train)

    ############
    # testing  #
    ############

    # Test on each domain in turn
    for domain in skip_domains:
        uids_domain_all = filtered_data.get_ids(pmid_instance=0,
                                                filter_domain=domain)
        uids_domain_double_assessed = filtered_data.get_ids(
            pmid_instance=1, filter_domain=domain)
        uids_test_domain = np.intersect1d(uids_domain_all,
                                          uids_domain_double_assessed)

        X_test_d, y_test = filtered_data.Xy(uids_test_domain,
                                            domain=domain,
                                            pmid_instance=0)
        X_ignore, y_human = filtered_data.Xy(uids_test_domain,
                                             domain=domain,
                                             pmid_instance=1)
        X_ignore = None  # don't need this bit

        #
        #   get high prob sents from test data
        #
        high_prob_sents = []

        for doc_text in X_test_d:
            doc_sents = sent_tokenizer.tokenize(doc_text)

            # bcw -- I think this (using doc_domain and not
            # domain) was the bug before!
            #doc_domains = [doc_domain] * len(doc_sents)
            doc_domains = [domain] * len(doc_sents)

            doc_X_i = izip(doc_sents, doc_domains)

            sent_vec.builder_clear()
            sent_vec.builder_add_interaction_features(
                doc_sents)  # add base features
            sent_vec.builder_add_interaction_features(
                doc_X_i)  # then add interactions
            doc_sents_X = sent_vec.builder_transform()
            doc_sents_preds = sent_clf.predict(doc_sents_X)

            high_prob_sents.append(" ".join([
                sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
                if sent_pred == 1
            ]))

        sent_domain_interactions = ["-s-" + domain] * len(high_prob_sents)
        domain_interactions = [domain] * len(high_prob_sents)

        print
        print "domain: %s" % domain
        print "High prob sents:"
        print '\n'.join(high_prob_sents)

        # build up test vector
        vec.builder_clear()
        vec.builder_add_docs(X_test_d)  # add base features
        vec.builder_add_docs(izip(X_test_d,
                                  domain_interactions))  # add interactions
        vec.builder_add_docs(
            izip(high_prob_sents,
                 sent_domain_interactions))  # sentence interactions

        X_test = vec.builder_transform()
        y_preds = clf.predict(X_test)

        model_metrics.add_preds_test(y_preds, y_test, domain=domain)
        human_metrics.add_preds_test(y_human, y_test, domain=domain)
        stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)

    model_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="stupid-baseline")))
    human_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="human-performance")))
def main(out_dir="results"):

    model_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)

    # parse the risk of bias data from Cochrane
    data = riskofbias.RoBData(test_mode=True)
    data.generate_data(doc_level_only=False)

    docs = riskofbias.MultiTaskSentFilter(data)

    uids = np.array(docs.get_ids())
    no_studies = len(uids)

    kf = KFold(no_studies, n_folds=5, shuffle=False)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 10)}
    

    for train, test in kf:

        X_train_d, y_train, i_train = docs.Xyi(uids[train])

        interactions = {domain:[] for domain in riskofbias.CORE_DOMAINS}
        for doc_domain in i_train:
            for domain in riskofbias.CORE_DOMAINS:
                if domain == doc_domain:
                    interactions[domain].append(True)
                else:
                    interactions[domain].append(False)

        vec = modhashvec.ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space
        vec.builder_clear()

        

        # import pdb; pdb.set_trace()

    
        vec.builder_add_docs(X_train_d, low=10) # add base features

        for domain in riskofbias.CORE_DOMAINS:
            X_train_d = docs.X_iter(uids[train])
            print np.sum(interactions[domain]), "/", len(interactions[domain]), "added for", domain
            vec.builder_add_interaction_features(X_train_d, interactions=interactions[domain], prefix=domain+"-i-", low=2) # then add interactions

    
        X_train = vec.builder_fit_transform()
        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='accuracy')
        clf.fit(X_train, y_train)

        # free some memory now, only need the model
        del X_train_d # remove references to these
        del X_train

        del y_train
        clf = clf.best_estimator_ # and we only need the best performing, discard the rest

        # Test on each domain in turn

        filtered_data = riskofbias.SentFilter(data)

        for domain in riskofbias.CORE_DOMAINS:


            X_test_d, y_test = filtered_data.Xy(uids[test], domain=domain)

            
            # build up test vector

            vec.builder_clear()
            vec.builder_add_docs(X_test_d) # add base features
            vec.builder_add_docs(X_test_d, prefix=domain+'-i-') # add interactions

            X_test = vec.builder_transform()

            y_preds = clf.predict(X_test)

            model_metrics.add_preds_test(y_preds, y_test, domain=domain)
            stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)

            del X_test_d, X_test, y_test, y_preds




    model_metrics.save_csv(os.path.join(out_dir, outputnames.filename(label="model")))
    stupid_metrics.save_csv(os.path.join(out_dir, outputnames.filename(label="stupid-baseline")))
示例#5
0
def main():

    model_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)

    # parse the risk of bias data from Cochrane
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False)

    docs = riskofbias.MultiTaskSentFilter(data)

    uids = np.array(docs.get_ids())
    no_studies = len(uids)

    kf = KFold(no_studies, n_folds=5, shuffle=False)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 5)}

    vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space

    for train, test in kf:

        y_train = docs.y(uids[train])

        vec.builder_clear()
        vec.builder_add_interaction_features(docs.X(uids[train]),
                                             low=7)  # add base features
        vec.builder_add_interaction_features(docs.X_i(uids[train]),
                                             low=2)  # then add interactions
        X_train = vec.builder_fit_transform()

        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                           tuned_parameters,
                           scoring='recall')

        # import pdb; pdb.set_trace()

        clf.fit(X_train, y_train)
        del X_train, y_train
        clf = clf.best_estimator_  # and we only need the best performing, discard the rest

        # Test on each domain in turn

        # filtered_data = riskofbias.SentFilter(data)

        for domain in riskofbias.CORE_DOMAINS:

            print "Testing on %s" % domain

            vec.builder_clear()
            vec.builder_add_interaction_features(
                docs.X(uids[test], domain=domain))  # add base features
            vec.builder_add_interaction_features(
                docs.X_i(uids[test], domain=domain))  # then add interactions
            X_test = vec.builder_transform()

            y_test = docs.y(uids[test], domain=domain)
            y_preds = clf.predict(X_test)

            model_metrics.add_preds_test(y_preds, y_test, domain=domain)
            stupid_metrics.add_preds_test([-1] * len(y_test),
                                          y_test,
                                          domain=domain)

            del X_test, y_test, y_preds

        del clf

    model_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="stupid-baseline")))
def main():

    model_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)

    f = open('test_data.csv','wb')
    w = csv.DictWriter(f, ["pmid", "domain", "sent_text", "random", "human", "algorithm", "top3", "top1"], escapechar="\\")
    w.writeheader()

    # parse the risk of bias data from Cochrane     
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False)

    docs = riskofbias.MultiTaskSentFilter(data)

    uids = np.array(docs.get_ids())
    no_studies = len(uids)

    kf = KFold(no_studies, n_folds=5, shuffle=False)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 5), "class_weight": [{1: i, -1: 1} for i in np.logspace(0, 2, 5)]}

    vec = modhashvec.ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space

    for k_i, (train, test) in enumerate(kf):

        if k_i == 1:
            break

        y_train = docs.y(uids[train])

            
        vec.builder_clear()
        vec.builder_add_interaction_features(docs.X(uids[train]), low=7) # add base features
        vec.builder_add_interaction_features(docs.X_i(uids[train]), low=2) # then add interactions
        X_train = vec.builder_fit_transform()

        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='recall', n_jobs=16)

        # import pdb; pdb.set_trace()

        clf.fit(X_train, y_train)
        del X_train, y_train
        clf = clf.best_estimator_ # and we only need the best performing, discard the rest

        # Test on each domain in turn

        # filtered_data = riskofbias.SentFilter(data)



        for domain in riskofbias.CORE_DOMAINS:

            print "Testing on %s" % domain

            

            vec.builder_clear()
            vec.builder_add_interaction_features(docs.X(uids[test], domain=domain)) # add base features
            vec.builder_add_interaction_features(docs.X_i(uids[test], domain=domain)) # then add interactions
            X_test = vec.builder_transform()

            y_test = docs.y(uids[test], domain=domain)
            y_preds = clf.predict(X_test)




            y_df = clf.decision_function(X_test) # get distances from the decision boundary
            # positive distances = more likely to be relevant sentences

            r_len = len(y_preds)
            y_top3 = []
            y_top1 = []
            y_rand = []

            y_uids = np.array(docs.y_uids(uids[test], domain=domain))

            # import pdb; pdb.set_trace()

            for y_uid in np.unique(y_uids):

                mask = np.where(y_uids == y_uid)[0]
                doc_df = y_df[mask]

                doc_top3 = np.argpartition(doc_df, -3)[-3:]
                y_top3.extend(list(mask[doc_top3]))
                
                doc_top1 = np.argmax(doc_df)
                y_top1.append(mask[doc_top1])

                doc_rand = np.random.randint(0, len(doc_df))
                y_rand.append(mask[doc_rand])


            human_sent_indices = np.where(y_test==1)[0]
            algorithm_sent_indices = np.where(y_preds==1)[0]

            model_metrics.add_preds_test(y_preds, y_test, domain=domain)
            stupid_metrics.add_preds_test([-1] * len(y_test), y_test, domain=domain)

            # import pdb; pdb.set_trace()

            for doc_i, (doc, pmid) in enumerate(izip(docs.X(uids[test], domain=domain), docs.iter_pmid(uids[test], domain=domain))):

                row = {"domain": domain,
                       "sent_text": doc,
                       "random": doc_i in y_rand,
                       "human": doc_i in human_sent_indices,
                       "algorithm": doc_i in algorithm_sent_indices,
                       "top3": doc_i in y_top3,
                       "top1": doc_i in y_top1,
                       "pmid": pmid}

                if row["random"] or row["human"] or row["top3"] or row["top1"]:
                    # please note, the sentences will only be included in the analysis if
                    # in the top1 or top3
                    # we do have data on whether the raw classifier has predicted yes/no
                    # 
                    # this in effect means where the classifier picks <= 3 sentences
                    # we use all raw classifier data
                    # where >3 sentences are predicted by raw classifier, only the
                    # top 3 are used; the rest are discarded
                    w.writerow(row)

            del X_test, y_test, y_preds

        del clf



    model_metrics.save_csv(os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(os.path.join('results', outputnames.filename(label="stupid-baseline")))
    f.close()
def main():


    model_metrics = metrics.BinaryMetricsRecorder(domains=target_domains)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=target_domains)
    human_metrics = metrics.BinaryMetricsRecorder(domains=target_domains)


    # parse the risk of bias data from Cochrane
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=True)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)

    uids_all = filtered_data.get_ids(pmid_instance=0) # those with 1 or more assessment (i.e. all)
    uids_double_assessed = filtered_data.get_ids(pmid_instance=1) # those with 2 (or more) assessments (to hide for training)

    uids_train = np.setdiff1d(uids_all, uids_double_assessed)



    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)

    docs = riskofbias.MultiTaskDocFilter(data)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 10)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='accuracy')

    X_train_d, y_train, i_train = docs.Xyi(uids_train, pmid_instance=0)

    interactions = {domain:[] for domain in target_domains}
    for doc_text, doc_domain in zip(X_train_d, i_train):
        for domain in target_domains:
            if domain == doc_domain:
                interactions[domain].append(True)
            else:
                interactions[domain].append(False)

    vec = modhashvec.ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space
    vec.builder_clear()

    
    vec.builder_add_docs(X_train_d, low=10) # add base features

    for domain in target_domains:
        
        print np.sum(interactions[domain]), "/", len(interactions[domain]), "added for", domain
        vec.builder_add_interaction_features(X_train_d, interactions=interactions[domain], prefix=domain+"_I_", low=2) # then add interactions

    
    X_train = vec.builder_fit_transform()
    
    clf.fit(X_train, y_train)

    # free some memory now, only need the model
    del X_train_d # remove references to these
    del X_train
    del y_train
    clf = clf.best_estimator_ # and we only need the best performing, discard the rest


    # Test on each domain in turn

    for domain in target_domains:

        uids_domain_all = filtered_data.get_ids(pmid_instance=0, filter_domain=domain)
        uids_domain_double_assessed = filtered_data.get_ids(pmid_instance=1, filter_domain=domain)
        uids_test_domain = np.intersect1d(uids_domain_all, uids_domain_double_assessed)


        X_test_d, y_test = filtered_data.Xy(uids_test_domain, domain=domain, pmid_instance=0)

        X_ignore, y_human = filtered_data.Xy(uids_test_domain, domain=domain, pmid_instance=1)
        X_ignore = None # don't need this bit


        # build up test vector

        vec.builder_clear()
        vec.builder_add_docs(X_test_d) # add base features
        vec.builder_add_docs(X_test_d, prefix=domain+'_I_') # add interactions

        X_test = vec.builder_transform()

        y_preds = clf.predict(X_test)

        model_metrics.add_preds_test(y_preds, y_test, domain=domain)
        human_metrics.add_preds_test(y_human, y_test, domain=domain)
        stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)



    model_metrics.save_csv(os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(os.path.join('results', outputnames.filename(label="stupid-baseline")))
    human_metrics.save_csv(os.path.join('results', outputnames.filename(label="human-performance")))   
def main(out_dir="results"):

    model_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)
    human_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)

    # parse the risk of bias data from Cochrane
    print "risk of bias data!"
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False, skip_small_files=True)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)

    uids_all = filtered_data.get_ids(pmid_instance=0) # those with 1 or more assessment (i.e. all)
    uids_double_assessed = filtered_data.get_ids(pmid_instance=1) # those with 2 (or more) assessments (to hide for training)

    uids_train = np.setdiff1d(uids_all, uids_double_assessed)


    ###
    ###    sentence prediction
    ###

    # The first stage is to make the sentence prediction model using the
    #   training data set
    #


    print "First, making sentence prediction model"

    #sent_docs = riskofbias.SentFilter(data)
    sent_docs = riskofbias.MultiTaskSentFilter(data)

    sent_models = {} #where the key is the domain name

    #sent_vec = modhashvec.InteractionHashingVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**24) # hashing vectorizer so doesn't change per domain in terms of feature space
    sent_vec = modhashvec.ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26)
    for domain in riskofbias.CORE_DOMAINS:

        sent_uids = np.intersect1d(uids_train, np.array(sent_docs.get_ids(filter_domain=domain)))
        no_studies = len(sent_uids)

        print "%d docs obtained for domain: %s" % (no_studies, domain)

        tuned_parameters = {"alpha": np.logspace(-4, -1, 5), "class_weight": [{1: i, -1: 1} for i in np.logspace(-1, 1, 5)]}
        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='recall')

        y_train = sent_docs.y(sent_uids, domain=domain)
        sent_vec.builder_clear()
        sent_vec.builder_add_interaction_features(sent_docs.X(sent_uids, domain=domain), low=7) # add base features
        sent_vec.builder_add_interaction_features(sent_docs.X_i(sent_uids, domain=domain), low=2) # then add interactions
        
        X_train = sent_vec.builder_fit_transform()

        clf.fit(X_train, y_train)

        sent_models[domain] = clf.best_estimator_
        # import pdb; pdb.set_trace()


    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)

    docs = riskofbias.MultiTaskDocFilter(data)

    tuned_parameters = {"alpha": np.logspace(-2, 2, 10)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='f1')

    X_train_d, y_train, i_train = docs.Xyi(uids_train, pmid_instance=0)

    # add interaction features (here both domain + high prob sentences)

    interactions = {domain:[] for domain in riskofbias.CORE_DOMAINS}

    high_prob_sents = []

    for doc_text, doc_domain in zip(X_train_d, i_train):

        doc_sents = sent_tokenizer.tokenize(doc_text)
        doc_sents_X = sent_vec.transform(doc_sents)

        doc_sents_preds = sent_models[doc_domain].predict(doc_sents_X)

        high_prob_sents.append(" ".join([sent for sent, sent_pred in zip(doc_sents, doc_sents_preds) if sent_pred==1]))

        print "high prob sents:"

        from collections import Counter
        prob_count = Counter(list(doc_sents_preds))
        print prob_count
        



        for domain in riskofbias.CORE_DOMAINS:
            if domain == doc_domain:
                interactions[domain].append(True)
            else:
                interactions[domain].append(False)

    vec = modhashvec.ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space
    vec.builder_clear()

    
    vec.builder_add_docs(X_train_d, low=10) # add base features

    # print high_prob_sents

    for domain in riskofbias.CORE_DOMAINS:
        
        print np.sum(interactions[domain]), "/", len(interactions[domain]), "added for", domain
        vec.builder_add_docs(X_train_d, interactions=interactions[domain], prefix=domain+"-i-", low=2) # then add interactions


    vec.builder_add_docs(high_prob_sents, prefix="-s-", low=2)
    

    X_train = vec.builder_fit_transform()
    
    clf.fit(X_train, y_train)

    # Test on each domain in turn

    for domain in riskofbias.CORE_DOMAINS:

        uids_domain_all = filtered_data.get_ids(pmid_instance=0, filter_domain=domain)
        uids_domain_double_assessed = filtered_data.get_ids(pmid_instance=1, filter_domain=domain)
        uids_test_domain = np.intersect1d(uids_domain_all, uids_domain_double_assessed)


        X_test_d, y_test = filtered_data.Xy(uids_test_domain, domain=domain, pmid_instance=0)

        X_ignore, y_human = filtered_data.Xy(uids_test_domain, domain=domain, pmid_instance=1)
        X_ignore = None # don't need this bit

        #
        #   get high prob sents from test data
        #

        high_prob_sents =[]
        for doc_text in X_test_d:

            doc_sents = sent_tokenizer.tokenize(doc_text)
            doc_sents_X = sent_vec.transform(doc_sents)

            doc_sents_preds = sent_models[domain].predict(doc_sents_X)

            high_prob_sents.append(" ".join([sent for sent, sent_pred in zip(doc_sents, doc_sents_preds) if sent_pred==1]))



        # build up test vector

        vec.builder_clear()
        vec.builder_add_docs(X_test_d) # add base features
        vec.builder_add_docs(X_test_d, prefix=domain+'-i-') # add interactions
        vec.builder_add_docs(high_prob_sents, prefix="-s-")
    
        X_test = vec.builder_transform()

        y_preds = clf.predict(X_test)

        model_metrics.add_preds_test(y_preds, y_test, domain=domain)
        human_metrics.add_preds_test(y_human, y_test, domain=domain)
        stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)


    model_metrics.save_csv(os.path.join(out_dir, outputnames.filename(label="model")))
    stupid_metrics.save_csv(os.path.join(out_dir, outputnames.filename(label="stupid-baseline")))
    human_metrics.save_csv(os.path.join(out_dir, outputnames.filename(label="human-performance")))   
示例#9
0
def main(out_dir="results"):

    model_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)
    human_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)

    # parse the risk of bias data from Cochrane
    print "risk of bias data!"
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False, skip_small_files=True)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)

    uids_all = filtered_data.get_ids(
        pmid_instance=0)  # those with 1 or more assessment (i.e. all)
    uids_double_assessed = filtered_data.get_ids(
        pmid_instance=1
    )  # those with 2 (or more) assessments (to hide for training)

    uids_train = np.setdiff1d(uids_all, uids_double_assessed)

    ###
    ###    sentence prediction
    ###

    # The first stage is to make the sentence prediction model using the
    #   training data set
    #

    print "First, making sentence prediction model"

    sent_docs = riskofbias.SentFilter(data)

    sent_models = {}  #where the key is the domain name

    sent_vec = modhashvec.InteractionHashingVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**24
    )  # hashing vectorizer so doesn't change per domain in terms of feature space

    for domain in riskofbias.CORE_DOMAINS:

        sent_uids = np.intersect1d(
            uids_train, np.array(sent_docs.get_ids(filter_domain=domain)))
        no_studies = len(sent_uids)

        kf = KFold(no_studies, n_folds=5, shuffle=False)

        print "%d docs obtained for domain: %s" % (no_studies, domain)

        tuned_parameters = {
            "alpha": np.logspace(-4, -1, 5),
            "class_weight": [{
                1: i,
                -1: 1
            } for i in np.logspace(-1, 2, 10)]
        }
        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                           tuned_parameters,
                           scoring='recall')

        X_train_d, y_train = sent_docs.Xy(sent_uids, domain=domain)

        X_train = sent_vec.fit_transform(X_train_d, low=2)

        clf.fit(X_train, y_train)

        sent_models[domain] = clf.best_estimator_
        # import pdb; pdb.set_trace()

    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)

    docs = riskofbias.MultiTaskDocFilter(data)

    tuned_parameters = {"alpha": np.logspace(-2, 2, 10)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                       tuned_parameters,
                       scoring='f1')

    X_train_d, y_train, i_train = docs.Xyi(uids_train, pmid_instance=0)

    # add interaction features (here both domain + high prob sentences)

    interactions = {domain: [] for domain in riskofbias.CORE_DOMAINS}

    high_prob_sents = []

    for doc_text, doc_domain in zip(X_train_d, i_train):

        doc_sents = sent_tokenizer.tokenize(doc_text)
        doc_sents_X = sent_vec.transform(doc_sents)

        doc_sents_preds = sent_models[doc_domain].predict(doc_sents_X)

        high_prob_sents.append(" ".join([
            sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
            if sent_pred == 1
        ]))

        print "high prob sents:"

        from collections import Counter
        prob_count = Counter(list(doc_sents_preds))
        print prob_count

        for domain in riskofbias.CORE_DOMAINS:
            if domain == doc_domain:
                interactions[domain].append(True)
            else:
                interactions[domain].append(False)

    vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    vec.builder_clear()

    vec.builder_add_docs(X_train_d, low=10)  # add base features

    # print high_prob_sents

    for domain in riskofbias.CORE_DOMAINS:

        print np.sum(interactions[domain]), "/", len(
            interactions[domain]), "added for", domain
        vec.builder_add_docs(X_train_d,
                             interactions=interactions[domain],
                             prefix=domain + "-i-",
                             low=2)  # then add interactions

    vec.builder_add_docs(high_prob_sents, prefix="-s-", low=2)

    X_train = vec.builder_fit_transform()

    clf.fit(X_train, y_train)

    # Test on each domain in turn

    for domain in riskofbias.CORE_DOMAINS:

        uids_domain_all = filtered_data.get_ids(pmid_instance=0,
                                                filter_domain=domain)
        uids_domain_double_assessed = filtered_data.get_ids(
            pmid_instance=1, filter_domain=domain)
        uids_test_domain = np.intersect1d(uids_domain_all,
                                          uids_domain_double_assessed)

        X_test_d, y_test = filtered_data.Xy(uids_test_domain,
                                            domain=domain,
                                            pmid_instance=0)

        X_ignore, y_human = filtered_data.Xy(uids_test_domain,
                                             domain=domain,
                                             pmid_instance=1)
        X_ignore = None  # don't need this bit

        #
        #   get high prob sents from test data
        #

        high_prob_sents = []
        for doc_text in X_test_d:

            doc_sents = sent_tokenizer.tokenize(doc_text)
            doc_sents_X = sent_vec.transform(doc_sents)

            doc_sents_preds = sent_models[domain].predict(doc_sents_X)

            high_prob_sents.append(" ".join([
                sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
                if sent_pred == 1
            ]))

        # build up test vector

        vec.builder_clear()
        vec.builder_add_docs(X_test_d)  # add base features
        vec.builder_add_docs(X_test_d,
                             prefix=domain + '-i-')  # add interactions
        vec.builder_add_docs(high_prob_sents, prefix="-s-")

        X_test = vec.builder_transform()

        y_preds = clf.predict(X_test)

        model_metrics.add_preds_test(y_preds, y_test, domain=domain)
        human_metrics.add_preds_test(y_human, y_test, domain=domain)
        stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)

    model_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="stupid-baseline")))
    human_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="human-performance")))
def main(out_dir="results"):
    model_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)
    human_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)

    # parse the risk of bias data from Cochrane
    print "risk of bias data!"
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False, skip_small_files=True)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)

    uids_all = filtered_data.get_ids(pmid_instance=0) # those with 1 or more assessment (i.e. all)
    uids_double_assessed = filtered_data.get_ids(pmid_instance=1) # those with 2 (or more) assessments (to hide for training)
    uids_train = np.setdiff1d(uids_all, uids_double_assessed)



    ########################
    # sentence prediction  #
    ########################


    # The first stage is to make the sentence prediction model using the
    #   training data set
    #
    print "First, making sentence prediction model"
    sent_docs = riskofbias.MultiTaskSentFilter(data)
    uids = np.array(sent_docs.get_ids())
    no_studies = len(uids)

    # sentence tokenization
    sent_vec = modhashvec.ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space
    sent_vec.builder_clear()
    # add base features; this effectively generates the shared feature
    # space (i.e., features for all domains)
    sent_vec.builder_add_interaction_features(sent_docs.X(uids_train), low=7) 

    # now we add interaction features, which cross the domain with the
    # tokens. specifically, the X_i method returns token tuples crossing 
    # every term with every domain, and the vectorizer (an instance of 
    # ModularVectorizer) deals with inserting the actual interaction tokens 
    # that cross domains with tokens.
    domain_interaction_tuples = sent_docs.X_i(uids_train)
    sent_vec.builder_add_interaction_features(domain_interaction_tuples, low=2) 

    # setup sentence classifier
    tuned_parameters = {"alpha": np.logspace(-4, -1, 5), "class_weight": [{1: i, -1: 1} for i in np.logspace(0, 2, 5)]}
    # bcw: are we sure we want to do 'recall' here, and not (e.g.) F1?
    sent_clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='recall', n_jobs=4)

    X_train = sent_vec.builder_fit_transform()
    y_train = sent_docs.y(uids_train)

    sent_clf.fit(X_train, y_train)
    del X_train, y_train
    # we only need the best performing
    sent_clf = sent_clf.best_estimator_ 

    # now we have our multi-task sentence prediction model,
    # which we'll use to make sentence-level predictions for
    # documents.


    ########################
    # document prediction  #
    ########################

    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)
    docs = riskofbias.MultiTaskDocFilter(data)
    X_train_d = docs.Xyi(uids_train)


    tuned_parameters = {"alpha": np.logspace(-4, -1, 5)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='f1')

    # bcw: note that I've amended the y method to 
    # return interactions as well (i.e., domain strs)
    y_train = docs.y(uids_train)

    # add interaction features (here both domain + high prob sentences)
    interactions = {domain:[] for domain in riskofbias.CORE_DOMAINS}
    high_prob_sents = []
    interaction_domains = []

    for doc_index, (doc_text, doc_domain) in enumerate(X_train_d):
        
        doc_sents = sent_tokenizer.tokenize(doc_text)
        doc_domains = [doc_domain] * len(doc_sents)
        # interactions
        doc_X_i = izip(doc_sents, doc_domains)

        # sent_vec is from above.
        sent_vec.builder_clear()
        sent_vec.builder_add_interaction_features(doc_sents) # add base features
        sent_vec.builder_add_interaction_features(doc_X_i) # then add interactions
        doc_sents_X = sent_vec.builder_transform()

        ## bcw -- shouldn't we use the *true* sentence labels
        # here, rather than predictions????

        # sent_clf was trained above
        doc_sents_preds = sent_clf.predict(doc_sents_X)

        high_prob_sents.append(" ".join([sent for sent, sent_pred in 
                        zip(doc_sents, doc_sents_preds) if sent_pred==1]))
        interaction_domains.append("-s-" + doc_domain)

        if doc_index % 10 == 0:
            print doc_index
        # from collections import Counter
        # prob_count = Counter(list(doc_sents_preds))
        # print prob_count
        
        # for domain in riskofbias.CORE_DOMAINS:
        #     if domain == doc_domain:
        #         interactions[domain].append(True)
        #     else:
        #         interactions[domain].append(False)

    vec = modhashvec.ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space
    vec.builder_clear()
    vec.builder_add_docs(docs.X(uids_train), low=7) # add base features
    vec.builder_add_docs(docs.Xyi(uids_train), low=2) # add domain interactions
    # removed X_train_d since already been through the generator! (needed reset)
    vec.builder_add_docs(izip(high_prob_sents, interaction_domains), low=2)    # then add sentence interaction terms

    X_train = vec.builder_fit_transform()    
    clf.fit(X_train, y_train)

    ############
    # testing  #
    ############

    # Test on each domain in turn
    for domain in riskofbias.CORE_DOMAINS:
        uids_domain_all = filtered_data.get_ids(pmid_instance=0, filter_domain=domain)
        uids_domain_double_assessed = filtered_data.get_ids(pmid_instance=1, filter_domain=domain)
        uids_test_domain = np.intersect1d(uids_domain_all, uids_domain_double_assessed)

        X_test_d, y_test = filtered_data.Xy(uids_test_domain, domain=domain, pmid_instance=0)
        X_ignore, y_human = filtered_data.Xy(uids_test_domain, domain=domain, pmid_instance=1)
        X_ignore = None # don't need this bit

        #
        #   get high prob sents from test data
        #
        high_prob_sents = []
        
        for doc_text in X_test_d:
            doc_sents = sent_tokenizer.tokenize(doc_text)

            # bcw -- I think this (using doc_domain and not 
            # domain) was the bug before!
            #doc_domains = [doc_domain] * len(doc_sents)
            doc_domains = [domain] * len(doc_sents)

            doc_X_i = izip(doc_sents, doc_domains)

            sent_vec.builder_clear()
            sent_vec.builder_add_interaction_features(doc_sents) # add base features
            sent_vec.builder_add_interaction_features(doc_X_i) # then add interactions
            doc_sents_X = sent_vec.builder_transform()
            doc_sents_preds = sent_clf.predict(doc_sents_X)

            high_prob_sents.append(" ".join(
                                    [sent for sent, sent_pred in 
                                        zip(doc_sents, doc_sents_preds) if sent_pred==1]))

        
        sent_domain_interactions = ["-s-" + domain] * len(high_prob_sents)
        domain_interactions = [domain] * len(high_prob_sents)

        print
        print "domain: %s" % domain
        print "High prob sents:"
        print '\n'.join(high_prob_sents)

        # build up test vector
        vec.builder_clear()
        vec.builder_add_docs(X_test_d) # add base features
        vec.builder_add_docs(izip(X_test_d, domain_interactions)) # add interactions
        vec.builder_add_docs(izip(high_prob_sents, sent_domain_interactions)) # sentence interactions
    
        X_test = vec.builder_transform()
        y_preds = clf.predict(X_test)

        model_metrics.add_preds_test(y_preds, y_test, domain=domain)
        human_metrics.add_preds_test(y_human, y_test, domain=domain)
        stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)


    model_metrics.save_csv(os.path.join(out_dir, outputnames.filename(label="model")))
    stupid_metrics.save_csv(os.path.join(out_dir, outputnames.filename(label="stupid-baseline")))
    human_metrics.save_csv(os.path.join(out_dir, outputnames.filename(label="human-performance")))   
def main():

    model_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)

    # parse the risk of bias data from Cochrane     
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False)

    docs = riskofbias.MultiTaskSentFilter(data)

    uids = np.array(docs.get_ids())
    no_studies = len(uids)

    kf = KFold(no_studies, n_folds=5, shuffle=False)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 5), "class_weight": [{1: i, -1: 1} for i in np.logspace(0, 2, 5)]}

    vec = modhashvec.ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space

    for train, test in kf:

        y_train = docs.y(uids[train])

            
        vec.builder_clear()
        vec.builder_add_interaction_features(docs.X(uids[train]), low=7) # add base features
        vec.builder_add_interaction_features(docs.X_i(uids[train]), low=2) # then add interactions
        X_train = vec.builder_fit_transform()

        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='recall')

        # import pdb; pdb.set_trace()

        clf.fit(X_train, y_train)
        del X_train, y_train
        clf = clf.best_estimator_ # and we only need the best performing, discard the rest

        # Test on each domain in turn

        # filtered_data = riskofbias.SentFilter(data)

        for domain in riskofbias.CORE_DOMAINS:

            print "Testing on %s" % domain

            vec.builder_clear()
            vec.builder_add_interaction_features(docs.X(uids[test], domain=domain)) # add base features
            vec.builder_add_interaction_features(docs.X_i(uids[test], domain=domain)) # then add interactions
            X_test = vec.builder_transform()

            y_test = docs.y(uids[test], domain=domain)
            y_preds = clf.predict(X_test)

            model_metrics.add_preds_test(y_preds, y_test, domain=domain)
            stupid_metrics.add_preds_test([-1] * len(y_test), y_test, domain=domain)

            del X_test, y_test, y_preds

        del clf



    model_metrics.save_csv(os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(os.path.join('results', outputnames.filename(label="stupid-baseline")))
def main(out_dir="results"):

    model_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)

    # parse the risk of bias data from Cochrane
    data = riskofbias.RoBData(test_mode=True)
    data.generate_data(doc_level_only=False)

    docs = riskofbias.MultiTaskSentFilter(data)

    uids = np.array(docs.get_ids())
    no_studies = len(uids)

    kf = KFold(no_studies, n_folds=5, shuffle=False)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 10)}

    for train, test in kf:

        X_train_d, y_train, i_train = docs.Xyi(uids[train])

        interactions = {domain: [] for domain in riskofbias.CORE_DOMAINS}
        for doc_domain in i_train:
            for domain in riskofbias.CORE_DOMAINS:
                if domain == doc_domain:
                    interactions[domain].append(True)
                else:
                    interactions[domain].append(False)

        vec = modhashvec.ModularVectorizer(
            norm=None,
            non_negative=True,
            binary=True,
            ngram_range=(1, 2),
            n_features=2**26)  # since multitask + bigrams = huge feature space
        vec.builder_clear()

        # import pdb; pdb.set_trace()

        vec.builder_add_docs(X_train_d, low=10)  # add base features

        for domain in riskofbias.CORE_DOMAINS:
            X_train_d = docs.X_iter(uids[train])
            print np.sum(interactions[domain]), "/", len(
                interactions[domain]), "added for", domain
            vec.builder_add_interaction_features(
                X_train_d,
                interactions=interactions[domain],
                prefix=domain + "-i-",
                low=2)  # then add interactions

        X_train = vec.builder_fit_transform()
        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                           tuned_parameters,
                           scoring='accuracy')
        clf.fit(X_train, y_train)

        # free some memory now, only need the model
        del X_train_d  # remove references to these
        del X_train

        del y_train
        clf = clf.best_estimator_  # and we only need the best performing, discard the rest

        # Test on each domain in turn

        filtered_data = riskofbias.SentFilter(data)

        for domain in riskofbias.CORE_DOMAINS:

            X_test_d, y_test = filtered_data.Xy(uids[test], domain=domain)

            # build up test vector

            vec.builder_clear()
            vec.builder_add_docs(X_test_d)  # add base features
            vec.builder_add_docs(X_test_d,
                                 prefix=domain + '-i-')  # add interactions

            X_test = vec.builder_transform()

            y_preds = clf.predict(X_test)

            model_metrics.add_preds_test(y_preds, y_test, domain=domain)
            stupid_metrics.add_preds_test([1] * len(y_test),
                                          y_test,
                                          domain=domain)

            del X_test_d, X_test, y_test, y_preds

    model_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="stupid-baseline")))
示例#13
0
def main():

    model_metrics = metrics.BinaryMetricsRecorder(domains=target_domains)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=target_domains)
    human_metrics = metrics.BinaryMetricsRecorder(domains=target_domains)

    # parse the risk of bias data from Cochrane
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=True)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)

    uids_all = filtered_data.get_ids(
        pmid_instance=0)  # those with 1 or more assessment (i.e. all)
    uids_double_assessed = filtered_data.get_ids(
        pmid_instance=1
    )  # those with 2 (or more) assessments (to hide for training)

    uids_train = np.setdiff1d(uids_all, uids_double_assessed)

    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)

    docs = riskofbias.MultiTaskDocFilter(data)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 10)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                       tuned_parameters,
                       scoring='accuracy')

    X_train_d, y_train, i_train = docs.Xyi(uids_train, pmid_instance=0)

    interactions = {domain: [] for domain in target_domains}
    for doc_text, doc_domain in zip(X_train_d, i_train):
        for domain in target_domains:
            if domain == doc_domain:
                interactions[domain].append(True)
            else:
                interactions[domain].append(False)

    vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    vec.builder_clear()

    vec.builder_add_docs(X_train_d, low=10)  # add base features

    for domain in target_domains:

        print np.sum(interactions[domain]), "/", len(
            interactions[domain]), "added for", domain
        vec.builder_add_interaction_features(X_train_d,
                                             interactions=interactions[domain],
                                             prefix=domain + "_I_",
                                             low=2)  # then add interactions

    X_train = vec.builder_fit_transform()

    clf.fit(X_train, y_train)

    # free some memory now, only need the model
    del X_train_d  # remove references to these
    del X_train
    del y_train
    clf = clf.best_estimator_  # and we only need the best performing, discard the rest

    # Test on each domain in turn

    for domain in target_domains:

        uids_domain_all = filtered_data.get_ids(pmid_instance=0,
                                                filter_domain=domain)
        uids_domain_double_assessed = filtered_data.get_ids(
            pmid_instance=1, filter_domain=domain)
        uids_test_domain = np.intersect1d(uids_domain_all,
                                          uids_domain_double_assessed)

        X_test_d, y_test = filtered_data.Xy(uids_test_domain,
                                            domain=domain,
                                            pmid_instance=0)

        X_ignore, y_human = filtered_data.Xy(uids_test_domain,
                                             domain=domain,
                                             pmid_instance=1)
        X_ignore = None  # don't need this bit

        # build up test vector

        vec.builder_clear()
        vec.builder_add_docs(X_test_d)  # add base features
        vec.builder_add_docs(X_test_d,
                             prefix=domain + '_I_')  # add interactions

        X_test = vec.builder_transform()

        y_preds = clf.predict(X_test)

        model_metrics.add_preds_test(y_preds, y_test, domain=domain)
        human_metrics.add_preds_test(y_human, y_test, domain=domain)
        stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)

    model_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="stupid-baseline")))
    human_metrics.save_csv(
        os.path.join('results',
                     outputnames.filename(label="human-performance")))
def main():


    model_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)
    human_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)


    # parse the risk of bias data from Cochrane
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=True)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)

    uids_all = filtered_data.get_ids(pmid_instance=0) # those with 1 or more assessment (i.e. all)
    uids_double_assessed = filtered_data.get_ids(pmid_instance=1) # those with 2 (or more) assessments (to hide for training)

    uids_train = np.setdiff1d(uids_all, uids_double_assessed)

    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)

    uids_test = {}
    for domain in riskofbias.CORE_DOMAINS:

        # print "%d docs obtained for domain: %s" % (len(uids), domain)

        tuned_parameters = {"alpha": np.logspace(-4, -1, 10)}
        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='accuracy')

        # no_studies = len(uids)

        X_train_d, y_train = filtered_data.Xy(uids_train, domain=domain, pmid_instance=0)

        # get domain test ids
        # (i.e. the double assessed trials, which have a judgement for the current domain in
        #   *both* the 0th and 1st review)
        uids_domain_all = filtered_data.get_ids(pmid_instance=0, filter_domain=domain)
        uids_domain_double_assessed = filtered_data.get_ids(pmid_instance=1, filter_domain=domain)
        uids_test_domain = np.intersect1d(uids_domain_all, uids_domain_double_assessed)


        X_test_d, y_test = filtered_data.Xy(uids_test_domain, domain=domain, pmid_instance=0)

        X_ignore, y_human = filtered_data.Xy(uids_test_domain, domain=domain, pmid_instance=1)
        X_ignore = None # don't need this bit


        vec = modhashvec.InteractionHashingVectorizer(norm=None, non_negative=True, binary=True)

        X_train = vec.fit_transform(X_train_d, low=2)
        X_test = vec.transform(X_test_d)

        clf.fit(X_train, y_train)

        y_preds = clf.predict(X_test)

        model_metrics.add_preds_test(y_preds, y_test, domain=domain)
        human_metrics.add_preds_test(y_human, y_test, domain=domain)
        stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)

    model_metrics.save_csv(os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(os.path.join('results', outputnames.filename(label="stupid-baseline")))
    human_metrics.save_csv(os.path.join('results', outputnames.filename(label="human-performance")))