def main(out_dir="results"):

    model_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)

    # parse the risk of bias data from Cochrane
    data = riskofbias.RoBData(test_mode=True)
    data.generate_data(doc_level_only=False)

    docs = riskofbias.MultiTaskSentFilter(data)

    uids = np.array(docs.get_ids())
    no_studies = len(uids)

    kf = KFold(no_studies, n_folds=5, shuffle=False)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 10)}

    for train, test in kf:

        X_train_d, y_train, i_train = docs.Xyi(uids[train])

        interactions = {domain: [] for domain in riskofbias.CORE_DOMAINS}
        for doc_domain in i_train:
            for domain in riskofbias.CORE_DOMAINS:
                if domain == doc_domain:
                    interactions[domain].append(True)
                else:
                    interactions[domain].append(False)

        vec = modhashvec.ModularVectorizer(
            norm=None,
            non_negative=True,
            binary=True,
            ngram_range=(1, 2),
            n_features=2**26)  # since multitask + bigrams = huge feature space
        vec.builder_clear()

        # import pdb; pdb.set_trace()

        vec.builder_add_docs(X_train_d, low=10)  # add base features

        for domain in riskofbias.CORE_DOMAINS:
            X_train_d = docs.X_iter(uids[train])
            print np.sum(interactions[domain]), "/", len(
                interactions[domain]), "added for", domain
            vec.builder_add_interaction_features(
                X_train_d,
                interactions=interactions[domain],
                prefix=domain + "-i-",
                low=2)  # then add interactions

        X_train = vec.builder_fit_transform()
        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                           tuned_parameters,
                           scoring='accuracy')
        clf.fit(X_train, y_train)

        # free some memory now, only need the model
        del X_train_d  # remove references to these
        del X_train

        del y_train
        clf = clf.best_estimator_  # and we only need the best performing, discard the rest

        # Test on each domain in turn

        filtered_data = riskofbias.SentFilter(data)

        for domain in riskofbias.CORE_DOMAINS:

            X_test_d, y_test = filtered_data.Xy(uids[test], domain=domain)

            # build up test vector

            vec.builder_clear()
            vec.builder_add_docs(X_test_d)  # add base features
            vec.builder_add_docs(X_test_d,
                                 prefix=domain + '-i-')  # add interactions

            X_test = vec.builder_transform()

            y_preds = clf.predict(X_test)

            model_metrics.add_preds_test(y_preds, y_test, domain=domain)
            stupid_metrics.add_preds_test([1] * len(y_test),
                                          y_test,
                                          domain=domain)

            del X_test_d, X_test, y_test, y_preds

    model_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="stupid-baseline")))
示例#2
0
def main(out_dir="results"):

    model_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)
    human_metrics = metrics.BinaryMetricsRecorder(
        domains=riskofbias.CORE_DOMAINS)

    # parse the risk of bias data from Cochrane
    print "risk of bias data!"
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False, skip_small_files=True)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)

    uids_all = filtered_data.get_ids(
        pmid_instance=0)  # those with 1 or more assessment (i.e. all)
    uids_double_assessed = filtered_data.get_ids(
        pmid_instance=1
    )  # those with 2 (or more) assessments (to hide for training)

    uids_train = np.setdiff1d(uids_all, uids_double_assessed)

    ###
    ###    sentence prediction
    ###

    # The first stage is to make the sentence prediction model using the
    #   training data set
    #

    print "First, making sentence prediction model"

    sent_docs = riskofbias.SentFilter(data)

    sent_models = {}  #where the key is the domain name

    sent_vec = modhashvec.InteractionHashingVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**24
    )  # hashing vectorizer so doesn't change per domain in terms of feature space

    for domain in riskofbias.CORE_DOMAINS:

        sent_uids = np.intersect1d(
            uids_train, np.array(sent_docs.get_ids(filter_domain=domain)))
        no_studies = len(sent_uids)

        kf = KFold(no_studies, n_folds=5, shuffle=False)

        print "%d docs obtained for domain: %s" % (no_studies, domain)

        tuned_parameters = {
            "alpha": np.logspace(-4, -1, 5),
            "class_weight": [{
                1: i,
                -1: 1
            } for i in np.logspace(-1, 2, 10)]
        }
        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                           tuned_parameters,
                           scoring='recall')

        X_train_d, y_train = sent_docs.Xy(sent_uids, domain=domain)

        X_train = sent_vec.fit_transform(X_train_d, low=2)

        clf.fit(X_train, y_train)

        sent_models[domain] = clf.best_estimator_
        # import pdb; pdb.set_trace()

    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)

    docs = riskofbias.MultiTaskDocFilter(data)

    tuned_parameters = {"alpha": np.logspace(-2, 2, 10)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                       tuned_parameters,
                       scoring='f1')

    X_train_d, y_train, i_train = docs.Xyi(uids_train, pmid_instance=0)

    # add interaction features (here both domain + high prob sentences)

    interactions = {domain: [] for domain in riskofbias.CORE_DOMAINS}

    high_prob_sents = []

    for doc_text, doc_domain in zip(X_train_d, i_train):

        doc_sents = sent_tokenizer.tokenize(doc_text)
        doc_sents_X = sent_vec.transform(doc_sents)

        doc_sents_preds = sent_models[doc_domain].predict(doc_sents_X)

        high_prob_sents.append(" ".join([
            sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
            if sent_pred == 1
        ]))

        print "high prob sents:"

        from collections import Counter
        prob_count = Counter(list(doc_sents_preds))
        print prob_count

        for domain in riskofbias.CORE_DOMAINS:
            if domain == doc_domain:
                interactions[domain].append(True)
            else:
                interactions[domain].append(False)

    vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    vec.builder_clear()

    vec.builder_add_docs(X_train_d, low=10)  # add base features

    # print high_prob_sents

    for domain in riskofbias.CORE_DOMAINS:

        print np.sum(interactions[domain]), "/", len(
            interactions[domain]), "added for", domain
        vec.builder_add_docs(X_train_d,
                             interactions=interactions[domain],
                             prefix=domain + "-i-",
                             low=2)  # then add interactions

    vec.builder_add_docs(high_prob_sents, prefix="-s-", low=2)

    X_train = vec.builder_fit_transform()

    clf.fit(X_train, y_train)

    # Test on each domain in turn

    for domain in riskofbias.CORE_DOMAINS:

        uids_domain_all = filtered_data.get_ids(pmid_instance=0,
                                                filter_domain=domain)
        uids_domain_double_assessed = filtered_data.get_ids(
            pmid_instance=1, filter_domain=domain)
        uids_test_domain = np.intersect1d(uids_domain_all,
                                          uids_domain_double_assessed)

        X_test_d, y_test = filtered_data.Xy(uids_test_domain,
                                            domain=domain,
                                            pmid_instance=0)

        X_ignore, y_human = filtered_data.Xy(uids_test_domain,
                                             domain=domain,
                                             pmid_instance=1)
        X_ignore = None  # don't need this bit

        #
        #   get high prob sents from test data
        #

        high_prob_sents = []
        for doc_text in X_test_d:

            doc_sents = sent_tokenizer.tokenize(doc_text)
            doc_sents_X = sent_vec.transform(doc_sents)

            doc_sents_preds = sent_models[domain].predict(doc_sents_X)

            high_prob_sents.append(" ".join([
                sent for sent, sent_pred in zip(doc_sents, doc_sents_preds)
                if sent_pred == 1
            ]))

        # build up test vector

        vec.builder_clear()
        vec.builder_add_docs(X_test_d)  # add base features
        vec.builder_add_docs(X_test_d,
                             prefix=domain + '-i-')  # add interactions
        vec.builder_add_docs(high_prob_sents, prefix="-s-")

        X_test = vec.builder_transform()

        y_preds = clf.predict(X_test)

        model_metrics.add_preds_test(y_preds, y_test, domain=domain)
        human_metrics.add_preds_test(y_human, y_test, domain=domain)
        stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)

    model_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="stupid-baseline")))
    human_metrics.save_csv(
        os.path.join(out_dir, outputnames.filename(label="human-performance")))
示例#3
0
def main():

    model_metrics = metrics.BinaryMetricsRecorder(domains=target_domains)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=target_domains)
    human_metrics = metrics.BinaryMetricsRecorder(domains=target_domains)

    # parse the risk of bias data from Cochrane
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=True)

    # filter the data by Document
    filtered_data = riskofbias.DocFilter(data)

    # get the uids of the desired training set
    # (for this experiment those which appear in only one review)

    uids_all = filtered_data.get_ids(
        pmid_instance=0)  # those with 1 or more assessment (i.e. all)
    uids_double_assessed = filtered_data.get_ids(
        pmid_instance=1
    )  # those with 2 (or more) assessments (to hide for training)

    uids_train = np.setdiff1d(uids_all, uids_double_assessed)

    # we need different test ids for each domain
    # (since we're testing on studies with more than one RoB assessment for *each domain*)

    docs = riskofbias.MultiTaskDocFilter(data)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 10)}
    clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"),
                       tuned_parameters,
                       scoring='accuracy')

    X_train_d, y_train, i_train = docs.Xyi(uids_train, pmid_instance=0)

    interactions = {domain: [] for domain in target_domains}
    for doc_text, doc_domain in zip(X_train_d, i_train):
        for domain in target_domains:
            if domain == doc_domain:
                interactions[domain].append(True)
            else:
                interactions[domain].append(False)

    vec = modhashvec.ModularVectorizer(
        norm=None,
        non_negative=True,
        binary=True,
        ngram_range=(1, 2),
        n_features=2**26)  # since multitask + bigrams = huge feature space
    vec.builder_clear()

    vec.builder_add_docs(X_train_d, low=10)  # add base features

    for domain in target_domains:

        print np.sum(interactions[domain]), "/", len(
            interactions[domain]), "added for", domain
        vec.builder_add_interaction_features(X_train_d,
                                             interactions=interactions[domain],
                                             prefix=domain + "_I_",
                                             low=2)  # then add interactions

    X_train = vec.builder_fit_transform()

    clf.fit(X_train, y_train)

    # free some memory now, only need the model
    del X_train_d  # remove references to these
    del X_train
    del y_train
    clf = clf.best_estimator_  # and we only need the best performing, discard the rest

    # Test on each domain in turn

    for domain in target_domains:

        uids_domain_all = filtered_data.get_ids(pmid_instance=0,
                                                filter_domain=domain)
        uids_domain_double_assessed = filtered_data.get_ids(
            pmid_instance=1, filter_domain=domain)
        uids_test_domain = np.intersect1d(uids_domain_all,
                                          uids_domain_double_assessed)

        X_test_d, y_test = filtered_data.Xy(uids_test_domain,
                                            domain=domain,
                                            pmid_instance=0)

        X_ignore, y_human = filtered_data.Xy(uids_test_domain,
                                             domain=domain,
                                             pmid_instance=1)
        X_ignore = None  # don't need this bit

        # build up test vector

        vec.builder_clear()
        vec.builder_add_docs(X_test_d)  # add base features
        vec.builder_add_docs(X_test_d,
                             prefix=domain + '_I_')  # add interactions

        X_test = vec.builder_transform()

        y_preds = clf.predict(X_test)

        model_metrics.add_preds_test(y_preds, y_test, domain=domain)
        human_metrics.add_preds_test(y_human, y_test, domain=domain)
        stupid_metrics.add_preds_test([1] * len(y_test), y_test, domain=domain)

    model_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(
        os.path.join('results', outputnames.filename(label="stupid-baseline")))
    human_metrics.save_csv(
        os.path.join('results',
                     outputnames.filename(label="human-performance")))