예제 #1
0
def extract(metrics, model_dir, params_filename, metrics_filename):
    metrics_file = os.path.join(model_dir, metrics_filename)
    params_file = os.path.join(model_dir, params_filename)
    if os.path.isfile(metrics_file):
        data = load_from_json(metrics_file)
        data.update(load_from_json(params_file))
        data['path'] = model_dir
        metrics.append(data)

    for subitem in os.listdir(model_dir):
        subdir = os.path.join(model_dir, subitem)
        if not os.path.isdir(subdir):
            continue
        extract(metrics, subdir, params_filename, metrics_filename)
예제 #2
0
def cityDic(places):
    geolocator = Nominatim(user_agent="specify_your_app_name_here")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
    place_dicts = []
    for place in places:
        place_dict = {"text":place, "address":"", "latitude":"", "longtitude":""}
        location = geocode(place)
        if location:
            place_dict["address"] = location.address
            point = tuple(location.point)
            place_dict["latitude"] = point[0]
            place_dict["longtitude"] = point[1]

        place_dicts.append(place_dict)
    return place_dicts

if __name__ == '__main__':
    args = get_args()
    data = load_from_json(args.data)

    place_tags = []
    # TODO : Process only sentences with label 1
    for sentence in data["sentences"]:
        places = geograpy.get_place_context(text=sentence)
        place_dicts = cityDic(places.cities) # Only cities ???
        place_tags.append(place_dicts)

    data["place_tags"] = place_tags
    write_to_json(data, data["id"], extension="json", out_dir=args.out_dir)
예제 #3
0
def ui_load(expenses):
    path = ui_input_path()

    serialized = load_from_json(path)
    expenses.do('set_serialized', serialized)
예제 #4
0
def make_supplementary_table_three(
    *, args, topn_results_obs, topn_results_counter_diss, topn_results_counter_suff
):
    casecards = load_from_json(args.datapath / VIGNETTES_FILE)

    paired_results = {}
    doc_topn = {}
    doc_topn_caseav = {}
    doc_score, obs_score = [], []
    for num, card in enumerate(casecards.values()):
        if args.first is not None and num >= args.first:
            continue

        true_id = card["card"]["diseases"][0]["id"]
        pred_suff = topn_results_counter_suff[num]
        pred_diss = topn_results_counter_diss[num]
        pred_obs = topn_results_obs[num]
        doc_res_n = doctor_top_ns(card, true_id)
        for val in doc_res_n:
            if val[1] == 0:
                continue
            if val[0] not in paired_results.keys():
                paired_results[val[0]] = [
                    [
                        deepcopy(val[2]),
                        deepcopy(pred_suff[val[1] - 1]),
                        deepcopy(pred_diss[val[1] - 1]),
                        deepcopy(pred_obs[val[1] - 1]),
                    ]
                ]
            else:
                paired_results[val[0]] += [
                    [
                        deepcopy(val[2]),
                        deepcopy(pred_suff[val[1] - 1]),
                        deepcopy(pred_diss[val[1] - 1]),
                        deepcopy(pred_obs[val[1] - 1]),
                    ]
                ]
        for val in doc_res_n:
            if val[1] == 0:
                continue

            if val[0] not in doc_topn.keys():
                doc_topn[val[0]] = {
                    "count": 1,
                    "sufficiency": {
                        val[1]: np.array([1, deepcopy(pred_suff[val[1] - 1])])
                    },
                    "disablement": {
                        val[1]: np.array([1, deepcopy(pred_diss[val[1] - 1])])
                    },
                    "obs": {val[1]: np.array([1, deepcopy(pred_obs[val[1] - 1])])},
                    "doctor": {val[1]: np.array([1, deepcopy(val[2])])},
                }
            else:
                doc_topn[val[0]]["count"] += 1
                if (
                    val[1] not in doc_topn[val[0]]["sufficiency"].keys()
                ):  # this doctor has never had this score before

                    doc_topn[val[0]]["sufficiency"][val[1]] = np.array(
                        [1, deepcopy(pred_suff[val[1] - 1])]
                    )
                    doc_topn[val[0]]["disablement"][val[1]] = np.array(
                        [1, deepcopy(pred_diss[val[1] - 1])]
                    )
                    doc_topn[val[0]]["obs"][val[1]] = np.array(
                        [1, deepcopy(pred_obs[val[1] - 1])]
                    )
                    doc_topn[val[0]]["doctor"][val[1]] = np.array([1, deepcopy(val[2])])
                else:
                    doc_topn[val[0]]["sufficiency"][val[1]] += np.array(
                        [1, deepcopy(pred_suff[val[1] - 1])]
                    )
                    doc_topn[val[0]]["disablement"][val[1]] += np.array(
                        [1, deepcopy(pred_diss[val[1] - 1])]
                    )
                    doc_topn[val[0]]["obs"][val[1]] += np.array(
                        [1, deepcopy(pred_obs[val[1] - 1])]
                    )
                    doc_topn[val[0]]["doctor"][val[1]] += np.array(
                        [1, deepcopy(val[2])]
                    )

        this_card_res_doc = {
            1: [],
            2: [],
            3: [],
            4: [],
            5: [],
            6: [],
            7: [],
            8: [],
            9: [],
        }
        this_card_res_suff = {
            1: [],
            2: [],
            3: [],
            4: [],
            5: [],
            6: [],
            7: [],
            8: [],
            9: [],
        }
        this_card_res_diss = {
            1: [],
            2: [],
            3: [],
            4: [],
            5: [],
            6: [],
            7: [],
            8: [],
            9: [],
        }
        this_card_res_obs = {
            1: [],
            2: [],
            3: [],
            4: [],
            5: [],
            6: [],
            7: [],
            8: [],
            9: [],
        }
        for val in doc_res_n:
            if val[1] == 0:
                continue
            if val[1] > 9:
                continue
            this_card_res_doc[val[1]] += [val[2]]
            this_card_res_suff[val[1]] += [deepcopy(pred_suff[val[1] - 1])]
            this_card_res_diss[val[1]] += [deepcopy(pred_diss[val[1] - 1])]
            this_card_res_obs[val[1]] += [deepcopy(pred_obs[val[1] - 1])]
        this_card_res_doc = dict(
            [[k, mean_list(val)] for k, val in this_card_res_doc.items()]
        )
        this_card_res_suff = dict(
            [[k, mean_list(val)] for k, val in this_card_res_suff.items()]
        )
        this_card_res_diss = dict(
            [[k, mean_list(val)] for k, val in this_card_res_diss.items()]
        )
        this_card_res_obs = dict(
            [[k, mean_list(val)] for k, val in this_card_res_obs.items()]
        )
        for k, val in this_card_res_doc.items():
            if val == "none":  # no data collected on differentials of this size
                continue
            else:  # if a value was collected for this value for doctors, it was collected for the other two algorithms too
                if k not in doc_topn_caseav.keys():
                    doc_topn_caseav[k] = {
                        "count": 1,
                        "suff": deepcopy(this_card_res_suff[k]),
                        "diss": deepcopy(this_card_res_diss[k]),
                        "obs": deepcopy(this_card_res_obs[k]),
                        "doc": deepcopy(this_card_res_doc[k]),
                    }
                else:  # we have recorded a case of this length before
                    doc_topn_caseav[k]["count"] += 1
                    doc_topn_caseav[k]["suff"] += deepcopy(this_card_res_suff[k])
                    doc_topn_caseav[k]["diss"] += deepcopy(this_card_res_diss[k])
                    doc_topn_caseav[k]["obs"] += deepcopy(this_card_res_obs[k])
                    doc_topn_caseav[k]["doc"] += deepcopy(this_card_res_doc[k])

        # card mean score
        if [val[2] for val in [_val for _val in doc_res_n if _val[1] != 0]] == []:
            continue
        doc_mean_score = np.mean(
            [val[2] for val in [_val for _val in doc_res_n if _val[1] != 0]]
        )
        obs_mean_score = np.mean(
            [
                deepcopy(pred_obs[val[1] - 1])
                for val in [_val for _val in doc_res_n if _val[1] != 0]
            ]
        )
        doc_score += [doc_mean_score]
        obs_score += [obs_mean_score]

    doc_score = []
    doc_error = []
    obs_score = []
    obs_error = []
    suff_score = []
    suff_error = []
    diss_score = []
    diss_error = []

    for k, val in doc_topn.items():
        n = val["count"]
        if n < 50:
            continue

        docp = sum(val["doctor"].values())[1] / n
        obsp = sum(val["obs"].values())[1] / n
        suffp = sum(val["sufficiency"].values())[1] / n
        dissp = sum(val["disablement"].values())[1] / n
        doc_score += [docp]
        doc_error += [np.sqrt(docp * (1 - docp) / n)]
        obs_score += [obsp]
        obs_error += [np.sqrt(obsp * (1 - obsp) / n)]
        suff_score += [suffp]
        suff_error += [np.sqrt(suffp * (1 - suffp) / n)]
        diss_score += [suffp]
        diss_error += [np.sqrt(dissp * (1 - dissp) / n)]

    raw_data = {
        "doc_score": doc_score,
        "doc_error": doc_error,
        "obs_score": obs_score,
        "obs_error": obs_error,
        "sufficiency_score": suff_score,
        "sufficiency_error": suff_error,
        "disablement_score": diss_score,
        "disablement_error": diss_error,
    }
    df_results = pd.DataFrame(
        raw_data,
        columns=[
            "doc_score",
            "doc_error",
            "obs_score",
            "obs_error",
            "sufficiency_score",
            "sufficiency_error",
            "disablement_score",
            "disablement_error",
        ],
    )

    df_results.to_pickle(args.results / "supp_table_3_df.p")

    return df_results, doc_topn
import utils
import random
import collections
from sklearn import svm, tree

# Set parameters and load dataset
DATASET = 'soundscapesDescriptor'
NUMBER_OF_DIMENSIONS_OF_FEATURE_VECTOR = 7 # Maximum number of dimensions for the feature vector. Only the N most common tags will be used. Use a big number to "ommit" this parameter
CLASSIFIER_TYPE = 'tree' # Use 'svm' or 'tree'
PERCENTAGE_OF_TRAINING_DATA = 0.5 # Percentage of sounds that will be used for training (others are for testing)
MAX_INPUT_TAGS_FOR_TESTING = 5 # Use a big number to "omit" this parameter and use as many tags as originally are in the sound
dataset = utils.load_from_json(DATASET + '.json')
N = len(dataset[dataset.keys()[0]]) # Number of sounds per class
CLASS_NAMES = dataset.keys()

# 3) Define vector space
# **********************

# Get all tags in the dataset (the vocabulary)
all_tags = list()
for class_name in CLASS_NAMES:
	class_tags = utils.get_all_tags_from_class(class_name, dataset)
	all_tags += class_tags

# Filter out tags with less frequency (get only top N tags)
most_common_tags = [tag for tag, count in collections.Counter(all_tags).most_common(NUMBER_OF_DIMENSIONS_OF_FEATURE_VECTOR)]
filtered_tags = [tag for tag in most_common_tags if tag in all_tags]

# Build our prototype feature vector (unique list of tags), and print first 10 tags
prototype_feature_vector = list(set(filtered_tags))
print 'Created prototype feature vector with %i dimensions (originally %i dimensions)' % (len(prototype_feature_vector), len(set(all_tags)))
예제 #6
0
def make_table_one_and_supplementary_table_two(
    *, args, topn_results_obs, topn_results_counter_diss, topn_results_counter_suff
):
    casecards = load_from_json(args.datapath / VIGNETTES_FILE)

    results_obs = {
        "common": [],
        "rare": [],
        "very_rare": [],
        "almost_impossible": [],
        "uncommon": [],
        "very_common": [],
    }
    results_counter = {
        "common": [],
        "rare": [],
        "very_rare": [],
        "almost_impossible": [],
        "uncommon": [],
        "very_common": [],
    }
    wins_obs = {
        "common": 0,
        "rare": 0,
        "very_rare": 0,
        "almost_impossible": 0,
        "uncommon": 0,
        "very_common": 0,
    }
    wins_counter = {
        "common": 0,
        "rare": 0,
        "very_rare": 0,
        "almost_impossible": 0,
        "uncommon": 0,
        "very_common": 0,
    }
    draws = {
        "common": 0,
        "rare": 0,
        "very_rare": 0,
        "almost_impossible": 0,
        "uncommon": 0,
        "very_common": 0,
    }

    for num, card in enumerate(casecards.values()):
        if args.first is not None and num >= args.first:
            continue

        rareness = card["card"]["diseases"][0]["rareness"]
        r_obs = sum(topn_results_obs[num])
        r_suff = sum(topn_results_counter_suff[num])
        results_obs[rareness] += [min(21 - r_obs, 20)]
        results_counter[rareness] += [min(21 - r_suff, 20)]

        if r_obs > r_suff:
            wins_obs[rareness] += 1
        elif r_obs < r_suff:
            wins_counter[rareness] += 1
        else:
            draws[rareness] += 1

    results_obs_sum = []
    wins_obs_all, wins_counter_all, draws_all = 0, 0, 0
    for k, val in results_obs.items():
        results_obs_sum += val
        wins_obs_all += wins_obs[k]
        draws_all += draws[k]
        results_obs[k] = {"mean": np.mean(val), "std": np.std(val)}
    results_obs["all"] = {
        "mean": np.mean(results_obs_sum),
        "std": np.std(results_obs_sum),
    }

    results_counter_sum = []
    wins_counter_all = 0
    for k, val in results_counter.items():
        results_counter_sum += val
        wins_counter_all += wins_counter[k]
        results_counter[k] = {"mean": np.mean(val), "std": np.std(val)}
    results_counter["all"] = {
        "mean": np.mean(results_counter_sum),
        "std": np.std(results_counter_sum),
    }
    draws["all"] = draws_all
    wins_obs["all"] = wins_obs_all
    wins_counter["all"] = wins_counter_all

    print("> Observational Results")
    pprint(results_obs)
    print("")

    print("> Counterfactual Results")
    pprint(results_counter)
    print("")
예제 #7
0
def load_networks(datapath, filename=NETWORKS_FILE):
    return load_from_json(datapath / filename)
예제 #8
0
"""
# Network Parameters
num_epochs = 10
hidden_layers = [25, 25]
learning_rate = .01

# Iterate through different time periods
for response in ['daily', 'weekly', 'bi_weekly', 'monthly']:
    print('Beginning {} Models'.format(response))

    # Load which predictors will be used
    with open('saved_models/' + response + '/' + 'predictors.json', 'r') as fd:
        predictors = json.loads(fd.read())

    # Load the predictor values from files
    variables_dict = load_from_json(predictors, response, verbose=True)

    # Split the data into training and test sets
    xtrain, xtest, ytrain, ytest, ytrain_hot, ytest_hot = create_model_data(
        variables_dict, predictors, response, model_type='Both')
    # Iterate through regression and classification models
    for model_type in ['Regression', 'Classification']:
        print('Beginning {} Model for {} Predictions'.format(
            model_type, response))
        print('Setting up TensorBoard')

        # Create the model structure, define the cost optimization functions
        x = tf.placeholder(tf.float32, [None, len(predictors)], name='x')
        if model_type == 'Classification':
            y = tf.placeholder(tf.float32, [None, 2], name='y')
            output_layer = create_network(x, hidden_layers, num_classes=2)
예제 #9
0
def main(args):

    # Source 1 times
    # Source 2 newind
    # Source 3 ind
    # Source 4 thehin
    # Source 5 scm
    # Source 6 people

    data = load_from_json(args.data)
    filename = args.input_dir + "/" + data["id"]

    with open(filename, "rb") as g:
        html_string = g.read()

    text = data["text"].splitlines()

    stoplist1 = None
    stoplist2 = None
    stoplist3 = None
    stoplist4 = None
    if args.source == 1:
        text = deletesamesubstr(text)
        stoplist1 = [
            "RELATED", "From around the web", "More from The Times of India",
            "Recommended By Colombia", "more from times of india Cities",
            "You might also", "You might also like",
            "more from times of india", "All Comments ()+^ Back to Top",
            "more from times of india News", "more from times of india TV",
            "more from times of india Sports",
            "more from times of india Entertainment",
            "more from times of india Life & Style",
            "more from times of india Business"
        ]
        stoplist2 = ["FOLLOW US", "FOLLOW PHOTOS", "FOLLOW LIFE & STYLE"]

    elif args.source == 3:
        stoplist1 = [
            "Tags:", "ALSO READ", "Please read our before posting comments",
            "TERMS OF USE: The views expressed in comments published on indianexpress.com are those of the comment writer's alone. They do not represent the views or opinions of The Indian Express Group or its staff. Comments are automatically posted live; however, indianexpress.com reserves the right to take it down at any time. We also reserve the right not to publish comments that are abusive, obscene, inflammatory, derogatory or defamatory."
        ]

    elif args.source == 4:
        stoplist3 = [
            "ShareArticle", "Updated:", "MoreIn", "SpecialCorrespondent",
            "METRO PLUS", "EDUCATION PLUS", "PROPERTY PLUS", "CINEMA PLUS",
            "DISTRICT PLUS"
        ]
        stoplist4 = [
            "METRO PLUS", "EDUCATION PLUS", "PROPERTY PLUS", "CINEMA PLUS",
            "DISTRICT PLUS"
        ]

    elif args.source == 5:
        stoplist1 = ["Print Email", "Video"]
        stoplist2 = [
            "Viewed", "Associated Press", "Get updates direct to your inbox",
            "Opinion"
        ]

    elif args.source == 6:
        stoplist2 = [
            'Email | Print', '+', 'stumbleupon', 'More Pictures',
            'Save Article',
            'Click the "PLAY" button and listen. Do you like the online audio service here?',
            'Good, I like it', 'Do you have anything to say?', 'Name'
        ]
        text = [line for line in text if not line.startswith("Source")]

    if text:
        text = deletecertainstr(text,
                                stoplist1=stoplist1,
                                stoplist2=stoplist2,
                                stoplist3=stoplist3)
        if text:
            text, data = addnewstime(text,
                                     html_string,
                                     data,
                                     args.source,
                                     stoplist=stoplist4)
            if args.source == 1:
                text = deletesamesubstr(text)
            if text:
                text = "".join([
                    line.strip() + "\n" if line.strip() != "" else ""
                    for line in text
                ])[:-1]
                data["text"] = text
                data = dump_to_json(data)

    return data
예제 #10
0
def run_vignettes_experiment(*, args):
    if args.reproduce is False:
        # run over the test_networks.json file and perform inference calculation
        networks = load_from_json(args.datapath / NETWORKS_FILE)
        casecards = load_from_json(args.datapath / VIGNETTES_FILE)
        inference_output = None
    else:
        # use pre-calcd inference output
        networks = None
        inference_output = load_from_json(args.datapath / RESULTS_FILE)
        casecards = load_from_json(args.datapath / VIGNETTES_FILE)

    topn_results_obs = []
    topn_results_counter_suff = []
    topn_results_counter_diss = []

    count_all = 0
    ind_obs_store = []
    ind_suff_store = []
    ind_diss_store = []

    total_to_run = len(casecards)
    if args.first is not None:
        total_to_run = args.first

    pbar = tqdm(total=total_to_run, desc="Casecards", unit="cards")

    for card in casecards.values():

        if args.first is not None and count_all >= args.first:
            continue

        if args.reproduce is False:
            if card["card"]["network_name"] not in networks:
                continue

        if inference_output is None and networks is not None:
            counter_suff, counter_diss, obs, true_id = run_single_vignette(
                card=card,
                networks=networks,
                datapath=args.datapath,
            )
        else:
            output = inference_output[str(card["card"]["id"])]
            counter_suff = output["sufficiency"]
            counter_diss = output["disablement"]
            obs = output["posterior"]
            true_id = card["card"]["diseases"][0]["id"]

        pred_suff = np.array(
            [
                1
                if true_id
                in sorted(counter_suff, key=counter_suff.get, reverse=True)[:i]
                else 0
                for i in range(1, 21)
            ]
        )
        pred_diss = np.array(
            [
                1
                if true_id
                in sorted(counter_diss, key=counter_diss.get, reverse=True)[:i]
                else 0
                for i in range(1, 21)
            ]
        )
        pred_obs = np.array(
            [
                1 if true_id in sorted(obs, key=obs.get, reverse=True)[:i] else 0
                for i in range(1, 21)
            ]
        )
        topn_results_obs += [pred_obs]
        topn_results_counter_suff += [pred_suff]
        topn_results_counter_diss += [pred_diss]
        count_all += 1

        pbar.update(1)

        if args.verbose and (
            (count_all % 10 == 0) or (count_all == len(casecards) - 1)
        ):
            pbar.write(f"N_processed: {count_all}")
            pbar.write(f"TopN CFSuff: {sum(topn_results_counter_suff) / count_all}")
            pbar.write(f"TopN CFDiss: {sum(topn_results_counter_diss) / count_all}")
            pbar.write(f"TopN Obs:    {sum(topn_results_obs) / count_all}\n")

    write_to_pickle(topn_results_obs, args.results / RESULTS_OBS_FILE)
    write_to_pickle(
        topn_results_counter_diss, args.results / RESULTS_CF_DISSABLEMENT_FILE
    )
    write_to_pickle(
        topn_results_counter_suff, args.results / RESULTS_CF_SUFFICIENCY_FILE
    )