示例#1
0
def __save_references(
    project: str, issue_summary: Tuple[str, int, Set[str], Set[str], Set[str],
                                       Set[str], Set[str], Set[str], List[str],
                                       List[str]]
) -> None:
    """
    Save references for an issue in JSON format.
    :param project: Project to write references for
    :param issue_summary: Data type describing necessary data
    :return: None
    """
    summary_dir = os.path.join("Projects", project, "Summary")
    if not os.path.isdir(summary_dir):
        os.mkdir(summary_dir)

    issue_dict = {
        "issue_key": issue_summary[0],
        "issue_id": issue_summary[1],
        "urls": list(issue_summary[2]),
        "revisions": list(issue_summary[3]),
        "mailing_lists": list(issue_summary[4]),
        "pdf_documents": list(issue_summary[5]),
        "archives": list(issue_summary[6]),
        "other_issues": list(issue_summary[7]),
        "commits": issue_summary[8],
        "pull_requests": issue_summary[9]
    }
    path = os.path.join(summary_dir, issue_summary[0] + ".json")
    utils.save_as_json(issue_dict, path)
def preprocess_seq_galaxy_clades(fasta_file, samples_clades, LEN_AA):
    encoded_samples = list()
    aa_chars = utils.get_all_possible_words(amino_acid_codes)
    f_word_dictionaries, r_word_dictionaries = utils.get_words_indices(aa_chars)
    all_sample_names = list(samples_clades.keys()) 
    for sequence_obj in SeqIO.parse(fasta_file, "fasta"):
        row = list()
        seq_id = sequence_obj.id
        sequence = str(sequence_obj.seq)
        sequence = sequence.replace("*", '')
        if "X" not in sequence and all_sample_names.count(seq_id) > 0 and len(sequence) == LEN_AA:
            row.append(seq_id)
            clade_name = samples_clades[seq_id]
            clade_name = utils.format_clade_name(clade_name)
            row.append(clade_name)
            seq_chars = list(sequence)
            indices_chars = [str(r_word_dictionaries[i]) for i in seq_chars]
            joined_indices_kmers = ','.join(indices_chars)
            row.append(joined_indices_kmers)
            encoded_samples.append(row)
    sample_clade_sequence_df = pd.DataFrame(encoded_samples, columns=["SampleName", "Clade", "Sequence"])
    sample_clade_sequence_df.to_csv(PATH_SAMPLES_CLADES, index=None)
    utils.save_as_json(PATH_F_DICT, f_word_dictionaries)
    utils.save_as_json(PATH_R_DICT, r_word_dictionaries)
    return sample_clade_sequence_df, f_word_dictionaries, r_word_dictionaries
示例#3
0
    def parse_issues(self, issues_raw: List[dict] = None) -> List[dict]:
        """
        For each raw issue, create a JSON file containing necessary information:
        1. Issue key
        2. Project information
            2.1 Project key
            2.2 Project name
        3. Author
        4. Date of creation
        5. Date of update
        6. Current status
        7. Summary
        8. Description
        9. List of attachments
            9.1 File name
            9.2 URL to attachment
        10. List of issue links
            10.1 Type of link
            10.2 Issue key
        11. List of remote links
            11.1 Title of link
            11.2 URL
        12. List of comments
            12.1 Author
            12.2 Date of creation
            12.3 Date of update
            12.4 Comment body

        The parsed data for each file is stored in "Projects/<project_name>/Issues/<issue_key>.json
        :param issues_raw: List of dictionaries representing raw issues. If none is specified, then they are
        loaded from the cache
        :return: List of dictionaries of parsed issues
        """
        print("{}: parsing issues. This may take a while".format(self.project))
        count = 0
        issues_dir = self.issues_dir
        utils.create_dir_if_necessary(issues_dir)

        if not issues_raw:
            issues_raw = self.load_issues_raw()

        issues = []
        for count, issue in enumerate(issues_raw, start=1):
            filename = issue["key"] + ".json"
            path = os.path.join(issues_dir, filename)
            json_object = self.__prepare_json_object(issue)
            utils.save_as_json(json_object, path)
            issues.append(json_object)

            if count % 100 == 0:
                print("{}: Parsed {} issues".format(self.project, count))
        print("{}: Finished parsing issues! Totally parsed: {}".format(
            self.project, count))
        return issues
def get_galaxy_samples_clades(path_seq_clades):
    ncov_global_df = pd.read_csv(path_seq_clades, sep="\t")
    samples_clades = dict()
    for idx in range(len(ncov_global_df)):
        sample_row = ncov_global_df.take([idx])
        s_name = sample_row["seqName"].values[0]
        clade_name = sample_row["clade"].values[0]
        if sample_row["qc.overallStatus"].values[0] and sample_row["qc.overallStatus"].values[0] == "good":
            clade_name = utils.format_clade_name(clade_name)
            samples_clades[s_name] = clade_name
    utils.save_as_json(PATH_ALL_SAMPLES_CLADES, samples_clades)
    return samples_clades
示例#5
0
 def __save_issues_raw(self, issues: List[dict]) -> None:
     """
     Persist raw issues in the corresponding folder.
     :param issues: List of dictionaries describing unparsed issues
     :return: None
     """
     directory = self.issues_raw_dir
     utils.create_dir_if_necessary(directory)
     print("\t{}: Successfully saved!".format(self.project))
     for issue in issues:
         key = issue["key"]
         filename = key + ".json"
         path = os.path.join(directory, filename)
         utils.save_as_json(issue, path)
def find_pred_mut():
    mut_tr = utils.read_json(results_path + "tr_parent_child_pos_{}_{}.json".format(clade_parent, clade_child))
    mut_te = utils.read_json(results_path + "te_parent_child_pos_{}_{}.json".format(clade_parent, clade_child))

    mut_future_true = utils.read_json(results_path + "parent_child_pos_{}_{}.json".format(clade_child, clade_future))
    mut_future_gen = utils.read_json(results_path + "parent_gen_pos_{}_{}.json".format(clade_child, clade_future))

    novel_mut = list()
    novel_mut_orig = list()
    present_in_tr_mut = list()
    for key in mut_future_gen:
        if key not in mut_tr and key not in mut_te and key in mut_future_true:
            print(key, mut_future_gen[key], mut_future_true[key])
            s_key = key.split(">")
            s_key = "".join(s_key)
            novel_mut_orig.append(s_key)
            novel_mut.append(key)
    print("novel mut share: {}, {}, {}".format(str(len(novel_mut) / float(len(mut_future_true))), str(len(novel_mut)), str(len(mut_future_true))))
    utils.save_as_json(results_path + "predicted_novel_mutations_in_c_{}.json".format(clade_child), novel_mut)
    utils.save_as_json(results_path + "predicted_novel_mutations_in_c_{}_original.json".format(clade_child), novel_mut_orig)

    print("---")
    for key in mut_future_gen:
        if key in mut_future_true and key in mut_tr:
            print(key, mut_future_gen[key], mut_future_true[key], mut_tr[key])
            present_in_tr_mut.append(key)

    print("--")
    tr_pos = get_POS(mut_tr)
    print(tr_pos)
    true_pos = get_POS(mut_future_true)
    print()
    print(true_pos)
    gen_pos = get_POS(mut_future_gen)
    print()
    print(gen_pos)
    novel_pos = list()
    present_in_tr_pos = list()
    for pos in gen_pos:
        if pos in true_pos and pos not in tr_pos:
            novel_pos.append(pos)
        if pos in tr_pos:
            present_in_tr_pos.append(pos)
    print()
    print("% gen mut present in tr: {}".format(str(float(len(present_in_tr_pos))/len(tr_pos))))
    print()
    print("% novel mut pos: {}".format(str(float(len(novel_pos))/len(true_pos))))
    print()
    print(novel_pos)
示例#7
0
 def __save_json(json_list: List[dict], directory: str, issue_key: str = None) -> None:
     """
     Save target list of dictionaries to the desired directory in a file in JSON format.
     If issue_key is specified, then the file is name "<issue_key>.json", otherwise - "all.json".
     :param json_list: List of dictionaries to save as JSON file
     :param directory: Directory where to save the file
     :param issue_key: Target issue key
     :return: None
     """
     utils.create_dir_if_necessary(directory)
     if issue_key:
         filename = issue_key + ".json"
     else:
         filename = "all.json"
     path = os.path.join(directory, filename)
     utils.save_as_json(json_list, path)
示例#8
0
    def parse_issue(self, issue_key: str) -> dict:
        """
        Parse a raw issue and store it in "Projects/<project_name>/Issues/<issue_key>.json.
        If the issue is not cached, then it is fetched first.
        :param issue_key: Key of the issue to parse
        :return: Dictionary representing the issue
        """
        filename = issue_key + ".json"
        utils.create_dir_if_necessary(self.issues_dir)
        path_raw = os.path.join(self.issues_raw_dir, filename)
        if not os.path.isfile(path_raw):
            issue_raw = self.fetch_issue_raw(issue_key, save=True)
        else:
            issue_raw = utils.load_json(path_raw)
        json_object = self.__prepare_json_object(issue_raw)

        path = os.path.join(self.issues_dir, filename)
        utils.save_as_json(json_object, path)
        return json_object
示例#9
0
def scrape_deaths():
    # Load content from Terviset's Covid dashboard and parse it
    log_status("Scraping data on deaths from " + TERVISEAMET_COVID_DASHBOARD)
    html = requests.get(TERVISEAMET_COVID_DASHBOARD).text
    soup = BeautifulSoup(html, "html.parser")

    # Extract number of deaths from page content and update JSON data on deaths
    deaths_container = soup.select(DEATHS_SELECTOR)
    if len(deaths_container) > 0:
        try:
            # Get number of deaths and the current date
            deaths_count = int(deaths_container[0].text.strip())
            current_date = (datetime.now() -
                            timedelta(days=1)).strftime("%Y-%m-%d")

            # Load existing deaths data
            json_deaths = read_json_from_file(DEATHS_PATH)

            # Add new entry to deaths data for current date
            deaths_output = {}
            if len(json_deaths):
                deaths_output = json_deaths
            deaths_output[current_date] = deaths_count

            # Save data on deaths
            save_as_json(DEATHS_PATH + ".tmp", deaths_output)

            # Log status
            log_status("Successfully scraped deaths. Total deaths: " +
                       str(deaths_count))
        except:
            # Log error
            error_message = "Error when scraping data on deaths"
            log_status(error_message + ":")
            log_status(traceback.format_exc())
            raise Exception(error_message)
    else:
        # Log error
        error_message = "Error: could not find page element with data on deaths"
        log_status(error_message)
        raise Exception(error_message)
示例#10
0
            'pageid': article['pageid'],
            'text': corpus.strip_mediawiki_markup(article['text'])
        }

    # limit = 0
    article_sentences = {}
    all_articles = []
    for title, article in semi_cleaned_articles.items():
        all_articles.append(Article(title, article['pageid'], corpus.get_sentences_and_citations(article['text'])))
        # limit += 1
        # if limit == 3:
        #     break

    train, dev, test = corpus.get_corpus_splits(all_articles)

    # print(get_corpus_stats(all_articles))
    # print(get_corpus_stats(train))
    # print(get_corpus_stats(dev))
    # print(get_corpus_stats(test))

    save_as_json(all_articles, 'all_articles.json')
    save_as_json(train, 'train.json')
    save_as_json(dev, 'dev.json')
    save_as_json(test, 'test.json')

    save_stats_to_md(all_articles, train, dev, test)

    # with open('article_sentences.json', 'w') as json_file:
    #     json.dump(article_sentences, json_file, sort_keys=True, indent=4)

示例#11
0
def start_training(forward_dict, rev_dict, gen_encoder=None, gen_decoder=None):
    #pos_variations = dict()
    #pos_variations_count = dict()
    start_time = time.time()
    print("Loading datasets...")
    #pretr_clade_files = glob.glob('data/pretrain/*.csv')
    tr_clade_files = glob.glob('data/train/*.csv')
    te_clade_files = glob.glob('data/test/*.csv')

    pretr_combined_X = list()
    pretr_combined_y = list()
    '''print("Loading pre-training datasets...")
    for name in pretr_clade_files:
        pretr_clade_df = pd.read_csv(name, sep="\t")
        pretr_X = pretr_clade_df["X"].tolist()
        pretr_y = pretr_clade_df["Y"].tolist()
        pretr_combined_X.extend(pretr_X)
        pretr_combined_y.extend(pretr_y)'''

    combined_X = list()
    combined_y = list()
    # load train data
    print("Loading training datasets...")
    for name in tr_clade_files:
        tr_clade_df = pd.read_csv(name, sep="\t")
        X = tr_clade_df["X"].tolist()
        y = tr_clade_df["Y"].tolist()
        combined_X.extend(X)
        combined_y.extend(y)

    combined_te_X = list()
    combined_te_y = list()
    # load test data
    print("Loading test datasets...")
    for te_name in te_clade_files:
        te_clade_df = pd.read_csv(te_name, sep="\t")
        te_X = te_clade_df["X"].tolist()
        te_y = te_clade_df["Y"].tolist()
        combined_te_X.extend(te_X)
        combined_te_y.extend(te_y)
        print(len(te_X), len(te_y))
    print()

    tr_unrelated_files = glob.glob("data/tr_unrelated/*.csv")
    print("Loading unrelated datasets...")
    unrelated_X = list()
    unrelated_y = list()
    for tr_unrelated in tr_unrelated_files:
        unrelated_clade_df = pd.read_csv(tr_unrelated, sep="\t")
        un_X = unrelated_clade_df["X"].tolist()
        un_y = unrelated_clade_df["Y"].tolist()
        unrelated_X.extend(un_X)
        unrelated_y.extend(un_y)
        print(len(un_X), len(un_y))

    unrelated_X = np.array(unrelated_X)
    unrelated_y = np.array(unrelated_y)
    print("Unrelated data sizes")
    print(len(unrelated_X), len(unrelated_y))

    print("train and test data sizes")
    print(len(combined_X), len(combined_y), len(combined_te_X),
          len(combined_te_y))

    kmer_f_dict = utils.read_json(PATH_KMER_F_DICT)
    kmer_r_dict = utils.read_json(PATH_KMER_R_DICT)

    vocab_size = len(kmer_f_dict) + 1

    print("Number of kmers: {}".format(str(len(kmer_f_dict))))
    print("Vocab size: {}".format(str(len(kmer_f_dict) + 1)))

    combined_X = np.array(combined_X)
    combined_y = np.array(combined_y)

    X_train = combined_X
    y_train = combined_y

    test_dataset_in = np.array(combined_te_X)
    test_dataset_out = np.array(combined_te_y)

    if gen_encoder is None or gen_decoder is None:
        encoder, decoder = neural_network.make_generator_model(
            len_final_aa_padding, vocab_size, embedding_dim, enc_units,
            batch_size, size_stateful)
    else:
        encoder = gen_encoder
        decoder = gen_decoder

    #print(len(pretr_combined_X))
    '''if len(pretr_combined_X) == 0:
        X_pretrain, X_train, y_pretrain, y_train  = train_test_split(combined_X, combined_y, test_size=pretrain_train_size)
        X_pretrain = np.array(X_pretrain)
        y_pretrain = np.array(y_pretrain)
        pre_train_cluster_indices, pre_train_cluster_indices_dict = utils.find_cluster_indices(y_pretrain, batch_size)
        df_pretrain = pd.DataFrame(list(zip(X_pretrain, y_pretrain)), columns=["X", "Y"])
        df_pretrain.to_csv(PRETRAIN_DATA, sep="\t", index=None)
        # save update train dataset
        df_train = pd.DataFrame(list(zip(X_train, y_train)), columns=["X", "Y"])
        df_train.to_csv(tr_clade_files[0], sep="\t", index=None)
    else: '''
    #X_pretrain = np.array(pretr_combined_X)
    #y_pretrain = np.array(pretr_combined_y)

    #print("Pretrain data sizes")
    #print(X_pretrain.shape, y_pretrain.shape)

    # divide into pretrain and train
    print("Train data sizes")
    print(X_train.shape, y_train.shape)
    X_train = np.array(X_train)
    y_train = np.array(y_train)

    # pretrain generator
    if to_pretrain is True:

        utils.create_dirs("data/generated_files/pre_train")
        pretrain_gen_train_loss = list()
        pretrain_gen_test_loss = list()

        pretrain_gen_test_seq_var = list()
        pretrain_gen_train_seq_var = list()
        pretrain_gen_batch_test_loss = list()
        pretrain_gen_batch_test_seq_var = list()

        print("Pretraining generator...")

        # balance tr data by mutations
        x_pretr_parent_child_mut_indices, x_pos_variations, x_pos_variations_count = utils.get_mutation_tr_indices(
            X_train, kmer_f_dict, kmer_r_dict, forward_dict, rev_dict, "x")
        #print(x_pos_variations)
        #print()
        #print(x_pos_variations_count)

        y_pretr_parent_child_mut_indices, y_pos_variations, y_pos_variations_count = utils.get_mutation_tr_indices(
            y_train, kmer_f_dict, kmer_r_dict, forward_dict, rev_dict, "y")
        #print(y_pos_variations)
        #print()
        print(y_pos_variations_count)

        #sys.exit()

        #print("Creating training data generator balanced by sample weights...")
        #pre_train_cluster_indices, pre_train_cluster_indices_dict, scatter_df = utils.find_cluster_indices(y_train, batch_size)

        print()
        print("Creating training data generator balanced by sample weights...")
        #training_generator = utils.calculate_sample_weights(y_train, batch_size, pos_variations_count)

        inputs_tokens_weights = utils.calculate_input_sample_weights(
            X_train, x_pos_variations_count)

        #pre_train_cluster_indices, pre_train_cluster_indices_dict = utils.find_cluster_indices(y_train, batch_size)
        pre_train_cluster_indices_dict = dict()

        mut_pattern, mut_pattern_dist, mut_pattern_dist_freq, mut_buckets = utils.create_mut_balanced_dataset(
            X_train, y_train, kmer_f_dict, len_final_aa_padding, batch_size)

        #sys.exit()

        #utils.save_as_json(PRETR_MUT_INDICES, pretr_parent_child_mut_indices)
        utils.save_as_json(PRETR_MUT_INDICES, y_pretr_parent_child_mut_indices)
        #utils.save_as_json(PRETR_MUT_INDICES, pretr_parent_child_mut_indices)
        # get pretraining dataset as sliced tensors
        n_pretrain_batches = int(X_train.shape[0] / float(batch_size))
        print("Num of pretrain batches: {}".format(str(n_pretrain_batches)))
        #updated_lr = pretr_lr
        for i in range(retrain_pretrain_start_index, pretrain_epochs):
            #pretrain_generator_optimizer = tf.keras.optimizers.Adam(learning_rate=pretr_lr)
            print("Pre training epoch {}/{}...".format(str(i + 1),
                                                       str(pretrain_epochs)))
            pretrain_gen_tr_loss, bat_te_gen_loss, bat_te_seq_var, bat_tr_seq_var, encoder, decoder = train_model.pretrain_generator(
                [
                    X_train, y_train, test_dataset_in, test_dataset_out,
                    te_batch_size, n_te_batches
                ], i, encoder, decoder, enc_units, vocab_size,
                n_pretrain_batches, batch_size, pretrain_epochs, size_stateful,
                forward_dict, rev_dict, kmer_f_dict, kmer_r_dict,
                y_pos_variations_count)
            print(
                "Pre training loss at epoch {}/{}: Generator loss: {}, variation score: {}"
                .format(str(i + 1), str(pretrain_epochs),
                        str(pretrain_gen_tr_loss),
                        str(np.mean(bat_tr_seq_var))))
            pretrain_gen_train_loss.append(pretrain_gen_tr_loss)
            pretrain_gen_batch_test_loss.append(bat_te_gen_loss)
            pretrain_gen_batch_test_seq_var.append(bat_te_seq_var)
            pretrain_gen_train_seq_var.append(bat_tr_seq_var)
            print()
            print("Pretrain: predicting on test datasets...")
            with tf.device('/device:cpu:0'):
                pretrain_gen_te_loss, pretrain_gen_te_seq_var = utils.predict_sequence(
                    i, 0, test_dataset_in, test_dataset_out, te_batch_size,
                    n_te_batches, len_final_aa_padding, vocab_size, enc_units,
                    encoder, decoder, size_stateful, "pretrain")
                pretrain_gen_test_loss.append(pretrain_gen_te_loss)
                pretrain_gen_test_seq_var.append(pretrain_gen_te_seq_var)
            print("Pre-training epoch {} finished".format(str(i + 1)))
            print()
            epoch_type_name = "pretrain_epoch_{}".format(str(i + 1))
            utils.save_predicted_test_data(test_dataset_in, test_dataset_out,
                                           te_batch_size, enc_units,
                                           vocab_size, len_final_aa_padding,
                                           size_stateful, epoch_type_name,
                                           PRETRAIN_GEN_ENC_MODEL,
                                           PRETRAIN_GEN_DEC_MODEL)  #
        np.savetxt(PRETRAIN_GEN_LOSS, pretrain_gen_train_loss)
        np.savetxt(PRETRAIN_GEN_TEST_LOSS, pretrain_gen_test_loss)
        np.savetxt("data/generated_files/pretrain_gen_test_seq_var.txt",
                   pretrain_gen_test_seq_var)
        np.savetxt("data/generated_files/pretrain_gen_batch_test_loss.txt",
                   pretrain_gen_batch_test_loss)
        np.savetxt("data/generated_files/pretrain_gen_batch_test_seq_var.txt",
                   pretrain_gen_batch_test_seq_var)
        np.savetxt("data/generated_files/pretrain_gen_batch_train_seq_var.txt",
                   pretrain_gen_train_seq_var)
        print("Pre-training finished")
        print()
        end_time = time.time()
        print("Pretraining finished in {} seconds".format(
            str(np.round(end_time - start_time, 2))))

    if gan_train is False:
        sys.exit()

    # GAN training
    # create discriminator model

    utils.create_dirs("data/generated_files/gan_train")
    train_cluster_indices, train_cluster_indices_dict = utils.find_cluster_indices(
        y_train, batch_size)
    disc_parent_encoder_model, disc_gen_encoder_model = neural_network.make_disc_par_gen_model(
        len_final_aa_padding, vocab_size, embedding_dim, enc_units, batch_size,
        size_stateful)

    discriminator = neural_network.make_discriminator_model(enc_units)

    # use the pretrained generator and train it along with discriminator
    print("Training Generator and Discriminator...")

    train_gen_total_loss = list()
    train_gen_true_loss = list()
    train_gen_fake_loss = list()
    train_disc_total_loss = list()
    train_disc_true_loss = list()
    train_disc_fake_loss = list()
    train_te_loss = list()
    train_gen_test_seq_var = list()
    train_gen_batch_test_loss = list()
    train_gen_batch_test_seq_var = list()

    n_train_batches = int(X_train.shape[0] / float(batch_size))
    print("Num of train batches: {}".format(str(n_train_batches)))

    # balance tr data by mutations
    tr_parent_child_mut_indices, pos_variations, pos_variations_count = utils.get_mutation_tr_indices(
        X_train, y_train, kmer_f_dict, kmer_r_dict, forward_dict, rev_dict,
        pos_variations, pos_variations_count)
    print(pos_variations)
    print()
    print(pos_variations_count)
    utils.save_as_json(TR_MUT_INDICES, tr_parent_child_mut_indices)
    for n in range(epochs):
        print("Training epoch {}/{}...".format(str(n + 1), str(epochs)))
        epo_gen_true_loss, epo_gen_fake_loss, epo_total_gen_loss, epo_disc_true_loss, epo_disc_fake_loss, epo_total_disc_loss, epo_bat_te_loss, epo_bat_gen_seq_var, encoder, decoder = train_model.start_training_mut_balanced(
            [
                X_train, y_train, unrelated_X, unrelated_y, test_dataset_in,
                test_dataset_out, te_batch_size, n_te_batches
            ], n, encoder, decoder, disc_parent_encoder_model,
            disc_gen_encoder_model, discriminator, enc_units, vocab_size,
            n_train_batches, batch_size, tr_parent_child_mut_indices, epochs,
            size_stateful, forward_dict, rev_dict, kmer_f_dict, kmer_r_dict,
            pos_variations, pos_variations_count, train_cluster_indices_dict)

        print(
            "Training loss at epoch {}/{}, G true loss: {}, G fake loss: {}, Total G loss: {}, D true loss: {}, D fake loss: {}, Total D loss: {}"
            .format(str(n + 1), str(epochs), str(epo_gen_true_loss),
                    str(epo_gen_fake_loss), str(epo_total_gen_loss),
                    str(epo_disc_true_loss), str(epo_disc_fake_loss),
                    str(epo_total_disc_loss)))

        train_gen_total_loss.append(epo_total_gen_loss)
        train_gen_true_loss.append(epo_gen_true_loss)
        train_gen_fake_loss.append(epo_gen_fake_loss)

        train_disc_total_loss.append(epo_total_disc_loss)
        train_disc_true_loss.append(epo_disc_true_loss)
        train_disc_fake_loss.append(epo_disc_fake_loss)

        train_gen_batch_test_loss.append(epo_bat_te_loss)
        train_gen_batch_test_seq_var.append(epo_bat_gen_seq_var)

        # predict seq on test data
        print("Prediction on test data...")
        with tf.device('/device:cpu:0'):
            epo_tr_gen_te_loss, epo_tr_gen_seq_var = utils.predict_sequence(
                n, 0, test_dataset_in, test_dataset_out, te_batch_size,
                n_te_batches, len_final_aa_padding, vocab_size, enc_units,
                encoder, decoder, size_stateful, "gan_train")
            train_te_loss.append(epo_tr_gen_te_loss)
            train_gen_test_seq_var.append(epo_tr_gen_seq_var)
        print()
        epoch_type_name = "gan_train_epoch_{}".format(str(n + 1))
        utils.save_predicted_test_data(test_dataset_in, test_dataset_out,
                                       te_batch_size, enc_units, vocab_size,
                                       len_final_aa_padding, size_stateful,
                                       epoch_type_name, TRAIN_GEN_ENC_MODEL,
                                       TRAIN_GEN_DEC_MODEL)
    print("Training finished")
    # save loss files
    np.savetxt(TRAIN_GEN_TOTAL_LOSS, train_gen_total_loss)
    np.savetxt(TRAIN_GEN_FAKE_LOSS, train_gen_fake_loss)
    np.savetxt(TRAIN_GEN_TRUE_LOSS, train_gen_true_loss)
    np.savetxt(TRAIN_DISC_FAKE_LOSS, train_disc_fake_loss)
    np.savetxt(TRAIN_DISC_TRUE_LOSS, train_disc_true_loss)
    np.savetxt(TRAIN_DISC_TOTAL_LOSS, train_disc_total_loss)
    np.savetxt(TEST_LOSS, train_te_loss)
    np.savetxt("data/generated_files/train_gen_batch_test_loss.txt",
               train_gen_batch_test_loss)
    np.savetxt("data/generated_files/train_gen_batch_test_seq_var.txt",
               train_gen_batch_test_seq_var)
    np.savetxt("data/generated_files/train_gen_test_seq_var.txt",
               train_gen_test_seq_var)

    end_time = time.time()
    print("Program finished in {} seconds".format(
        str(np.round(end_time - start_time, 2))))
import utils

def main(suas):
    reading_levels = utils.get_reading_levels(suas)
    utils.enrich_collection(suas, reading_levels)

if __name__ == '__main__':
    suas = utils.get_suas_1970()
    main(suas)
    utils.save_as_json(utils.reading_levels_fname, suas)
示例#13
0
def start_training_mut_balanced(inputs, epo_step, encoder, decoder, disc_par_enc, disc_gen_enc, discriminator, enc_units, vocab_size, n_train_batches, batch_size, parent_child_mut_indices, epochs, size_stateful, forward_dict, rev_dict, kmer_f_dict, kmer_r_dict, pos_variations, pos_variations_count, train_cluster_indices_dict):
  """
  Training sequences balanced by mutation type
  """
  X_train, y_train, unrelated_X, unrelated_y, test_dataset_in, test_dataset_out, te_batch_size, n_te_batches = inputs

  epo_avg_total_gen_loss = list()
  epo_ave_gen_true_loss = list()
  epo_avg_gen_fake_loss = list()

  epo_avg_total_disc_loss = list()
  epo_avg_disc_fake_loss = list()
  epo_avg_disc_real_loss = list()
  disc_real_loss = tf.constant(0)
  disc_fake_loss = tf.constant(0)
  total_disc_loss = tf.constant(0)
  gen_fake_loss = tf.constant(0)
  gen_true_loss = tf.constant(0)
  total_gen_loss = tf.constant(0)
  batch_mut_distribution = dict()

  epo_te_gen_loss = list()
  epo_te_seq_var = list()

  pos_size = dict() #get_mut_size(parent_child_mut_indices)

  mut_keys = list(parent_child_mut_indices.keys())

  epo_train_save_folder = "data/generated_files/gan_train/{}".format(str(epo_step+1))
  enc_train_save_folder = "data/generated_files/gan_train/{}/enc".format(str(epo_step+1))
  dec_train_save_folder = "data/generated_files/gan_train/{}/dec".format(str(epo_step+1))
  utils.create_dirs(epo_train_save_folder)
  utils.create_dirs(enc_train_save_folder)
  utils.create_dirs(dec_train_save_folder)

  for step in range(n_train_batches):
      #unrolled_x, unrolled_y, batch_mut_distribution = sample_true_x_y(parent_child_mut_indices, batch_size, X_train, y_train, batch_mut_distribution)
      unrolled_x, unrolled_y = sample_true_x_y(batch_size, X_train, y_train, train_cluster_indices_dict)
      un_X, un_y = utils.sample_unrelated_x_y(unrelated_X, unrelated_y, batch_size)
      seq_len = unrolled_x.shape[1]
      disc_gen = step % n_disc_step
      if disc_gen in list(range(0, n_disc_step - n_gen_step)):
          # train discriminator
          _, _, disc_par_enc, disc_gen_enc, discriminator, disc_real_loss, disc_fake_loss, total_disc_loss = d_loop(seq_len, batch_size, vocab_size, enc_units, unrolled_x, unrolled_y, un_X, un_y, encoder, decoder, disc_par_enc, disc_gen_enc, discriminator, size_stateful, pos_size, pos_variations, pos_variations_count, step)
          # share weights with generator's encoder
          disc_par_enc.load_weights(GEN_ENC_WEIGHTS)
          disc_gen_enc.load_weights(GEN_ENC_WEIGHTS)
          #disc_gen_enc.layers[1].set_weights(disc_par_enc.layers[1].get_weights())
          print("Training epoch {}/{}, batch {}/{}, D true loss: {}, D fake loss: {}, Total D loss: {}".format(str(epo_step+1), str(epochs), str(step+1), str(n_train_batches), str(disc_real_loss.numpy()), str(disc_fake_loss.numpy()), str(total_disc_loss.numpy())))
      else:
          # train generator with unrolled discriminator
          # save disc weights to reset after unrolling
          discriminator.save_weights(DISC_WEIGHTS)
          disc_par_enc.save_weights(DISC_PAR_ENC_WEIGHTS)
          disc_gen_enc.save_weights(DISC_GEN_ENC_WEIGHTS)
          print("Applying unrolled steps...")
          # unrolling steps
          for i in range(unrolled_steps):
              print("Unrolled step: {}/{}".format(str(i+1), str(unrolled_steps)))
              # sample data for unrolling
              #unroll_x, unroll_y, _ = sample_true_x_y(parent_child_mut_indices, batch_size, X_train, y_train, batch_mut_distribution)
              unroll_x, unroll_y = sample_true_x_y(batch_size, X_train, y_train, train_cluster_indices_dict)
              un_unroll_X, un_unroll_y = utils.sample_unrelated_x_y(unrelated_X, unrelated_y, batch_size)
              # train discriminator
              _, _, disc_par_enc, disc_gen_enc, discriminator, d_r_l, d_f_l, d_t_l = d_loop(seq_len, batch_size, vocab_size, enc_units, unroll_x, unroll_y, un_unroll_X, un_unroll_y, encoder, decoder, disc_par_enc, disc_gen_enc, discriminator, size_stateful, pos_size, pos_variations, pos_variations_count, step)
              print("Unrolled disc losses: real {}, fake {}, total {}".format(str(d_r_l.numpy()), str(d_f_l.numpy()), str(d_t_l.numpy())))
          # finish unrolling
          # train generator with unrolled discriminator
          encoder, decoder, _, _, _, gen_true_loss, gen_fake_loss, total_gen_loss = g_loop(seq_len, batch_size, vocab_size, enc_units, unrolled_x, unrolled_y, un_X, un_y, encoder, decoder, disc_par_enc, disc_gen_enc, discriminator, size_stateful, pos_size, pos_variations, pos_variations_count, step)
          print("Training epoch {}/{}, batch {}/{}, G true loss: {}, G fake loss: {}, Total G loss: {}".format(str(epo_step+1), str(epochs), str(step+1), str(n_train_batches), str(gen_true_loss.numpy()), str(gen_fake_loss.numpy()), str(total_gen_loss.numpy())))
          encoder.save_weights(GEN_ENC_WEIGHTS)
          # reset weights of discriminator, disc_par_enc and disc_gen_enc after unrolling
          discriminator.load_weights(DISC_WEIGHTS)
          disc_par_enc.load_weights(DISC_PAR_ENC_WEIGHTS)
          disc_gen_enc.load_weights(DISC_GEN_ENC_WEIGHTS)
      # intermediate prediction on test data while training
      if (step + 1) % test_log_step == 0 and step > 0:
          print("Training: prediction on test data...")
          with tf.device('/device:cpu:0'):
              _, _ = utils.predict_sequence(epo_step, step, test_dataset_in, test_dataset_out, te_batch_size, n_te_batches, seq_len, vocab_size, enc_units, encoder, decoder, size_stateful, "gan_train", True)
      
      print("Training epoch {}/{}, batch {}/{}, G true loss: {}, G fake loss: {}, Total G loss: {}, D true loss: {}, D fake loss: {}, Total D loss: {}".format(str(epo_step+1), str(epochs), str(step+1), str(n_train_batches), str(gen_true_loss.numpy()), str(gen_fake_loss.numpy()), str(total_gen_loss.numpy()), str(disc_real_loss.numpy()), str(disc_fake_loss.numpy()), str(total_disc_loss.numpy())))
      # write off results
      epo_ave_gen_true_loss.append(gen_true_loss.numpy())
      epo_avg_gen_fake_loss.append(gen_fake_loss.numpy())
      epo_avg_total_gen_loss.append(total_gen_loss.numpy())
      epo_avg_disc_fake_loss.append(disc_fake_loss.numpy())
      epo_avg_disc_real_loss.append(disc_real_loss.numpy())
      epo_avg_total_disc_loss.append(total_disc_loss.numpy())
  # save model
  print("Training epoch {} finished, Saving model...".format(str(epo_step+1)))
  print()

  tf.keras.models.save_model(encoder, TRAIN_GEN_ENC_MODEL)
  tf.keras.models.save_model(decoder, TRAIN_GEN_DEC_MODEL)
 
  # save trained models per epoch
  tf.keras.models.save_model(encoder, enc_train_save_folder)
  tf.keras.models.save_model(decoder, dec_train_save_folder)

  encoder.save_weights(GEN_ENC_WEIGHTS)
  decoder.save_weights(GEN_DEC_WEIGHTS)
  utils.save_as_json("data/generated_files/ave_batch_x_y_mut_epo_{}.json".format(str(epo_step)), batch_mut_distribution)
  return np.mean(epo_ave_gen_true_loss), np.mean(epo_avg_gen_fake_loss), np.mean(epo_avg_total_gen_loss), np.mean(epo_avg_disc_real_loss), np.mean(epo_avg_disc_fake_loss), np.mean(epo_avg_total_disc_loss), np.mean(epo_te_gen_loss), np.mean(epo_te_seq_var), encoder, decoder
示例#14
0
def main():
    # Log status
    log_status("Starting to generate chart data at " + str(TODAY_DMYHM))


    # 1.  Create date ranges for charts

    log_status("Creating date ranges for charts")
    case_dates = pd.date_range(start=DATE_SETTINGS["first_case_date"], end=YESTERDAY_YMD)
    vaccination_dates = pd.date_range(start=DATE_SETTINGS["vaccination_start_date"], end=YESTERDAY_YMD)


    # 2.  Calculate data related to deaths

    try:
        deaths = read_json_from_file(DEATHS_PATH)
        manual_data = read_json_from_file(MANUAL_DATA_PATH)
    except:
        # Log error
        log_status('Error when loading local data:')
        log_status(traceback.format_exc())
        exit()

    log_status("Calculating data related to deaths")

    manual_data["deceased"].update(deaths)
    deceased = list(manual_data["deceased"].values())
    n_deaths = deceased[-1]
    n_deaths_change = int(deceased[-1]) - int(deceased[-2])


    # 3.  Calculate data related to test results

    # Define columns to import
    column_list = [
        'Gender',
        'AgeGroup',
        'County',
        'ResultValue',
        'StatisticsDate'
    ]

    test_results = get_json_from_csv_file(TEST_RESULTS_PATH, column_list)
    
    log_status("Calculating data related to test results")

    # Find count of confirmed cases
    n_confirmed_cases = np.sum([res["ResultValue"] == "P" for res in test_results])

    # Find total number of tests
    n_tests_administered = len(test_results)
    log_status("Total number of tests: " + str(n_tests_administered))

    infections_by_county = get_infection_count_by_county(test_results, county_mapping)
    county_by_day = get_county_by_day(test_results, case_dates, county_mapping, county_sizes)
    confirmed_cases_by_county = get_confirmed_cases_by_county(test_results, county_mapping)
    tests_per_day_chart_data = get_tests_per_day_chart_data(test_results, case_dates)
    cumulative_cases_chart_data = get_cumulative_cases_chart_data(
        test_results,
        case_dates,
        tests_per_day_chart_data
    )
    cumulative_tests_chart_data = get_cumulative_tests_chart_data(test_results, case_dates)
    positive_test_by_age_chart_data = get_positive_tests_by_age_chart_data(test_results)
    positive_negative_chart_data = get_positive_negative_chart_data(test_results, county_mapping)
    county_daily_active = get_county_daily_active(test_results, case_dates, county_mapping, county_sizes)

    # Delete test result data from memory
    del test_results

    infections_by_county_10000 = get_infections_data_by_count_10000(infections_by_county, county_sizes)
    tests_pop_ratio = get_test_data_pop_ratio(infections_by_county_10000)
    new_cases_per_day_chart_data = get_new_cases_per_day_chart_data(cumulative_cases_chart_data)
    n_active_cases = cumulative_cases_chart_data["active"][-1]
    n_active_cases_change = (cumulative_cases_chart_data["active"][-1] - cumulative_cases_chart_data["active"][-2])
    per_100k = cumulative_cases_chart_data["active100k"][-1]
    active_infections_by_county = [
        {"MNIMI": k, "sequence": v, "drilldown": k}
        for k, v in county_daily_active["countyByDayActive"].items()
    ]
    active_infections_by_county_100k = [
        [k, round(v[-1] / county_sizes[k] * 100000, 2)]
        for k, v in county_daily_active["countyByDayActive"].items()
    ]


    # 4.  Calculate data related to test locations

    test_locations = read_json_from_file(TEST_LOCATIONS_PATH)

    municipalities_data = get_municipality_data(test_locations, county_mapping)

    # Delete test location data from memory
    del test_locations


    # 5.  Calculate data related to hospitalisation

    hospitalization = read_json_from_file(HOSPITALIZATION_PATH)

    log_status("Calculating data related to hospitalisation")

    # Set hospitalised and ICU time-series
    hospital = get_hospital_data(hospitalization, DATE_SETTINGS["first_case_date"])
    # TODO: Based on cross-checking with the hospitalisation data published by TEHIK, the data listed
    #       in the manual_data.json file with the field name "intensive" appears to show the number
    #       of patients on ventilation. We should fix the terminology and make sure that the intensive
    #       and on ventilation statistics are being calculated correctly.
    intensive = list(get_in_intensive_data(hospitalization, manual_data["intensive"]).values())
    on_ventilation = list(get_on_ventilation_data(hospitalization).values())
    # Delete hospitalization data from memory
    del hospitalization

    hospitalised = hospital["activehospitalizations"]
    n_on_ventilation = on_ventilation[-1]
    n_on_ventilation_change = int(on_ventilation[-1]) - int(on_ventilation[-2])


    # 6.  Calculate data related to vaccination

    vaccination = read_json_from_file(VACCINATIONS_PATH)

    log_status("Calculating data related to vaccination")

    vaccinated_people_chart_data = get_vaccinated_people_chart_data(vaccination, vaccination_dates)

    last_day_vaccination_data = [x for x in vaccination if x["MeasurementType"] == "Vaccinated" and x["VaccinationSeries"] == 1][-1]
    last_day_completed_vaccination_data = [x for x in vaccination if x["MeasurementType"] == "FullyVaccinated" and x["VaccinationSeries"] == 1][-1]
    last_day_doses_administered_data = [x for x in vaccination if x["MeasurementType"] == "DosesAdministered" and x["VaccinationSeries"] == 1][-1]
    # Delete vaccination data from memory
    del vaccination

    n_fully_vaccinated = last_day_completed_vaccination_data["TotalCount"]
    n_fully_vaccinated_change = last_day_completed_vaccination_data["DailyCount"]
    n_fully_vaccinated_percentage = last_day_completed_vaccination_data["PopulationCoverage"]
    n_vaccinated_at_least_one_dose = last_day_vaccination_data["TotalCount"]
    n_vaccinated_at_least_one_dose_change = last_day_vaccination_data["DailyCount"]
    n_vaccinated_at_least_one_dose_percentage = last_day_vaccination_data["PopulationCoverage"]
    # vaccination_number_total = (n_vaccinated_at_least_one_dose - n_fully_vaccinated)
    # vaccination_number_last_day = (n_vaccinated_at_least_one_dose_change - n_fully_vaccinated_change)


    # 7.  Create and save final JSON

    log_status("Compiling final JSON")

    final_json = {
        "updatedOn": TODAY_DMYHM,
        "confirmedCasesNumber": str(n_confirmed_cases),
        # TODO: For consistency, we should include the change in the number of confirmed cases as well.
        "hospitalisedNumber": str(hospital["activehospitalizations"][-1]),
        "hospitalChanged": str(hospital["activehospitalizations"][-1] - hospital["activehospitalizations"][-2]),
        "onVentilation": on_ventilation,
        "onVentilationNumber": n_on_ventilation,
        "onVentilationChanged": n_on_ventilation_change,
        "deceased": deceased,
        "deceasedNumber": str(n_deaths),
        "deceasedChanged": str(n_deaths_change),
        "testsAdministeredNumber": str(n_tests_administered),
        # TODO: For consistency, we should include the change in the number of tests as well.
        "activeCasesNumber": str(n_active_cases),
        "activeChanged": str(n_active_cases_change),
        "perHundred": str(per_100k), # TODO: This should be given a clearer name.
        "dates2": [str(x.date()) for x in case_dates],  # TODO: Change key to "caseDates"
        "dates3": [str(x.date()) for x in vaccination_dates],  # TODO: Change key to "vaccinationDates"
        "counties": counties,
        "age_groups": age_groups,
        "dataInfectionsByCounty": infections_by_county,
        "dataInfectionsByCounty10000": infections_by_county_10000,
        "dataActiveInfectionsByCounty100k": active_infections_by_county_100k,
        "dataActiveInfectionsByCounty": active_infections_by_county,
        "dataTestsPopRatio": tests_pop_ratio,
        "countyByDay": county_by_day,
        "dataCountyDailyActive": county_daily_active,
        "dataConfirmedCasesByCounty": confirmed_cases_by_county,
        "dataCumulativeCasesChart": cumulative_cases_chart_data,
        "dataNewCasesPerDayChart": new_cases_per_day_chart_data,
        "dataCumulativeTestsChart": cumulative_tests_chart_data,
        "dataTestsPerDayChart": tests_per_day_chart_data,
        "dataPositiveTestsByAgeChart": positive_test_by_age_chart_data,
        "dataPositiveNegativeChart": positive_negative_chart_data,
        "dataVaccinatedPeopleChart": vaccinated_people_chart_data,
        "dataMunicipalities": municipalities_data,
        "hospital": hospital, # TODO: Rename this to make it clearer what data it contains.
        # "vaccinationNumberTotal": vaccination_number_total,
        # "vaccinationNumberLastDay": vaccination_number_last_day,
        "fullyVaccinatedNumber": n_fully_vaccinated,
        "fullyVaccinatedNumberChange": n_fully_vaccinated_change,
        "fullyVaccinatedNumberPercentage": n_fully_vaccinated_percentage,
        "vaccinatedAtLeastOneDoseNumber": n_vaccinated_at_least_one_dose,
        "vaccinatedAtLeastOneDoseChange": n_vaccinated_at_least_one_dose_change,
        "vaccinatedAtLeastOneDosePercentage": n_vaccinated_at_least_one_dose_percentage,
    }

    # Dump JSON output
    log_status("Dumping JSON output")
    save_as_json(OUTPUT_FILE_LOCATION, final_json)

    # Log finish time
    finish = datetime.today().astimezone(ESTONIA_TIMEZONE).strftime("%d/%m/%Y, %H:%M")
    log_status("Finished update process at " + finish)
示例#15
0
def main():
    # Log status
    log_status("Starting data update process at " + str(today))

    # Get current number of deaths from Terviseamet's Covid dashboard
    try:
        scrape_deaths()
    except:
        log_status("Aborting data update.")
        exit()

    # Load data from external services
    log_status("Downloading data from TEHIK: Test results")
    json_testing = get_json_data(TESTING_ENDPOINT)
    log_status("Downloading data from TEHIK: Location data")
    json_test_location = get_json_data(TEST_LOCATION_ENDPOINT)
    log_status("Downloading data from TEHIK: Hospitalisation data")
    json_hospitalisation = get_json_data(HOSPITALISATION_ENDPOINT)
    log_status("Downloading data from TEHIK: Vaccination data")
    json_vaccination = get_json_data(VACCINATION_ENDPOINT)

    # Validate data from remote endpoints
    # TODO: Add checks that the testing and vaccination data are up to date. We will need to adopt
    #       a different approach than for the test location and hospitalisation data due to the fact
    #       that the data structure of the JSON is different. Checking the "Last-Modified" header of the
    #       response may be the way to go and would handle the possibility that there are no tests or
    #       vaccinations on a particular day.
    ok = True
    if json_testing is None:
        log_status("Unable to retrieve testing data")
        ok = False
    if json_test_location is None:
        log_status("Unable to retrieve location data")
        ok = False
    elif not is_up_to_date(json_test_location, "LastStatisticsDate"):
        log_status("Location data is not up-to-date")
        ok = False
    if json_hospitalisation is None:
        log_status("Unable to retrieve hospitalisation data")
        ok = False
    elif not is_up_to_date(json_hospitalisation, "LastLoadStatisticsDate"):
        log_status("Hospitalisation data is not up-to-date")
        ok = False
    if json_vaccination is None:
        log_status("Unable to retrieve vaccination data")
        ok = False
    # TODO: Review whether this check is needed. I have commented it out for now.
    # if not is_header_last_modified_up_to_date(TEST_LOCATION_ENDPOINT):
    #     log_status("Location data last modified is not up-to-date")
    #     ok = False

    if not ok:
        log_status(
            "One or more of the TEHIK APIs has not been updated or could not be retrieved."
        )
        log_status("Aborting data update.")
        exit()

    # Load locally-stored data
    log_status("Loading local data files")
    try:
        json_deaths = read_json_from_file(DEATHS_FILE_LOCATION)
        json_manual = read_json_from_file(MANUAL_DATA_FILE_LOCATION)
    except:
        # Log error
        log_status('Error when loading local data:')
        log_status(traceback.format_exc())
        exit()

    # Log status
    log_status("Calculating main statistics")

    # Statsbar
    # Find count of confirmed cases
    n_confirmed_cases = np.sum(
        [res["ResultValue"] == "P" for res in json_testing])

    # Find total number of tests
    n_tests_administered = len(json_testing)

    # Create date ranges for charts
    # dates1 = pd.date_range(start=DATE_SETTINGS["dates1_start"], end=yesterday)
    dates2 = pd.date_range(start=DATE_SETTINGS["dates2_start"], end=yesterday)
    dates3 = pd.date_range(start=DATE_SETTINGS["dates3_start"], end=yesterday)

    # Set recovered, deceased, hospitalised and ICU time-series
    hospital = get_hospital_data(json_hospitalisation,
                                 DATE_SETTINGS["dates2_start"])
    recovered = hospital["discharged"]
    json_manual["deceased"].update(json_deaths)
    deceased = list(json_manual["deceased"].values())
    hospitalised = hospital["activehospitalizations"]
    # TODO: Based on cross-checking with the hospitalisation data publishedby TEHIK, the data listed
    #       in the manual_data.json file with the field name "intensive" appears to show the number
    #       of patients on ventilation. We should fix the terminology and make sure that the intensive
    #       and on ventilation statistics are being calculated correctly.
    intensive = list(
        get_in_intensive_data(json_hospitalisation,
                              json_manual["intensive"]).values())
    on_ventilation = list(
        get_on_ventilation_data(json_hospitalisation).values())

    n_deaths = deceased[-1]
    n_deaths_change = int(deceased[-1]) - int(deceased[-2])

    # Get data for each chart
    log_status("Calculating data for charts")
    infections_by_county = get_infection_count_by_county(
        json_testing, county_mapping)
    infections_by_county_10000 = get_infections_data_by_count_10000(
        infections_by_county, county_sizes)
    tests_pop_ratio = get_test_data_pop_ratio(infections_by_county_10000)
    county_by_day = get_county_by_day(json_testing, dates2, county_mapping,
                                      county_sizes)
    confirmed_cases_by_county = get_confirmed_cases_by_county(
        json_testing, county_mapping)
    cumulative_cases_chart_data = get_cumulative_cases_chart_data(
        json_testing, recovered, deceased, hospitalised, intensive,
        on_ventilation, dates2)
    new_cases_per_day_chart_data = get_new_cases_per_day_chart_data(
        cumulative_cases_chart_data)
    cumulative_tests_chart_data = get_cumulative_tests_chart_data(
        json_testing, dates2)
    tests_per_day_chart_data = get_tests_per_day_chart_data(
        json_testing, dates2)
    positive_test_by_age_chart_data = get_positive_tests_by_age_chart_data(
        json_testing)
    positive_negative_chart_data = get_positive_negative_chart_data(
        json_testing, county_mapping)
    vaccinated_people_chart_data = get_vaccinated_people_chart_data(
        json_vaccination, dates3)
    county_daily_active = get_county_daily_active(json_testing, dates2,
                                                  county_mapping, county_sizes)
    n_active_cases = cumulative_cases_chart_data["active"][-1]
    n_active_cases_change = (cumulative_cases_chart_data["active"][-1] -
                             cumulative_cases_chart_data["active"][-2])
    active_infections_by_county = [{
        "MNIMI": k,
        "sequence": v,
        "drilldown": k
    } for k, v in county_daily_active["countyByDayActive"].items()]
    active_infections_by_county_100k = [[
        k, round(v[-1] / county_sizes[k] * 100000, 2)
    ] for k, v in county_daily_active["countyByDayActive"].items()]
    municipalities_data = get_municipality_data(json_test_location,
                                                county_mapping)
    per_100k = cumulative_cases_chart_data["active100k"][-1]

    # Calculate vaccination data
    log_status("Calculating vaccination data")
    last_day_vaccination_data = [
        x for x in json_vaccination if x["MeasurementType"] == "Vaccinated"
    ][-1]
    last_day_completed_vaccination_data = [
        x for x in json_vaccination
        if x["MeasurementType"] == "FullyVaccinated"
    ][-1]
    # TODO: Doses administered
    # last_day_doses_administered_data = [x for x in json_vaccination if x['MeasurementType'] == 'DosesAdministered'][-1]
    completed_vaccination_number_total = last_day_completed_vaccination_data[
        "TotalCount"]
    completed_vaccination_number_last_day = last_day_completed_vaccination_data[
        "DailyCount"]
    all_vaccination_number_total = last_day_vaccination_data["TotalCount"]
    all_vaccination_number_last_day = last_day_vaccination_data["DailyCount"]
    vaccination_number_total = (all_vaccination_number_total -
                                completed_vaccination_number_total)
    vaccination_number_last_day = (all_vaccination_number_last_day -
                                   completed_vaccination_number_last_day)
    fully_vaccinated_from_total_vaccinated_percentage = round(
        completed_vaccination_number_total * 100 /
        (all_vaccination_number_total), 2)

    # Create dictionary for final JSON
    log_status("Compiling final JSON")
    final_json = {
        "updatedOn":
        today,
        "confirmedCasesNumber":
        str(n_confirmed_cases),
        # TODO: For consistency, we should include the change in the number of confirmed cases as well.
        "hospitalisedNumber":
        str(hospital["activehospitalizations"][-1]),
        "hospitalChanged":
        str(hospital["activehospitalizations"][-1] -
            hospital["activehospitalizations"][-2]),
        "deceasedNumber":
        str(n_deaths),
        "deceasedChanged":
        str(n_deaths_change),
        "recoveredNumber":
        str(hospital["discharged"][-1]),
        "recoveredChanged":
        str(hospital["discharged"][-1] - hospital["discharged"][-2]),
        "testsAdministeredNumber":
        str(n_tests_administered),
        # TODO: For consistency, we should include the change in the number of tests as well.
        "activeCasesNumber":
        str(n_active_cases),
        "activeChanged":
        str(n_active_cases_change),
        "perHundred":
        str(per_100k),  # TODO: This should be given a clearer name.
        # TODO: I can't find anywhere in the app where "dates1" is used. Is it needed? Commented out for now.
        # "dates1": [str(x.date()) for x in dates1],
        "dates2": [str(x.date()) for x in dates2],
        "dates3": [str(x.date()) for x in dates3],
        "counties":
        counties,
        "age_groups":
        age_groups,
        "dataInfectionsByCounty":
        infections_by_county,
        "dataInfectionsByCounty10000":
        infections_by_county_10000,
        "dataActiveInfectionsByCounty100k":
        active_infections_by_county_100k,
        "dataActiveInfectionsByCounty":
        active_infections_by_county,
        "dataTestsPopRatio":
        tests_pop_ratio,
        "countyByDay":
        county_by_day,
        "dataCountyDailyActive":
        county_daily_active,
        "dataConfirmedCasesByCounties":
        confirmed_cases_by_county,
        "dataCumulativeCasesChart":
        cumulative_cases_chart_data,
        "dataNewCasesPerDayChart":
        new_cases_per_day_chart_data,
        "dataCumulativeTestsChart":
        cumulative_tests_chart_data,
        "dataTestsPerDayChart":
        tests_per_day_chart_data,
        "dataPositiveTestsByAgeChart":
        positive_test_by_age_chart_data,
        "dataPositiveNegativeChart":
        positive_negative_chart_data,
        "dataVaccinatedPeopleChart":
        vaccinated_people_chart_data,
        "dataMunicipalities":
        municipalities_data,
        "hospital":
        hospital,  # TODO: Rename this to make it clearer what data it contains.
        "vaccinationNumberTotal":
        vaccination_number_total,
        "vaccinationNumberLastDay":
        vaccination_number_last_day,
        "completedVaccinationNumberTotal":
        completed_vaccination_number_total,
        "completedVaccinationNumberLastDay":
        completed_vaccination_number_last_day,
        "allVaccinationNumberTotal":
        all_vaccination_number_total,
        "allVaccinationNumberLastDay":
        all_vaccination_number_last_day,
        "allVaccinationFromPopulationPercentage":
        last_day_vaccination_data["PopulationCoverage"],
        "completelyVaccinatedFromTotalVaccinatedPercentage":
        fully_vaccinated_from_total_vaccinated_percentage,
    }

    # Dump JSON output
    log_status("Dumping JSON output")
    save_as_json(OUTPUT_FILE_LOCATION, final_json)

    # Log finish time
    finish = datetime.today().astimezone(estonian_timezone).strftime(
        "%d/%m/%Y, %H:%M")
    log_status("Finished update process at " + finish)
示例#16
0
def pretrain_generator(inputs, epo_step, gen_encoder, gen_decoder, enc_units, vocab_size, n_batches, batch_size, epochs, size_stateful, forward_dict, rev_dict, kmer_f_dict, kmer_r_dict, pos_variations_count):
   #train_model.pretrain_generator([X_train, y_train, test_dataset_in, test_dataset_out, te_batch_size, n_te_batches], i, encoder, decoder, enc_units, vocab_size, n_pretrain_batches, batch_size, pretrain_epochs, size_stateful, forward_dict, rev_dict, kmer_f_dict, kmer_r_dict, y_pos_variations_count)

  X_train, y_train, test_dataset_in, test_dataset_out, te_batch_size, n_te_batches = inputs
  epo_avg_tr_gen_loss = list()
  epo_te_gen_loss = list()
  epo_tr_seq_var = list()
  epo_te_seq_var = list()
  batch_mut_distribution = dict()
  #pos_variations_count = dict()
  pos_size = dict() #get_mut_size(pretr_parent_child_mut_indices)
  #for step, (unrolled_x, unrolled_y) in enumerate(zip(X_train, y_train)):

  epo_pre_train_save_folder = "data/generated_files/pre_train/{}".format(str(epo_step+1))
  enc_pre_train_save_folder = "data/generated_files/pre_train/{}/enc".format(str(epo_step+1))
  dec_pre_train_save_folder = "data/generated_files/pre_train/{}/dec".format(str(epo_step+1))

  utils.create_dirs(epo_pre_train_save_folder)
  utils.create_dirs(enc_pre_train_save_folder)
  utils.create_dirs(dec_pre_train_save_folder)

  for step in range(n_batches):
      unrolled_x, unrolled_y = sample_true_x_y(batch_size, X_train, y_train)

      '''print("Batch {} x and y:".format(str(step+1)))
      print(unrolled_x[:5, :])
      print(unrolled_y[:5, :])
      print()'''
      '''str_x = [",".join(str(pos) for pos in item) for item in unrolled_x]
      str_y = [",".join(str(pos) for pos in item) for item in unrolled_y]
      #print(str_x, str_y)
      muts = utils.get_mutation_tr_indices(str_x, str_y, kmer_f_dict, kmer_r_dict, forward_dict, rev_dict)
      print(kmer_f_dict)
      print(muts)
      print(unrolled_x)
      print()
      print(unrolled_y)
      print()'''

      seq_len = unrolled_x.shape[1]
      # verify levenshtein distance
      '''for i in range(len(unrolled_x)):
          re_x = utils.reconstruct_seq([kmer_f_dict[str(pos)] for pos in unrolled_x[i][1:]])
          re_y = utils.reconstruct_seq([kmer_f_dict[str(pos)] for pos in unrolled_y[i][1:]])
          #l_dist = utils.compute_Levenshtein_dist(re_x, re_y)
          print(re_x)
          print(re_y)
          #print(l_dist)
          print("---")'''
      '''import sys
      sys.exit()'''
      #print(pos_size)
      with tf.GradientTape() as gen_tape:
          #print(unrolled_x.shape, unrolled_y.shape)
          pred_logits, gen_encoder, gen_decoder, gen_loss = utils.loop_encode_decode_stateful(seq_len, batch_size, vocab_size, unrolled_x, unrolled_y, gen_encoder, gen_decoder, enc_units, teacher_forcing_ratio, True, size_stateful, pos_size, pos_variations_count, step)
          #print("Training: true input seq")
          #print(unrolled_x[:5, 1:], unrolled_x.shape)
          #print()
          print("Training: true output seq")
          print(unrolled_y[:batch_size,], unrolled_y.shape)
          print()
          print(tf.argmax(pred_logits, axis=-1)[:batch_size, :], pred_logits.shape)

          # compute generated sequence variation
          variation_score = utils.get_sequence_variation_percentage(unrolled_x, pred_logits)
          print("Pretr: generation variation score: {}".format(str(variation_score)))
          epo_tr_seq_var.append(variation_score)
          #print("Pretr: teacher forcing ratio: {}".format(str(teacher_forcing_ratio)))

      '''with tf.GradientTape() as pf_tape:

          # train pf model
          true_y = unrolled_y[:, 1:]
          pred_y = tf.argmax(pred_logits, axis=-1)
          true_o = pf_model(true_y)
          fake_o = pf_model(pred_y)

          true_pf_disc_loss, fake_pf_disc_loss = discriminator_loss(true_o, fake_o)
          total_pf_disc_loss = true_pf_disc_loss + fake_pf_disc_loss
          fake_pf_gen_loss = generator_loss(fake_o)

          print(true_pf_disc_loss, fake_pf_disc_loss, fake_pf_gen_loss)'''

          ##########################

      #gen_loss = gen_loss + fake_pf_gen_loss
      print("Pretrain epoch {}/{}, batch {}/{}, gen true loss: {}".format(str(epo_step+1), str(epochs), str(step+1), str(n_batches), str(gen_loss.numpy())))
      print()
      gen_trainable_vars = gen_encoder.trainable_variables + gen_decoder.trainable_variables
      gradients_of_generator = gen_tape.gradient(gen_loss, gen_trainable_vars)
      #print("Pretrain gradient norm before clipping: ", [tf.norm(gd) for gd in gradients_of_generator])
      gradients_of_generator = [(tf.clip_by_norm(grad, clip_norm=pretrain_clip_norm)) for grad in gradients_of_generator]
      #print("Pretrain gradient norm after clipping: ", [tf.norm(gd) for gd in gradients_of_generator])
      pretrain_generator_optimizer.apply_gradients(zip(gradients_of_generator, gen_trainable_vars))

      # optimize pf discriminator
      #pf_disc_trainable_vars = pf_model.trainable_variables
      #pf_gradients = pf_tape.gradient(total_pf_disc_loss, pf_disc_trainable_vars)
      #pf_discriminator_optimizer.apply_gradients(zip(pf_gradients, pf_disc_trainable_vars))

      if (step + 1) % test_log_step == 0 and step > 0:
          print("-------")
          print("Pretr: Prediction on test data at epoch {}/{}, batch {}/{}...".format(str(epo_step+1), str(epochs), str(step+1), str(n_batches)))
          print()
          gen_te_loss, gen_te_seq_var = utils.predict_sequence(epo_step, step, test_dataset_in, test_dataset_out, te_batch_size, n_te_batches, seq_len, vocab_size, enc_units, gen_encoder, gen_decoder, size_stateful, "pretrain", True)
          epo_te_gen_loss.append(gen_te_loss)
          epo_te_seq_var.append(gen_te_seq_var)
          print("-------")
          print()
      epo_avg_tr_gen_loss.append(gen_loss)
  # save model

  gen_encoder.save_weights(GEN_ENC_WEIGHTS)

  tf.keras.models.save_model(gen_encoder, PRETRAIN_GEN_ENC_MODEL)
  tf.keras.models.save_model(gen_decoder, PRETRAIN_GEN_DEC_MODEL)

  gen_encoder.save_weights(PRE_TR_GEN_ENC_WEIGHTS)
  gen_decoder.save_weights(PRE_TR_GEN_DEC_WEIGHTS)

  tf.keras.models.save_model(gen_encoder, enc_pre_train_save_folder)
  tf.keras.models.save_model(gen_decoder, dec_pre_train_save_folder)

  utils.save_as_json("data/generated_files/pretr_ave_batch_x_y_mut_epo_{}.json".format(str(epo_step)), batch_mut_distribution)
  # pretrain_gen_tr_loss, bat_te_gen_loss, bat_te_seq_var, bat_tr_seq_var, encoder, decoder, _
  return np.mean(epo_avg_tr_gen_loss), np.mean(epo_te_gen_loss), np.mean(epo_te_seq_var), np.mean(epo_tr_seq_var), gen_encoder, gen_decoder
示例#17
0
if __name__ == '__main__':
	reload(sys)
	sys.setdefaultencoding('utf-8')
	devs, log = utils.setup()
	
	
	# Use Google to find potential LinkedIn matches
	for dev in devs:
		try:
			print'\rGoogling for matches for %s...' % dev.get('name')
			dev['li_matches'] = google_for_li_matches(dev)
		except:
			print "%s occurred while processing: %s" % (sys.exc_info()[0].__name__,dev['name'])
			continue
	utils.save_as_json(devs, 'googlesearchresults')
	print "Done.\n"
	
	
	# Compare the LinkedIn profiles to the GitHub profile, score them and sort them. Return best at index[0]
	devs = utils.load_json('googlesearchresults')
	for dev in devs:
		print '\rEvaluating matches for %s...' % dev.get('name')
		dev['li_matches'] = evaluate_li_matches(dev)
	utils.save_as_json(devs, 'scoredresults')
	print "Done.\n"
	
				
	"""
	# Use Pipl to match remainder
	for dev in [dev for dev in devs if dev['email']]:
import utils
import elections_csv

def gen_elections_by_year(elections):
    elections_by_year = { e['year']: e for e in elections }
    for e in elections:
        e.pop('year')
        e.pop('president')

    return elections_by_year

def main(suas):
    elections = elections_csv.get_elections()
    elections_by_year = gen_elections_by_year(elections)
    year_per_sua = [elections_by_year[year] for year in utils.get_years(suas)]

    utils.enrich_collection(suas, year_per_sua)

if __name__ == '__main__':
    suas = utils.load_json_as_dict(utils.with_presidents_fname)
    main(suas)
    utils.save_as_json(utils.with_elections_fname, suas)
import utils

def main(inaugurals):
    reading_levels = utils.get_reading_levels(inaugurals)
    utils.enrich_collection(inaugurals, reading_levels)

if __name__ == '__main__':
    inaugurals = utils.get_inaugurals()
    main(inaugurals)
    utils.save_as_json(utils.inaugurals_reading_levels_fname, inaugurals)
import pickle
from utils import save_as_json, read_from_json
import pdb
out_dir = 'data/output/blocs/'
out_filename = 'dictionary_blocs.json'
blocs = ['investment_blocs_2020.json']

dict_blocs = {}
for bloc in blocs:
    bloc_name = bloc.replace('.json', '')
    this_bloc_dict = read_from_json(out_dir, bloc)
    dict_blocs[bloc_name] = this_bloc_dict

save_as_json(dict_blocs, out_dir, out_filename)

# check
parsed_dict_blocs = read_from_json(out_dir, out_filename)
pdb.set_trace()
示例#21
0
 def save_as_json(self):
     save_as_json('vults_items.json', self.items)
示例#22
0
def ui_save(expenses):
    path = ui_input_path()

    serialized = expenses.do('get_serialized', keep=False)
    save_as_json(serialized, path)
示例#23
0
 def save_as_json(self):
     save_as_json('digital_items.json', self.items)
import utils
import presidents_csv

def gen_pres_by_name(presidents):
    pres_by_name = { p['president']: p for p in presidents }
    for p in presidents:
        p.pop('president')

    return pres_by_name

def main(suas):
    presidents = presidents_csv.get_presidents()
    pres_by_name = gen_pres_by_name(presidents)
    president_per_sua = [pres_by_name[pres] for pres in utils.get_presidents(suas)]

    utils.enrich_collection(suas, president_per_sua)

if __name__ == '__main__':
    suas = utils.load_json_as_dict(utils.reading_levels_fname)
    main(suas)
    utils.save_as_json(utils.with_presidents_fname, suas)
示例#25
0
			print'\rGoogling for matches for %s...' % dev.get('name')
			dev['li_matches'] = get_matching_li_profiles(dev)
		except:
			print "%s occurred while processing: %s" % (sys.exc_info()[0].__name__,dev['name'])
			continue
	utils.save_as_json(devs, 'googlesearchresults')
	print "Done.\n"
	#raw_input('Continue?')
	"""
	
	# Compare the LinkedIn profiles to the GitHub, score them and sort them. Return best at index[0]
	devs = utils.load_json('googlesearchresults')
	for dev in devs:
		print '\rEvaluating matches for %s...' % dev.get('name')
		dev['li_matches'] = evaluate_li_matches(dev)
	utils.save_as_json(devs, 'scoredresults')
	print "Done.\n"
	#raw_input('Continue?')
				
	"""
	# Use Pipl to match remainder
	for dev in [dev for dev in devs if dev['email']]:
		if dev['li_matches']:
			if dev['li_matches'][0]['score'] < 75:
				print "Trying piplsearch for %s..." % dev.get('name')
				result = try_piplsearch(dev)
				if result:
					dev['li_matches'].append(result)
					# re-sort from highest scoring match to lowest
					dev['li_matches'] = sorted(dev['li_matches'], key =lambda k: k['score'], reverse=True)
		else: