Пример #1
0
def download_kegg_info_files(kegg_set_ids, species_ini_file):
    """
    This is a KEGG-specific function that downloads the files containing
    information about the KEGG sets, such as their title, abstract, supporting
    publications, etc.

    Arguments:
    kegg_set_ids -- List of kegg set identifiers (e.g. hsa00010) for which
    info files will be downloaded.

    species_ini_file -- Path to the species INI config file. This
    is a string.

    Returns:
    Nothing, just downloads and saves files to keggset_info folder, which will
    be the SPECIES_DOWNLOAD_FOLDER + 'KEGG/keggset_info_folder'

    """
    species_file = SafeConfigParser()
    species_file.read(species_ini_file)

    sd_folder = species_file.get('species_info', 'SPECIES_DOWNLOAD_FOLDER')

    keggset_info_folder = os.path.join(sd_folder, KEGGSET_INFO_FOLDER)
    check_create_folder(keggset_info_folder)

    full_info_url = species_file.get('KEGG', 'KEGG_ROOT_URL') + \
        species_file.get('KEGG', 'SET_INFO_DIR')

    for kegg_id in kegg_set_ids:
        kegg_info_file = full_info_url + kegg_id
        download_from_url(kegg_info_file, keggset_info_folder)
Пример #2
0
    def _get_or_download_model(self, download: bool) -> Optional[str]:
        """
        Return downloaded model path, if model path does not exits and download is true, it will download
        and return the path
        Args:
            download: flag to decide whether to download model in case it not exists
        Returns:
            str: model path or None
        """
        home_dir = home_directory()
        downloaded_models_dir = os.path.join(home_dir, MODELS_DIR)

        if not os.path.exists(downloaded_models_dir):
            os.makedirs(downloaded_models_dir)

        model_hashed_name = get_hashed_name(self.embedding + self.model)
        model_path = os.path.join(downloaded_models_dir, model_hashed_name)

        if not os.path.exists(model_path):
            if not download:
                return

            model_download_path = model_path + '.' + self.embedding_cls.EMBEDDING_MODELS[
                self.model].format
            model_download_url = self.embedding_cls.EMBEDDING_MODELS[
                self.model].download_url
            print(f"Model does not exists, Downloading model: {self.model}")
            download_from_url(model_download_url, model_download_path)
            extract_file(model_download_path, model_path)
            os.remove(model_download_path)
            print(f"Model downloaded successfully!")
        return model_path
    def get_blacklist(self):

        source = BLACKLIST_SOURCE[self.ref_name]
        path = os.path.join(self.ref_path, 'regions', 'blacklist.bed')
        utils.download_from_url(source, path)

        # hg19 blacklist.bed has a coordinate overflow issue for the chrM row.
        # The entire chrM is set to be blacklist but the annotation does not match the contig length
        regtools.clean_chr_name_file(path, self.config, self.ref_path)
        regtools.sort_and_uniq_bed(path, self.ref_path)
    def get_DHS(self):
        """
        download and clean the dnase.bed. Only called when creating standard genomes.
        :return:
        """

        source = DHS_SOURCE[self.ref_name]
        path = os.path.join(self.ref_path, 'regions', 'dnase.bed')
        utils.download_from_url(source, path)

        regtools.clean_chr_name_file(path, self.config, self.ref_path)
        regtools.sort_and_uniq_bed(path, self.ref_path)
Пример #5
0
def upload_file():
    if request.method == 'POST':
        # check if the post request has the file part
        print(request.form)
        if 'text' in request.form:
            # raise NotImplementedError
            print(request.form["text"])
            url = request.form["text"]
            filename = download_from_url(url, app.config['UPLOAD_FOLDER'])
            return redirect(url_for('remove_output', filename=filename))
        if 'file' not in request.files:
            flash('No file part')
            return redirect(request.url)
        file = request.files['file']
        # if user does not select file, browser also
        # submit an empty part without filename
        if file.filename == '':
            flash('No selected file')
            return redirect(request.url)
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            return redirect(url_for('remove_output',
                                    filename=filename))
    return '''
Пример #6
0
    def __init__(self):
        base_path = os.path.dirname(os.path.abspath(__file__))
        jar_path = os.path.join(base_path, METEOR_JAR)
        gz_path = os.path.join(base_path, os.path.basename(METEOR_GZ_URL))
        if not os.path.isfile(jar_path):
            if not os.path.isfile(gz_path):
                download_from_url(METEOR_GZ_URL, gz_path)
            tar = tarfile.open(gz_path, "r")
            tar.extractall(path=os.path.dirname(os.path.abspath(__file__)))
            tar.close()
            os.remove(gz_path)

        self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \
                '-', '-', '-stdio', '-l', 'en', '-norm']
        self.meteor_p = subprocess.Popen(self.meteor_cmd, \
                cwd=os.path.dirname(os.path.abspath(__file__)), \
                stdin=subprocess.PIPE, \
                stdout=subprocess.PIPE, \
                stderr=subprocess.PIPE)
        # Used to guarantee thread safety
        self.lock = threading.Lock()
Пример #7
0
def download_file(item, path, course, output_dir):
    filepath = create_filepath(course, path)
    description = item["Description"]["Html"]
    topic_type = item["TopicType"]
    title = item["Title"]
    if topic_type == 1:
        filename = create_filename(item)
        full_path = f"{output_dir}/{filepath}/{filename}"
        # These documents are real files that we want to download
        download_from_url(f"""{ufora}{item["Url"]}""", full_path)
        if item["Url"].endswith(".html"):
            # HTML files on Ufora need a little special treatment
            # We'll prepend a title, <base> tag and convert them to pdf
            with open(full_path, "r") as f:
                content = f.read()
            filename_without_extension = ".".join(filename.split(".")[:-1])
            description_path = f"{output_dir}/{filepath}/{filename_without_extension}.pdf"
            create_metadata(description_path, content,
                            filename_without_extension)
            new_content = f"<base href={ufora}><h1>{title}</h1>{content}"
            with open(full_path, "w") as f:
                f.write(new_content)
        elif description:
            # Choosing this filename might cause an overlap...
            filename_without_extension = ".".join(filename.split(".")[:-1])
            description_path = f"{output_dir}/{filepath}/{filename_without_extension}.pdf"
            create_metadata(description_path, description,
                            filename_without_extension)
    elif topic_type == 3:
        # These documents are just clickable links, we'll render them in a pdf
        url = item["Url"]
        filename = create_filename_without_extension(item)
        full_path = f"{output_dir}/{filepath}/{filename}"
        create_metadata(f"{full_path}.pdf",
                        f"<a href={url}>{url}</a>{description}", item["Title"])
    else:
        print(f"Don't know this topic type: {topic_type}")
        exit()
def prepare_data(data_dir,
                 filename_gold,
                 filename_negative,
                 remote_url,
                 embeddings_filenames,
                 embeddings_dir,
                 n_docs=None,
                 n_queries=None,
                 add_precomputed=False):
    """
    filename_gold points to a squad format file.
    filename_negative points to a csv file where the first column is doc_id and second is document text.
    If add_precomputed is True, this fn will look in the embeddings files for precomputed embeddings to add to each Document
    """

    logging.getLogger("farm").setLevel(logging.INFO)
    download_from_url(remote_url + filename_gold,
                      filepath=data_dir + filename_gold)
    download_from_url(remote_url + filename_negative,
                      filepath=data_dir + filename_negative)
    if add_precomputed:
        for embedding_filename in embeddings_filenames:
            download_from_url(
                remote_url + str(embeddings_dir) + embedding_filename,
                filepath=data_dir + str(embeddings_dir) + embedding_filename)
    logging.getLogger("farm").setLevel(logging.WARN)

    gold_docs, labels = eval_data_from_json(data_dir + filename_gold)

    # Reduce number of docs
    gold_docs = gold_docs[:n_docs]

    # Remove labels whose gold docs have been removed
    doc_ids = [x.id for x in gold_docs]
    labels = [x for x in labels if x.document_id in doc_ids]

    # Filter labels down to n_queries
    selected_queries = list(
        set(f"{x.document_id} | {x.question}" for x in labels))
    selected_queries = selected_queries[:n_queries]
    labels = [
        x for x in labels
        if f"{x.document_id} | {x.question}" in selected_queries
    ]

    n_neg_docs = max(0, n_docs - len(gold_docs))
    neg_docs = prepare_negative_passages(data_dir, filename_negative,
                                         n_neg_docs)
    docs = gold_docs + neg_docs

    if add_precomputed:
        docs = add_precomputed_embeddings(data_dir + embeddings_dir,
                                          embeddings_filenames, docs)

    return docs, labels
def preprocess_google_news(
    raw_data_dir: str,
    output_dir: str,
    annoy_index_n_trees: int,
    scann_num_leaves_scaling: int,
) -> None:
    """
    Downloads and preprocessed external word embeddings from [1].

    Parameters
    ----------
    raw_data_dir : str
        Path to the raw data directory (where files will be downloaded to).
    output_dir : str
        Output directory to save processed data.
    annoy_index_n_trees : int
        Number of trees to pass to Annoys build method. More trees => higher precision.
    scann_num_leaves_scaling : int
        Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision.

    References
    ----------
    .. [1] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean.
       Distributed Representations of Words and Phrases and their Compositionality
       (https://arxiv.org/pdf/1310.4546.pdf). In Proceedings of NIPS, 2013.
    """
    # Ensure output directory exists
    output_dir = join(output_dir, "GoogleNews")
    makedirs(output_dir, exist_ok=True)

    # Define filepaths
    google_news_vectors_zip_raw_download_url = "https://filesender.uninett.no/download.php?token=b0aea55e-72a7-4ac0-9409-8d5dbb322505&files_ids=645861"
    google_news_vectors_zip_raw_filename = "GoogleNews-vectors-negative300.bin.gz"
    google_news_vectors_zip_raw_filepath = join(
        raw_data_dir, google_news_vectors_zip_raw_filename
    )
    google_news_vectors_bin_raw_filepath = join(
        raw_data_dir, "GoogleNews-vectors-negative300.bin"
    )
    google_news_words_filepath = join(
        output_dir, "GoogleNews-vectors-negative300_words.txt"
    )
    google_news_vectors_filepath = join(
        output_dir, "GoogleNews-vectors-negative300.npy"
    )
    google_news_normalized_vectors_filepath = join(
        output_dir, "GoogleNews-vectors-negative300_normalized.npy"
    )
    google_news_vectors_annoy_index_filepath = join(
        output_dir, "GoogleNews-vectors-negative300_annoy_index.ann"
    )
    google_news_vectors_scann_artifacts_dir = join(
        output_dir, "GoogleNews-vectors-negative300_scann_artifacts"
    )

    # -- GoogleNews-vectors-negative300.bin.gz --
    if not isfile(google_news_vectors_zip_raw_filepath):
        print(f"Downloading {google_news_vectors_zip_raw_filename}...")
        download_from_url(
            url=google_news_vectors_zip_raw_download_url,
            destination_filepath=google_news_vectors_zip_raw_filepath,
        )
        print("Done!")

    if not isfile(google_news_vectors_bin_raw_filepath):
        print(f"Extracting {google_news_vectors_zip_raw_filename}...")
        with gzip.GzipFile(google_news_vectors_zip_raw_filepath, "rb") as gzip_file_raw:
            with open(google_news_vectors_bin_raw_filepath, "wb") as gzip_file_output:
                gzip_file_output.write(gzip_file_raw.read())
        print("Done!")

    # Parse vectors from binary file and save result
    should_load_vectors = (
        not isfile(google_news_words_filepath)
        or not isfile(google_news_vectors_filepath)
        or not isfile(google_news_normalized_vectors_filepath)
    )
    if should_load_vectors:
        google_news_word_embeddings, google_news_words = load_word2vec_binary_format(
            word2vec_filepath=google_news_vectors_bin_raw_filepath,
            tqdm_enabled=True,
        )

    # Save words
    if not isfile(google_news_words_filepath):
        with open(google_news_words_filepath, "w") as file:
            for i, word in enumerate(google_news_words):
                if i > 0:
                    file.write("\n")
                file.write(word)

    # Save word embeddings
    if not isfile(google_news_vectors_filepath):
        np.save(google_news_vectors_filepath, google_news_word_embeddings)

    # Save normalized word embeddings
    google_news_word_embeddings_normalized = None
    if not isfile(google_news_normalized_vectors_filepath):
        google_news_word_embeddings_normalized = (
            google_news_word_embeddings
            / np.linalg.norm(google_news_word_embeddings, axis=1).reshape(-1, 1)
        )
        np.save(
            google_news_normalized_vectors_filepath,
            google_news_word_embeddings_normalized,
        )

    annoy_index_created = isfile(google_news_vectors_annoy_index_filepath)
    scann_instance_created = isdir(google_news_vectors_scann_artifacts_dir)
    if not annoy_index_created or not scann_instance_created:
        if google_news_word_embeddings_normalized is None:
            google_news_word_embeddings_normalized = np.load(
                google_news_normalized_vectors_filepath
            )

        if not annoy_index_created:
            ann_index_annoy = ApproxNN(ann_alg="annoy")
            ann_index_annoy.build(
                data=google_news_word_embeddings_normalized,
                annoy_n_trees=annoy_index_n_trees,
                distance_measure="euclidean",
            )
            ann_index_annoy.save(google_news_vectors_annoy_index_filepath)

        if not scann_instance_created:
            ann_index_scann = ApproxNN(ann_alg="scann")
            ann_index_scann.build(
                data=google_news_word_embeddings_normalized,
                scann_num_leaves_scaling=scann_num_leaves_scaling,
            )
            ann_index_scann.save(google_news_vectors_scann_artifacts_dir)
def preprocess_fasttext_tps(
    raw_data_dir: str,
    output_dir: str,
    annoy_index_n_trees: int,
    scann_num_leaves_scaling: int,
) -> None:
    """
    Downloads and preprocessed external word embeddings from [1].

    Parameters
    ----------
    raw_data_dir : str
        Path to the raw data directory (where files will be downloaded to).
    output_dir : str
        Output directory to save processed data.
    annoy_index_n_trees : int
        Number of trees to pass to Annoys build method. More trees => higher precision.
    scann_num_leaves_scaling : int
        Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision.

    References
    ----------
    .. Alexander Jakubowski, Milica Gašić, & Marcus Zibrowius. (2020).
       Topology of Word Embeddings: Singularities Reflect Polysemy.
    """
    # Ensure output directory exists
    output_dir = join(output_dir, "fastTextTPS")
    makedirs(output_dir, exist_ok=True)

    # Define constants
    env_config = dotenv_values(join("..", ".env"))
    tps_fasttext_model_filesender_token = env_config[
        "TPS_FASTTEXT_MODEL_FILESENDER_TOKEN"
    ]
    tps_fasttext_model_filesender_token_files_ids = env_config[
        "TPS_FASTTEXT_MODEL_FILESENDER_TOKEN_FILES_IDS"
    ]
    tps_fasttext_model_url = f"https://filesender.uninett.no/download.php?token={tps_fasttext_model_filesender_token}&files_ids={tps_fasttext_model_filesender_token_files_ids}"
    tps_fasttext_model_name = "fastText.TPS.300d"
    tps_fasttext_model_raw_filepath = join(
        raw_data_dir, f"{tps_fasttext_model_name}.bin"
    )
    tps_fasttext_model_words_filepath = join(
        output_dir, f"{tps_fasttext_model_name}_words.txt"
    )
    tps_fasttext_model_vectors_filepath = join(
        output_dir, f"{tps_fasttext_model_name}.npy"
    )
    tps_fasttext_model_vectors_normalized_filepath = join(
        output_dir, f"{tps_fasttext_model_name}_normalized.npy"
    )
    tps_fasttext_model_annoy_index_filepath = join(
        output_dir, f"{tps_fasttext_model_name}_annoy_index.ann"
    )
    tps_fasttext_model_scann_artifacts_dir = join(
        output_dir, f"{tps_fasttext_model_name}_scann_artifacts"
    )

    if not isfile(tps_fasttext_model_raw_filepath):
        print(f"Downloading {tps_fasttext_model_name}...")
        download_from_url(
            url=tps_fasttext_model_url,
            destination_filepath=tps_fasttext_model_raw_filepath,
        )
        print("Done!")

    # Load output from trained fastText model
    fasttext_model = fasttext.load_model(tps_fasttext_model_raw_filepath)
    fasttext_model_words = fasttext_model.words
    fasttext_model_embedding_weights = np.zeros(
        (len(fasttext_model_words), fasttext_model.get_dimension())
    )
    for i, word in enumerate(fasttext_model.words):
        fasttext_model_embedding_weights[i] = fasttext_model.get_word_vector(word)

    # Save words
    if not isfile(tps_fasttext_model_words_filepath):
        with open(tps_fasttext_model_words_filepath, "w") as file:
            for i, word in enumerate(fasttext_model.words):
                if i > 0:
                    file.write("\n")
                file.write(word)

    # Save word embeddings
    if not isfile(tps_fasttext_model_vectors_filepath):
        np.save(tps_fasttext_model_vectors_filepath, fasttext_model_embedding_weights)

    # Save normalized word embeddings
    fasttext_model_embedding_weights_normalized = None
    if not isfile(tps_fasttext_model_vectors_normalized_filepath):
        fasttext_model_embedding_weights_normalized = (
            fasttext_model_embedding_weights
            / np.linalg.norm(fasttext_model_embedding_weights, axis=1).reshape(-1, 1)
        )
        np.save(
            tps_fasttext_model_vectors_normalized_filepath,
            fasttext_model_embedding_weights_normalized,
        )

    annoy_index_created = isfile(tps_fasttext_model_annoy_index_filepath)
    scann_instance_created = isdir(tps_fasttext_model_scann_artifacts_dir)
    if not annoy_index_created or not scann_instance_created:
        if fasttext_model_embedding_weights_normalized is None:
            fasttext_model_embedding_weights_normalized = np.load(
                tps_fasttext_model_vectors_normalized_filepath
            )

        if not annoy_index_created:
            ann_index_annoy = ApproxNN(ann_alg="annoy")
            ann_index_annoy.build(
                data=fasttext_model_embedding_weights_normalized,
                annoy_n_trees=annoy_index_n_trees,
                distance_measure="euclidean",
            )
            ann_index_annoy.save(tps_fasttext_model_annoy_index_filepath)

        if not scann_instance_created:
            ann_index_scann = ApproxNN(ann_alg="scann")
            ann_index_scann.build(
                data=fasttext_model_embedding_weights_normalized,
                scann_num_leaves_scaling=scann_num_leaves_scaling,
            )
            ann_index_scann.save(tps_fasttext_model_scann_artifacts_dir)
def preprocess_fasttext(
    raw_data_dir: str,
    output_dir: str,
    annoy_index_n_trees: int,
    scann_num_leaves_scaling: int,
) -> None:
    """
    Downloads and preprocessed external word embeddings from [1].

    Parameters
    ----------
    raw_data_dir : str
        Path to the raw data directory (where files will be downloaded to).
    output_dir : str
        Output directory to save processed data.
    annoy_index_n_trees : int
        Number of trees to pass to Annoys build method. More trees => higher precision.
    scann_num_leaves_scaling : int
        Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision.

    References
    ----------
    .. [1] Grave, E., Bojanowski, P., Gupta, P., Joulin, A., & Mikolov, T. (2018).
       Learning Word Vectors for 157 Languages. In Proceedings of the International
       Conference on Language Resources and Evaluation (LREC 2018).
    """
    # Ensure output directory exists
    output_dir = join(output_dir, "fastText")
    makedirs(output_dir, exist_ok=True)

    # Define constants
    fasttext_data_filename = "cc.en.300.vec"
    fasttext_vectors_url = f"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/{fasttext_data_filename}.gz"
    fasttext_word_vectors_raw_gzip_filepath = join(
        raw_data_dir, f"{fasttext_data_filename}.gz"
    )
    fasttext_word_vectors_raw_txt_filepath = join(raw_data_dir, fasttext_data_filename)
    fasttext_word_vectors_words_filepath = join(
        output_dir, f"{fasttext_data_filename}_words.txt"
    )
    fasttext_word_vectors_filepath = join(output_dir, f"{fasttext_data_filename}.npy")
    fasttext_word_vectors_normalized_filepath = join(
        output_dir, f"{fasttext_data_filename}_normalized.npy"
    )
    fasttext_word_vectors_annoy_index_filepath = join(
        output_dir, f"{fasttext_data_filename}_annoy_index.ann"
    )
    fasttext_word_vectors_scann_artifacts_dir = join(
        output_dir, f"{fasttext_data_filename}_scann_artifacts"
    )

    if not isfile(fasttext_word_vectors_raw_gzip_filepath):
        print(f"Downloading {fasttext_data_filename}...")
        download_from_url(
            url=fasttext_vectors_url,
            destination_filepath=fasttext_word_vectors_raw_gzip_filepath,
        )
        print("Done!")

    if not isfile(fasttext_word_vectors_raw_txt_filepath):
        print(f"Extracting {fasttext_data_filename}...")
        with gzip.GzipFile(
            fasttext_word_vectors_raw_gzip_filepath, "rb"
        ) as gzip_file_raw:
            with open(fasttext_word_vectors_raw_txt_filepath, "wb") as gzip_file_output:
                gzip_file_output.write(gzip_file_raw.read())
        print("Done!")

    # Parse vectors from text file and save result
    should_load_vectors = (
        not isfile(fasttext_word_vectors_words_filepath)
        or not isfile(fasttext_word_vectors_filepath)
        or not isfile(fasttext_word_vectors_normalized_filepath)
    )
    if should_load_vectors:
        fasttext_word_embeddings, fasttext_words = load_word_embeddings_text_format(
            word_embeddings_text_filepath=fasttext_word_vectors_raw_txt_filepath,
            first_line_header=True,
            tqdm_enabled=True,
        )

    # Save words
    if not isfile(fasttext_word_vectors_words_filepath):
        with open(fasttext_word_vectors_words_filepath, "w") as file:
            for i, word in enumerate(fasttext_words):
                if i > 0:
                    file.write("\n")
                file.write(word)

    # Save word embeddings
    if not isfile(fasttext_word_vectors_filepath):
        np.save(fasttext_word_vectors_filepath, fasttext_word_embeddings)

    # Save normalized word embeddings
    fasttext_word_embeddings_normalized = None
    if not isfile(fasttext_word_vectors_normalized_filepath):
        fasttext_word_embeddings_normalized = fasttext_word_embeddings / np.linalg.norm(
            fasttext_word_embeddings, axis=1
        ).reshape(-1, 1)
        np.save(
            fasttext_word_vectors_normalized_filepath,
            fasttext_word_embeddings_normalized,
        )

    annoy_index_created = isfile(fasttext_word_vectors_annoy_index_filepath)
    scann_instance_created = isdir(fasttext_word_vectors_scann_artifacts_dir)
    if not annoy_index_created or not scann_instance_created:
        if fasttext_word_embeddings_normalized is None:
            fasttext_word_embeddings_normalized = np.load(
                fasttext_word_vectors_normalized_filepath
            )

        if not annoy_index_created:
            ann_index_annoy = ApproxNN(ann_alg="annoy")
            ann_index_annoy.build(
                data=fasttext_word_embeddings_normalized,
                annoy_n_trees=annoy_index_n_trees,
                distance_measure="euclidean",
            )
            ann_index_annoy.save(fasttext_word_vectors_annoy_index_filepath)

        if not scann_instance_created:
            ann_index_scann = ApproxNN(ann_alg="scann")
            ann_index_scann.build(
                data=fasttext_word_embeddings_normalized,
                scann_num_leaves_scaling=scann_num_leaves_scaling,
            )
            ann_index_scann.save(fasttext_word_vectors_scann_artifacts_dir)
def preprocess_glove(
    raw_data_dir: str,
    output_dir: str,
    annoy_index_n_trees: int,
    scann_num_leaves_scaling: int,
) -> None:
    """
    Downloads and preprocessed external word embeddings from [1].

    Parameters
    ----------
    raw_data_dir : str
        Path to the raw data directory (where files will be downloaded to).
    output_dir : str
        Output directory to save processed data.
    annoy_index_n_trees : int
        Number of trees to pass to Annoys build method. More trees => higher precision.
    scann_num_leaves_scaling : int
        Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision.

    References
    ----------
    .. [1] Jeffrey Pennington, Richard Socher, & Christopher D. Manning (2014).
       GloVe: Global Vectors for Word Representation. In Empirical Methods in Natural
       Language Processing (EMNLP) (pp. 1532–1543).
    """
    # Ensure output directory exists
    output_dir = join(output_dir, "GloVe")
    makedirs(output_dir, exist_ok=True)

    # Define constants
    glove_data_filename = "glove.840B.300d"
    glove_word_vectors_url = f"http://nlp.stanford.edu/data/{glove_data_filename}.zip"
    glove_word_vectors_raw_zip_filepath = join(
        raw_data_dir, f"{glove_data_filename}.zip"
    )
    glove_word_vectors_raw_txt_filename = f"{glove_data_filename}.txt"
    glove_word_vectors_raw_txt_filepath = join(
        raw_data_dir, glove_word_vectors_raw_txt_filename
    )
    glove_word_vectors_words_filepath = join(
        output_dir, f"{glove_data_filename}_words.txt"
    )
    glove_word_vectors_filepath = join(output_dir, f"{glove_data_filename}.npy")
    glove_word_vectors_normalized_filepath = join(
        output_dir, f"{glove_data_filename}_normalized.npy"
    )
    glove_word_vectors_annoy_index_filepath = join(
        output_dir, f"{glove_data_filename}_annoy_index.ann"
    )
    glove_word_vectors_scann_artifacts_dir = join(
        output_dir, f"{glove_data_filename}_scann_artifacts"
    )

    if not isfile(glove_word_vectors_raw_zip_filepath):
        print(f"Downloading {glove_data_filename}...")
        download_from_url(
            url=glove_word_vectors_url,
            destination_filepath=glove_word_vectors_raw_zip_filepath,
        )
        print("Done!")

    if not isfile(glove_word_vectors_raw_txt_filepath):
        print(f"Extracting {glove_data_filename}...")
        with zipfile.ZipFile(glove_word_vectors_raw_zip_filepath, "r") as zip_ref:
            zip_ref.extractall(raw_data_dir)
        print("Done!")

    # Parse vectors from text file and save result
    should_load_vectors = (
        not isfile(glove_word_vectors_words_filepath)
        or not isfile(glove_word_vectors_filepath)
        or not isfile(glove_word_vectors_normalized_filepath)
    )
    if should_load_vectors:
        glove_word_embeddings, glove_words = load_word_embeddings_text_format(
            word_embeddings_text_filepath=glove_word_vectors_raw_txt_filepath,
            first_line_header=False,
            tqdm_enabled=True,
        )

    # Save words
    if not isfile(glove_word_vectors_words_filepath):
        with open(glove_word_vectors_words_filepath, "w") as file:
            for i, word in enumerate(glove_words):
                if i > 0:
                    file.write("\n")
                file.write(word)

    # Save word embeddings
    if not isfile(glove_word_vectors_filepath):
        np.save(glove_word_vectors_filepath, glove_word_embeddings)

    # Save normalized word embeddings
    glove_word_embeddings_normalized = None
    if not isfile(glove_word_vectors_normalized_filepath):
        glove_word_embeddings_normalized = glove_word_embeddings / np.linalg.norm(
            glove_word_embeddings, axis=1
        ).reshape(-1, 1)
        np.save(
            glove_word_vectors_normalized_filepath,
            glove_word_embeddings_normalized,
        )

    annoy_index_created = isfile(glove_word_vectors_annoy_index_filepath)
    scann_instance_created = isdir(glove_word_vectors_scann_artifacts_dir)
    if not annoy_index_created or not scann_instance_created:
        if glove_word_embeddings_normalized is None:
            glove_word_embeddings_normalized = np.load(
                glove_word_vectors_normalized_filepath
            )

        if not annoy_index_created:
            ann_index_annoy = ApproxNN(ann_alg="annoy")
            ann_index_annoy.build(
                data=glove_word_embeddings_normalized,
                annoy_n_trees=annoy_index_n_trees,
                distance_measure="euclidean",
            )
            ann_index_annoy.save(glove_word_vectors_annoy_index_filepath)

        if not scann_instance_created:
            ann_index_scann = ApproxNN(ann_alg="scann")
            ann_index_scann.build(
                data=glove_word_embeddings_normalized,
                scann_num_leaves_scaling=scann_num_leaves_scaling,
            )
            ann_index_scann.save(glove_word_vectors_scann_artifacts_dir)
Пример #13
0
 def download(self):
     download_from_url(
         'http://www.kr-kralovehradecky.cz/xml/export/eldeska-zpravy.xml',
         'data/kralovehradecky_kraj.xml')
Пример #14
0
def download_all_files(species_ini_file,
                       base_download_folder,
                       secrets_location=None):
    """
    Reads config INI file for a species, which contains the files (and
    their locations, or URLs) that must be loaded for this species, and calls
    the download_from_url function for each of those files.

    Arguments:
    species_ini_file -- Path to the particular species INI file. This
    is a string.

    base_download_folder -- A string. Path of the root folder where download
    folders for other species will be created and where common downloaded files
    will be saved. This is stored in the main configuration INI file.

    secrets_location -- Optional string of location of the secrets INI
    file.

    Returns:
    Nothing, just downloads and saves files to download_folder

    """
    check_create_folder(base_download_folder)

    species_file = SafeConfigParser()
    species_file.read(species_ini_file)

    sd_folder = species_file.get('species_info', 'SPECIES_DOWNLOAD_FOLDER')
    check_create_folder(sd_folder)

    if species_file.has_section('GO'):
        if species_file.getboolean('GO', 'DOWNLOAD'):

            obo_url = species_file.get('GO', 'GO_OBO_URL')
            download_from_url(obo_url, base_download_folder)

            go_dir = os.path.join(sd_folder, 'GO')
            check_create_folder(go_dir)

            goa_urls = species_file.get('GO', 'ASSOC_FILE_URLS')
            goa_urls = re.sub(r'\s', '', goa_urls).split(',')

            for goa_url in goa_urls:
                download_from_url(goa_url, go_dir)

    if species_file.has_section('KEGG'):
        if species_file.getboolean('KEGG', 'DOWNLOAD'):

            kegg_root_url = species_file.get('KEGG', 'KEGG_ROOT_URL')

            kegg_info_url = kegg_root_url + species_file.get(
                'KEGG', 'DB_INFO_URL')

            download_from_url(kegg_info_url, base_download_folder,
                              'kegg_db_info')

            kegg_dir = os.path.join(sd_folder, 'KEGG')
            check_create_folder(kegg_dir)

            ks_urls = species_file.get('KEGG', 'SETS_TO_DOWNLOAD')
            kegg_urls = [
                kegg_root_url + url.strip() for url in ks_urls.split(',')
            ]

            for kegg_url in kegg_urls:
                download_from_url(kegg_url, kegg_dir)

    if species_file.has_section('DO'):
        if species_file.getboolean('DO', 'DOWNLOAD'):
            do_dir = os.path.join(sd_folder, 'DO')
            check_create_folder(do_dir)

            obo_url = species_file.get('DO', 'DO_OBO_URL')
            download_from_url(obo_url, do_dir)

            mim2gene_url = species_file.get('DO', 'MIM2GENE_URL')
            download_from_url(mim2gene_url, do_dir)

            # The genemap_file needs a special Secret Key, which must be
            # retrieved from the secrets file if the user wishes to download
            # the genemap_file
            genemap_url = species_file.get('DO', 'GENEMAP_URL')

            if not secrets_location:
                logger.error('Secrets file was not passed to '
                             'download_all_files() function. A secrets file '
                             'containing an OMIM API secret key is required to'
                             ' download the genemap file to process Disease '
                             'Ontology.')
                sys.exit(1)

            secrets_file = SafeConfigParser()
            secrets_file.read(secrets_location)

            if not secrets_file.has_section('OMIM API secrets'):
                logger.error('Secrets file has no "OMIM API secrets" section,'
                             'which is required to download the genemap file '
                             ' to process Disease Ontology.')
                sys.exit(1)

            omim_secret_key = secrets_file.get('OMIM API secrets',
                                               'SECRET_KEY')
            genemap_url = genemap_url.replace('<SecretKey>', omim_secret_key)

            download_from_url(genemap_url, do_dir)
Пример #15
0
def preprocess_topological_polysemy_data(raw_data_dir: str, output_dir: str) -> None:
    """
    Preprocesses data for the topological polysemy paper [1].

    Parameters
    ----------
    raw_data_dir : str
        Path to the raw data directory (where files will be downloaded to and extracted from).
    output_dir : str
        Output directory to save processed data.

    References
    ----------
    .. [1] Alexander Jakubowski, Milica Gašić, & Marcus Zibrowius. (2020).
       Topology of Word Embeddings: Singularities Reflect Polysemy.
    """
    print("Processing TPS paper...")

    # Download data from SemEval-2010 task 14
    semeval_2010_14_data_url = (
        "https://www.cs.york.ac.uk/semeval2010_WSI/files/training_data.tar.gz"
    )
    semeval_2010_14_raw_data_filepath = join(
        raw_data_dir, "semeval_training_data.tar.gz"
    )
    semeval_2010_14_raw_data_dir = join(raw_data_dir, "semeval_training_data")
    semeval_2010_14_nouns_dir = join(
        semeval_2010_14_raw_data_dir, "training_data", "nouns"
    )
    semeval_2010_14_verbs_dir = join(
        semeval_2010_14_raw_data_dir, "training_data", "verbs"
    )
    semeval_2010_14_york_datasets_url = (
        "https://www.cs.york.ac.uk/semeval2010_WSI/datasets.html"
    )
    semeval_2010_14_word_senses_filepath = join(
        output_dir, "semeval_2010_14_word_senses.joblib"
    )
    semeval_2010_14_training_data_sentences_dir = join(
        output_dir, "semeval_2010_14_training_data"
    )

    if not isfile(semeval_2010_14_word_senses_filepath):

        # Scrape website for SemEval gold standard senses
        print("Downloading SemEval 2010 task 14 website...")
        semeval_2010_14_york_datasets_source = get_cached_download_text_file(
            semeval_2010_14_york_datasets_url,
            target_dir=raw_data_dir,
            filename="semeval_2010_14_york_datasets.html",
        )
        semeval_2010_14_york_datasets_soup = BeautifulSoup(
            semeval_2010_14_york_datasets_source, features="lxml"
        )
        semeval_2010_14_york_datasets_tables_soup = (
            semeval_2010_14_york_datasets_soup.find_all("tbody")
        )

        # Scrape tables for word/sense pairs
        semeval_2010_14_word_senses: dict = {"verbs": {}, "nouns": {}, "all": {}}
        for table in semeval_2010_14_york_datasets_tables_soup:
            table_rows = table.find_all("tr")[1:]
            for table_row in table_rows:
                table_cols = table_row.find_all("td")

                # Get word and its GS senses
                target_word = table_cols[0].get_text().strip()
                target_word_is_verb = target_word.endswith(".v")
                target_word = target_word.split(".")[0]
                target_word_senses = int(table_cols[3].get_text().strip())

                if target_word_is_verb:
                    semeval_2010_14_word_senses["verbs"][
                        target_word
                    ] = target_word_senses
                else:
                    semeval_2010_14_word_senses["nouns"][
                        target_word
                    ] = target_word_senses
        semeval_2010_14_word_senses["all"] = {
            **semeval_2010_14_word_senses["verbs"],
            **semeval_2010_14_word_senses["nouns"],
        }

        # Save result
        joblib.dump(semeval_2010_14_word_senses, semeval_2010_14_word_senses_filepath)

    if not isfile(semeval_2010_14_raw_data_filepath):
        print("Downloading training data from SemEval-2010 task 14...")
        download_from_url(semeval_2010_14_data_url, semeval_2010_14_raw_data_filepath)
        print("Done!")

    if not isdir(semeval_2010_14_raw_data_dir):
        print("Extracting raw training data from SemEval-2010 task 14...")
        with tarfile.open(semeval_2010_14_raw_data_filepath) as tar_file:
            tar_file.extractall(semeval_2010_14_raw_data_dir)
        print("Done!")

    if not isdir(semeval_2010_14_training_data_sentences_dir):
        makedirs(semeval_2010_14_training_data_sentences_dir)

        # Default to all CPUs
        num_output_files = cpu_count()

        # Prepare arguments for multiprocessing
        num_output_files_str_len = len(str(num_output_files))
        semeval_2010_14_dirs = [semeval_2010_14_nouns_dir, semeval_2010_14_verbs_dir]
        semeval_2010_14_dir_filepaths = [
            join(semeval_dir, fn)
            for semeval_dir in semeval_2010_14_dirs
            for fn in listdir(semeval_dir)
        ]
        num_xml_files_per_output_file = int(
            len(semeval_2010_14_dir_filepaths) // num_output_files
        )

        print("Processing SemEval-2010 task 14 training data for word2vec...")
        with Pool() as pool:
            for i, mp_args in zip(
                range(num_output_files),
                batch_list_gen(
                    semeval_2010_14_dir_filepaths, num_xml_files_per_output_file
                ),
            ):
                output_filename = f"semeval_2010_task_14-{str(i + 1).zfill(num_output_files_str_len)}.txt"
                output_filepath = join(
                    semeval_2010_14_training_data_sentences_dir, output_filename
                )
                print(f"Writing to {output_filename}...")
                with open(output_filepath, "w", encoding="utf8") as output_semeval_file:
                    for j, result in enumerate(
                        tqdm(
                            pool.imap_unordered(
                                preprocess_semeval_2010_task_14_training_xml_file,
                                mp_args,
                            ),
                            total=num_xml_files_per_output_file,
                        )
                    ):
                        if j > 0:
                            output_semeval_file.write("\n")
                        output_semeval_file.write(result)
        print("Done!")
Пример #16
0
def get_documents(cache_dir):
    ALREADY_DOWNLOADED = []

    def get_documents_for(url, d_type):
        response = requests.get(url)
        items = xmltodict.parse(response.content)['rss']['channel']['item']
        link_with_filenames = []
        for item in items:
            title = item.get('title')
            link = item.get('link')
            filename = os.path.join(cache_dir,
                                    'pdf/{}/{}.pdf'.format(d_type, title))
            if not os.path.isfile(filename):
                link_with_filenames.append([link, filename])
            else:
                ALREADY_DOWNLOADED.append([link, filename])
        return link_with_filenames

    url_with_filenames = []
    logger.info('Retrieving file list...')
    for d_type, url in TYPES:
        documents = get_documents_for(url, d_type)
        url_with_filenames.extend(documents)
    file_meta_path = os.path.join(cache_dir, 'file_meta.json')
    os.makedirs(os.path.dirname(file_meta_path), exist_ok=True)
    with open(file_meta_path, 'w') as fp:
        json.dump(
            {
                '{}__{}'.format(
                    filename.split('/')[-2],
                    filename.split('/')[-1]): link
                for link, filename in url_with_filenames + ALREADY_DOWNLOADED
            }, fp)

    already_downloaded_num = len(ALREADY_DOWNLOADED)
    total_num_docs = len(url_with_filenames) + len(ALREADY_DOWNLOADED)
    async_download(url_with_filenames,
                   headers=HEADERS,
                   exception_handler=exception_handler)

    errored_num_docs = len(ERRORED_URLS)
    logger.warn(
        '{} Already Exsists, {} Success, {} Error downloading docs. {}'.format(
            already_downloaded_num,
            total_num_docs - errored_num_docs,
            errored_num_docs,
            ' Retrying....' if errored_num_docs else '',
        ))

    # Retry for errored urls using requests
    for url, filename in ERRORED_URLS:
        try:
            logger.info('Retrying for url {}'.format(url))
            download_from_url(url, filename, HEADERS)
            errored_num_docs -= 1
            logger.info('Success for url {}'.format(url))
        except Exception as e:
            exception_handler(e, url=url, retry=True)

    logger.warn('Total docs: {}'.format(total_num_docs))
    logger.warn('Already existing docs: {}'.format(already_downloaded_num))
    logger.warn('Success downloads: {}'.format(len(url_with_filenames)))
    logger.warn('Error downloads: {}'.format(errored_num_docs))
    logger.warn('Download Complete')
def preprocess_country_info(raw_data_dir: str, output_dir: str,
                            geonames_username: str) -> None:
    """
    Downloads and prepares a .csv file containing all countries and
    its capitals of the world.

    Data is fetched from geonames.org (https://www.geonames.org/)
    and is licenced under CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/).

    Parameters
    ----------
    raw_data_dir : str
        Raw data directory
    output_dir : str
        Directory to save output data.
    geonames_username : str
        GeoNames username
    """
    # Constants
    all_countries_combined_data_url = (
        "https://download.geonames.org/export/dump/allCountries.zip")
    all_countries_raw_data_zip_filepath = join(raw_data_dir,
                                               "allCountries.zip")
    all_countries_raw_data_txt_filepath = join(raw_data_dir,
                                               "allCountries.txt")
    country_info_csv_data_url = (
        f"https://secure.geonames.org/countryInfoCSV?username={geonames_username}"
    )
    country_info_raw_data_csv_filepath = join(raw_data_dir, "country-info.csv")
    output_filepath = join(output_dir, "country-info.csv")

    # Download raw data
    if not isfile(all_countries_raw_data_zip_filepath):
        print("Downloading raw country data...")
        download_from_url(all_countries_combined_data_url,
                          all_countries_raw_data_zip_filepath)
        print("Done!")

    if not isfile(all_countries_raw_data_txt_filepath):
        print("Extracting raw data...")
        with zipfile.ZipFile(all_countries_raw_data_zip_filepath,
                             "r") as zip_file:
            zip_file.extractall(raw_data_dir)
        print("Done!")

    if not isfile(country_info_raw_data_csv_filepath):
        print("Downloading country info data...")
        download_from_url(country_info_csv_data_url,
                          country_info_raw_data_csv_filepath)
        print("Done!")

    if not isfile(output_filepath):
        # Load raw data into Pandas DataFrames and join them
        all_countries_info_df = pd.read_csv(
            all_countries_raw_data_txt_filepath,
            sep="\t",
            na_filter=False,
            header=None,
            names=[
                "geonameId",
                "name",
                "asciiname",
                "alternatenames",
                "latitude",
                "longitude",
                "feature class",
                "feature code",
                "country code",
                "cc2",
                "admin1 code",
                "admin2 code",
                "admin3 code",
                "admin4 code",
                "population",
                "elevation",
                "dem",
                "timezone",
                "modification date",
            ],
            usecols=["geonameId", "latitude", "longitude"],
            index_col="geonameId",
        )
        country_info_df = pd.read_csv(
            country_info_raw_data_csv_filepath,
            sep="\t",
            na_filter=False,
            usecols=["name", "capital", "continent", "geonameId"],
        )
        country_info_df = country_info_df.join(all_countries_info_df,
                                               on="geonameId",
                                               how="left")

        # Remove unused GeoNameId column
        country_info_df.drop("geonameId", inplace=True, axis=1)

        # Replace continent codes with names
        continent_code_to_name = {
            "AF": "Africa",
            "AS": "Asia",
            "EU": "Europe",
            "NA": "North America",
            "OC": "Oceania",
            "SA": "South America",
            "AN": "Antarctica",
        }
        country_info_df["continent"] = country_info_df["continent"].apply(
            lambda code: continent_code_to_name[code])

        # Apply preprocessing to country name and capital
        country_info_df["name"] = country_info_df["name"].apply(
            preprocess_name)
        country_info_df["capital"] = country_info_df["capital"].apply(
            preprocess_name)

        # Save to file
        country_info_df.to_csv(output_filepath, index=False)
Пример #18
0
def _maybe_download(inputpath):
    archive_path = os.path.join(inputpath, BOOKS_ARCHIVE)
    if not os.path.isfile(archive_path):
        download_from_url(BOOKS_DOWNLOAD_URL, archive_path)
Пример #19
0
 def download(self):
     download_from_url(
         'https://deska.pardubickykraj.cz/desk_print.aspx', 'data/pardubicky_kraj.xml')
Пример #20
0
def _maybe_download_wikipedia(inputpath):
    wiki_dump_url = f"https://dumps.wikimedia.org/nowiki/latest/{WIKI_DUMP_NAME}"
    archive_path = os.path.join(inputpath, WIKI_DUMP_NAME)
    if not os.path.isfile(archive_path):
        download_from_url(wiki_dump_url, archive_path)
def preprocess_msr(raw_data_dir: str, output_dir: str) -> None:
    """
    Downloads and preprocess test data for evaluating a word2vec model
    on the Microsoft Research Syntactic Analogies Dataset (MSR) from
    Mikolov et al. (https://www.aclweb.org/anthology/N13-1090.pdf)

    Parameters
    ----------
    raw_data_dir : str
        Path to the raw data directory (where files will be downloaded to).
    output_dir : str
        Output directory to save processed data.
    """
    print("Processing MSR...")

    # Initialize paths
    dataset_name = "msr"
    raw_data_url = "https://download.microsoft.com/download/A/B/4/AB4F476B-48A6-47CF-9716-5FF9D0D1F7EA/FeatureAugmentedRNNToolkit-v1.1.tgz"
    raw_data_zip_filepath = join(raw_data_dir, f"{dataset_name}.tgz")
    raw_data_extracted_zip_filepath = join(raw_data_dir, dataset_name)
    output_filepath = join(output_dir, f"{dataset_name}.joblib")

    # Download raw data if not present
    if not isfile(raw_data_zip_filepath):
        print(f"Downloading raw {dataset_name} data...")
        download_from_url(raw_data_url, raw_data_zip_filepath)
        print("Done!")

    # Extract raw data if not present
    if not isdir(raw_data_extracted_zip_filepath):
        print("Extracting raw data...")
        with tarfile.open(raw_data_zip_filepath) as tar_file:
            tar_file.extractall(raw_data_extracted_zip_filepath)
        print("Done!")

    # Read content from extracted zip, process them and combine into one test dataset.
    with open(
        join(
            raw_data_extracted_zip_filepath, "test_set", "word_relationship.questions"
        ),
        "r",
    ) as file:
        word_relationship_questions = [
            line.split(" ") for line in file.read().split("\n") if len(line) > 0
        ]
    with open(
        join(raw_data_extracted_zip_filepath, "test_set", "word_relationship.answers"),
        "r",
    ) as file:
        word_relationship_answers = [
            line.split(" ") for line in file.read().split("\n") if len(line) > 0
        ]

    # Combine lists
    print("Combining files...")
    word_relationship_questions_answers: dict = {
        "adjectives": [],
        "nouns": [],
        "verbs": [],
    }
    for i in tqdm(range(len(word_relationship_questions))):
        questions = word_relationship_questions[i]
        qa_label, answer = word_relationship_answers[i]

        # Convert from label to category
        qa_category = None
        if qa_label.startswith("J"):
            qa_category = "adjectives"
        elif qa_label.startswith("N"):
            qa_category = "nouns"
        elif qa_label.startswith("V"):
            qa_category = "verbs"

        # Append pair to category
        word_relationship_questions_answers[qa_category].append(questions + [answer])
    print("Done!")

    # Save list of analogies from MSR to file
    print("Saving to file...")
    joblib.dump(word_relationship_questions_answers, output_filepath)
    print("Done!")
def load_and_preprocess_data(
    language: str,
    wiki_name: str,
    wiki_dump_time: str,
    raw_data_dir: str,
    output_dir: str,
    num_output_files: int,
    min_sent_word_count: int,
    max_wikipedia_files: int,
) -> None:
    """
    Loads and preprocess text8 data for training a word2vec model.

    Parameters
    ----------
    language : str
        Language of the wikipedia dump.
    wiki_name : str
        Name of the Wikipedia dump.
    wiki_dump_time : str
        Time of the wikipedia dump.
    raw_data_dir : str
        Path to the raw data directory (where files will be downloaded to and extracted from).
    output_dir : str
        Output directory to save processed data.
    num_output_files : int
        Number of files to split the output into.
    min_sent_word_count : int
        Minimum sentence word count.
    max_wikipedia_files : int
        Maximum number of wikipedia files to process (-1 denotes all files).
    """
    # Ensure data directories exist
    makedirs(raw_data_dir, exist_ok=True)
    makedirs(output_dir, exist_ok=True)

    # Initialize paths
    dataset_name = f"{wiki_name}-{wiki_dump_time}"
    raw_data_url = (
        f"https://dumps.wikimedia.org/{wiki_name}/{wiki_dump_time}/"
        f"{dataset_name}-pages-articles-multistream.xml.bz2"
    )
    raw_data_bz2_filepath = join(raw_data_dir, f"{dataset_name}.xml.bz2")
    raw_data_bz2_extracted_dir = join(raw_data_dir, f"{dataset_name}_extracted")

    # Download raw data if not present
    if not isfile(raw_data_bz2_filepath):
        print(f"Downloading {wiki_name}-{wiki_dump_time} dump...")
        download_from_url(url=raw_data_url, destination_filepath=raw_data_bz2_filepath)
        print("Done!")

    # Extract raw data if not present
    if not isdir(raw_data_bz2_extracted_dir):
        print(f"Extracting articles from {wiki_name}-{wiki_dump_time} dump...")
        subprocess.run(
            [
                "python",
                "-m",
                "wikiextractor.WikiExtractor",
                "-cb",
                "250K",
                "--no-templates",
                "-o",
                raw_data_bz2_extracted_dir,
                raw_data_bz2_filepath,
            ],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.STDOUT,
        )
        print("Done!")

    print("Combining and processing extracted files into text files...")
    wikiextractor_outputs_to_file(
        extracted_dir=raw_data_bz2_extracted_dir,
        language=language,
        dataset_name=dataset_name,
        output_dir=output_dir,
        num_output_files=num_output_files,
        max_num_files=max_wikipedia_files,
        min_sent_word_count=min_sent_word_count,
    )
    print("Done!")
Пример #23
0
import zipfile

from utils import download_from_url

# =================================
# Script purpose:
# Download and unzip all raw files
# =================================

# Word frequency calculations from Beijing Language and Culture University
download_from_url(
    "http://bcc.blcu.edu.cn/downloads/resources/BCC_LEX_Zh.zip",
    "./data/raw/BCC_LEX_Zh.zip",
    overwrite=False,
)

# Word frequency calculations for blogs, converted to UTF-8
download_from_url(
    "https://www.plecoforums.com/download/blogs_wordfreq-release_utf-8-txt.2602/",
    "./data/raw/blogs_wordfreq-release_utf-8.txt",
    overwrite=False,
)

# CEDICT dictionary
download_from_url(
    "https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.zip",
    "./data/raw/cedict_1_0_ts_utf-8_mdbg.zip",
    overwrite=True,
)

# CJKVI character decompositions
Пример #24
0
def _maybe_download_norsk_aviskorpus(inputpath):
    archive_path = os.path.join(inputpath, AVIS_CORSPUS_ARCHIVE)
    if not os.path.isfile(archive_path):
        download_from_url(AVIS_CORPUS_URL, archive_path)
def preprocess_word_cluster_groups(raw_data_dir: str, output_dir: str,
                                   words_filepath: str) -> None:
    """
    Preprocesses word cluster groups

    Parameters
    ----------
    raw_data_dir : str
        Raw data directory
    output_dir : str
        Directory to save output data.
    words_filepath: str
        Filepath of words text file (vocabulary) from word2vec training output
    """
    # Load words from vocabulary
    with open(words_filepath, "r") as words_file:
        words = np.array(words_file.read().split("\n"))
    word_to_int = {word: i
                   for i, word in enumerate(words)
                   }  # Word integer lookup table

    # -- Numbers --
    numbers_list = []
    numbers_list.extend(list(range(100)))
    numbers_list.extend([100, 1000, 1000000, 1000000000, 1000000000000])
    numbers_textual_reprs = []
    for number in numbers_list:
        for num in preprocess_text(str(number)):
            if num != "and" and num not in numbers_textual_reprs:
                numbers_textual_reprs.append(num)
    number_words_in_vocab = [
        num_word for num_word in numbers_textual_reprs
        if num_word in word_to_int
    ]
    with open(join(output_dir, "numbers.txt"), "w") as words_output_file:
        for i, word in enumerate(number_words_in_vocab):
            if i > 0:
                words_output_file.write("\n")
            words_output_file.write(f"{word}")

    # -- Names --
    num_top_names = 1000
    forenames_data_url = "https://www.ssa.gov/oact/babynames/names.zip"
    forenames_raw_zip_filepath = join(raw_data_dir, "forenames.zip")
    forenames_raw_zip_dir = join(raw_data_dir, "forenames")
    forenames_year = 2019
    forenames_raw_filepath = join(forenames_raw_zip_dir,
                                  f"yob{forenames_year}.txt")
    forenames_output_filepath = join(output_dir, "forenames.csv")

    surnames_year = 2010
    surnames_data_url = (
        f"https://www2.census.gov/topics/genealogy/{surnames_year}surnames/names.zip"
    )
    surnames_raw_zip_filepath = join(raw_data_dir, "surnames.zip")
    surnames_raw_zip_dir = join(raw_data_dir, "surnames")
    surnames_raw_filepath = join(surnames_raw_zip_dir,
                                 f"Names_{surnames_year}Census.csv")
    surnames_output_filepath = join(output_dir, "surnames.csv")

    # Download raw data
    if not isfile(forenames_raw_zip_filepath):
        print("Downloading forenames data...")
        download_from_url(forenames_data_url, forenames_raw_zip_filepath)
        print("Done!")

    if not isdir(forenames_raw_zip_dir):
        print("Extracting raw forenames data...")
        with zipfile.ZipFile(forenames_raw_zip_filepath, "r") as zip_file:
            zip_file.extractall(forenames_raw_zip_dir)
        print("Done!")

    if not isfile(surnames_raw_zip_filepath):
        print("Downloading surnames data...")
        download_from_url(surnames_data_url, surnames_raw_zip_filepath)
        print("Done!")

    if not isdir(surnames_raw_zip_dir):
        print("Extracting raw surnames data...")
        with zipfile.ZipFile(surnames_raw_zip_filepath, "r") as zip_file:
            zip_file.extractall(surnames_raw_zip_dir)
        print("Done!")

    # Parse and save forenames/surnames
    word_in_vocab_filter: Callable[[str],
                                   bool] = lambda word: word in word_to_int
    if not isfile(forenames_output_filepath) or not isfile(
            surnames_output_filepath):
        forenames_raw_df = pd.read_csv(
            forenames_raw_filepath,
            header=None,
            names=["name", "gender", "count"],
        )
        forenames_raw_df["name"] = forenames_raw_df["name"].str.lower()
        forenames_raw_df = forenames_raw_df[forenames_raw_df["name"].apply(
            word_in_vocab_filter)]
        forenames_male_raw_df = forenames_raw_df[forenames_raw_df["gender"] ==
                                                 "M"]
        forenames_male_raw_df = forenames_male_raw_df[:num_top_names]
        forenames_female_raw_df = forenames_raw_df[forenames_raw_df["gender"]
                                                   == "F"]
        forenames_female_raw_df = forenames_female_raw_df[:num_top_names]
        forenames_raw_df = pd.concat(
            [forenames_male_raw_df, forenames_female_raw_df])
        forenames_raw_df.to_csv(forenames_output_filepath, index=False)

        surnames_raw_df = pd.read_csv(surnames_raw_filepath,
                                      usecols=["name", "count"])
        surnames_raw_df["name"] = surnames_raw_df["name"].str.lower()
        surnames_raw_df = surnames_raw_df[surnames_raw_df["name"].apply(
            lambda name: word_in_vocab_filter(
                name) and name not in forenames_raw_df["name"])]
        surnames_raw_df = surnames_raw_df[:num_top_names]
        surnames_raw_df.to_csv(surnames_output_filepath, index=False)

    # -- Foods --
    num_top_food_words = 250
    foods_output_filepath = join(output_dir, "foods.txt")
    foods_output_raw_filepath = join(raw_data_dir, "foods.csv")

    # Prepare food ingredient dataframe
    if not isfile(foods_output_raw_filepath):
        food_ingredient_list_csv_url = (
            "https://query.data.world/s/g6zcrqk6kbcks2kadrwdwjvnygbagk")
        food_ingredient_df = pd.read_csv(
            food_ingredient_list_csv_url,
            usecols=["name", "categories", "features.value"],
        )
        food_ingredient_df.rename(columns={"features.value": "ingredients"},
                                  inplace=True)
        food_ingredient_df = food_ingredient_df.astype({
            "name": str,
            "categories": str,
            "ingredients": str
        })

        # Preprocess food words and save to file
        preprocess_sent: Callable[[str], str] = lambda sent: " ".join(
            preprocess_text(
                sent, should_remove_stopwords=True, should_remove_digits=True))
        food_ingredient_df["name"] = food_ingredient_df["name"].apply(
            lambda name: preprocess_sent(name))
        food_ingredient_df["categories"] = food_ingredient_df[
            "categories"].apply(lambda name: preprocess_sent(name))
        food_ingredient_df["ingredients"] = food_ingredient_df[
            "ingredients"].apply(lambda name: preprocess_sent(name))
        food_ingredient_df.to_csv(foods_output_raw_filepath, index=False)

    # Combine food words into one text file sorted by word occurrence.
    if not isfile(foods_output_filepath):
        food_ingredient_df = pd.read_csv(foods_output_raw_filepath)
        food_word_occurrences_counter: Counter = Counter()

        # Count food word frequencies
        for col_name in ["name", "categories", "ingredients"]:
            for col_sent in food_ingredient_df[col_name].values:
                if isinstance(col_sent, str):
                    food_word_occurrences_counter.update(col_sent.split())

        # Only use top `num_top_food_words` which are in the vocabulary.
        most_common_food_words = [
            food_word
            for food_word, _ in food_word_occurrences_counter.most_common()
            if food_word in word_to_int and len(food_word) > 1
        ][:num_top_food_words]

        # Save food words to file
        with open(foods_output_filepath, "w") as foods_file:
            for i, food_word in enumerate(most_common_food_words):
                if i > 0:
                    foods_file.write("\n")
                foods_file.write(f"{food_word}")