def download_kegg_info_files(kegg_set_ids, species_ini_file): """ This is a KEGG-specific function that downloads the files containing information about the KEGG sets, such as their title, abstract, supporting publications, etc. Arguments: kegg_set_ids -- List of kegg set identifiers (e.g. hsa00010) for which info files will be downloaded. species_ini_file -- Path to the species INI config file. This is a string. Returns: Nothing, just downloads and saves files to keggset_info folder, which will be the SPECIES_DOWNLOAD_FOLDER + 'KEGG/keggset_info_folder' """ species_file = SafeConfigParser() species_file.read(species_ini_file) sd_folder = species_file.get('species_info', 'SPECIES_DOWNLOAD_FOLDER') keggset_info_folder = os.path.join(sd_folder, KEGGSET_INFO_FOLDER) check_create_folder(keggset_info_folder) full_info_url = species_file.get('KEGG', 'KEGG_ROOT_URL') + \ species_file.get('KEGG', 'SET_INFO_DIR') for kegg_id in kegg_set_ids: kegg_info_file = full_info_url + kegg_id download_from_url(kegg_info_file, keggset_info_folder)
def _get_or_download_model(self, download: bool) -> Optional[str]: """ Return downloaded model path, if model path does not exits and download is true, it will download and return the path Args: download: flag to decide whether to download model in case it not exists Returns: str: model path or None """ home_dir = home_directory() downloaded_models_dir = os.path.join(home_dir, MODELS_DIR) if not os.path.exists(downloaded_models_dir): os.makedirs(downloaded_models_dir) model_hashed_name = get_hashed_name(self.embedding + self.model) model_path = os.path.join(downloaded_models_dir, model_hashed_name) if not os.path.exists(model_path): if not download: return model_download_path = model_path + '.' + self.embedding_cls.EMBEDDING_MODELS[ self.model].format model_download_url = self.embedding_cls.EMBEDDING_MODELS[ self.model].download_url print(f"Model does not exists, Downloading model: {self.model}") download_from_url(model_download_url, model_download_path) extract_file(model_download_path, model_path) os.remove(model_download_path) print(f"Model downloaded successfully!") return model_path
def get_blacklist(self): source = BLACKLIST_SOURCE[self.ref_name] path = os.path.join(self.ref_path, 'regions', 'blacklist.bed') utils.download_from_url(source, path) # hg19 blacklist.bed has a coordinate overflow issue for the chrM row. # The entire chrM is set to be blacklist but the annotation does not match the contig length regtools.clean_chr_name_file(path, self.config, self.ref_path) regtools.sort_and_uniq_bed(path, self.ref_path)
def get_DHS(self): """ download and clean the dnase.bed. Only called when creating standard genomes. :return: """ source = DHS_SOURCE[self.ref_name] path = os.path.join(self.ref_path, 'regions', 'dnase.bed') utils.download_from_url(source, path) regtools.clean_chr_name_file(path, self.config, self.ref_path) regtools.sort_and_uniq_bed(path, self.ref_path)
def upload_file(): if request.method == 'POST': # check if the post request has the file part print(request.form) if 'text' in request.form: # raise NotImplementedError print(request.form["text"]) url = request.form["text"] filename = download_from_url(url, app.config['UPLOAD_FOLDER']) return redirect(url_for('remove_output', filename=filename)) if 'file' not in request.files: flash('No file part') return redirect(request.url) file = request.files['file'] # if user does not select file, browser also # submit an empty part without filename if file.filename == '': flash('No selected file') return redirect(request.url) if file and allowed_file(file.filename): filename = secure_filename(file.filename) file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) return redirect(url_for('remove_output', filename=filename)) return '''
def __init__(self): base_path = os.path.dirname(os.path.abspath(__file__)) jar_path = os.path.join(base_path, METEOR_JAR) gz_path = os.path.join(base_path, os.path.basename(METEOR_GZ_URL)) if not os.path.isfile(jar_path): if not os.path.isfile(gz_path): download_from_url(METEOR_GZ_URL, gz_path) tar = tarfile.open(gz_path, "r") tar.extractall(path=os.path.dirname(os.path.abspath(__file__))) tar.close() os.remove(gz_path) self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \ '-', '-', '-stdio', '-l', 'en', '-norm'] self.meteor_p = subprocess.Popen(self.meteor_cmd, \ cwd=os.path.dirname(os.path.abspath(__file__)), \ stdin=subprocess.PIPE, \ stdout=subprocess.PIPE, \ stderr=subprocess.PIPE) # Used to guarantee thread safety self.lock = threading.Lock()
def download_file(item, path, course, output_dir): filepath = create_filepath(course, path) description = item["Description"]["Html"] topic_type = item["TopicType"] title = item["Title"] if topic_type == 1: filename = create_filename(item) full_path = f"{output_dir}/{filepath}/{filename}" # These documents are real files that we want to download download_from_url(f"""{ufora}{item["Url"]}""", full_path) if item["Url"].endswith(".html"): # HTML files on Ufora need a little special treatment # We'll prepend a title, <base> tag and convert them to pdf with open(full_path, "r") as f: content = f.read() filename_without_extension = ".".join(filename.split(".")[:-1]) description_path = f"{output_dir}/{filepath}/{filename_without_extension}.pdf" create_metadata(description_path, content, filename_without_extension) new_content = f"<base href={ufora}><h1>{title}</h1>{content}" with open(full_path, "w") as f: f.write(new_content) elif description: # Choosing this filename might cause an overlap... filename_without_extension = ".".join(filename.split(".")[:-1]) description_path = f"{output_dir}/{filepath}/{filename_without_extension}.pdf" create_metadata(description_path, description, filename_without_extension) elif topic_type == 3: # These documents are just clickable links, we'll render them in a pdf url = item["Url"] filename = create_filename_without_extension(item) full_path = f"{output_dir}/{filepath}/{filename}" create_metadata(f"{full_path}.pdf", f"<a href={url}>{url}</a>{description}", item["Title"]) else: print(f"Don't know this topic type: {topic_type}") exit()
def prepare_data(data_dir, filename_gold, filename_negative, remote_url, embeddings_filenames, embeddings_dir, n_docs=None, n_queries=None, add_precomputed=False): """ filename_gold points to a squad format file. filename_negative points to a csv file where the first column is doc_id and second is document text. If add_precomputed is True, this fn will look in the embeddings files for precomputed embeddings to add to each Document """ logging.getLogger("farm").setLevel(logging.INFO) download_from_url(remote_url + filename_gold, filepath=data_dir + filename_gold) download_from_url(remote_url + filename_negative, filepath=data_dir + filename_negative) if add_precomputed: for embedding_filename in embeddings_filenames: download_from_url( remote_url + str(embeddings_dir) + embedding_filename, filepath=data_dir + str(embeddings_dir) + embedding_filename) logging.getLogger("farm").setLevel(logging.WARN) gold_docs, labels = eval_data_from_json(data_dir + filename_gold) # Reduce number of docs gold_docs = gold_docs[:n_docs] # Remove labels whose gold docs have been removed doc_ids = [x.id for x in gold_docs] labels = [x for x in labels if x.document_id in doc_ids] # Filter labels down to n_queries selected_queries = list( set(f"{x.document_id} | {x.question}" for x in labels)) selected_queries = selected_queries[:n_queries] labels = [ x for x in labels if f"{x.document_id} | {x.question}" in selected_queries ] n_neg_docs = max(0, n_docs - len(gold_docs)) neg_docs = prepare_negative_passages(data_dir, filename_negative, n_neg_docs) docs = gold_docs + neg_docs if add_precomputed: docs = add_precomputed_embeddings(data_dir + embeddings_dir, embeddings_filenames, docs) return docs, labels
def preprocess_google_news( raw_data_dir: str, output_dir: str, annoy_index_n_trees: int, scann_num_leaves_scaling: int, ) -> None: """ Downloads and preprocessed external word embeddings from [1]. Parameters ---------- raw_data_dir : str Path to the raw data directory (where files will be downloaded to). output_dir : str Output directory to save processed data. annoy_index_n_trees : int Number of trees to pass to Annoys build method. More trees => higher precision. scann_num_leaves_scaling : int Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision. References ---------- .. [1] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality (https://arxiv.org/pdf/1310.4546.pdf). In Proceedings of NIPS, 2013. """ # Ensure output directory exists output_dir = join(output_dir, "GoogleNews") makedirs(output_dir, exist_ok=True) # Define filepaths google_news_vectors_zip_raw_download_url = "https://filesender.uninett.no/download.php?token=b0aea55e-72a7-4ac0-9409-8d5dbb322505&files_ids=645861" google_news_vectors_zip_raw_filename = "GoogleNews-vectors-negative300.bin.gz" google_news_vectors_zip_raw_filepath = join( raw_data_dir, google_news_vectors_zip_raw_filename ) google_news_vectors_bin_raw_filepath = join( raw_data_dir, "GoogleNews-vectors-negative300.bin" ) google_news_words_filepath = join( output_dir, "GoogleNews-vectors-negative300_words.txt" ) google_news_vectors_filepath = join( output_dir, "GoogleNews-vectors-negative300.npy" ) google_news_normalized_vectors_filepath = join( output_dir, "GoogleNews-vectors-negative300_normalized.npy" ) google_news_vectors_annoy_index_filepath = join( output_dir, "GoogleNews-vectors-negative300_annoy_index.ann" ) google_news_vectors_scann_artifacts_dir = join( output_dir, "GoogleNews-vectors-negative300_scann_artifacts" ) # -- GoogleNews-vectors-negative300.bin.gz -- if not isfile(google_news_vectors_zip_raw_filepath): print(f"Downloading {google_news_vectors_zip_raw_filename}...") download_from_url( url=google_news_vectors_zip_raw_download_url, destination_filepath=google_news_vectors_zip_raw_filepath, ) print("Done!") if not isfile(google_news_vectors_bin_raw_filepath): print(f"Extracting {google_news_vectors_zip_raw_filename}...") with gzip.GzipFile(google_news_vectors_zip_raw_filepath, "rb") as gzip_file_raw: with open(google_news_vectors_bin_raw_filepath, "wb") as gzip_file_output: gzip_file_output.write(gzip_file_raw.read()) print("Done!") # Parse vectors from binary file and save result should_load_vectors = ( not isfile(google_news_words_filepath) or not isfile(google_news_vectors_filepath) or not isfile(google_news_normalized_vectors_filepath) ) if should_load_vectors: google_news_word_embeddings, google_news_words = load_word2vec_binary_format( word2vec_filepath=google_news_vectors_bin_raw_filepath, tqdm_enabled=True, ) # Save words if not isfile(google_news_words_filepath): with open(google_news_words_filepath, "w") as file: for i, word in enumerate(google_news_words): if i > 0: file.write("\n") file.write(word) # Save word embeddings if not isfile(google_news_vectors_filepath): np.save(google_news_vectors_filepath, google_news_word_embeddings) # Save normalized word embeddings google_news_word_embeddings_normalized = None if not isfile(google_news_normalized_vectors_filepath): google_news_word_embeddings_normalized = ( google_news_word_embeddings / np.linalg.norm(google_news_word_embeddings, axis=1).reshape(-1, 1) ) np.save( google_news_normalized_vectors_filepath, google_news_word_embeddings_normalized, ) annoy_index_created = isfile(google_news_vectors_annoy_index_filepath) scann_instance_created = isdir(google_news_vectors_scann_artifacts_dir) if not annoy_index_created or not scann_instance_created: if google_news_word_embeddings_normalized is None: google_news_word_embeddings_normalized = np.load( google_news_normalized_vectors_filepath ) if not annoy_index_created: ann_index_annoy = ApproxNN(ann_alg="annoy") ann_index_annoy.build( data=google_news_word_embeddings_normalized, annoy_n_trees=annoy_index_n_trees, distance_measure="euclidean", ) ann_index_annoy.save(google_news_vectors_annoy_index_filepath) if not scann_instance_created: ann_index_scann = ApproxNN(ann_alg="scann") ann_index_scann.build( data=google_news_word_embeddings_normalized, scann_num_leaves_scaling=scann_num_leaves_scaling, ) ann_index_scann.save(google_news_vectors_scann_artifacts_dir)
def preprocess_fasttext_tps( raw_data_dir: str, output_dir: str, annoy_index_n_trees: int, scann_num_leaves_scaling: int, ) -> None: """ Downloads and preprocessed external word embeddings from [1]. Parameters ---------- raw_data_dir : str Path to the raw data directory (where files will be downloaded to). output_dir : str Output directory to save processed data. annoy_index_n_trees : int Number of trees to pass to Annoys build method. More trees => higher precision. scann_num_leaves_scaling : int Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision. References ---------- .. Alexander Jakubowski, Milica Gašić, & Marcus Zibrowius. (2020). Topology of Word Embeddings: Singularities Reflect Polysemy. """ # Ensure output directory exists output_dir = join(output_dir, "fastTextTPS") makedirs(output_dir, exist_ok=True) # Define constants env_config = dotenv_values(join("..", ".env")) tps_fasttext_model_filesender_token = env_config[ "TPS_FASTTEXT_MODEL_FILESENDER_TOKEN" ] tps_fasttext_model_filesender_token_files_ids = env_config[ "TPS_FASTTEXT_MODEL_FILESENDER_TOKEN_FILES_IDS" ] tps_fasttext_model_url = f"https://filesender.uninett.no/download.php?token={tps_fasttext_model_filesender_token}&files_ids={tps_fasttext_model_filesender_token_files_ids}" tps_fasttext_model_name = "fastText.TPS.300d" tps_fasttext_model_raw_filepath = join( raw_data_dir, f"{tps_fasttext_model_name}.bin" ) tps_fasttext_model_words_filepath = join( output_dir, f"{tps_fasttext_model_name}_words.txt" ) tps_fasttext_model_vectors_filepath = join( output_dir, f"{tps_fasttext_model_name}.npy" ) tps_fasttext_model_vectors_normalized_filepath = join( output_dir, f"{tps_fasttext_model_name}_normalized.npy" ) tps_fasttext_model_annoy_index_filepath = join( output_dir, f"{tps_fasttext_model_name}_annoy_index.ann" ) tps_fasttext_model_scann_artifacts_dir = join( output_dir, f"{tps_fasttext_model_name}_scann_artifacts" ) if not isfile(tps_fasttext_model_raw_filepath): print(f"Downloading {tps_fasttext_model_name}...") download_from_url( url=tps_fasttext_model_url, destination_filepath=tps_fasttext_model_raw_filepath, ) print("Done!") # Load output from trained fastText model fasttext_model = fasttext.load_model(tps_fasttext_model_raw_filepath) fasttext_model_words = fasttext_model.words fasttext_model_embedding_weights = np.zeros( (len(fasttext_model_words), fasttext_model.get_dimension()) ) for i, word in enumerate(fasttext_model.words): fasttext_model_embedding_weights[i] = fasttext_model.get_word_vector(word) # Save words if not isfile(tps_fasttext_model_words_filepath): with open(tps_fasttext_model_words_filepath, "w") as file: for i, word in enumerate(fasttext_model.words): if i > 0: file.write("\n") file.write(word) # Save word embeddings if not isfile(tps_fasttext_model_vectors_filepath): np.save(tps_fasttext_model_vectors_filepath, fasttext_model_embedding_weights) # Save normalized word embeddings fasttext_model_embedding_weights_normalized = None if not isfile(tps_fasttext_model_vectors_normalized_filepath): fasttext_model_embedding_weights_normalized = ( fasttext_model_embedding_weights / np.linalg.norm(fasttext_model_embedding_weights, axis=1).reshape(-1, 1) ) np.save( tps_fasttext_model_vectors_normalized_filepath, fasttext_model_embedding_weights_normalized, ) annoy_index_created = isfile(tps_fasttext_model_annoy_index_filepath) scann_instance_created = isdir(tps_fasttext_model_scann_artifacts_dir) if not annoy_index_created or not scann_instance_created: if fasttext_model_embedding_weights_normalized is None: fasttext_model_embedding_weights_normalized = np.load( tps_fasttext_model_vectors_normalized_filepath ) if not annoy_index_created: ann_index_annoy = ApproxNN(ann_alg="annoy") ann_index_annoy.build( data=fasttext_model_embedding_weights_normalized, annoy_n_trees=annoy_index_n_trees, distance_measure="euclidean", ) ann_index_annoy.save(tps_fasttext_model_annoy_index_filepath) if not scann_instance_created: ann_index_scann = ApproxNN(ann_alg="scann") ann_index_scann.build( data=fasttext_model_embedding_weights_normalized, scann_num_leaves_scaling=scann_num_leaves_scaling, ) ann_index_scann.save(tps_fasttext_model_scann_artifacts_dir)
def preprocess_fasttext( raw_data_dir: str, output_dir: str, annoy_index_n_trees: int, scann_num_leaves_scaling: int, ) -> None: """ Downloads and preprocessed external word embeddings from [1]. Parameters ---------- raw_data_dir : str Path to the raw data directory (where files will be downloaded to). output_dir : str Output directory to save processed data. annoy_index_n_trees : int Number of trees to pass to Annoys build method. More trees => higher precision. scann_num_leaves_scaling : int Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision. References ---------- .. [1] Grave, E., Bojanowski, P., Gupta, P., Joulin, A., & Mikolov, T. (2018). Learning Word Vectors for 157 Languages. In Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018). """ # Ensure output directory exists output_dir = join(output_dir, "fastText") makedirs(output_dir, exist_ok=True) # Define constants fasttext_data_filename = "cc.en.300.vec" fasttext_vectors_url = f"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/{fasttext_data_filename}.gz" fasttext_word_vectors_raw_gzip_filepath = join( raw_data_dir, f"{fasttext_data_filename}.gz" ) fasttext_word_vectors_raw_txt_filepath = join(raw_data_dir, fasttext_data_filename) fasttext_word_vectors_words_filepath = join( output_dir, f"{fasttext_data_filename}_words.txt" ) fasttext_word_vectors_filepath = join(output_dir, f"{fasttext_data_filename}.npy") fasttext_word_vectors_normalized_filepath = join( output_dir, f"{fasttext_data_filename}_normalized.npy" ) fasttext_word_vectors_annoy_index_filepath = join( output_dir, f"{fasttext_data_filename}_annoy_index.ann" ) fasttext_word_vectors_scann_artifacts_dir = join( output_dir, f"{fasttext_data_filename}_scann_artifacts" ) if not isfile(fasttext_word_vectors_raw_gzip_filepath): print(f"Downloading {fasttext_data_filename}...") download_from_url( url=fasttext_vectors_url, destination_filepath=fasttext_word_vectors_raw_gzip_filepath, ) print("Done!") if not isfile(fasttext_word_vectors_raw_txt_filepath): print(f"Extracting {fasttext_data_filename}...") with gzip.GzipFile( fasttext_word_vectors_raw_gzip_filepath, "rb" ) as gzip_file_raw: with open(fasttext_word_vectors_raw_txt_filepath, "wb") as gzip_file_output: gzip_file_output.write(gzip_file_raw.read()) print("Done!") # Parse vectors from text file and save result should_load_vectors = ( not isfile(fasttext_word_vectors_words_filepath) or not isfile(fasttext_word_vectors_filepath) or not isfile(fasttext_word_vectors_normalized_filepath) ) if should_load_vectors: fasttext_word_embeddings, fasttext_words = load_word_embeddings_text_format( word_embeddings_text_filepath=fasttext_word_vectors_raw_txt_filepath, first_line_header=True, tqdm_enabled=True, ) # Save words if not isfile(fasttext_word_vectors_words_filepath): with open(fasttext_word_vectors_words_filepath, "w") as file: for i, word in enumerate(fasttext_words): if i > 0: file.write("\n") file.write(word) # Save word embeddings if not isfile(fasttext_word_vectors_filepath): np.save(fasttext_word_vectors_filepath, fasttext_word_embeddings) # Save normalized word embeddings fasttext_word_embeddings_normalized = None if not isfile(fasttext_word_vectors_normalized_filepath): fasttext_word_embeddings_normalized = fasttext_word_embeddings / np.linalg.norm( fasttext_word_embeddings, axis=1 ).reshape(-1, 1) np.save( fasttext_word_vectors_normalized_filepath, fasttext_word_embeddings_normalized, ) annoy_index_created = isfile(fasttext_word_vectors_annoy_index_filepath) scann_instance_created = isdir(fasttext_word_vectors_scann_artifacts_dir) if not annoy_index_created or not scann_instance_created: if fasttext_word_embeddings_normalized is None: fasttext_word_embeddings_normalized = np.load( fasttext_word_vectors_normalized_filepath ) if not annoy_index_created: ann_index_annoy = ApproxNN(ann_alg="annoy") ann_index_annoy.build( data=fasttext_word_embeddings_normalized, annoy_n_trees=annoy_index_n_trees, distance_measure="euclidean", ) ann_index_annoy.save(fasttext_word_vectors_annoy_index_filepath) if not scann_instance_created: ann_index_scann = ApproxNN(ann_alg="scann") ann_index_scann.build( data=fasttext_word_embeddings_normalized, scann_num_leaves_scaling=scann_num_leaves_scaling, ) ann_index_scann.save(fasttext_word_vectors_scann_artifacts_dir)
def preprocess_glove( raw_data_dir: str, output_dir: str, annoy_index_n_trees: int, scann_num_leaves_scaling: int, ) -> None: """ Downloads and preprocessed external word embeddings from [1]. Parameters ---------- raw_data_dir : str Path to the raw data directory (where files will be downloaded to). output_dir : str Output directory to save processed data. annoy_index_n_trees : int Number of trees to pass to Annoys build method. More trees => higher precision. scann_num_leaves_scaling : int Number of leaves scaling to pass to ScaNNs build method. Higher scaling => higher precision. References ---------- .. [1] Jeffrey Pennington, Richard Socher, & Christopher D. Manning (2014). GloVe: Global Vectors for Word Representation. In Empirical Methods in Natural Language Processing (EMNLP) (pp. 1532–1543). """ # Ensure output directory exists output_dir = join(output_dir, "GloVe") makedirs(output_dir, exist_ok=True) # Define constants glove_data_filename = "glove.840B.300d" glove_word_vectors_url = f"http://nlp.stanford.edu/data/{glove_data_filename}.zip" glove_word_vectors_raw_zip_filepath = join( raw_data_dir, f"{glove_data_filename}.zip" ) glove_word_vectors_raw_txt_filename = f"{glove_data_filename}.txt" glove_word_vectors_raw_txt_filepath = join( raw_data_dir, glove_word_vectors_raw_txt_filename ) glove_word_vectors_words_filepath = join( output_dir, f"{glove_data_filename}_words.txt" ) glove_word_vectors_filepath = join(output_dir, f"{glove_data_filename}.npy") glove_word_vectors_normalized_filepath = join( output_dir, f"{glove_data_filename}_normalized.npy" ) glove_word_vectors_annoy_index_filepath = join( output_dir, f"{glove_data_filename}_annoy_index.ann" ) glove_word_vectors_scann_artifacts_dir = join( output_dir, f"{glove_data_filename}_scann_artifacts" ) if not isfile(glove_word_vectors_raw_zip_filepath): print(f"Downloading {glove_data_filename}...") download_from_url( url=glove_word_vectors_url, destination_filepath=glove_word_vectors_raw_zip_filepath, ) print("Done!") if not isfile(glove_word_vectors_raw_txt_filepath): print(f"Extracting {glove_data_filename}...") with zipfile.ZipFile(glove_word_vectors_raw_zip_filepath, "r") as zip_ref: zip_ref.extractall(raw_data_dir) print("Done!") # Parse vectors from text file and save result should_load_vectors = ( not isfile(glove_word_vectors_words_filepath) or not isfile(glove_word_vectors_filepath) or not isfile(glove_word_vectors_normalized_filepath) ) if should_load_vectors: glove_word_embeddings, glove_words = load_word_embeddings_text_format( word_embeddings_text_filepath=glove_word_vectors_raw_txt_filepath, first_line_header=False, tqdm_enabled=True, ) # Save words if not isfile(glove_word_vectors_words_filepath): with open(glove_word_vectors_words_filepath, "w") as file: for i, word in enumerate(glove_words): if i > 0: file.write("\n") file.write(word) # Save word embeddings if not isfile(glove_word_vectors_filepath): np.save(glove_word_vectors_filepath, glove_word_embeddings) # Save normalized word embeddings glove_word_embeddings_normalized = None if not isfile(glove_word_vectors_normalized_filepath): glove_word_embeddings_normalized = glove_word_embeddings / np.linalg.norm( glove_word_embeddings, axis=1 ).reshape(-1, 1) np.save( glove_word_vectors_normalized_filepath, glove_word_embeddings_normalized, ) annoy_index_created = isfile(glove_word_vectors_annoy_index_filepath) scann_instance_created = isdir(glove_word_vectors_scann_artifacts_dir) if not annoy_index_created or not scann_instance_created: if glove_word_embeddings_normalized is None: glove_word_embeddings_normalized = np.load( glove_word_vectors_normalized_filepath ) if not annoy_index_created: ann_index_annoy = ApproxNN(ann_alg="annoy") ann_index_annoy.build( data=glove_word_embeddings_normalized, annoy_n_trees=annoy_index_n_trees, distance_measure="euclidean", ) ann_index_annoy.save(glove_word_vectors_annoy_index_filepath) if not scann_instance_created: ann_index_scann = ApproxNN(ann_alg="scann") ann_index_scann.build( data=glove_word_embeddings_normalized, scann_num_leaves_scaling=scann_num_leaves_scaling, ) ann_index_scann.save(glove_word_vectors_scann_artifacts_dir)
def download(self): download_from_url( 'http://www.kr-kralovehradecky.cz/xml/export/eldeska-zpravy.xml', 'data/kralovehradecky_kraj.xml')
def download_all_files(species_ini_file, base_download_folder, secrets_location=None): """ Reads config INI file for a species, which contains the files (and their locations, or URLs) that must be loaded for this species, and calls the download_from_url function for each of those files. Arguments: species_ini_file -- Path to the particular species INI file. This is a string. base_download_folder -- A string. Path of the root folder where download folders for other species will be created and where common downloaded files will be saved. This is stored in the main configuration INI file. secrets_location -- Optional string of location of the secrets INI file. Returns: Nothing, just downloads and saves files to download_folder """ check_create_folder(base_download_folder) species_file = SafeConfigParser() species_file.read(species_ini_file) sd_folder = species_file.get('species_info', 'SPECIES_DOWNLOAD_FOLDER') check_create_folder(sd_folder) if species_file.has_section('GO'): if species_file.getboolean('GO', 'DOWNLOAD'): obo_url = species_file.get('GO', 'GO_OBO_URL') download_from_url(obo_url, base_download_folder) go_dir = os.path.join(sd_folder, 'GO') check_create_folder(go_dir) goa_urls = species_file.get('GO', 'ASSOC_FILE_URLS') goa_urls = re.sub(r'\s', '', goa_urls).split(',') for goa_url in goa_urls: download_from_url(goa_url, go_dir) if species_file.has_section('KEGG'): if species_file.getboolean('KEGG', 'DOWNLOAD'): kegg_root_url = species_file.get('KEGG', 'KEGG_ROOT_URL') kegg_info_url = kegg_root_url + species_file.get( 'KEGG', 'DB_INFO_URL') download_from_url(kegg_info_url, base_download_folder, 'kegg_db_info') kegg_dir = os.path.join(sd_folder, 'KEGG') check_create_folder(kegg_dir) ks_urls = species_file.get('KEGG', 'SETS_TO_DOWNLOAD') kegg_urls = [ kegg_root_url + url.strip() for url in ks_urls.split(',') ] for kegg_url in kegg_urls: download_from_url(kegg_url, kegg_dir) if species_file.has_section('DO'): if species_file.getboolean('DO', 'DOWNLOAD'): do_dir = os.path.join(sd_folder, 'DO') check_create_folder(do_dir) obo_url = species_file.get('DO', 'DO_OBO_URL') download_from_url(obo_url, do_dir) mim2gene_url = species_file.get('DO', 'MIM2GENE_URL') download_from_url(mim2gene_url, do_dir) # The genemap_file needs a special Secret Key, which must be # retrieved from the secrets file if the user wishes to download # the genemap_file genemap_url = species_file.get('DO', 'GENEMAP_URL') if not secrets_location: logger.error('Secrets file was not passed to ' 'download_all_files() function. A secrets file ' 'containing an OMIM API secret key is required to' ' download the genemap file to process Disease ' 'Ontology.') sys.exit(1) secrets_file = SafeConfigParser() secrets_file.read(secrets_location) if not secrets_file.has_section('OMIM API secrets'): logger.error('Secrets file has no "OMIM API secrets" section,' 'which is required to download the genemap file ' ' to process Disease Ontology.') sys.exit(1) omim_secret_key = secrets_file.get('OMIM API secrets', 'SECRET_KEY') genemap_url = genemap_url.replace('<SecretKey>', omim_secret_key) download_from_url(genemap_url, do_dir)
def preprocess_topological_polysemy_data(raw_data_dir: str, output_dir: str) -> None: """ Preprocesses data for the topological polysemy paper [1]. Parameters ---------- raw_data_dir : str Path to the raw data directory (where files will be downloaded to and extracted from). output_dir : str Output directory to save processed data. References ---------- .. [1] Alexander Jakubowski, Milica Gašić, & Marcus Zibrowius. (2020). Topology of Word Embeddings: Singularities Reflect Polysemy. """ print("Processing TPS paper...") # Download data from SemEval-2010 task 14 semeval_2010_14_data_url = ( "https://www.cs.york.ac.uk/semeval2010_WSI/files/training_data.tar.gz" ) semeval_2010_14_raw_data_filepath = join( raw_data_dir, "semeval_training_data.tar.gz" ) semeval_2010_14_raw_data_dir = join(raw_data_dir, "semeval_training_data") semeval_2010_14_nouns_dir = join( semeval_2010_14_raw_data_dir, "training_data", "nouns" ) semeval_2010_14_verbs_dir = join( semeval_2010_14_raw_data_dir, "training_data", "verbs" ) semeval_2010_14_york_datasets_url = ( "https://www.cs.york.ac.uk/semeval2010_WSI/datasets.html" ) semeval_2010_14_word_senses_filepath = join( output_dir, "semeval_2010_14_word_senses.joblib" ) semeval_2010_14_training_data_sentences_dir = join( output_dir, "semeval_2010_14_training_data" ) if not isfile(semeval_2010_14_word_senses_filepath): # Scrape website for SemEval gold standard senses print("Downloading SemEval 2010 task 14 website...") semeval_2010_14_york_datasets_source = get_cached_download_text_file( semeval_2010_14_york_datasets_url, target_dir=raw_data_dir, filename="semeval_2010_14_york_datasets.html", ) semeval_2010_14_york_datasets_soup = BeautifulSoup( semeval_2010_14_york_datasets_source, features="lxml" ) semeval_2010_14_york_datasets_tables_soup = ( semeval_2010_14_york_datasets_soup.find_all("tbody") ) # Scrape tables for word/sense pairs semeval_2010_14_word_senses: dict = {"verbs": {}, "nouns": {}, "all": {}} for table in semeval_2010_14_york_datasets_tables_soup: table_rows = table.find_all("tr")[1:] for table_row in table_rows: table_cols = table_row.find_all("td") # Get word and its GS senses target_word = table_cols[0].get_text().strip() target_word_is_verb = target_word.endswith(".v") target_word = target_word.split(".")[0] target_word_senses = int(table_cols[3].get_text().strip()) if target_word_is_verb: semeval_2010_14_word_senses["verbs"][ target_word ] = target_word_senses else: semeval_2010_14_word_senses["nouns"][ target_word ] = target_word_senses semeval_2010_14_word_senses["all"] = { **semeval_2010_14_word_senses["verbs"], **semeval_2010_14_word_senses["nouns"], } # Save result joblib.dump(semeval_2010_14_word_senses, semeval_2010_14_word_senses_filepath) if not isfile(semeval_2010_14_raw_data_filepath): print("Downloading training data from SemEval-2010 task 14...") download_from_url(semeval_2010_14_data_url, semeval_2010_14_raw_data_filepath) print("Done!") if not isdir(semeval_2010_14_raw_data_dir): print("Extracting raw training data from SemEval-2010 task 14...") with tarfile.open(semeval_2010_14_raw_data_filepath) as tar_file: tar_file.extractall(semeval_2010_14_raw_data_dir) print("Done!") if not isdir(semeval_2010_14_training_data_sentences_dir): makedirs(semeval_2010_14_training_data_sentences_dir) # Default to all CPUs num_output_files = cpu_count() # Prepare arguments for multiprocessing num_output_files_str_len = len(str(num_output_files)) semeval_2010_14_dirs = [semeval_2010_14_nouns_dir, semeval_2010_14_verbs_dir] semeval_2010_14_dir_filepaths = [ join(semeval_dir, fn) for semeval_dir in semeval_2010_14_dirs for fn in listdir(semeval_dir) ] num_xml_files_per_output_file = int( len(semeval_2010_14_dir_filepaths) // num_output_files ) print("Processing SemEval-2010 task 14 training data for word2vec...") with Pool() as pool: for i, mp_args in zip( range(num_output_files), batch_list_gen( semeval_2010_14_dir_filepaths, num_xml_files_per_output_file ), ): output_filename = f"semeval_2010_task_14-{str(i + 1).zfill(num_output_files_str_len)}.txt" output_filepath = join( semeval_2010_14_training_data_sentences_dir, output_filename ) print(f"Writing to {output_filename}...") with open(output_filepath, "w", encoding="utf8") as output_semeval_file: for j, result in enumerate( tqdm( pool.imap_unordered( preprocess_semeval_2010_task_14_training_xml_file, mp_args, ), total=num_xml_files_per_output_file, ) ): if j > 0: output_semeval_file.write("\n") output_semeval_file.write(result) print("Done!")
def get_documents(cache_dir): ALREADY_DOWNLOADED = [] def get_documents_for(url, d_type): response = requests.get(url) items = xmltodict.parse(response.content)['rss']['channel']['item'] link_with_filenames = [] for item in items: title = item.get('title') link = item.get('link') filename = os.path.join(cache_dir, 'pdf/{}/{}.pdf'.format(d_type, title)) if not os.path.isfile(filename): link_with_filenames.append([link, filename]) else: ALREADY_DOWNLOADED.append([link, filename]) return link_with_filenames url_with_filenames = [] logger.info('Retrieving file list...') for d_type, url in TYPES: documents = get_documents_for(url, d_type) url_with_filenames.extend(documents) file_meta_path = os.path.join(cache_dir, 'file_meta.json') os.makedirs(os.path.dirname(file_meta_path), exist_ok=True) with open(file_meta_path, 'w') as fp: json.dump( { '{}__{}'.format( filename.split('/')[-2], filename.split('/')[-1]): link for link, filename in url_with_filenames + ALREADY_DOWNLOADED }, fp) already_downloaded_num = len(ALREADY_DOWNLOADED) total_num_docs = len(url_with_filenames) + len(ALREADY_DOWNLOADED) async_download(url_with_filenames, headers=HEADERS, exception_handler=exception_handler) errored_num_docs = len(ERRORED_URLS) logger.warn( '{} Already Exsists, {} Success, {} Error downloading docs. {}'.format( already_downloaded_num, total_num_docs - errored_num_docs, errored_num_docs, ' Retrying....' if errored_num_docs else '', )) # Retry for errored urls using requests for url, filename in ERRORED_URLS: try: logger.info('Retrying for url {}'.format(url)) download_from_url(url, filename, HEADERS) errored_num_docs -= 1 logger.info('Success for url {}'.format(url)) except Exception as e: exception_handler(e, url=url, retry=True) logger.warn('Total docs: {}'.format(total_num_docs)) logger.warn('Already existing docs: {}'.format(already_downloaded_num)) logger.warn('Success downloads: {}'.format(len(url_with_filenames))) logger.warn('Error downloads: {}'.format(errored_num_docs)) logger.warn('Download Complete')
def preprocess_country_info(raw_data_dir: str, output_dir: str, geonames_username: str) -> None: """ Downloads and prepares a .csv file containing all countries and its capitals of the world. Data is fetched from geonames.org (https://www.geonames.org/) and is licenced under CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/). Parameters ---------- raw_data_dir : str Raw data directory output_dir : str Directory to save output data. geonames_username : str GeoNames username """ # Constants all_countries_combined_data_url = ( "https://download.geonames.org/export/dump/allCountries.zip") all_countries_raw_data_zip_filepath = join(raw_data_dir, "allCountries.zip") all_countries_raw_data_txt_filepath = join(raw_data_dir, "allCountries.txt") country_info_csv_data_url = ( f"https://secure.geonames.org/countryInfoCSV?username={geonames_username}" ) country_info_raw_data_csv_filepath = join(raw_data_dir, "country-info.csv") output_filepath = join(output_dir, "country-info.csv") # Download raw data if not isfile(all_countries_raw_data_zip_filepath): print("Downloading raw country data...") download_from_url(all_countries_combined_data_url, all_countries_raw_data_zip_filepath) print("Done!") if not isfile(all_countries_raw_data_txt_filepath): print("Extracting raw data...") with zipfile.ZipFile(all_countries_raw_data_zip_filepath, "r") as zip_file: zip_file.extractall(raw_data_dir) print("Done!") if not isfile(country_info_raw_data_csv_filepath): print("Downloading country info data...") download_from_url(country_info_csv_data_url, country_info_raw_data_csv_filepath) print("Done!") if not isfile(output_filepath): # Load raw data into Pandas DataFrames and join them all_countries_info_df = pd.read_csv( all_countries_raw_data_txt_filepath, sep="\t", na_filter=False, header=None, names=[ "geonameId", "name", "asciiname", "alternatenames", "latitude", "longitude", "feature class", "feature code", "country code", "cc2", "admin1 code", "admin2 code", "admin3 code", "admin4 code", "population", "elevation", "dem", "timezone", "modification date", ], usecols=["geonameId", "latitude", "longitude"], index_col="geonameId", ) country_info_df = pd.read_csv( country_info_raw_data_csv_filepath, sep="\t", na_filter=False, usecols=["name", "capital", "continent", "geonameId"], ) country_info_df = country_info_df.join(all_countries_info_df, on="geonameId", how="left") # Remove unused GeoNameId column country_info_df.drop("geonameId", inplace=True, axis=1) # Replace continent codes with names continent_code_to_name = { "AF": "Africa", "AS": "Asia", "EU": "Europe", "NA": "North America", "OC": "Oceania", "SA": "South America", "AN": "Antarctica", } country_info_df["continent"] = country_info_df["continent"].apply( lambda code: continent_code_to_name[code]) # Apply preprocessing to country name and capital country_info_df["name"] = country_info_df["name"].apply( preprocess_name) country_info_df["capital"] = country_info_df["capital"].apply( preprocess_name) # Save to file country_info_df.to_csv(output_filepath, index=False)
def _maybe_download(inputpath): archive_path = os.path.join(inputpath, BOOKS_ARCHIVE) if not os.path.isfile(archive_path): download_from_url(BOOKS_DOWNLOAD_URL, archive_path)
def download(self): download_from_url( 'https://deska.pardubickykraj.cz/desk_print.aspx', 'data/pardubicky_kraj.xml')
def _maybe_download_wikipedia(inputpath): wiki_dump_url = f"https://dumps.wikimedia.org/nowiki/latest/{WIKI_DUMP_NAME}" archive_path = os.path.join(inputpath, WIKI_DUMP_NAME) if not os.path.isfile(archive_path): download_from_url(wiki_dump_url, archive_path)
def preprocess_msr(raw_data_dir: str, output_dir: str) -> None: """ Downloads and preprocess test data for evaluating a word2vec model on the Microsoft Research Syntactic Analogies Dataset (MSR) from Mikolov et al. (https://www.aclweb.org/anthology/N13-1090.pdf) Parameters ---------- raw_data_dir : str Path to the raw data directory (where files will be downloaded to). output_dir : str Output directory to save processed data. """ print("Processing MSR...") # Initialize paths dataset_name = "msr" raw_data_url = "https://download.microsoft.com/download/A/B/4/AB4F476B-48A6-47CF-9716-5FF9D0D1F7EA/FeatureAugmentedRNNToolkit-v1.1.tgz" raw_data_zip_filepath = join(raw_data_dir, f"{dataset_name}.tgz") raw_data_extracted_zip_filepath = join(raw_data_dir, dataset_name) output_filepath = join(output_dir, f"{dataset_name}.joblib") # Download raw data if not present if not isfile(raw_data_zip_filepath): print(f"Downloading raw {dataset_name} data...") download_from_url(raw_data_url, raw_data_zip_filepath) print("Done!") # Extract raw data if not present if not isdir(raw_data_extracted_zip_filepath): print("Extracting raw data...") with tarfile.open(raw_data_zip_filepath) as tar_file: tar_file.extractall(raw_data_extracted_zip_filepath) print("Done!") # Read content from extracted zip, process them and combine into one test dataset. with open( join( raw_data_extracted_zip_filepath, "test_set", "word_relationship.questions" ), "r", ) as file: word_relationship_questions = [ line.split(" ") for line in file.read().split("\n") if len(line) > 0 ] with open( join(raw_data_extracted_zip_filepath, "test_set", "word_relationship.answers"), "r", ) as file: word_relationship_answers = [ line.split(" ") for line in file.read().split("\n") if len(line) > 0 ] # Combine lists print("Combining files...") word_relationship_questions_answers: dict = { "adjectives": [], "nouns": [], "verbs": [], } for i in tqdm(range(len(word_relationship_questions))): questions = word_relationship_questions[i] qa_label, answer = word_relationship_answers[i] # Convert from label to category qa_category = None if qa_label.startswith("J"): qa_category = "adjectives" elif qa_label.startswith("N"): qa_category = "nouns" elif qa_label.startswith("V"): qa_category = "verbs" # Append pair to category word_relationship_questions_answers[qa_category].append(questions + [answer]) print("Done!") # Save list of analogies from MSR to file print("Saving to file...") joblib.dump(word_relationship_questions_answers, output_filepath) print("Done!")
def load_and_preprocess_data( language: str, wiki_name: str, wiki_dump_time: str, raw_data_dir: str, output_dir: str, num_output_files: int, min_sent_word_count: int, max_wikipedia_files: int, ) -> None: """ Loads and preprocess text8 data for training a word2vec model. Parameters ---------- language : str Language of the wikipedia dump. wiki_name : str Name of the Wikipedia dump. wiki_dump_time : str Time of the wikipedia dump. raw_data_dir : str Path to the raw data directory (where files will be downloaded to and extracted from). output_dir : str Output directory to save processed data. num_output_files : int Number of files to split the output into. min_sent_word_count : int Minimum sentence word count. max_wikipedia_files : int Maximum number of wikipedia files to process (-1 denotes all files). """ # Ensure data directories exist makedirs(raw_data_dir, exist_ok=True) makedirs(output_dir, exist_ok=True) # Initialize paths dataset_name = f"{wiki_name}-{wiki_dump_time}" raw_data_url = ( f"https://dumps.wikimedia.org/{wiki_name}/{wiki_dump_time}/" f"{dataset_name}-pages-articles-multistream.xml.bz2" ) raw_data_bz2_filepath = join(raw_data_dir, f"{dataset_name}.xml.bz2") raw_data_bz2_extracted_dir = join(raw_data_dir, f"{dataset_name}_extracted") # Download raw data if not present if not isfile(raw_data_bz2_filepath): print(f"Downloading {wiki_name}-{wiki_dump_time} dump...") download_from_url(url=raw_data_url, destination_filepath=raw_data_bz2_filepath) print("Done!") # Extract raw data if not present if not isdir(raw_data_bz2_extracted_dir): print(f"Extracting articles from {wiki_name}-{wiki_dump_time} dump...") subprocess.run( [ "python", "-m", "wikiextractor.WikiExtractor", "-cb", "250K", "--no-templates", "-o", raw_data_bz2_extracted_dir, raw_data_bz2_filepath, ], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT, ) print("Done!") print("Combining and processing extracted files into text files...") wikiextractor_outputs_to_file( extracted_dir=raw_data_bz2_extracted_dir, language=language, dataset_name=dataset_name, output_dir=output_dir, num_output_files=num_output_files, max_num_files=max_wikipedia_files, min_sent_word_count=min_sent_word_count, ) print("Done!")
import zipfile from utils import download_from_url # ================================= # Script purpose: # Download and unzip all raw files # ================================= # Word frequency calculations from Beijing Language and Culture University download_from_url( "http://bcc.blcu.edu.cn/downloads/resources/BCC_LEX_Zh.zip", "./data/raw/BCC_LEX_Zh.zip", overwrite=False, ) # Word frequency calculations for blogs, converted to UTF-8 download_from_url( "https://www.plecoforums.com/download/blogs_wordfreq-release_utf-8-txt.2602/", "./data/raw/blogs_wordfreq-release_utf-8.txt", overwrite=False, ) # CEDICT dictionary download_from_url( "https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.zip", "./data/raw/cedict_1_0_ts_utf-8_mdbg.zip", overwrite=True, ) # CJKVI character decompositions
def _maybe_download_norsk_aviskorpus(inputpath): archive_path = os.path.join(inputpath, AVIS_CORSPUS_ARCHIVE) if not os.path.isfile(archive_path): download_from_url(AVIS_CORPUS_URL, archive_path)
def preprocess_word_cluster_groups(raw_data_dir: str, output_dir: str, words_filepath: str) -> None: """ Preprocesses word cluster groups Parameters ---------- raw_data_dir : str Raw data directory output_dir : str Directory to save output data. words_filepath: str Filepath of words text file (vocabulary) from word2vec training output """ # Load words from vocabulary with open(words_filepath, "r") as words_file: words = np.array(words_file.read().split("\n")) word_to_int = {word: i for i, word in enumerate(words) } # Word integer lookup table # -- Numbers -- numbers_list = [] numbers_list.extend(list(range(100))) numbers_list.extend([100, 1000, 1000000, 1000000000, 1000000000000]) numbers_textual_reprs = [] for number in numbers_list: for num in preprocess_text(str(number)): if num != "and" and num not in numbers_textual_reprs: numbers_textual_reprs.append(num) number_words_in_vocab = [ num_word for num_word in numbers_textual_reprs if num_word in word_to_int ] with open(join(output_dir, "numbers.txt"), "w") as words_output_file: for i, word in enumerate(number_words_in_vocab): if i > 0: words_output_file.write("\n") words_output_file.write(f"{word}") # -- Names -- num_top_names = 1000 forenames_data_url = "https://www.ssa.gov/oact/babynames/names.zip" forenames_raw_zip_filepath = join(raw_data_dir, "forenames.zip") forenames_raw_zip_dir = join(raw_data_dir, "forenames") forenames_year = 2019 forenames_raw_filepath = join(forenames_raw_zip_dir, f"yob{forenames_year}.txt") forenames_output_filepath = join(output_dir, "forenames.csv") surnames_year = 2010 surnames_data_url = ( f"https://www2.census.gov/topics/genealogy/{surnames_year}surnames/names.zip" ) surnames_raw_zip_filepath = join(raw_data_dir, "surnames.zip") surnames_raw_zip_dir = join(raw_data_dir, "surnames") surnames_raw_filepath = join(surnames_raw_zip_dir, f"Names_{surnames_year}Census.csv") surnames_output_filepath = join(output_dir, "surnames.csv") # Download raw data if not isfile(forenames_raw_zip_filepath): print("Downloading forenames data...") download_from_url(forenames_data_url, forenames_raw_zip_filepath) print("Done!") if not isdir(forenames_raw_zip_dir): print("Extracting raw forenames data...") with zipfile.ZipFile(forenames_raw_zip_filepath, "r") as zip_file: zip_file.extractall(forenames_raw_zip_dir) print("Done!") if not isfile(surnames_raw_zip_filepath): print("Downloading surnames data...") download_from_url(surnames_data_url, surnames_raw_zip_filepath) print("Done!") if not isdir(surnames_raw_zip_dir): print("Extracting raw surnames data...") with zipfile.ZipFile(surnames_raw_zip_filepath, "r") as zip_file: zip_file.extractall(surnames_raw_zip_dir) print("Done!") # Parse and save forenames/surnames word_in_vocab_filter: Callable[[str], bool] = lambda word: word in word_to_int if not isfile(forenames_output_filepath) or not isfile( surnames_output_filepath): forenames_raw_df = pd.read_csv( forenames_raw_filepath, header=None, names=["name", "gender", "count"], ) forenames_raw_df["name"] = forenames_raw_df["name"].str.lower() forenames_raw_df = forenames_raw_df[forenames_raw_df["name"].apply( word_in_vocab_filter)] forenames_male_raw_df = forenames_raw_df[forenames_raw_df["gender"] == "M"] forenames_male_raw_df = forenames_male_raw_df[:num_top_names] forenames_female_raw_df = forenames_raw_df[forenames_raw_df["gender"] == "F"] forenames_female_raw_df = forenames_female_raw_df[:num_top_names] forenames_raw_df = pd.concat( [forenames_male_raw_df, forenames_female_raw_df]) forenames_raw_df.to_csv(forenames_output_filepath, index=False) surnames_raw_df = pd.read_csv(surnames_raw_filepath, usecols=["name", "count"]) surnames_raw_df["name"] = surnames_raw_df["name"].str.lower() surnames_raw_df = surnames_raw_df[surnames_raw_df["name"].apply( lambda name: word_in_vocab_filter( name) and name not in forenames_raw_df["name"])] surnames_raw_df = surnames_raw_df[:num_top_names] surnames_raw_df.to_csv(surnames_output_filepath, index=False) # -- Foods -- num_top_food_words = 250 foods_output_filepath = join(output_dir, "foods.txt") foods_output_raw_filepath = join(raw_data_dir, "foods.csv") # Prepare food ingredient dataframe if not isfile(foods_output_raw_filepath): food_ingredient_list_csv_url = ( "https://query.data.world/s/g6zcrqk6kbcks2kadrwdwjvnygbagk") food_ingredient_df = pd.read_csv( food_ingredient_list_csv_url, usecols=["name", "categories", "features.value"], ) food_ingredient_df.rename(columns={"features.value": "ingredients"}, inplace=True) food_ingredient_df = food_ingredient_df.astype({ "name": str, "categories": str, "ingredients": str }) # Preprocess food words and save to file preprocess_sent: Callable[[str], str] = lambda sent: " ".join( preprocess_text( sent, should_remove_stopwords=True, should_remove_digits=True)) food_ingredient_df["name"] = food_ingredient_df["name"].apply( lambda name: preprocess_sent(name)) food_ingredient_df["categories"] = food_ingredient_df[ "categories"].apply(lambda name: preprocess_sent(name)) food_ingredient_df["ingredients"] = food_ingredient_df[ "ingredients"].apply(lambda name: preprocess_sent(name)) food_ingredient_df.to_csv(foods_output_raw_filepath, index=False) # Combine food words into one text file sorted by word occurrence. if not isfile(foods_output_filepath): food_ingredient_df = pd.read_csv(foods_output_raw_filepath) food_word_occurrences_counter: Counter = Counter() # Count food word frequencies for col_name in ["name", "categories", "ingredients"]: for col_sent in food_ingredient_df[col_name].values: if isinstance(col_sent, str): food_word_occurrences_counter.update(col_sent.split()) # Only use top `num_top_food_words` which are in the vocabulary. most_common_food_words = [ food_word for food_word, _ in food_word_occurrences_counter.most_common() if food_word in word_to_int and len(food_word) > 1 ][:num_top_food_words] # Save food words to file with open(foods_output_filepath, "w") as foods_file: for i, food_word in enumerate(most_common_food_words): if i > 0: foods_file.write("\n") foods_file.write(f"{food_word}")