def extract_from_config(config, params=None): """"This function extracts some useful information from the `config` and `params` dictionaries Arguments: config {dict} -- a configuration dictionary loaded from `config.json` params {dict} -- the feature engineering part of the configuration Returns: train_root {string} -- the string path to the train data genres {list} -- the list of genre names n_trains_per_genre {int} -- the number of training examples per genre n_pieces {int} -- the number of pieces per track n_pieces_per_genre {int} -- the total number of train pieces per genre sampling_rate {int} -- the sampling rate for reading the audio data """ data_root = git_root("data", "sample_data") train_root = os.path.join(data_root, "train") genres = config["genres"] total_n_genres = len(genres) n_train = config["train_percent"] * config["sample_data_per_genre"] // 10 n_train_per_genre = n_train // total_n_genres n_pieces = params["divide"]["number_pieces"] if params else None n_train_pieces = n_train * n_pieces if params else None n_pieces_per_genre = n_train_pieces // total_n_genres if params else None sampling_rate = params["sampling_rate"] if params else None return (train_root, genres, n_train_per_genre, n_pieces, n_pieces_per_genre, sampling_rate)
def data_to_json(): data = [] DATA_SOURCE = "code-jam" data_path = git_root("data", DATA_SOURCE, "files", "*") JSON_OUTPUT = git_root("data", DATA_SOURCE, "code-jam_data.json") LABEL = 0 for filename in tqdm(glob.iglob(data_path)): print_filename = os.path.basename(filename) with open(filename,'r', errors='replace') as f: content = f.read() dentry = {"file_name": print_filename, "content": content} data.append(dentry) json_to_save = {"data_source": DATA_SOURCE, "label": LABEL, "data": data} with open(JSON_OUTPUT, 'w') as fp: json.dump(json_to_save, fp)
def load_credentials(): """Helper function to read the credentials from the credentials.json file Returns: data {dict} containing whatever parameters are in the config file """ with open(git_root("credentials.json"), "r") as credentials_file: credentials = json.load(credentials_file) return credentials
def load_config(): """Helper function to read the entire config file Returns: data {dict} containing the entire content of the config file """ with open(git_root("config", "config.json"), "r") as config: config = json.load(config) return config
def execute_etl( data_file="../data/nmdc_merged_data.tsv.zip", etl_modules=[ "gold_study", "gold_omics_processing", "gold_biosample", "emsl_omics_processing", "emsl_data_object", "jgi_data_object", ], sssom_map_file=git_root("schema/mappings/gold-to-mixs.sssom.tsv"), spec_file="lib/nmdc_data_source.yaml", ): nmdc_etl = NMDC_ETL( merged_data_file=data_file, data_source_spec_file=spec_file, sssom_file=sssom_map_file, ) if "gold_study" in etl_modules: nmdc_etl.transform_study() # nmdc_etl.transform_study(test_rows=1, print_df=True, print_dict=True) nmdc_etl.save_study(file_path="output/nmdc_etl/gold_study.json") if "gold_omics_processing" in etl_modules: nmdc_etl.transform_omics_processing() # nmdc_etl.transform_omics_proccessing(test_rows=1, print_df=True, print_dict=True) nmdc_etl.save_omics_processing( file_path="output/nmdc_etl/gold_omics_processing.json" ) if "gold_biosample" in etl_modules: nmdc_etl.transform_biosample() # nmdc_etl.transform_biosample(test_rows=1, print_df=True, print_dict=True) nmdc_etl.save_biosample("output/nmdc_etl/gold_biosample.json") # align_nmdc_datatypes.align_gold_biosample() ########### currently broken if "emsl_omics_processing" in etl_modules: nmdc_etl.transform_emsl_omics_processing() # nmdc_etl.transform_emsl_omics_processing(test_rows=1, print_df=True, print_dict=True) nmdc_etl.save_emsl_omics_processing( "output/nmdc_etl/emsl_omics_processing.json" ) if "emsl_data_object" in etl_modules: nmdc_etl.transform_emsl_data_object() # nmdc_etl.transform_emsl_data_object(test_rows=1, print_df=True, print_dict=True) nmdc_etl.save_emsl_data_object("output/nmdc_etl/emsl_data_objects.json") if "jgi_data_object" in etl_modules: nmdc_etl.transform_jgi_data_object() # nmdc_etl.transform_jgi_data_object(test_rows=1, print_df=True, print_dict=True) nmdc_etl.save_jgi_data_object("output/nmdc_etl/jgi_fastq_data_objects.json")
def load_params(): """Helper function to read the parameters from the config file Returns: data {dict} containing whatever parameters are in the config file """ with open(git_root("config", "config.json"), "r") as config: config = json.load(config) params = config["feature_engineering"] return params
def get_all_results(detector_results): # add all the other files all_data_path = git_root("data", "full_data.json") all_data = pd.read_json(all_data_path)[["data_source", "label", "file_name"]] all_data["prediction"] = 0 all_results = pd.concat([detector_results, all_data], axis=0, sort=True).\ drop_duplicates(subset=["data_source", "file_name"], keep="first") # sanity check assert(all_results.shape == all_data.shape) return all_results
def read_in_data(sampling_rate, sample_data=True): """Function to load the data in memory Arguments: sampling_rate {int} -- the sampling rate with which to read the .wav sample_data {boolean} -- if True reads data from `sample_data` subfolder else reads data from `full_data` subfolder Returns: data {dict} -- keys in ('train', 'test'), values are lists of tuples ('file_name', 'numpy_representation', 'genre') """ data_root = git_root("data") metadata = pd.read_csv( os.path.join(data_root, "metadata", "train_test_split.csv") ) train_metadata = metadata.loc[metadata["split" ]== "train", ] test_metadata = metadata.loc[metadata["split"] == "test", ] if sample_data: train_metadata = train_metadata.loc[train_metadata["sample"], ] test_metadata = test_metadata.loc[test_metadata["sample"], ] train_records = [] test_records = [] def load_file(metadata_row): data_folder = "sample_data" if sample_data else "full_data" file_path = os.path.join( data_root, data_folder, metadata_row["split"], metadata_row["genre"], metadata_row["file_name"] ) file_numpy_representation, _ = librosa.load(file_path, sr=sampling_rate) return ( metadata_row["file_name"], file_numpy_representation, metadata_row["genre"] ) train_records = train_metadata.apply(load_file, axis=1).tolist() test_records = test_metadata.apply(load_file, axis=1).tolist() print(len(train_records)) return {'train': train_records, 'test': test_records}
def display_audio(track_name): """This functions displays an audio players in a notebook to listen to a track Arguments: track_name {string} -- name of the track """ data_root = git_root("data", "sample_data") train_root = os.path.join(data_root, "train") print(track_name) IPython.display.display( IPython.display.Audio( os.path.join(train_root, track_name.split(".")[0], track_name)))
def main(data_file='../data/nmdc_merged_data.tsv.zip', etl_modules=['gold_study', 'gold_omics_processing', 'gold_biosample', 'emsl_omics_processing', 'emsl_data_object', 'jgi_data_object'], sssom_map_file=git_root('schema/mappings/gold-to-mixs.sssom.tsv'), spec_file='lib/nmdc_data_source.yaml'): nmdc_etl = NMDC_ETL(merged_data_file=data_file, data_source_spec_file=spec_file, sssom_file=sssom_map_file) if 'gold_study' in etl_modules: nmdc_etl.transform_study() # nmdc_etl.transform_study(test_rows=1, print_df=True, print_dict=True) nmdc_etl.save_study(file_path='output/nmdc_etl/gold_study.json') if 'gold_omics_processing' in etl_modules: nmdc_etl.transform_omics_proccessing() # nmdc_etl.transform_omics_proccessing(test_rows=1, print_df=True, print_dict=True) nmdc_etl.save_omics_proccessing(file_path='output/nmdc_etl/gold_omics_processing.json') if 'gold_biosample' in etl_modules: nmdc_etl.transform_biosample() # nmdc_etl.transform_biosample(test_rows=1, print_df=True, print_dict=True) nmdc_etl.save_biosample('output/nmdc_etl/gold_biosample.json') # align_nmdc_datatypes.align_gold_biosample() ########### currently broken if 'emsl_omics_processing' in etl_modules: nmdc_etl.transform_emsl_omics_processing() # nmdc_etl.transform_emsl_omics_processing(test_rows=1, print_df=True, print_dict=True) nmdc_etl.save_emsl_omics_processing('output/nmdc_etl/emsl_omics_processing.json') if 'emsl_data_object' in etl_modules: nmdc_etl.transform_emsl_data_object() # nmdc_etl.transform_emsl_data_object(test_rows=1, print_df=True, print_dict=True) nmdc_etl.save_emsl_data_object('output/nmdc_etl/emsl_data_objects.json') if 'jgi_data_object' in etl_modules: nmdc_etl.transform_jgi_data_object() # nmdc_etl.transform_jgi_data_object(test_rows=1, print_df=True, print_dict=True) nmdc_etl.save_jgi_data_object('output/nmdc_etl/jgi_fastq_data_objects.json')
def load_data(): detector_output = git_root("models", "benchmark", "crypto-detector_output") def join_path(source): return os.path.join( detector_output, f"{source}_output.crypto" ) filenames = {source: join_path(source) for source in sources} outputs = {source: None for source in sources} for source in filenames: with open(filenames[source]) as data_file: parser = JsonComment() outputs[source] = parser.load(data_file) return outputs
import os, sys, click, pickle from git_root import git_root sys.path.append( os.path.abspath(git_root("schema")) ) # add path nmdc schema files and modules sys.path.append(os.path.abspath(git_root("metadata-translation/src/bin"))) sys.path.append(os.path.abspath(git_root("metadata-translation/src/bin/lib"))) from lib.nmdc_etl_class import NMDC_ETL import yaml import json from yaml import CLoader as Loader, CDumper as Dumper from collections import namedtuple from pprint import pprint import pandas as pds import jsonasobj import nmdc import lib.data_operations as dop import nmdc_dataframes # import align_nmdc_datatypes import jq from git_root import git_root def get_json(file_path, replace_single_quote=False): ## load json with open(file_path, "r") as in_file: if replace_single_quote: # json text = in_file.read()
import json import os import fnmatch from git_root import git_root items_to_exclude = [".DS_Store", ".gitignore", "README.md"] file_list = [] data_folder = os.path.join(git_root(), "data", "crypto-library") #Read each crypto_library folder for crypto_library in os.listdir(os.path.join(data_folder, "files")): library_file_counter = 0 #Make sure we only look at the library folders if (crypto_library not in items_to_exclude and not (os.path.isfile(os.path.join("files", crypto_library)))): root = os.path.join(data_folder, "files", crypto_library) pattern = "*.c" print("<--- Processing new library --->") print(crypto_library) #Go through the entire list of files (embedded in folders or not) for path, subdirs, files in os.walk(root): for name in files:
## author: Bill Duncan ## summary: Contians class with methods and properties for transforming data in NMDC ETL pipeline. ## add ./lib directory to sys.path so that local modules can be found import pickle from git_root import git_root import os, sys sys.path.append(os.path.abspath(".")) sys.path.append(os.path.abspath("./lib")) sys.path.append(os.path.abspath( git_root("schema"))) # add path nmdc schema files and modules # print(sys.path) import transform_nmdc_data as tx import extract_nmdc_data as ex import load_nmdc_data as lx import nmdc_dataframes as nmdc_dfs import nmdc ## system level modules import pandas as pds import jq import jsonasobj import json import zipfile import yaml from yaml import CLoader as Loader, CDumper as Dumper from dotted_dict import DottedDict from collections import namedtuple
import os import re from functools import partial import pandas as pd from git_root import git_root BASE_DIR = os.path.join(git_root(), "feature_engineering") DATA_DIR = os.path.join(git_root(), "data") train = pd.read_json(os.path.join(DATA_DIR, "train.json")) content = train["content"] train = train.drop(columns=["content"]) feature_dict = { "non-regex": { "proxy_line": "/n", "proxy_comment": "/*", "proxy_int": "int", "proxy_long": "long", "proxy_while_loops": "while", "proxy_for_loops": "for", "proxy_include": "#include", "proxy_bit_left_shift": "<<", "proxy_bit_right_shift": ">>", "proxy_bitwise_and": "&", "proxy_bitwise_or": "|", "proxy_complement": "~",
import os import json from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras import backend as K from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Flatten, Dense, Dropout from tensorflow.keras.layers import Embedding from tensorflow.keras.optimizers import Adam from sklearn.model_selection import train_test_split from git_root import git_root data_folder = os.path.join(git_root(), "data") dataframe = pd.read_json(os.path.join(data_folder, 'full_data_v2.json')) def remove_comments(string): # remove all occurrences streamed comments (/*COMMENT */) from string string = re.sub(re.compile(r"/\*.*?\*/", re.DOTALL), "", string) # remove all occurrence single-line comments (//COMMENT\n ) from string string = re.sub(re.compile(r"//.*?\n"), "", string) return string def get_docs_and_labels(df): _docs = [] _labels = [] for index in df.index:
def setUp(self): with open(Path(git_root()) / 'index.md', "r") as index: indexpage = index.read() self.soup = BeautifulSoup(indexpage, 'html.parser')
## author: Bill Duncan ## summary: Contians class with methods and properties for transforming data in NMDC ETL pipeline. ## add ./lib directory to sys.path so that local modules can be found from git_root import git_root import os, sys sys.path.append(os.path.abspath(".")) sys.path.append(os.path.abspath("./lib")) sys.path.append(os.path.abspath( git_root('schema'))) # add path nmdc schema files and modules # print(sys.path) import transform_nmdc_data as tx import extract_nmdc_data as ex import load_nmdc_data as lx import nmdc_dataframes as nmdc_dfs import nmdc ## system level modules import pandas as pds import jq import jsonasobj import json import zipfile import yaml from yaml import CLoader as Loader, CDumper as Dumper from dotted_dict import DottedDict from collections import namedtuple class NMDC_ETL():
import numpy as np import librosa import sys import os from git_root import git_root # <---- For importing a .py file from another module ----> sys.path.append(os.path.join(git_root(), "utils")) from utils import read_in_data, generate_short_term_piece from utils import quantize, load_params, load_config def generate_MFCC(array, n_mfcc, frame_length, overlap, sampling_rate, n_windows): """This function generates a MFCC from a numpy representation of mono .wav files <---- WARNING: the number of windows computed is a FIXED parameter from the config file ----> Arguments: array {np.array} -- float np.array frame_length {int} -- the number of samples in each analysis window overlap {float} -- in [0, 1) the fraction of overlap for each window returns: np.array with dimensions (n_mfcc, t) """ window_length = int(frame_length * sampling_rate) hop_length = int(window_length * (1 - overlap))
# IMPORTANT: train_test_split should have been run before import os import shutil import json import numpy as np import pandas as pd from git_root import git_root with open(git_root("config", "config.json"), 'r') as config: config = json.load(config) data_root = git_root("data") train_test_split_path = os.path.join(data_root, "metadata", "train_test_split.csv") train_test_split = pd.read_csv(train_test_split_path) random_state = 11 def sample_18_2(block): sample_size = 2 if block["split"].unique()[0] == "train": sample_size = 18 return block.sample(sample_size, random_state=random_state)
## author: Bill Duncan ## summary: Contians methods for transforming data in NMDC ETL pipeline. ## add ./lib and rooth directory to sys.path so that local modules can be found import os, sys, git_root sys.path.append(os.path.abspath(".")) sys.path.append(os.path.abspath("./lib")) sys.path.append(git_root.git_root("./schema")) # print(sys.path) ## system level modules import pandas as pds import jq import jsonasobj import json import jq import zipfile import yaml from yaml import CLoader as Loader, CDumper as Dumper from dotted_dict import DottedDict from collections import namedtuple import nmdc_dataframes ## add all classes for local nmdc.py ## this is the file of python classes generated by biolinkml import nmdc def has_raw_value(obj, attribute: str) -> bool: """
import os import json from git_root import git_root data_source = "crypto-competitions" data_folder = os.path.join(git_root(), "data", data_source) files_folder = os.path.join(data_folder, "files") data_dict = {"data_source": data_source, "label": 1, "data": []} # fill up the dictionary with the files content for filename in os.listdir(files_folder): file_path = os.path.join(files_folder, filename) with open(file_path, 'r') as file: file_string = file.read() # strip opening comment for libObfuscate if file_string.startswith(47 * "/"): file_string = file_string[546:] data_dict["data"].append({"file_name": filename, "content": file_string}) # write resulting json json_path = os.path.join(data_folder, data_source + "_data.json") with open(json_path, 'w') as json_to_write: json.dump(data_dict, json_to_write, indent=4)
import os import json import pandas as pd from sklearn.model_selection import train_test_split from git_root import git_root BASE_DIR = os.path.join(git_root(), "data") json_path = os.path.join(BASE_DIR, "full_data.json") df = pd.read_json(json_path) train, test, _, _ = train_test_split(df, df.loc[:, "data_source"], test_size=.15, stratify=df.loc[:, "data_source"]) train_source_counts = train["data_source"].value_counts() test_source_counts = test["data_source"].value_counts() for t in ("train", "test"): if t == "train": props = train_source_counts / train.shape[0] else: props = test_source_counts / test.shape[0] print(f""" The proportions of files from each source in {t} is: {props.round(3)} """)
import os from git_root import git_root from google.cloud import storage import librosa import pandas as pd import shutil import sys import json from tqdm import tqdm from preprocessing_pipeline import preprocess_data #Load the utils module sys.path.append(os.path.join(git_root(), 'utils')) from utils import load_config, load_credentials #Load the config file config = load_config() credentials = load_credentials() ### Read data from Google cloud storage os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials['PATH'] storage_client = storage.Client("Music-Genre-Classification") bucket = storage_client.get_bucket("deep-music-classification") #Create a temporary folder for storing the current file temp_dir_path = os.path.join(git_root(), 'temp') try: os.mkdir(temp_dir_path)
import sys from tensorflow.keras import Sequential from tensorflow.keras.layers import Conv2D, GlobalAveragePooling2D from tensorflow.keras.layers import AveragePooling2D, Dense from git_root import git_root #Load the utils module sys.path.append(git_root("utils")) from utils import load_config, load_params from affine_scalar_layer import AffineScalar def setup_model(M=1): """[summary] Returns: [type] -- [description] """ config = load_config() params = load_params() input_dim = params["quantization"]["n_levels"] - 1 n_genres = len(config["genres"]) model = Sequential() model.add( Conv2D( 12,
import os import shutil import json import numpy as np import pandas as pd from git_root import git_root with open(git_root("config", "config.json"), 'r') as config: config = json.load(config) output = git_root("data", "metadata", "train_test_split.csv") # writes csv with three columns in `data/metadata`: # - file_name: e.g. blues.00098.wav # - genre: e.g. blues # - split: train or test output_dict = { "file_name": [], "genre": [], "split": [], } random_state = 11 for genre in config["genres"]: R = np.random.RandomState(random_state)
import os def process_raw_isear_data(data_path='data/isear.csv', save_path='data/isear_processed.csv'): texts = [] emotions = [] print("Reading from {}".format(data_path)) with open(data_path) as csv_file: csv_reader = csv.reader(csv_file, delimiter='|') line_count = 0 for row in csv_reader: if line_count == 0: line_count += 1 text_id = row.index('SIT') emotion_id = row.index('Field1') else: texts.append(row[text_id]) emotions.append(row[emotion_id]) line_count += 1 print(f'Processed {line_count} lines.') df = pd.DataFrame(list(zip(texts, emotions))) print("Writing to {}".format(save_path)) df.to_csv(save_path) return if __name__ == '__main__': process_raw_isear_data(os.path.join(git_root(), 'data', 'isear.csv'))
from git_root import git_root import os import sys import json import numpy as np from generate_melmap import generate_mel_map from generate_spectrogram import generate_spectrogram from generate_GLCM import generate_glcm from generate_MFCC import generate_MFCC # <---- For importing a .py file from another module ----> sys.path.append(os.path.join(git_root(), "utils")) from utils import read_in_data, generate_short_term_piece from utils import quantize, load_params, load_config def pad_from_dict(data): """This function pads tracks that are too short to produce 14 chunks in the later short term pieces generation Arguments: data {dict} -- keys in ('train', 'test'), values are lists of tuples ('file_name', 'numpy_representation', 'genre') Returns: data_padded {dict} -- keys in ('train', 'test'), values are lists of tuples ('file_name', 'numpy_representation', 'genre') and the numpy representations are padded with zeros """
import os import re import logging from git_root import git_root import pandas as pd pd.set_option("display.max_columns", 100) # clone it from https://github.com/pcm-dpc/COVID-19" data_folder = "C:\\Users\\matth\\repos\\COVID-19\\dati-regioni" output_file = os.path.join(git_root(), "data", "df_it.csv") dict_renaming = { "data": "date", "stato": "country", "codice_regione": "region_code", "denominazione_regione": "region_name", "lat": "region_latitude", "long": "region_longitude", "ricoverati_con_sintomi": "cases_pos_hospitalized_non_icu", "terapia_intensiva": "cases_pos_hospitalized_icu", "totale_ospedalizzati": "cases_pos_hospitalized", "isolamento_domiciliare": "cases_pos_in_home_isolation", "totale_attualmente_positivi": "cases_pos_total", "nuovi_attualmente_positivi": "cases_pos_new", "dimessi_guariti": "cases_recovered", "deceduti": "cases_deceased", "totale_casi": "cases_total", "tamponi": "tests_total",