def extract_from_config(config, params=None):
    """"This function extracts some useful information from the `config` and
    `params` dictionaries
    
    Arguments:
        config {dict} -- a configuration dictionary loaded from `config.json`
        params {dict} -- the feature engineering part of the configuration

    Returns:
        train_root {string} -- the string path to the train data
        genres {list} -- the list of genre names
        n_trains_per_genre {int} -- the number of training examples per genre
        n_pieces {int} -- the number of pieces per track
        n_pieces_per_genre {int} -- the total number of train pieces per genre
        sampling_rate {int} -- the sampling rate for reading the audio data
    """
    data_root = git_root("data", "sample_data")
    train_root = os.path.join(data_root, "train")

    genres = config["genres"]
    total_n_genres = len(genres)
    n_train = config["train_percent"] * config["sample_data_per_genre"] // 10

    n_train_per_genre = n_train // total_n_genres

    n_pieces = params["divide"]["number_pieces"] if params else None

    n_train_pieces = n_train * n_pieces if params else None
    n_pieces_per_genre = n_train_pieces // total_n_genres if params else None

    sampling_rate = params["sampling_rate"] if params else None

    return (train_root, genres, n_train_per_genre, n_pieces,
            n_pieces_per_genre, sampling_rate)
示例#2
0
def data_to_json():
    data = []
    DATA_SOURCE = "code-jam"
    data_path = git_root("data", DATA_SOURCE, "files", "*")
    JSON_OUTPUT = git_root("data", DATA_SOURCE, "code-jam_data.json")
    LABEL = 0
    for filename in tqdm(glob.iglob(data_path)):
        print_filename = os.path.basename(filename)
        with open(filename,'r', errors='replace') as f:
            content = f.read()

        dentry = {"file_name": print_filename, "content": content}
        data.append(dentry)

    json_to_save = {"data_source": DATA_SOURCE, "label": LABEL, "data": data}
    with open(JSON_OUTPUT, 'w') as fp:
        json.dump(json_to_save, fp)
def load_credentials():
    """Helper function to read the credentials from the credentials.json file

    Returns:
        data {dict} containing whatever parameters are in the config file
    """
    with open(git_root("credentials.json"), "r") as credentials_file:
            credentials = json.load(credentials_file)
            
    return credentials
def load_config():
    """Helper function to read the entire config file

    Returns:
        data {dict} containing the entire content of the config file
    """
    with open(git_root("config", "config.json"), "r") as config:
    	    config = json.load(config)
    		
    return config
示例#5
0
def execute_etl(
    data_file="../data/nmdc_merged_data.tsv.zip",
    etl_modules=[
        "gold_study",
        "gold_omics_processing",
        "gold_biosample",
        "emsl_omics_processing",
        "emsl_data_object",
        "jgi_data_object",
    ],
    sssom_map_file=git_root("schema/mappings/gold-to-mixs.sssom.tsv"),
    spec_file="lib/nmdc_data_source.yaml",
):

    nmdc_etl = NMDC_ETL(
        merged_data_file=data_file,
        data_source_spec_file=spec_file,
        sssom_file=sssom_map_file,
    )

    if "gold_study" in etl_modules:
        nmdc_etl.transform_study()
        # nmdc_etl.transform_study(test_rows=1, print_df=True, print_dict=True)
        nmdc_etl.save_study(file_path="output/nmdc_etl/gold_study.json")

    if "gold_omics_processing" in etl_modules:
        nmdc_etl.transform_omics_processing()
        # nmdc_etl.transform_omics_proccessing(test_rows=1, print_df=True, print_dict=True)
        nmdc_etl.save_omics_processing(
            file_path="output/nmdc_etl/gold_omics_processing.json"
        )

    if "gold_biosample" in etl_modules:
        nmdc_etl.transform_biosample()
        # nmdc_etl.transform_biosample(test_rows=1, print_df=True, print_dict=True)
        nmdc_etl.save_biosample("output/nmdc_etl/gold_biosample.json")

        # align_nmdc_datatypes.align_gold_biosample() ########### currently broken

    if "emsl_omics_processing" in etl_modules:
        nmdc_etl.transform_emsl_omics_processing()
        # nmdc_etl.transform_emsl_omics_processing(test_rows=1, print_df=True, print_dict=True)
        nmdc_etl.save_emsl_omics_processing(
            "output/nmdc_etl/emsl_omics_processing.json"
        )

    if "emsl_data_object" in etl_modules:
        nmdc_etl.transform_emsl_data_object()
        # nmdc_etl.transform_emsl_data_object(test_rows=1, print_df=True, print_dict=True)
        nmdc_etl.save_emsl_data_object("output/nmdc_etl/emsl_data_objects.json")

    if "jgi_data_object" in etl_modules:
        nmdc_etl.transform_jgi_data_object()
        # nmdc_etl.transform_jgi_data_object(test_rows=1, print_df=True, print_dict=True)
        nmdc_etl.save_jgi_data_object("output/nmdc_etl/jgi_fastq_data_objects.json")
def load_params():
    """Helper function to read the parameters from the config file

    Returns:
        data {dict} containing whatever parameters are in the config file
    """
    with open(git_root("config", "config.json"), "r") as config:
    	    config = json.load(config)
    		
    params = config["feature_engineering"]
    return params
def get_all_results(detector_results):

    # add all the other files
    all_data_path = git_root("data", "full_data.json")
    all_data = pd.read_json(all_data_path)[["data_source", "label", "file_name"]]
    all_data["prediction"] = 0

    all_results = pd.concat([detector_results, all_data], axis=0, sort=True).\
        drop_duplicates(subset=["data_source", "file_name"], keep="first")

    # sanity check
    assert(all_results.shape == all_data.shape)

    return all_results
def read_in_data(sampling_rate, sample_data=True):
    """Function to load the data in memory 

    Arguments:
        sampling_rate {int} -- the sampling rate with which to read the .wav
        sample_data {boolean} -- if True reads data from `sample_data` subfolder
            else reads data from `full_data` subfolder
    Returns:
        data {dict} -- keys in ('train', 'test'), values are lists of tuples
            ('file_name', 'numpy_representation', 'genre')
    """
    
    data_root = git_root("data")

    metadata = pd.read_csv(
        os.path.join(data_root, "metadata", "train_test_split.csv")
    )

    train_metadata = metadata.loc[metadata["split" ]== "train", ]
    test_metadata = metadata.loc[metadata["split"] == "test", ]
    if sample_data:
        train_metadata = train_metadata.loc[train_metadata["sample"], ]
        test_metadata = test_metadata.loc[test_metadata["sample"], ]
	
    train_records = []
    test_records = []

    def load_file(metadata_row):
        data_folder = "sample_data" if sample_data else "full_data"
        file_path = os.path.join(
            data_root,
            data_folder,
            metadata_row["split"],
            metadata_row["genre"],
            metadata_row["file_name"]
        )
        file_numpy_representation, _ = librosa.load(file_path, sr=sampling_rate)
        return (
            metadata_row["file_name"], 
            file_numpy_representation, 
            metadata_row["genre"]
        )

    train_records = train_metadata.apply(load_file, axis=1).tolist()
    test_records = test_metadata.apply(load_file, axis=1).tolist()

    print(len(train_records))

    return {'train': train_records, 'test': test_records}
def display_audio(track_name):
    """This functions displays an audio players in a notebook to
    listen to a track
    
    Arguments:
        track_name {string} -- name of the track
    """
    data_root = git_root("data", "sample_data")
    train_root = os.path.join(data_root, "train")

    print(track_name)
    IPython.display.display(
        IPython.display.Audio(
            os.path.join(train_root,
                         track_name.split(".")[0], track_name)))
def main(data_file='../data/nmdc_merged_data.tsv.zip',
         etl_modules=['gold_study', 
                      'gold_omics_processing', 
                      'gold_biosample', 
                      'emsl_omics_processing',
                      'emsl_data_object', 
                      'jgi_data_object'],
         sssom_map_file=git_root('schema/mappings/gold-to-mixs.sssom.tsv'),
         spec_file='lib/nmdc_data_source.yaml'):

    
    nmdc_etl = NMDC_ETL(merged_data_file=data_file, data_source_spec_file=spec_file, sssom_file=sssom_map_file)
    
    if 'gold_study' in etl_modules:
        nmdc_etl.transform_study()
        # nmdc_etl.transform_study(test_rows=1, print_df=True, print_dict=True)
        nmdc_etl.save_study(file_path='output/nmdc_etl/gold_study.json')
    
    if 'gold_omics_processing' in etl_modules:
        nmdc_etl.transform_omics_proccessing()
        # nmdc_etl.transform_omics_proccessing(test_rows=1, print_df=True, print_dict=True)
        nmdc_etl.save_omics_proccessing(file_path='output/nmdc_etl/gold_omics_processing.json')

    if 'gold_biosample' in etl_modules:
        nmdc_etl.transform_biosample()
        # nmdc_etl.transform_biosample(test_rows=1, print_df=True, print_dict=True)
        nmdc_etl.save_biosample('output/nmdc_etl/gold_biosample.json')
        
        # align_nmdc_datatypes.align_gold_biosample() ########### currently broken

    if 'emsl_omics_processing' in etl_modules:
        nmdc_etl.transform_emsl_omics_processing()
        # nmdc_etl.transform_emsl_omics_processing(test_rows=1, print_df=True, print_dict=True)
        nmdc_etl.save_emsl_omics_processing('output/nmdc_etl/emsl_omics_processing.json')
        
    if 'emsl_data_object' in etl_modules:
        nmdc_etl.transform_emsl_data_object()
        # nmdc_etl.transform_emsl_data_object(test_rows=1, print_df=True, print_dict=True)
        nmdc_etl.save_emsl_data_object('output/nmdc_etl/emsl_data_objects.json')
        
    if 'jgi_data_object' in etl_modules:
        nmdc_etl.transform_jgi_data_object()
        # nmdc_etl.transform_jgi_data_object(test_rows=1, print_df=True, print_dict=True)
        nmdc_etl.save_jgi_data_object('output/nmdc_etl/jgi_fastq_data_objects.json')
def load_data():

    detector_output = git_root("models", "benchmark", "crypto-detector_output")
    
    def join_path(source):
        return os.path.join(
            detector_output, f"{source}_output.crypto"
        )

    filenames = {source: join_path(source) for source in sources}

    outputs = {source: None for source in sources}

    for source in filenames:
        with open(filenames[source]) as data_file:    
            parser = JsonComment()
            outputs[source] = parser.load(data_file)
    
    return outputs
示例#12
0
import os, sys, click, pickle
from git_root import git_root

sys.path.append(
    os.path.abspath(git_root("schema"))
)  # add path nmdc schema files and modules
sys.path.append(os.path.abspath(git_root("metadata-translation/src/bin")))
sys.path.append(os.path.abspath(git_root("metadata-translation/src/bin/lib")))

from lib.nmdc_etl_class import NMDC_ETL
import yaml
import json
from yaml import CLoader as Loader, CDumper as Dumper
from collections import namedtuple
from pprint import pprint
import pandas as pds
import jsonasobj
import nmdc
import lib.data_operations as dop
import nmdc_dataframes

# import align_nmdc_datatypes
import jq
from git_root import git_root


def get_json(file_path, replace_single_quote=False):
    ## load json
    with open(file_path, "r") as in_file:
        if replace_single_quote:  # json
            text = in_file.read()
示例#13
0
import json
import os
import fnmatch

from git_root import git_root

items_to_exclude = [".DS_Store", ".gitignore", "README.md"]

file_list = []

data_folder = os.path.join(git_root(), "data", "crypto-library")

#Read each crypto_library folder
for crypto_library in os.listdir(os.path.join(data_folder, "files")):

    library_file_counter = 0

    #Make sure we only look at the library folders
    if (crypto_library not in items_to_exclude
            and not (os.path.isfile(os.path.join("files", crypto_library)))):

        root = os.path.join(data_folder, "files", crypto_library)
        pattern = "*.c"

        print("<--- Processing new library --->")
        print(crypto_library)
        #Go through the entire list of files (embedded in folders or not)

        for path, subdirs, files in os.walk(root):
            for name in files:
示例#14
0
## author: Bill Duncan
## summary: Contians class with methods and properties for transforming data in NMDC ETL pipeline.

## add ./lib directory to sys.path so that local modules can be found
import pickle
from git_root import git_root
import os, sys

sys.path.append(os.path.abspath("."))
sys.path.append(os.path.abspath("./lib"))
sys.path.append(os.path.abspath(
    git_root("schema")))  # add path nmdc schema files and modules
# print(sys.path)

import transform_nmdc_data as tx
import extract_nmdc_data as ex
import load_nmdc_data as lx
import nmdc_dataframes as nmdc_dfs
import nmdc

## system level modules
import pandas as pds
import jq
import jsonasobj
import json
import zipfile
import yaml
from yaml import CLoader as Loader, CDumper as Dumper
from dotted_dict import DottedDict
from collections import namedtuple
import os
import re

from functools import partial

import pandas as pd

from git_root import git_root

BASE_DIR = os.path.join(git_root(), "feature_engineering")
DATA_DIR = os.path.join(git_root(), "data")

train = pd.read_json(os.path.join(DATA_DIR, "train.json"))

content = train["content"]
train = train.drop(columns=["content"])

feature_dict = {
    "non-regex": {
        "proxy_line": "/n",
        "proxy_comment": "/*",
        "proxy_int": "int",
        "proxy_long": "long",
        "proxy_while_loops": "while",
        "proxy_for_loops": "for",
        "proxy_include": "#include",
        "proxy_bit_left_shift": "<<",
        "proxy_bit_right_shift": ">>",
        "proxy_bitwise_and": "&",
        "proxy_bitwise_or": "|",
        "proxy_complement": "~",
示例#16
0
import os
import json

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split

from git_root import git_root

data_folder = os.path.join(git_root(), "data")
dataframe = pd.read_json(os.path.join(data_folder, 'full_data_v2.json'))


def remove_comments(string):
    # remove all occurrences streamed comments (/*COMMENT */) from string
    string = re.sub(re.compile(r"/\*.*?\*/", re.DOTALL), "", string)
    # remove all occurrence single-line comments (//COMMENT\n ) from string
    string = re.sub(re.compile(r"//.*?\n"), "", string)
    return string


def get_docs_and_labels(df):
    _docs = []
    _labels = []
    for index in df.index:
示例#17
0
 def setUp(self):
     with open(Path(git_root()) / 'index.md', "r") as index:
         indexpage = index.read()
     self.soup = BeautifulSoup(indexpage, 'html.parser')
示例#18
0
## author: Bill Duncan
## summary: Contians class with methods and properties for transforming data in NMDC ETL pipeline.

## add ./lib directory to sys.path so that local modules can be found
from git_root import git_root
import os, sys
sys.path.append(os.path.abspath("."))
sys.path.append(os.path.abspath("./lib"))
sys.path.append(os.path.abspath(
    git_root('schema')))  # add path nmdc schema files and modules
# print(sys.path)

import transform_nmdc_data as tx
import extract_nmdc_data as ex
import load_nmdc_data as lx
import nmdc_dataframes as nmdc_dfs
import nmdc

## system level modules
import pandas as pds
import jq
import jsonasobj
import json
import zipfile
import yaml
from yaml import CLoader as Loader, CDumper as Dumper
from dotted_dict import DottedDict
from collections import namedtuple


class NMDC_ETL():
import numpy as np

import librosa
import sys
import os
from git_root import git_root

# <---- For importing a .py file from another module ---->
sys.path.append(os.path.join(git_root(), "utils"))
from utils import read_in_data, generate_short_term_piece
from utils import quantize, load_params, load_config


def generate_MFCC(array, n_mfcc, frame_length, overlap, sampling_rate,
                  n_windows):
    """This function generates a MFCC 
    from a numpy representation of mono .wav files

    <---- WARNING: the number of windows computed is a FIXED parameter from the 
    config file ---->
    
    Arguments:
        array {np.array} -- float np.array
        frame_length {int} -- the number of samples in each analysis window
        overlap {float} -- in [0, 1) the fraction of overlap for each window

    returns: np.array with dimensions (n_mfcc, t)
    """

    window_length = int(frame_length * sampling_rate)
    hop_length = int(window_length * (1 - overlap))
示例#20
0
# IMPORTANT: train_test_split should have been run before

import os
import shutil

import json

import numpy as np
import pandas as pd

from git_root import git_root

with open(git_root("config", "config.json"), 'r') as config:
    config = json.load(config)

data_root = git_root("data")

train_test_split_path = os.path.join(data_root, "metadata",
                                     "train_test_split.csv")

train_test_split = pd.read_csv(train_test_split_path)

random_state = 11


def sample_18_2(block):
    sample_size = 2
    if block["split"].unique()[0] == "train":
        sample_size = 18
    return block.sample(sample_size, random_state=random_state)
## author: Bill Duncan
## summary: Contians methods for transforming data in NMDC ETL pipeline.

## add ./lib and rooth directory to sys.path so that local modules can be found
import os, sys, git_root

sys.path.append(os.path.abspath("."))
sys.path.append(os.path.abspath("./lib"))
sys.path.append(git_root.git_root("./schema"))
# print(sys.path)

## system level modules
import pandas as pds
import jq
import jsonasobj
import json
import jq
import zipfile
import yaml
from yaml import CLoader as Loader, CDumper as Dumper
from dotted_dict import DottedDict
from collections import namedtuple
import nmdc_dataframes

## add all classes for local nmdc.py
## this is the file of python classes generated by biolinkml
import nmdc


def has_raw_value(obj, attribute: str) -> bool:
    """
示例#22
0
import os
import json

from git_root import git_root

data_source = "crypto-competitions"
data_folder = os.path.join(git_root(), "data", data_source)
files_folder = os.path.join(data_folder, "files")

data_dict = {"data_source": data_source, "label": 1, "data": []}

# fill up the dictionary with the files content
for filename in os.listdir(files_folder):

    file_path = os.path.join(files_folder, filename)

    with open(file_path, 'r') as file:
        file_string = file.read()
        # strip opening comment for libObfuscate
        if file_string.startswith(47 * "/"):
            file_string = file_string[546:]

    data_dict["data"].append({"file_name": filename, "content": file_string})

# write resulting json
json_path = os.path.join(data_folder, data_source + "_data.json")
with open(json_path, 'w') as json_to_write:
    json.dump(data_dict, json_to_write, indent=4)
示例#23
0
import os
import json
import pandas as pd

from sklearn.model_selection import train_test_split

from git_root import git_root

BASE_DIR = os.path.join(git_root(), "data")
json_path = os.path.join(BASE_DIR, "full_data.json")

df = pd.read_json(json_path)

train, test, _, _ = train_test_split(df,
                                     df.loc[:, "data_source"],
                                     test_size=.15,
                                     stratify=df.loc[:, "data_source"])

train_source_counts = train["data_source"].value_counts()
test_source_counts = test["data_source"].value_counts()

for t in ("train", "test"):
    if t == "train":
        props = train_source_counts / train.shape[0]
    else:
        props = test_source_counts / test.shape[0]
    print(f"""
The proportions of files from each source in {t} is:
{props.round(3)}
    """)
import os
from git_root import git_root
from google.cloud import storage
import librosa
import pandas as pd
import shutil
import sys
import json
from tqdm import tqdm

from preprocessing_pipeline import preprocess_data

#Load the utils module
sys.path.append(os.path.join(git_root(), 'utils'))
from utils import load_config, load_credentials

#Load the config file
config = load_config()

credentials = load_credentials()

### Read data from Google cloud storage
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials['PATH']
storage_client = storage.Client("Music-Genre-Classification")
bucket = storage_client.get_bucket("deep-music-classification")

#Create a temporary folder for storing the current file
temp_dir_path = os.path.join(git_root(), 'temp')

try:
    os.mkdir(temp_dir_path)
示例#25
0
import sys

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, GlobalAveragePooling2D
from tensorflow.keras.layers import AveragePooling2D, Dense

from git_root import git_root

#Load the utils module
sys.path.append(git_root("utils"))
from utils import load_config, load_params
from affine_scalar_layer import AffineScalar



def setup_model(M=1):
    """[summary]
    
    Returns:
        [type] -- [description]
    """

    config = load_config()
    params = load_params()
    input_dim = params["quantization"]["n_levels"] - 1
    n_genres = len(config["genres"])

    model = Sequential()
    model.add(
        Conv2D(
            12, 
示例#26
0
import os
import shutil

import json

import numpy as np
import pandas as pd

from git_root import git_root

with open(git_root("config", "config.json"), 'r') as config:
    config = json.load(config)

output = git_root("data", "metadata", "train_test_split.csv")

# writes csv with three columns in `data/metadata`:
# - file_name: e.g. blues.00098.wav
# - genre: e.g. blues
# - split: train or test

output_dict = {
    "file_name": [],
    "genre": [],
    "split": [],
}

random_state = 11

for genre in config["genres"]:

    R = np.random.RandomState(random_state)
示例#27
0
import os


def process_raw_isear_data(data_path='data/isear.csv',
                           save_path='data/isear_processed.csv'):
    texts = []
    emotions = []
    print("Reading from {}".format(data_path))
    with open(data_path) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter='|')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                line_count += 1
                text_id = row.index('SIT')
                emotion_id = row.index('Field1')
            else:
                texts.append(row[text_id])
                emotions.append(row[emotion_id])
                line_count += 1
        print(f'Processed {line_count} lines.')

    df = pd.DataFrame(list(zip(texts, emotions)))
    print("Writing to {}".format(save_path))
    df.to_csv(save_path)
    return


if __name__ == '__main__':
    process_raw_isear_data(os.path.join(git_root(), 'data', 'isear.csv'))
示例#28
0
from git_root import git_root
import os
import sys
import json

import numpy as np

from generate_melmap import generate_mel_map
from generate_spectrogram import generate_spectrogram
from generate_GLCM import generate_glcm
from generate_MFCC import generate_MFCC

# <---- For importing a .py file from another module ---->
sys.path.append(os.path.join(git_root(), "utils"))
from utils import read_in_data, generate_short_term_piece
from utils import quantize, load_params, load_config


def pad_from_dict(data):
    """This function pads tracks that are too short to produce 14 chunks in the
    later short term pieces generation
    
    Arguments:
        data {dict} -- keys in ('train', 'test'), values are lists of tuples
            ('file_name', 'numpy_representation', 'genre')

    Returns:
        data_padded {dict} -- keys in ('train', 'test'), values are lists of 
            tuples ('file_name', 'numpy_representation', 'genre')
            and the numpy representations are padded with zeros
    """
示例#29
0
import os
import re
import logging

from git_root import git_root
import pandas as pd

pd.set_option("display.max_columns", 100)

# clone it from https://github.com/pcm-dpc/COVID-19"
data_folder = "C:\\Users\\matth\\repos\\COVID-19\\dati-regioni"

output_file = os.path.join(git_root(), "data", "df_it.csv")

dict_renaming = {
    "data": "date",
    "stato": "country",
    "codice_regione": "region_code",
    "denominazione_regione": "region_name",
    "lat": "region_latitude",
    "long": "region_longitude",
    "ricoverati_con_sintomi": "cases_pos_hospitalized_non_icu",
    "terapia_intensiva": "cases_pos_hospitalized_icu",
    "totale_ospedalizzati": "cases_pos_hospitalized",
    "isolamento_domiciliare": "cases_pos_in_home_isolation",
    "totale_attualmente_positivi": "cases_pos_total",
    "nuovi_attualmente_positivi": "cases_pos_new",
    "dimessi_guariti": "cases_recovered",
    "deceduti": "cases_deceased",
    "totale_casi": "cases_total",
    "tamponi": "tests_total",