def mirnaid_fix(fname: str): d = { "mouse": "mmu", "human": "hsa", "elegans": "cel", "cattle": "bta", "fly": "aga" } prefix = None for k, v in d.items(): if k in fname: prefix = v if prefix is None: raise Exception("unrecognized mirbase prefix") mirbase_df: DataFrame = pd.read_csv(MIRBASE_FILE).query("prefix==@prefix") mirbase_df.sort_values(by="version", ascending=False, inplace=True) mirbase_df.drop_duplicates("miRNA sequence", keep="first", inplace=True) fin_full_path = READ_PATH / fname fout_full_path = MIRNA_SEQ_PATH / fname d: DataFrame = read_csv(fin_full_path) join_df = d.merge(mirbase_df, how="left", left_on="miRNA sequence", right_on="miRNA sequence") d['miRNA ID'] = join_df['miRNA ID_y'] to_csv(d, fout_full_path)
def get_site_from_extended_site(fin: str, fout: str): def calc_chimera_start(seq: str, subseq: str) -> int: try: if seq.find(subseq) == -1: return -1 return seq.find(subseq) + 1 except AttributeError: return -1 def calc_chimera_end(chimera_start: int, seq_extended: str) -> int: if chimera_start == -1: return -1 return chimera_start + len(seq_extended) - 1 - HUMAN_SITE_EXTENDED_LEN logger.info(f"Insert site to {fin}") df: DataFrame = read_csv(Path(fin)) df["chimera_start"] = df.apply(func=get_wrapper(calc_chimera_start, 'region sequence', 'mRNA_seq_extended'), axis=1) df["chimera_end"] = df.apply(func=get_wrapper(calc_chimera_end, 'chimera_start', 'mRNA_seq_extended'), axis=1) df["site"] = df.apply(func=get_wrapper(get_subsequence_by_coordinates, "region sequence", "chimera_start", "chimera_end", extra_chars=SITE_EXTRA_CHARS), axis=1) to_csv(df, Path(fout)) logger.info(f"finish the site sequence insertion to {fin}")
def human_mapping_merge_by_name(fin: Path, fout: Path): def verify_sequence(seq: str, subseq: str) -> bool: try: return seq.find(subseq) != -1 except AttributeError: return False in_df: DataFrame = read_csv(fin) in_df["join_key"] = in_df["mRNA ID"].apply( lambda x: "|".join(x.split("_")[0:2])) mRNA_df = concatenate_biomart_df("human") in_df = in_df.merge(mRNA_df, how="left", left_on=["region", "join_key"], right_on=["region", "ID"]) in_df = in_df.rename(columns={"sequence": "region sequence"}) in_df = in_df[[ 'key', 'paper name', 'miRNA ID', 'miRNA sequence', 'mRNA ID', 'mRNA_seq_extended', 'region', 'region_sequence', 'mRNA_start', 'mRNA_end_extended' ]] in_df["join_ok"] = in_df.apply(func=get_wrapper(verify_sequence, 'region sequence', 'mRNA_seq_extended'), axis=1) to_csv(in_df, fout)
def get_model_output_10_classes(filename): with tf.gfile.GFile(filename, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) with tf.Graph().as_default() as graph: tensors_and_ops = tf.import_graph_def(graph_def, name='') with tf.Session(graph=graph) as sess: output_op = graph.get_operation_by_name('Minimum_3').outputs[0] xs = [] ys = [] filename_list = [] for idx, filename in enumerate(glob.glob(os.path.join(Constants.AUDIO_DATA_FOLDER, '*wav*', '*.wav'))): name = filename.split('/')[-2:] if int(name[-1].strip('.wav')) < 1000: continue y = name[-1].strip('.wav') y = str(int(y) - 1000) name = '/'.join(name) name = name.replace('.wav', '') filename_list.append(name) if idx % 50 == 0: print(name) fs, audio = wav.read(filename) x = audioToInputVector(audio, fs, N_FEATURES, N_CONTEXT) out = sess.run(output_op, {'input_node:0': [ x], 'input_lengths:0': [len(x)]}) xs.append(out) ys.append(y) xs = fix_seq_length(xs, length=20) xs = apply_pca(xs, n_components=25) xs = np.array([np.ravel(x) for x in xs]) to_csv(xs, ys, os.path.join(Constants.DATA_FOLDER, 'audio10classes.csv'), filename_list=filename_list)
def feature_extraction(fin: str, fout: str): in_df: DataFrame = read_csv(Path(fin)) valid_df = in_df.query("valid_row & duplex_valid=='True'") feature_df = df_feature_extractor(valid_df) result = pd.merge(left=in_df, right=feature_df, left_index=True, right_index=True, how='left') to_csv(result, Path(fout))
def fast_blast_file(fin: Path, fout: Path, db_title: str): logger.info(f"fast blast file {fin} against {db_title}") in_df: DataFrame = read_csv(fin) seq_file = BIOMART_DATA_PATH / f"{db_title}.csv" df_contains_db_title = partial(df_contains, df=pd.read_csv(seq_file)) in_df["blast sequence"] = in_df.apply(func=get_wrapper( df_contains_db_title, "site"), axis=1) to_csv(in_df, fout)
def rna_insertion(fin_full_path: Path, fout_full_path: Path, rna_df: DataFrame): logger.info(f"Insert rna sequence to {fin_full_path}") df: DataFrame = read_csv(fin_full_path) join_df = df.merge(rna_df, how="left", left_on="mRNA ID", right_on="mRNA ID", validate="many_to_one") to_csv(join_df, fout_full_path) logger.info(f"Finish the rna sequence insertion to {fin_full_path}")
def insert_site_by_coordinates(fin: str, fout: str): logger.info(f"Insert site to {fin}") df: DataFrame = read_csv(Path(fin)) df["site"] = df.apply(func=get_wrapper(get_subsequence_by_coordinates, "mRNA sequence", "chimera_start", "chimera_end", extra_chars=SITE_EXTRA_CHARS), axis=1) to_csv(df, Path(fout)) logger.info(f"finish the site sequence insertion to {fin}")
def blast_file(fin: Path, fout: Path, db_title: str): logger.info(f"blast file {fin} against {db_title}") in_df: DataFrame = read_csv(fin) blastn_df: DataFrame = in_df.apply(func=get_wrapper(run_blastn, "site", db_title=db_title), axis=1) result = pd.concat([in_df, blastn_df], axis=1) # in_df["blast region"] = in_df["blast sequence"].apply(lambda x: "" if np.isnan(x) else db_title) to_csv(result, fout)
def df_col_rename(fname: Path): df: DataFrame = read_csv(fname) df.rename( columns={ 'region_sequence': 'sequence', # 'chimera_start' : 'start', # 'chimera_end' : 'end' }, inplace=True) print(df.columns) df.insert(0, 'region count', np.nan) df.insert(0, 'identity', np.nan) df.insert(0, 'coverage', np.nan) to_csv(df, fname)
def gambiae_run(fin: str, fout: str): df: DataFrame = add_gambiae_region_information(Path(fin)) df = insert_gambiae_region(df) df = insert_gambiae_region_sequence(df) df["start"] = df.apply(lambda row: row['chimera_start'] - row[ 'mRNA sequence'].find(row['region_sequence']), axis=1) df["end"] = df.apply(lambda row: row['chimera_end'] - row['mRNA sequence']. find(row['region_sequence']), axis=1) df.rename(columns={"TRANSCRIPT_ID": ",Gene_ID"}, inplace=True) cols = [c for c in df.columns if c not in GAMBIAE_INFORMATION_USECOLS] to_csv(df[cols], Path(fout))
def duplex(method: str, fin: str, fout: str): duplex_cls: Duplex = DUPLEX_DICT[method] logger.info(f"{method} do_duplex to {fin}") in_df: DataFrame = read_csv(Path(fin)) # [in_df["miRNA sequence"].notnull() & in_df.site.notnull()] duplex_df = in_df.query("valid_row").apply(func=get_wrapper( do_duplex, "miRNA sequence", "site", cls=duplex_cls), axis=1) result = pd.merge(left=in_df, right=duplex_df, left_index=True, right_index=True, how='left') result["duplex_method"] = method to_csv(result, Path(fout))
def insert_site_from_chromosome(fin: str, fout: str, chr_dir: str): logger.info(f"Insert site from chromosome to {fin}") df: DataFrame = read_csv(Path(fin)) df["site"] = df.apply(func=get_wrapper(extract_seq_from_chromosome, 'chr', 'start', 'end', 'strand', directory=Path(chr_dir)), axis=1) df["site"] = df["site"].apply(lambda x: x.upper()) to_csv(df, Path(fout)) logger.info(f"finish the site sequence insertion to {fin}")
def mirna_seq_insertion(fname: str): logger.info(f"Insert mirna sequence to {fname}") fin_full_path = READ_PATH / fname fout_full_path = MIRNA_SEQ_PATH / fname df: DataFrame = read_csv(fin_full_path) df.drop(columns=["miRNA sequence"], inplace=True, errors='ignore') #drop the col we want to add via the join mirbase_df: DataFrame = pd.read_csv(MIRBASE_FILE, usecols=["miRNA ID", "miRNA sequence"]) join_df = df.merge(mirbase_df, how="left", left_on="miRNA ID", right_on="miRNA ID") to_csv(join_df, fout_full_path) logger.info(f"Finish the mirna sequence insertion to {fname}")
def finalize(fin: str, fout: str): df: DataFrame = read_csv(Path(fin)) logger.info("extract the site") df["site"] = df[df["sequence"].notnull()].apply(func=get_wrapper( get_subsequence_by_coordinates_no_exception, "sequence", "start", "end", extra_chars=SITE_EXTRA_CHARS), axis=1) def eta(x): try: return int(x) - SITE_EXTRA_CHARS except Exception: print(x) raise Exception() df["start"] = df[df["start"].notnull()]["start"].apply(lambda x: int( x) - SITE_EXTRA_CHARS if int(x) > SITE_EXTRA_CHARS else 1) df["end"] = df[df["end"].notnull()]["end"].apply( lambda x: int(x) + SITE_EXTRA_CHARS) logger.info("replace T with U") seq_cols = ['miRNA sequence', 'site', 'sequence'] df[seq_cols] = df[seq_cols].replace(to_replace='T', value='U', regex=True) logger.info("Add seed family") df["seed_family"] = df['miRNA sequence'].apply(extract_seed_family) logger.info("Add valid/invalid flag") invalid_conditions = [ pd.isna(df["miRNA sequence"]), pd.isna(df["site"]), df["miRNA sequence"].str.contains('X'), df["miRNA sequence"].str.contains('N'), df["site"].str.contains("N"), df["site"].str.contains("Error"), df["sequence"].str.contains('N'), df["sequence"].str.contains('X'), df["sequence"].str.contains("Error"), df["sequence"].str.contains("None") ] df["valid_row"] = ~reduce((lambda x, y: x | y), invalid_conditions) df = df[NORMALIZATION_COLUMNS] to_csv(df, Path(fout))
def qclash_melanoma_mirna_seq_insertion(fname: str): logger.info(f"Insert mirna sequence to {fname}") fin_full_path = READ_PATH / fname fout_full_path = MIRNA_SEQ_PATH / fname df: DataFrame = read_csv(fin_full_path) mirbase_df: DataFrame = pd.read_csv( MIRBASE_FILE, usecols=["miRNA ID", "miRNA sequence", "prefix"]) hsa = mirbase_df.query("prefix == 'hsa'") df["miRNA sequence"] = df.apply(func=get_wrapper(qclash_mirna_func, 'miRNA ID', 'mirna_seq_tmp', mirbase_hsa=hsa), axis=1, result_type="expand") to_csv(df, fout_full_path) logger.info(f"Finish the mirna sequence insertion to {fname}")
def concat_blast_result(directory: Path, fname: str, blast_prev_step_file: Path, fout: Path): blast_result_list = [ read_blast_result_file(f) for f in directory.glob(f"*{fname}_*.csv") ] logger.info("Finish read the files. start to concatenate") blast_result_df = pd.concat(blast_result_list, axis=0, ignore_index=True) vc = blast_result_df["key"].value_counts() blast_result_df["region count"] = \ blast_result_df.merge(vc, how="left", left_on="key", right_index=True)["key_y"] blast_result_inx = blast_result_df["key"].unique() all_interactions: DataFrame = read_csv(blast_prev_step_file) all_interactions["region"] = "None" all_interactions["region count"] = 0 all_interactions.query("key not in @blast_result_inx", inplace=True) unite = pd.concat([blast_result_df, all_interactions], axis=0) unite.sort_values(by="key", ignore_index=True, inplace=True) unite.drop(columns=["start", "end"], inplace=True, errors="ignore") unite = unite.rename(columns={"s.start": "start", "s.end": "end"}) to_csv(unite, fout)
""" Train an auditive som, test it alongside the visual one """ somv_path = os.path.join(Constants.DATA_FOLDER, '10classes', 'visual_model') somu_path = os.path.join(Constants.DATA_FOLDER, '10classes', 'audio_model') audio_data_path = os.path.join(Constants.DATA_FOLDER, '10classes', 'audio_data.csv') if __name__ == '__main__': xs, ys, filenames = from_csv_with_filenames(audio_data_path) vect_size = len(xs[0]) xs = MinMaxScaler().fit_transform(xs) audio_som = SOM(20, 30, vect_size, n_iterations=100, checkpoint_dir=somu_path) proto = get_prototypes(xs, [int(y) - 1000 for y in ys]) to_csv(proto.T, ys, os.path.join(Constants.DATA_FOLDER, '10classes', 'audio_prototypes.csv')) audio_som.train(xs) iterativeTraining(somv_path, somu_path)
def save(df: DataFrame, file_name: str): full_path = READ_PATH / file_name to_csv(df, full_path)