binary = args.binary n_jobs = args.jobs balanced = args.balanced bp = args.biological_process mf = args.molecular_function cc = args.cellular_component permute = args.permute scale = args.scale folder = args.output_folder file_suffix = args.output_file_suffix # ----------------------------- SETUP ----------------------------------- # date = str(datetime.now()).replace(" ", "-").replace(":", '-').replace('.', '-') log = open("tmp/training_log.txt", "w") dag = load_go_dag('data/gene_ontology.1_2.obo') if vectorizer_method not in ['count', 'tf-idf']: print('Vectorizer Method must select from: count | tf-idf') sys.exit(1) if folder: direc = 'results/{}-{}'.format(folder, date) su_make_dir(direc) else: direc = tempfile.mkdtemp(prefix='{}-{}-'.format(method, date), dir='results/') selection = [] ontologies = [] if pfam: selection.append('pfam')
def build_data_frame(ppi_file, obo_file, accession_to_feature_file, induce, fill_na, cache, n_jobs): """ Loads each tsv file containing a feature set (such as float similarity scores or accessions) into a pandas dataframe and then attempts to create binary vector/bag of words representations of textual accesion features. Finally combines each binary/numerical vector into a single feature vector along with it's label. @param ppi_file: Directory to look for feature files. @param obo_file: Path to obo file. @param accession_to_feature_file: Path to accession-feature map stored in tsv. @param induce: True to induce GO terms. @param fill_na: Value to fill NA with. Best to use np.NaN. @param cache: File to save dataframe to. @return: DataFrame. """ print("Building dataframe...") dag = ontology.load_go_dag(obo_file) # Create the blank dataframe to which we will attach our data to. labels = get_labels_from_file('data/labels.tsv') od = Od({ 'uniprot': [], 'uniprot_a': [], 'uniprot_b': [], 'go': [], 'go_cc': [], 'go_bp': [], 'go_mf': [], 'induced_go': [], 'induced_go_cc': [], 'induced_go_bp': [], 'induced_go_mf': [], 'ipr': [], 'pfam': [], 'sim': [], 'label': [] }) columns = od.keys() + labels # This will be for quick accessing of data binary labels for BR methods. # initialise these label to a null value. for l in labels: od[l] = -1 # Iterate through each ppi in the supplied file. fp = open(ppi_file, 'r') fp.readline() # assumes header exists for internal format def do_line(line): xs = line.strip().split('\t') p1 = xs[0].strip() p2 = xs[1].strip() reaction_type = xs[2].strip() reaction_types = [x.lower() for x in reaction_type.split(',')] # Maybe use resnik or something. cc_ss = float( xs[3].strip() ) bp_ss = float( xs[4].strip() ) mf_ss = float( xs[5].strip() ) terms = compute_features([p1, p2], induce, accession_to_feature_file, fill_na, dag) od = Od({ 'uniprot': [(p1, p2)], 'uniprot_a': [p1], 'uniprot_b': [p2], 'go': [terms['go']], 'go_cc': [terms['go_cc']], 'go_bp': [terms['go_bp']], 'go_mf': [terms['go_mf']], 'induced_go': [terms['induced_go']], 'induced_go_cc': [terms['induced_go_cc']], 'induced_go_bp': [terms['induced_go_bp']], 'induced_go_mf': [terms['induced_go_mf']], 'ipr': [terms['ipr']], 'pfam': [terms['pfam']], 'sim': [csr_matrix([cc_ss, bp_ss, mf_ss])], 'label': [reaction_type] }) # Iterate and check which labels are present in reaction_type. # Order of traversal is important here. for l in labels: if l.lower() in reaction_types: od[l] = 1 else: od[l] = 0 # Concatenate the dataframes df_new = pd.DataFrame( od, dtype='object', columns=columns ) return df_new try: df_rows = parallel_map(do_line, fp, n_jobs=n_jobs) except KeyboardInterrupt: sys.exit(0) df = pd.concat(df_rows, ignore_index=True) df = df.reset_index(); del df['index'] pickle.dump(df, open(cache, 'w')) return df
def compute_ss(ppi_tuples): r_file_in = tempfile.mktemp(suffix='.tsv', prefix='r_in_', dir='tmp') r_file_out = tempfile.mktemp(suffix='.tsv', prefix='r_out_', dir='tmp') dag = ontology.load_go_dag(OBO_FILE) feature_df = load_data_frame(ACCESSION_FEATURES_FILE, fill_na=np.NaN) # Write the three seperate GO columns to the r_input_file fp = open(r_file_in, 'w') fp.write("p1\tp2\tp1_go_cc\tp2_go_cc\tp1_go_bp\tp2_go_bp\tp1_go_mf\tp2_go_mf\n") for p1, p2 in ppi_tuples: p1_go = get_feature_for_accession(feature_df, p1, 'uniprot', 'go') p2_go = get_feature_for_accession(feature_df, p2, 'uniprot', 'go') # Separate the namespaces in the go terms. p1_go_cc = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'cellular_component', p1_go)) for p in p1_go_cc: assert ontology.id_to_node(p, dag).namespace == 'cellular_component' p2_go_cc = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'cellular_component', p2_go)) for p in p2_go_cc: assert ontology.id_to_node(p, dag).namespace == 'cellular_component' p1_go_bp = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'biological_process', p1_go)) for p in p1_go_bp: assert ontology.id_to_node(p, dag).namespace == 'biological_process' p2_go_bp = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'biological_process', p2_go)) for p in p2_go_bp: assert ontology.id_to_node(p, dag).namespace == 'biological_process' p1_go_mf = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'molecular_function', p1_go)) for p in p1_go_mf: assert ontology.id_to_node(p, dag).namespace == 'molecular_function' p2_go_mf = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'molecular_function', p2_go)) for p in p2_go_mf: assert ontology.id_to_node(p, dag).namespace == 'molecular_function' fp.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n".format( p1, p2, ','.join(p1_go_cc), ','.join(p2_go_cc), ','.join(p1_go_bp), ','.join(p2_go_bp), ','.join(p1_go_mf), ','.join(p2_go_mf) ) ) fp.close() # Run R script then collect output from tmp file args = [ 'Rscript', 'semantic_sim.r', '--file={}'.format(r_file_in), '--out={}'.format(r_file_out) ] proc = subprocess.Popen(args) proc.wait() # Parse the r_output into a list sims_tuple = [] with open(r_file_out, 'r') as fp: for line in fp: xs = line.strip().split('\t') p1, p2, cc_ss, bp_ss, mf_ss = xs sims_tuple.append((p1, p2, cc_ss, bp_ss, mf_ss)) fp.close() os.remove(r_file_in) os.remove(r_file_out) return sims_tuple
def depths(df, column): dag = ontology.load_go_dag('data/gene_ontology.1_2.obo') sublists = [x.split(',') for x in df[column].values] mean_depths = map(lambda sublist: np.mean([dag[x].depth for x in sublist if 'go' in x.lower()]), sublists) std_depths = map(lambda sublist: np.std([dag[x].depth for x in sublist if 'go' in x.lower()]), sublists) return mean_depths, std_depths