def create_amylpred_data(n): if os.path.exists("data/temp/amylpred" + str(n) + "set.txt"): print "Using existing data." return if not os.path.exists("data/temp/amino_acid_index.txt"): create_aaindex() create_amylpred_npeptide_data(n) fp = open("data/temp/amylpred" + str(n) + "peptides.txt") fn = open("data/temp/neg-" + str(n) + "peptides.txt") data = [line.rstrip() + " 1" for line in fp.readlines()] # Positive data neg = [line.rstrip() + " 0" for line in fn.readlines()] # Negative data data.extend(neg) # Shuffle the data randomly so that we can do cross-validation random.shuffle(data) # Creating a dataset with all features temp_amylprednset = open("data/temp/temp_amylpred" + str(n) + "set.txt", "w") # Compute the features for each sequence and append them to the data for i in xrange(len(data)): seq_features = " ".join( str(e) for e in compute_features(data[i].split()[0])) temp_amylprednset.write(data[i] + " " + seq_features + "\n") temp_amylprednset.close() if not os.path.exists("data/temp/amylpred_feature_dataframe.csv"): import optimal_feature_selection as ofs ofs.select_optimal_features(n, "amylpred") feature_dataframe = pd.read_csv("data/temp/amylpred_feature_dataframe.csv", index_col=0, header=0) feature_ids = [x for x in feature_dataframe["id"]] feature_ids.extend(range(len(feature_ids), len(feature_ids) + n)) # Compute the features for each sequence and append them to the data amylprednset = open("data/temp/amylpred" + str(n) + "set.txt", "w") for i in xrange(len(data)): seq_features = " ".join( str(e) for e in compute_features(data[i].split()[0], feature_ids)) amylprednset.write(data[i] + " " + seq_features + "\n") print "The amylprednset.txt has been created."
def create_zipper_data(n): if os.path.exists("data/temp/zipper_hexpepset.txt"): print "Using existing data." return if not os.path.exists("data/temp/amino_acid_index.txt"): create_aaindex() f = open("data/test/zipper_dataset.txt") data = [] for line in f: if line.strip()[0] == "+": data.append(line.split()[1] + " 1") # Positive data else: data.append(line.split()[1] + " 0") # Negative data # Shuffle the data randomly so that we can do cross-validation random.shuffle(data) # Creating a dataset with all features temp_zipper_hexpepset = open("data/temp/temp_zipper_hexpepset.txt", "w") # Compute the features for each sequence and append them to the data for i in xrange(len(data)): seq_features = " ".join( str(e) for e in compute_features(data[i].split()[0])) temp_zipper_hexpepset.write(data[i] + " " + seq_features + "\n") temp_zipper_hexpepset.close() if not os.path.exists("data/temp/zipper_feature_dataframe.csv"): import optimal_feature_selection as ofs ofs.select_optimal_features(6, "zipper") feature_dataframe = pd.read_csv("data/temp/zipper_feature_dataframe.csv", index_col=0, header=0) feature_ids = [x for x in feature_dataframe["id"]] feature_ids.extend(range(len(feature_ids), len(feature_ids) + n)) # Compute the features for each sequence and append them to the data zipper_hexpepset = open("data/temp/zipper_hexpepset.txt", "w") for i in xrange(len(data)): seq_features = " ".join( str(e) for e in compute_features(data[i].split()[0], feature_ids)) zipper_hexpepset.write(data[i] + " " + seq_features + "\n") print "The zipper_hexpepset.txt has been created."
def create_amylpred_data(n): if os.path.exists("data/temp/amylpred_hexpepset.txt"): print "Using existing data." return if not os.path.exists("data/temp/amino_acid_index.txt"): create_aaindex() f = open("data/test/amylpred_dataset.txt") data = [] for line in f: if line.strip()[0]=="+": data.append(line.split()[1] + " 1") # Positive data else: data.append(line.split()[1] + " 0") # Negative data # Shuffle the data randomly so that we can do cross-validation random.shuffle(data) # Creating a dataset with all features temp_amylpred_hexpepset = open("data/temp/temp_amylpred_hexpepset.txt", "w") # Compute the features for each sequence and append them to the data for i in xrange(len(data)): seq_features = " ".join(str(e) for e in compute_features(data[i].split()[0])) temp_amylpred_hexpepset.write(data[i] + " " + seq_features + "\n") temp_amylpred_hexpepset.close() if not os.path.exists("data/temp/amylpred_feature_dataframe.csv"): import optimal_feature_selection as ofs ofs.select_optimal_features(6, "amylpred") feature_dataframe = pd.read_csv("data/temp/amylpred_feature_dataframe.csv", index_col=0, header=0) feature_ids = [x for x in feature_dataframe["id"]] feature_ids.extend(range(len(feature_ids), len(feature_ids)+n)) # Compute the features for each sequence and append them to the data amylpred_hexpepset = open("data/temp/amylpred_hexpepset.txt", "w") for i in xrange(len(data)): seq_features = " ".join(str(e) for e in compute_features(data[i].split()[0], feature_ids)) amylpred_hexpepset.write(data[i] + " " + seq_features + "\n") print "The amylpred_hexpepset.txt has been created."
def create_amylnset(n): if os.path.exists("data/temp/amyl"+str(n)+"set.txt"): return if not os.path.exists("data/temp/amino_acid_index.txt"): create_aaindex() create_npeptide_data(n) fp = open("data/temp/"+str(n)+"peptides.txt") fn = open("data/temp/neg-"+str(n)+"peptides.txt") data = [line.rstrip() + " 1" for line in fp.readlines()] # Positive data neg = [line.rstrip() + " 0" for line in fn.readlines()] # Negative data data.extend(neg) # Shuffle the data randomly so that we can do cross-validation random.shuffle(data) # Creating dataset with all features. temp_amylnset = open("data/temp/temp_amyl"+str(n)+"set.txt", "w") # Compute the features for each sequence and append them to the data for i in xrange(len(data)): seq_features = " ".join(str(e) for e in compute_features(data[i].split()[0])) temp_amylnset.write(data[i] + " " + seq_features + "\n") temp_amylnset.close() # Create the .csv file of the sorted scores of features if it does not exist if not os.path.exists("data/temp/amylnset_feature_dataframe.csv"): import optimal_feature_selection as ofs ofs.select_optimal_features(6, "amylnset") # Creating dataset with optimal features. amylnset = open("data/temp/amyl"+str(n)+"set.txt", "w") feature_dataframe = pd.read_csv("data/temp/amylnset_feature_dataframe.csv", index_col=0, header=0) feature_ids = [x for x in feature_dataframe["id"]] feature_ids.extend(range(len(feature_ids), len(feature_ids)+n)) # Compute the features for each sequence and append them to the data for i in xrange(len(data)): seq_features = " ".join(str(e) for e in compute_features(data[i].split()[0], feature_ids)) amylnset.write(data[i] + " " + seq_features + "\n")