示例#1
0
from Bio import SeqIO
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import numpy as np
import csv
from featuresetup_module import transcript_info, transcript_info_dict
from sklearn.externals import joblib
from sklearn import preprocessing
import datetime
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFECV
#from collections import Counter
from treeinterpreter import treeinterpreter as ti

hsapiens_info, hsapiens_dict, hsapiens_names = transcript_info_dict(
    "../data/training_files/h_sapiens_random4500.fa",
    "../data/training_files/h_sapiens_random4500.cpat.txt",
    "../data/training_files/h_sapiens_random4500.fa.tab")
print("imported human info")
atha_info, atha_dict, atha_names = transcript_info_dict(
    "../data/training_files/a_thaliana_random4500.fa",
    "../data/training_files/a_thaliana_random4500.cpat.txt",
    "../data/training_files/a_thaliana_random4500.fa.tab")

lncRNA_info, lncRNA_dict, lncRNA_names = trans_info_dict_cc(
    "../data/training_files/all_lncRNA_nodup.fa",
    "../data/training_files/all_lncRNA_nodup.humantrained.cpat.txt",
    "../data/training_files/all_lncRNA_nodup.fa.tab",
    'ips1_arabidopsisthaliana_1')
print("imported lncRNA info")

wanted_keys = [
示例#2
0
from Bio import SeqIO
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import numpy as np
import csv
from featuresetup_module import transcript_info, transcript_info_dict
from sklearn.externals import joblib
from sklearn import preprocessing
import datetime
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFECV
#from collections import Counter
from treeinterpreter import treeinterpreter as ti

hsapiens_info, hsapiens_dict, hsapiens_names = transcript_info_dict(
    "../data/training_files/h_sapiens_random3000.fa",
    "../data/training_files/h_sapiens_random3000.cpat.txt",
    "../data/training_files/h_sapiens_random3000.fa.tab")
print("imported human info")
mmus_info, mmus_dict, mmus_names = transcript_info_dict(
    "../data/training_files/m_musculus_random1000.fa",
    "../data/training_files/m_musculus_random1000.cpat.txt",
    "../data/training_files/m_musculus_random1000.fa.tab")
print("imported mouse info")
osat_info, osat_dict, osat_names = transcript_info_dict(
    "../data/training_files/o_sativa_random3000.fa",
    "../data/training_files/o_sativa_random3000.cpat.txt",
    "../data/training_files/o_sativa_random3000.fa.tab")
print("imported rice info")
#lncRNA_info, lncRNA_dict, lncRNA_names = transcript_info_dict("fasta_files/subset_lncrna_V2.fa","fasta_files/subset_lncrna_v2.cpat.txt", "fasta_files/subset_lncrna_V2.fa.tab")
#lncRNA_info, lncRNA_dict, lncRNA_names = trans_info_dict_cc("fasta_files/all_lncRNA_nodup.fa","fasta_files/all_lncRNA_nodup.humantrained.cpat.txt", "fasta_files/all_lncRNA_nodup.fa.tab")
lncRNA_info, lncRNA_dict, lncRNA_names = trans_info_dict(
示例#3
0
clf2 = joblib.load(
    os.path.join(CURRENT_DIR + '/../updated_gb_models/model2.pkl'))
clf3 = joblib.load(
    os.path.join(CURRENT_DIR + '/../updated_gb_models/model3.pkl'))
clf4 = joblib.load(
    os.path.join(CURRENT_DIR + '/../updated_gb_models/model4.pkl'))
clf5 = joblib.load(
    os.path.join(CURRENT_DIR + '/../updated_gb_models/model5.pkl'))
clf6 = joblib.load(
    os.path.join(CURRENT_DIR + '/../updated_gb_models/model6.pkl'))
clf7 = joblib.load(
    os.path.join(CURRENT_DIR + '/../updated_gb_models/model7.pkl'))
clf8 = joblib.load(
    os.path.join(CURRENT_DIR + '/../updated_gb_models/model8.pkl'))

trans_info, trans_dict, trans_names = transcript_info_dict(
    args.trans_fasta, args.cpat_out, args.diam_out)

key_order = [
    "align_perc_len", "align_perc_ORF", "align_length", "ORF", "length", "GC",
    "fickett", "hexamer", "identity"
]  # to keep this order

trans_dict_order = {
    gene: {feature: trans_dict[gene][feature]
           for feature in key_order}
    for gene in trans_names
}

# changed this from trans_dict to trans_dict_order
trans_array = np.array([[
    trans_dict_order[gene][feature]
示例#4
0
from Bio import SeqIO
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import numpy as np
import csv
from featuresetup_module import transcript_info, transcript_info_dict
from sklearn.externals import joblib
from sklearn import preprocessing
import datetime
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFECV
#from collections import Counter
from treeinterpreter import treeinterpreter as ti

hsapiens_info, hsapiens_dict, hsapiens_names = transcript_info_dict(
    "../data/training_files/h_sapiens_random2000.fa",
    "../data/training_files/h_sapiens_random2000.cpat.txt",
    "../data/training_files/h_sapiens_random2000.fa.tab")
print("imported human info")
mmus_info, mmus_dict, mmus_names = transcript_info_dict(
    "../data/training_files/m_musculus_random1000.fa",
    "../data/training_files/m_musculus_random1000.cpat.txt",
    "../data/training_files/m_musculus_random1000.fa.tab")
print("imported mouse info")

#print("imported rice info")
atha_info, atha_dict, atha_names = transcript_info_dict(
    "../data/training_files/a_thaliana_random3000.fa",
    "../data/training_files/a_thaliana_random3000.cpat.txt",
    "../data/training_files/a_thaliana_random3000.fa.tab")
print("imported arabidopsis info")
lncRNA_info, lncRNA_dict, lncRNA_names = transcript_info_dict(
示例#5
0
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import numpy as np
import csv
from featuresetup_module import transcript_info, transcript_info_dict
from sklearn.externals import joblib
from sklearn import preprocessing
import datetime
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFECV
#from collections import Counter
from treeinterpreter import treeinterpreter as ti
import featuresetup_module

hsapiens_info, hsapiens_dict, hsapiens_names = transcript_info_dict(
    "../data/training_files/h_sapiens_random3000.fa",
    "../data/training_files/h_sapiens_random3000.cpat.txt",
    "../data/training_files/h_sapiens_random3000.fa.tab")
print("imported human info")
mmus_info, mmus_dict, mmus_names = transcript_info_dict(
    "../data/training_files/m_musculus_random1000.fa",
    "../data/training_files/m_musculus_random1000.cpat.txt",
    "../data/training_files/m_musculus_random1000.fa.tab")
print("imported mouse info")
osat_info, osat_dict, osat_names = transcript_info_dict(
    "../data/training_files/o_sativa_random3000.fa",
    "../data/training_files/o_sativa_random3000.cpat.txt",
    "../data/training_files/o_sativa_random3000.fa.tab")
print("imported rice info")
#lncRNA_info, lncRNA_dict, lncRNA_names = transcript_info_dict("fasta_files/subset_lncrna_V2.fa","fasta_files/subset_lncrna_v2.cpat.txt", "fasta_files/subset_lncrna_V2.fa.tab")
#lncRNA_info, lncRNA_dict, lncRNA_names = trans_info_dict_cc("fasta_files/all_lncRNA_nodup.fa","fasta_files/all_lncRNA_nodup.humantrained.cpat.txt", "fasta_files/all_lncRNA_nodup.fa.tab")
lncRNA_info, lncRNA_dict, lncRNA_names = transcript_info_dict(
示例#6
0
from Bio import SeqIO
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import numpy as np
import csv
from featuresetup_module import transcript_info, transcript_info_dict
from sklearn.externals import joblib
from sklearn import preprocessing
import datetime
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFECV
#from collections import Counter
from treeinterpreter import treeinterpreter as ti

hsapiens_info, hsapiens_dict, hsapiens_names = transcript_info_dict(
    "../data/training_files/old_files/hsapiens_random_3000.fasta",
    "../data/training_files/old_files/hsapiens_random_3000.fasta.cpat.txt",
    "../data/training_files/old_files/hsapiens_random_3000.fasta.tab")
print("imported human info")
atha_info, atha_dict, atha_names = transcript_info_dict(
    "../data/training_files/old_files/arabidopsis_random_3000.fasta",
    "../data/training_files/old_files/arabidopsis_random_3000.cpat.txt",
    "../data/training_files/old_files/arabidopsis_random_3000.fasta.tab")

lncRNA_info, lncRNA_dict, lncRNA_names = trans_info_dict_cc(
    "../data/training_files/all_lncRNA_nodup.fa",
    "../data/training_files/all_lncRNA_nodup.humantrained.cpat.txt",
    "../data/training_files/all_lncRNA_nodup.fa.tab",
    'ips1_arabidopsisthaliana_1')
print("imported lncRNA info")

wanted_keys = [
示例#7
0
from Bio import SeqIO
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import numpy as np
import csv
from featuresetup_module import transcript_info, transcript_info_dict
from sklearn.externals import joblib
from sklearn import preprocessing
import datetime
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFECV
#from collections import Counter
from treeinterpreter import treeinterpreter as ti

hsapiens_info, hsapiens_dict, hsapiens_names = transcript_info_dict(
    "../data/training_files/hsapiens_random_3000.fasta",
    "../data/training_files/hsapiens_random_3000.fasta.cpat.txt",
    "../data/training_files/hsapiens_random_3000.fasta.tab")
print("imported human info")
atha_info, atha_dict, atha_names = transcript_info_dict(
    "../data/training_files/arabidopsis_random_3000.fasta",
    "../data/training_files/arabidopsis_random_3000.cpat.txt",
    "../data/training_files/arabidopsis_random_3000.fasta.tab")
print("imported arabidopsis info")

lncRNA_info, lncRNA_dict, lncRNA_names = transcript_info_dict(
    "../data/training_files/all_lncRNA_nodup.fa",
    "../data/training_files/all_lncRNA_nodup.humantrained.cpat.txt",
    "../data/training_files/all_lncRNA_nodup.fa.tab")
print("imported lncRNA info")

wanted_keys = [