Exemplo n.º 1
0
def draw_one(filepath,output_path,score_col=3, label_col=4):

    result_score = single_column(filepath, col_num=score_col)
    result_label = single_column(filepath,col_num=label_col)

    all_score = list(map(float, result_score))
    all_label = list(map(int, result_label)) 

    #进行归一化
    all_score = normalization(all_score)

    long_ = sum(all_label)
    zipped = zip(all_score, all_label)

    # 先按 x[1] 进行排序,若 x[1] 相同,再按照 x[0] 排序
    combine_zip = sorted(zipped, key=lambda x : (x[1], x[0]),reverse=True)
    combine = zip(*combine_zip)
    all_score_new, all_label_new = [list(x) for x in combine]
    
    score = all_score_new[:proportion*long_]
    label = all_label_new[:proportion*long_]
    
    with open(output_path, 'w', encoding='utf-8') as output:
        for row in range(len(score)):
            rowtext = '{} {}'.format(score[row], label[row])
            output.write(rowtext)
            output.write('\n')
    
    # input('stop')
    return score, label
Exemplo n.º 2
0
def sample(filepath_pos, filepath_neg, seed, proportion=1):
    '''这是一个对负样本随机采样的函数'''
    random.seed(seed)
    pos_name = single_column(filepath_pos, 0, removetitle=False)
    pos_max = single_column(filepath_pos, 1, removetitle=False)
    pos_ave = single_column(filepath_pos, 2, removetitle=False)

    neg_name = single_column(filepath_neg, 0, removetitle=False)
    neg_max = single_column(filepath_neg, 1, removetitle=False)
    neg_ave = single_column(filepath_neg, 2, removetitle=False)

    pos_num = len(pos_name)
    neg_num = pos_num * proportion

    # 下面用随机函数进行采样
    neg_max_sample = []
    neg_ave_sample = []
    neg_name_sample = random.sample(neg_name, neg_num)
    for i in neg_name_sample:
        loc = neg_name.index(i)
        neg_max_sample.append(neg_max[loc])
        neg_ave_sample.append(neg_ave[loc])

    temp1 = [1 for i in range(len(pos_name))]
    temp2 = [0 for j in range(len(neg_name_sample))]
    temp1.extend(temp2)
    pos_max.extend(neg_max_sample)
    pos_ave.extend(neg_ave_sample)
    return neg_name_sample, temp1, pos_ave
Exemplo n.º 3
0
def r_c(filepath):
    '''我们拿到一个矩阵,需要检查行列的标题是否是对应的就可以用这个函数,输入就是文件路径进就行了'''

    col = single_column(filepath, 0, removetitle=True)
    print('第一列长度:', len(col))

    # 检查行列是否相等
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        line_tempt = lines[0]
        line_1 = line_tempt.split()[1:]
    print('第一行长度: ', len(line_1))
    print("并集的个数", len(set(col) | set(line_1)))
    print('交集的个数', len(set(col) & set(line_1)))
    # 匹配两列看下情况
    yes = 0
    for v, k in enumerate(col):
        if k == line_1[v]:
            yes += 1
        else:
            print('行和列从第:' + str(v + 1) + '个数开始不匹配')
            print('连续匹配的个数', yes)
            break
Exemplo n.º 4
0
from extract import single_column
"这个可以根据文件中的某一列数字,选择出TOP,相比于版本1,这里利用了数字出现的顺序,跑起来会更快。但是要注意几个参数"
top = 20  # 这里是设置想要前几
col = 2  # 这里是选择第几列数字来排序(由0开始计数)
genenum = 116  # 词典和文件匹配的蛋白质数目,用于写文件
drugnum = 2953  # 药物的数量,也就是一个蛋白出现了多少次,用于循环
# 文件路径,注意,这里的数据不是在data文件夹,而是在result文件夹。因为该文件是之前的函数运行的结果
filepath = 'C:\\Users\\hw\\Desktop\\test\\list_only_cov.txt'
filepath_dic = 'H:\\Data\\dict\\Covs-protein_dict127.txt'  # 新冠蛋白质词典
output_path = 'C:\\Users\\hw\\Desktop\\test\\list_top20.txt'  # 设置输出结果路径

# 下面两行注意列的参数的修改
covid19_protein_dic = single_column(filepath_dic, 1, removetitle=True)
only_procol = single_column(filepath, 1, removetitle=False)

# 首先把每一行转换成列表,并且提前把要比较数字的那一列由字符串转换为数字
contents = []
with open(filepath, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        content = line.strip('\n').split()
        a = float(content[col])
        content[col] = a
        contents.append(content)
print("------------数据转换为列表阶段结束---------------")
results = []
count = 0
for i in range(genenum):
    temps = []
    # 每个蛋白(temp)都有2953个
    while len(temps) < drugnum:
Exemplo n.º 5
0
from extract import single_column

filepath_list = 'C:\\Users\\hw\\Desktop\\list_only_morethan0.txt'
filepath_pos = 'C:\\Users\\hw\\Desktop\\positive.txt'

line_0 = single_column(filepath_list, 0, removetitle=False)
line_2 = single_column(filepath_list, 2, removetitle=False)
line_pos = single_column(filepath_pos, 0, removetitle=False)

pos_name, neg_name = [], []
pos_score, neg_score = [], []
for loc, drug in enumerate(line_0):
    if drug in line_pos:
        pos_name.append(drug)
        pos_score.append(line_2[loc])
    else:
        neg_name.append(drug)
        neg_score.append(line_2[loc])

with open('C:\\Users\\hw\\Desktop\\数据\\positive_data3.txt',
          'w',
          encoding='utf-8') as f:
    for i, j in zip(pos_name, pos_score):
        rowtext = '{} {}'.format(i, j)
        f.write(rowtext)
        f.write('\n')

with open('C:\\Users\\hw\\Desktop\\数据\\negative_data3.txt',
          'w',
          encoding='utf-8') as ll:
    for k, l in zip(neg_name, neg_score):
Exemplo n.º 6
0
from extract import single_column
#这是排序

filepath = 'H:\\PyCharm_Projects\\DataProcessing\\test_list.txt'
nums = []
count_1 = 0
count_05 = 0
list = single_column(filepath, 2, removetitle=False)
for i in list:
    temp = float(i)
    nums.append(temp)
nums.sort(reverse=True)
for num in nums:
    if num >= 1:
        count_1 += 1
    if num >= 0.5 and num < 1:
        count_05 += 1
print(count_05)
print(count_1)
print(nums[0:20])
Exemplo n.º 7
0
from extract import single_column
filepath = 'C:\\Users\\hw\\Desktop\\drug_wrong.txt'

f = single_column(filepath, 0, removetitle=False)
f.sort()
with open('hahahaha.txt', 'w', encoding='utf-8') as l:
    for i in f:
        l.write(i)
        l.write('\n')
Exemplo n.º 8
0
from extract import single_column
'''这是从数据中挑出是新冠蛋白的,需要修改的参数就是下面三行文件路径代码'''
# 下面第一行是新冠蛋白词典路径、第二行是数组路径、第三行是输出的文件名(文件会保存在代码所在文件夹)
filepath_covid19_dic = 'H:\\Data\\dict\\Covs-protein_dict127.txt'
filepath_list = 'C:\\Users\\hw\\Desktop\\test\\test_list.txt'
file_output = 'C:\\Users\\hw\\Desktop\\test\\list_only_cov.txt'

# 载入数据
covid19_dic = single_column(filepath_covid19_dic, 0, removetitle=True)
proteins = single_column(filepath_list, 1, removetitle=False)

print(len(set(covid19_dic)))
print(len(set(proteins)))
# 记录是新冠蛋白的行数
locs = []
for key, value in enumerate(proteins):
    if value in covid19_dic:
        locs.append(key)
# 保留新冠对应行,并把数据写入新文件
contents = []
with open(filepath_list, 'r', encoding='utf-8') as f:
    l_temp = f.readlines()
    for r in locs:
        temp = l_temp[r].strip('\n').split()
        contents.append(temp)
# 这里是统计最后结果中有多少个1
cc_num = 0
for cc in contents:
    if cc[3] == '1':
        cc_num += 1
print('最后的结果中1的个数为:', cc_num)
Exemplo n.º 9
0
import numpy as np
from extract import single_column
from tomatrix import tomatrix
from check import check
'''这个主要是调用写好的函数(主要是tomatrix),把一个矩阵,根据字典,去除无用的行和列'''


#文件路径,file_1表示字典,file_2表示需要修改的矩阵
file_1 = 'H:\\Data\\data422\\data\\dict\\gene_dict.txt'
file_2 = 'H:\\Data\\data422\\data\\protein similarity\\protein_sequence_similarity.txt'

#提取出词典
file_1_title = single_column(file_1, 0, removetitle=True)

##把剩下的数据转换成矩阵的形式,第一个参数为词典,第二个参数为根据词典修改的矩阵,第三个为是否去掉矩阵的第一行,第四个为是否去掉矩阵第一列
pure_mat = tomatrix(file_1_title, file_2, removetitle=True, removefirstcol=True)

#检查输出的矩阵的格式
check(pure_mat)

#如果矩阵不是0/1矩阵,把所有元素进行转化,然后求和,就可以清楚看出有几个非零元素
repeat_mat = pure_mat.copy()
repeat_mat[(repeat_mat < 0)] = 0
repeat_mat[repeat_mat > 0] = 1
num = np.sum(repeat_mat)
print("非0个数", num)

#输出结果到txt文件
np.savetxt('C:\\Users\\hw\\Desktop\\2\\Sim_prosequence.txt', pure_mat, fmt="%f")

print('end')
Exemplo n.º 10
0
from extract import single_column
import random

#参数的

file_path_rel = '/Users/huangwei/Downloads/drug_disease_association.txt'

drug_name_col = single_column(file_path_rel, col_num=0, removetitle=False)
target_name_col = single_column(file_path_rel, col_num=1, removetitle=False)

temp_drug_col = []  #这是没有去重的
for drug_name in drug_name_col:
    drug_num = drug_name.split(':')[0]
    temp_drug_col.append(drug_num)

drug_col = set(temp_drug_col)  #这是进行了去重操作的

print('没有去重,药物总共有:' + str(len(temp_drug_col)))
print('去重后, 药物总共有:' + str(len(drug_col)))
print('----------------------------------------------')

temp_target_col = []  #这是没有去重的
for target_name in target_name_col:
    target_num = target_name.split(':')[0]
    temp_target_col.append(target_num)

target_col = set(temp_target_col)  #这是进行了去重操作的

print('没有去重,靶点总共有:' + str(len(temp_target_col)))
print('去重后, 靶点总共有:' + str(len(target_col)))
print('----------------------------------------------')
Exemplo n.º 11
0
from extract import single_column
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

file_dic = 'H:\\Data\\dict\\Covs-protein_dict127.txt'
file_protein = 'H:\\Data\\dict\\protein5390_dict.txt'
dic_temp = single_column(file_dic, 0, removetitle=True)
pro = single_column(file_protein, 0, removetitle=True)
#去除字典中的重复
dic = list(set(dic_temp))

locs = []
for loc, name in enumerate(dic):
    if name in pro:
        locs.append(pro.index(name))

#保存新冠蛋白的向量
data_pro = []
file_protein = 'C:\\Users\\hw\\Desktop\\all_protein.txt'
with open(file_protein, 'r', encoding='utf-8') as f:
    contents = f.readlines()
    for loc in locs:
        data_pro.append(contents[loc].split())

#把116个向量中的字符串转换为数字
data_protein = []
for li in data_pro:
    temp = list(map(float, li))
    data_protein.append(temp)
Exemplo n.º 12
0
from extract import single_column
'''这是一个把交互矩阵转换成列表,第一列药物,第二列蛋白,第三列数字'''
# 需要注意两个位置的参数:1、第11行filepath的路径 2、39行输出的路径

results = []
contents = []
count = 0  # 记录最后一共有几个交互信息,方便核对
# 把字典,矩阵数据载入
filepath_drug_dic = 'H:\\Data\\dict\\drug2953_dict.txt'
filepath_protein_dic = 'H:\\Data\\dict\\protein5390_dict.txt'
filepath = 'C:\\Users\\hw\\Desktop\\test\\Zscore.txt'
filepath_interaction = 'C:\\Users\\hw\\Desktop\\test\\interaction.txt'

# 把药物和蛋白质词典提取出第一列
drug_list = single_column(filepath_drug_dic, 0, removetitle=True)
protein_list = single_column(filepath_protein_dic, 0, removetitle=True)

# 检查词典数量是否正确
print("药物词典的长度:", len(drug_list))
print("蛋白词典的长度:", len(protein_list))


# 交互数据
rows = []
with open(filepath_interaction, 'r', encoding='utf-8') as f:
    tempts = f.readlines()
    for tempt in tempts:
        row = tempt.strip('\n').split()
        rows.append(row)

Exemplo n.º 13
0
def tomatrix(file_dic, filepath, removetitle=False, removefirstcol=False):
    '''把数据转换为矩阵,其中行标题和列标题是否要去掉可以通过最后两个参数决定,第一个参数是文件路径,第二个参数是矩阵的长度'''

    dic_1 = file_dic
    col_1 = single_column(filepath, 0, removetitle=True)

    #创建全0矩阵
    long_mat = len(dic_1)
    mat_a = np.zeros((long_mat, long_mat))

    #打开文件读取数据
    with open(filepath, 'r', encoding='utf-8') as file:
        final_list = []    #用来记录最后剩下的行
        lines = file.readlines()  #把文件的每一行读取进一个列表,列表中每一个元素就是矩阵的一行
        if removetitle == True:  #决定是否删除第一行
            del lines[0]
        if removefirstcol == True: #决定是否删除第一列
            yes1 = 0
            for line in lines:
                tempt_list = line.strip('\n').split() #把每一行拆分成一个列表
                del tempt_list[0]   #对于每一行数据删除第一个,想当于删除了第一列

                #用0去替换列表中的缺失值"NA"
                for miss in tempt_list:
                    try:
                        tempt_list[tempt_list.index(miss)] = float(miss)
                    except ValueError:
                        tempt_list[tempt_list.index(miss)] = float(0)

                # 检查是否清除干净了
                for r, ring in enumerate(tempt_list):
                    if type(ring) != float:
                        print(type(ring))
                        print(ring)
                        print("行数", lines.index(line))
                        print("这个位置出错了:", r)
                        break
                    else:
                        yes1 += 1
                final_list.append(tempt_list)
            print("正确的元素的个数", yes1)

        else:
            yes1 = 0
            for line in lines:
                tempt_list = line.strip('\n').split()
                for miss in tempt_list:
                    try:
                        tempt_list[tempt_list.index(miss)] = float(miss)
                    except ValueError:
                        tempt_list[tempt_list.index(miss)] = float(0)
                #检查是否清除干净了
                for r, ring in enumerate(tempt_list):
                    if type(ring) != float:
                        print("行数", lines.index(line))
                        print("这个位置出错了:", r)
                        break
                    else:
                        yes1 += 1

                final_list.append(tempt_list)
            print("正确的元素的个数", yes1)
    #检查final_list:
    print('总的行数', len(final_list))

    #遍历所有行,删除无用的行以及对应的列
    for drug_name in col_1:
        if drug_name not in dic_1:
            drug_loc = col_1.index(drug_name)
            final_list[drug_loc] = 'XX'
            #删除列
            for pp in final_list:
                if pp != 'XX':
                    pp[col_1.index(drug_name)] = 'X'
                else:
                    continue

    #把标记了要删除的统一删除
    while 'XX' in final_list:
        final_list.remove('XX')
    for undel_col in final_list:
        while 'X' in undel_col:
            undel_col.remove('X')

    #补删除col_1
    for drug0, drug in enumerate(col_1):
        if drug not in dic_1:
            col_1[drug0] = 'XXX'
    while 'XXX' in col_1:
        col_1.remove('XXX')

    #检查删除后的结果
    print('*****删除后的行数:', len(final_list))
    for cccc in final_list:
        if len(cccc) != len(final_list):
            print('删除个数不对')
            print('------------------------')
            break

    #检查数据中0的个数,方便核对
    ehe_num = 0
    for ehes in final_list:
        # print(ehes)
        for ehe in ehes:
            if ehe == float(0):
                ehe_num += 1
    print('*****最后数据中有' + str(ehe_num) + '个0')

    #遍历列表,把列表的值一个一个插入到矩阵中,由于是对称的,所以用双循环来实现
    for drug_1 in col_1:
        a = dic_1.index(drug_1)
        aa = col_1.index(drug_1)
        for drug_2 in col_1:
            c = dic_1.index(drug_2)
            cc = col_1.index(drug_2)
            mat_a[a][c] = final_list[aa][cc]

    return mat_a
Exemplo n.º 14
0
from extract import single_column
from combine import combine
import numpy as np
from check import check, precheck
'''这个文件主要是调用几个写好的函数,把表示交互反应的文件,根据字典中的顺序,转换成一个邻接矩阵(只有0/1)'''

#输入两个词典和一个交互信息的数据文件路径,file_1,file_2是file_1x2中两个属性的字典
file_1 = 'H:\\Data\\data422\\data\\dict\\gene_dict.txt'
file_2 = 'H:\\Data\\data422\\data\\dict\\gene_dict.txt'
file_1x2 = 'H:\\Data\\data422\\data\\interaction\\PPI.txt'

#调用column中的single_column,分离出某个数据文件的某一列。
#第一个参数是文件,第二个参数是想要第几列,第三个参数是决定是否要去掉首行(标题)
file_1_list = single_column(file_1, 0, removetitle=True)
file_2_list = single_column(file_2, 0, removetitle=True)
file_1x2_list1 = single_column(file_1x2, 0, removetitle=True)
file_1x2_list2 = single_column(file_1x2, 1, removetitle=True)

#对于分离出来的几个列进行检查,就是检查数据有没有问题
#从check中调用precheck,第1,3个参数是字典,第2,4个参数是交互数据的两列
precheck(file_1_list, file_1x2_list1, file_2_list, file_1x2_list2)

# 拼接矩阵并检查
hello = combine(file_1_list, file_1x2_list1, file_2_list, file_1x2_list2)
check(hello)

#输出为txt格式
np.savetxt('C:\\Users\\hw\\Desktop\\protein_protein_mat.txt', hello, fmt="%d")

print("end")
Exemplo n.º 15
0
from extract import single_column
'''这是从数据中挑出大于阈值的'''
filepath_list = 'C:\\Users\\hw\\Desktop\\test\\list_only_cov.txt'
file_output = 'C:\\Users\\hw\\Desktop\\test\\list_only_morethan0.txt'
door_num = 0

#载入数据
nums = single_column(filepath_list, 2, removetitle=False)

#记录大于阈值的位置
locs = []
count = 0
for key, value in enumerate(nums):
    if float(value) > door_num:
        locs.append(key)
        count += 1
print("大于阈值的个数为:", count)

#保留大于阈值的行
contents = []
with open(filepath_list, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for loc in locs:
        content = lines[loc].strip('\n').split()
        contents.append(content)

#把数据写入文件
with open(file_output, 'w', encoding='utf-8') as output:
    for content in contents:
        rowtext = '{} {} {} {}'.format(content[0], content[1], content[2],
                                       content[3])