def test_scrub_pii_preserves_participants(self, db_session, zip_path, cleanup): dallinger.data.ingest_zip(zip_path) assert len(dallinger.models.Participant.query.all()) == 4 path = dallinger.data.export('test_export', local=True, scrub_pii=True) p_file = ZipFile(path).open('data/participant.csv') p_file = io.TextIOWrapper(p_file, encoding='utf8', newline='') assert len(p_file.readlines()) == 5 # 4 Participants + header row
def plugin_loaded(): from zipfile import ZipFile # # Is there an easier way to access files inside a .sublime-package file in Sublime Text 3? # # Find commitMessages file. package_path = path.join(sublime.installed_packages_path(), 'Commitment.sublime-package') if path.isfile(package_path): # Sublime Text 3 preferred way. messages_file = ZipFile(package_path).open(commitMessages) for line in messages_file.readlines(): messages[md5(line).hexdigest()] = line.decode('utf-8') else: # Somebody unzipped this badass. messages_file = open(path.join(path.dirname(__file__), commitMessages), encoding='utf-8') for line in messages_file.readlines(): messages[md5(line.encode('utf-8')).hexdigest()] = line CommitmentCommand.randomMessages = RandomCommitment(messages)
def _get_data_raw(self): """Download observations matching the time range. Returns a tuple with a string for the body, string for the headers, and a list of dates. """ with closing( urlopen(self.ftpsite + self.site_id + self.suffix + '.zip')) as url: f = ZipFile(BytesIO(url.read()), 'r').open(self.site_id + self.suffix) lines = [line.decode('utf-8') for line in f.readlines()] body, header, dates_long, dates = self._select_date_range(lines) return body, header, dates_long, dates
def _get_data_raw(self): """Download observations matching the time range. Returns a tuple with a string for the body, string for the headers, and a list of dates. """ # Import need to be here so we can monkeypatch urlopen for testing and avoid # downloading live data for testing try: from urllib.request import urlopen except ImportError: from urllib2 import urlopen with closing(urlopen(self.ftpsite + self.site_id + self.suffix + '.zip')) as url: f = ZipFile(BytesIO(url.read()), 'r').open(self.site_id + self.suffix) lines = [line.decode('utf-8') for line in f.readlines()] body, header, dates_long, dates = self._select_date_range(lines) return body, header, dates_long, dates
def _get_data_raw(self): """Download observations matching the time range. Returns a tuple with a string for the body, string for the headers, and a list of dates. """ path = self.folder + self.site_id + self.suffix + '.zip' # Get the data and handle if there is none matching what was requested try: resp = self.get_path(path) except HTTPError: raise ValueError('No data available for {time:%Y-%m-%d %HZ} ' 'for station {stid}.'.format(time=self.begin_date, stid=self.site_id)) file_info = ZipFile(BytesIO(resp.content)).infolist()[0] f = ZipFile(BytesIO(resp.content)).open(file_info) lines = [line.decode('utf-8') for line in f.readlines()] body, header, dates_long, dates = self._select_date_range(lines) return body, header, dates_long, dates
def script(directory, project_path): print ("ddddddddd", directory) # queria saber pq isso n ta sendo imprimido #ESPECIFICAR DIRETORIOS DE ENTRADA E SAIDA # entrada e saida recebe a mesma coisa input_directory= directory output_directory= directory print('eeeee', os.listdir(input_directory)) #DESCOMPACTANDO E REMOVENDO OS .ZIPs for file in os.listdir(input_directory): # print(file) if file.endswith('.zip'): path = input_directory + '/' +file file = ZipFile(path, 'r') file.extractall(input_directory) file.close() os.remove(path) arq = os.listdir(directory) #VERIFICAÇÃO E SUBSTITUIÇAO DO CARACTERE '-' NO NOME DOS ARQUIVOS for i in arq: aux = i.replace('-', '_') os.rename(input_directory+i, input_directory+aux) param = os.listdir(input_directory) #CRIANDO DICIONARIO DE ARQUIVOS DE NODES E EDGES dict_protein_nodes_files = {} dict_protein_edges_files = {} def populate_chain(chain): aux = str(chain).split(":") chain = aux[0] return chain def populate_aux_target(aux_chain): aux = str(aux_chain).split(":") aux_target = ':' + aux[1] + ':' + aux[2] + ':' + aux[3] return aux_target def populate_residue(NodeId): aux = str(NodeId).split(":") residue = aux[3] return residue arq_name = [] for i in param: arq_name.clear() arq_name.append(i[:-10]) arq_name.append(i[-9:-4]) if arq_name[1] == "nodes": #LER ARQUIVO E CRIAR DATAFRAME dict_protein_nodes_files[arq_name[0]] = pd.read_csv(input_directory+arq_name[0]+ "_nodes.txt", sep="\t", header= 0) #CRIAR NOVA COLUNA AUX_NODEID COM NODEID if 'Accessibility' not in dict_protein_nodes_files[arq_name[0]].columns: dict_protein_nodes_files[arq_name[0]].insert(13, 'Accessibility', 0) dict_protein_nodes_files[arq_name[0]]['aux_nodeId'] = '-' dict_protein_nodes_files[arq_name[0]]['aux_degree'] = '-' #POPULAR A COLUNA AUX_NODEID dict_protein_nodes_files[arq_name[0]]['aux_nodeId'] = dict_protein_nodes_files[arq_name[0]]['NodeId'].apply(populate_aux_target) #POPULAR A COLUNA AUX_DEGREE dict_protein_nodes_files[arq_name[0]]['aux_degree'] = dict_protein_nodes_files[arq_name[0]]['NodeId'].apply(populate_aux_target) dict_protein_nodes_files[arq_name[0]]['aux_degree'] = dict_protein_nodes_files[arq_name[0]].apply(lambda row: row.aux_degree + "_D" + str(row.Degree), axis=1) #POPULAR A COLUNA RESIDUE (ENCONTRADO CASOS DE PDBs COM RESIDUE NAN) dict_protein_nodes_files[arq_name[0]]['Residue'] = dict_protein_nodes_files[arq_name[0]]['NodeId'].apply(populate_residue) if arq_name[1] == "edges": #LER ARQUIVO E CRIAR DATAFRAME dict_protein_edges_files[arq_name[0]] = pd.read_csv(input_directory+arq_name[0]+ "_edges.txt", sep="\t", header= 0) #POPULAR AS COLUNAS AUX_CHAIN 1, 2 dict_protein_edges_files[arq_name[0]]['aux_chain_1'] = dict_protein_edges_files[arq_name[0]]['NodeId1'].apply(populate_chain) dict_protein_edges_files[arq_name[0]]['aux_chain_2'] = dict_protein_edges_files[arq_name[0]]['NodeId2'].apply(populate_chain) #CRIANDO DICIONARIOS DE CHAINS dict_protein_nodes_chain = {} for i in dict_protein_nodes_files: chains = pd.unique(dict_protein_nodes_files[i]["Chain"]) df = dict_protein_nodes_files[i] quant_chain= pd.unique(df["Chain"]) for j in quant_chain: key_dict = i + '('+ j + ')' dict_protein_nodes_chain[key_dict] = (df[df.Chain == j]) dict_protein_edges_chain = {} for i in dict_protein_edges_files: chains = pd.unique(dict_protein_edges_files[i]["aux_chain_1"]) df = dict_protein_edges_files[i] quant_chain= pd.unique(df["aux_chain_1"]) for j in quant_chain: key_dict = i + '('+ j + ')' dict_protein_edges_chain[key_dict] = (df[df.aux_chain_1 == j]) #NODES #NODES PRESENTES APENAS NA PROTEINA 1 dict_result_nodes_1 = {} #NODES PRESENTES APENAS NA PROTEINA 2 dict_result_nodes_2 = {} #POSIÇOES QUE SOFRERAM MUDANÇAS DE AMINOACIDOS dict_result_nodes_change = {} #AMINOACIDOS QUE SOFRERAM MUDANÇAS DE DEGREE - (INCLUI O DEGREE DE AMINO_CHANGE) dict_result_degree_change = {} #AMINOACIDOS PRESENTES EM AMBAS AS PROTEINAS dict_equal_nodes = {} vet = [] for i in dict_protein_nodes_chain: for j in dict_protein_nodes_chain: if(i != j) and (j + '-' + i not in vet): dif = dict_protein_nodes_chain[i].merge(dict_protein_nodes_chain[j], on="aux_nodeId", how='outer', suffixes=['', '_'], indicator=True) dif_nodes_protein_1 = dif[dif["_merge"] == "left_only"] dif_nodes_protein_1 = dif_nodes_protein_1.drop(dif.columns[[14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]], axis=1) dict_result_nodes_1[i+'-'+j] = dif_nodes_protein_1 equal_nodes = dif[dif["_merge"] == "both"] dict_equal_nodes[i+'-'+j] = equal_nodes dif_nodes_protein_2 = dif[dif["_merge"] == "right_only"] dif_nodes_protein_2 = dif_nodes_protein_2.drop(dif.columns[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]], axis=1) dif_nodes_protein_2.columns = dict_protein_nodes_chain[j].columns dif_nodes_protein_2.drop(columns=['aux_nodeId', 'aux_degree'], inplace = True) dict_result_nodes_2[i+'-'+j] = dif_nodes_protein_2 #POSIÇOES QUE SOFRERAM MUDANÇAS DE AMINOACIDOS amino_change = pd.DataFrame(columns=dif_nodes_protein_1.columns) dict_result_nodes_change[i+"-"+j] = amino_change for w, row in dif_nodes_protein_1.iterrows(): if row["Position"] in dif_nodes_protein_2["Position"].values: amino_change = amino_change.append(row) aux = dif_nodes_protein_2[dif_nodes_protein_2.Position == row["Position"]] amino_change = amino_change.append(aux) dict_result_nodes_change[i+"-"+j] = amino_change #AMINOACIDOS QUE SOFRERAM MUDANÇAS DE DEGREE dif_degree = dict_protein_nodes_chain[i].merge(dict_protein_nodes_chain[j], on="aux_degree", how='outer', suffixes=['', '_'], indicator=True) dif_degree_protein_1 = dif_degree[dif_degree["_merge"] == "left_only"] dif_degree_protein_1 = dif_degree_protein_1.drop(dif_degree.columns[[14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]], axis=1) dif_degree_protein_2 = dif_degree[dif_degree["_merge"] == "right_only"] dif_degree_protein_2 = dif_degree_protein_2.drop(dif_degree.columns[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]], axis=1) dif_degree_protein_2.columns = dict_protein_nodes_chain[j].columns dif_degree_protein_2.drop(columns=['aux_nodeId', 'aux_degree'], inplace = True) degree_change = pd.DataFrame(columns=dif_degree_protein_1.columns) dict_result_degree_change[i+"-"+j] = degree_change for w, row in dif_degree_protein_1.iterrows(): if row["Position"] in dif_degree_protein_2["Position"].values: degree_change = degree_change.append(row) aux = dif_degree_protein_2[dif_degree_protein_2.Position == row["Position"]] degree_change = degree_change.append(aux) dict_result_degree_change[i+"-"+j] = degree_change vet.append(i + '-' + j) vet.clear() #EDGES #CRIAR DICIONARIO DE CONTAGEM DE EDGES dict_cont_edges = {} def populate_aux_position(node_id1): aux = str(node_id1).split(':') return(int(aux[1])) def populate_aux_target(aux_chain): aux = str(aux_chain).split(":") aux_target = ':' + aux[1] + ':' + aux[2] + ':' + aux[3] return aux_target def populate_interaction(interaction): return(interaction) for i in dict_protein_edges_chain: #CONTAR INTERAÇÕES REPETIDAS aux = dict_protein_edges_chain[i].pivot_table(index=['NodeId1', 'Interaction', 'NodeId2', 'aux_chain_1', 'aux_chain_2'], aggfunc='size') test=pd.DataFrame(aux) check=test.reset_index(inplace=True) dict_cont_edges[i] = test #RENOMEAR COLUNAS dict_cont_edges[i].columns = ['NodeId1', 'Interaction', 'NodeId2', 'aux_chain_1', 'aux_chain_2', 'Quant'] #CRIAR AUX_NODEID_1 E 2, INTERACTION dict_cont_edges[i]['aux_nodeId_1'] = dict_cont_edges[i]['NodeId1'].apply(populate_aux_target) dict_cont_edges[i]['aux_nodeId_2'] = dict_cont_edges[i]['NodeId2'].apply(populate_aux_target) dict_cont_edges[i]['aux_interaction'] = dict_cont_edges[i]['Interaction'].apply(populate_interaction) #ORDENAR APENAS PELO NODEID1 dict_cont_edges[i]['aux_position'] = dict_cont_edges[i]['NodeId1'].apply(populate_aux_position) dict_cont_edges[i] = dict_cont_edges[i].sort_values('aux_position') #OS RESULTADOS VÃO ESTAR EM RAZÃO DA QUANTIDADE. CADA INTERAÇÃO TEM O CAMPO QUANTIDADE, #QUE INDICA A QUANTIDADE DAQUELA INTERAÇÃO #EDGES PRESENTES APENAS NA PROTEINA 1 dict_result_edges_1 = {} #EDGES PRESENTES APENAS NA PROTEINA 2 dict_result_edges_2 = {} #MERGE DOS DICIONARIOS DE CONTAGEM DE EDGES for i in dict_cont_edges: for j in dict_cont_edges: if(i != j) and (j + '-' + i not in vet): dif = dict_cont_edges[i].merge(dict_cont_edges[j], on=['aux_nodeId_1','aux_interaction','aux_nodeId_2'], how='outer', suffixes=['', '_'], indicator=True) #APENAS PRESENTE NA PROTEINA 1 - 1B55.A dif_edges_protein_1 = dif[dif["_merge"] == "left_only"] dif_edges_protein_1 = dif_edges_protein_1.drop(dif.columns[[6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17]], axis=1) #APENAS PRESENTE NA PROTEINA 2 - 2Z0P.A dif_edges_protein_2 = dif[dif["_merge"] == "right_only"] dif_edges_protein_2 = dif_edges_protein_2.drop(dif.columns[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 17]], axis=1) #PRESENTE EM AMBAS AS PROTEINAS dif_edges_protein_3 = dif[dif["_merge"] == "both"] if not dif_edges_protein_3.empty: #REALOCAR AS INTERAÇÕES PARA CORRIGIR A QUANTIDADE dif_edges_protein_3['dif_quant'] = dif_edges_protein_3.apply(lambda x: x['Quant'] - x['Quant_'], axis=1) maiorquezero = dif_edges_protein_3[dif_edges_protein_3.dif_quant > 0] menorquezero = dif_edges_protein_3[dif_edges_protein_3.dif_quant < 0] maiorquezero.drop(columns=['NodeId1_', 'Interaction_','NodeId2_', 'aux_position_', '_merge', 'aux_nodeId_1','aux_nodeId_2', 'aux_chain_1_','aux_chain_2_','Quant_', 'aux_interaction'], inplace = True) menorquezero.drop(columns=['NodeId1', 'Interaction','NodeId2','aux_chain_1', 'aux_chain_2', 'Quant', 'aux_nodeId_1', 'aux_nodeId_2', 'Quant', 'aux_position', '_merge','aux_interaction'], inplace = True) if not maiorquezero.empty: #INSERIR A ROW NO DATAFRAME CORRETO DE ACORDO COM A QUANTIDADE - PROTEINA 1 for w, row in maiorquezero.iterrows(): row.Quant = int(row.dif_quant) dif_edges_protein_1 = dif_edges_protein_1.append(row) if not menorquezero.empty: #INSERIR A ROW NO DATAFRAME CORRETO DE ACORDO COM A QUANTIDADE - PROTEINA 2 for w, row in menorquezero.iterrows(): row.Quant_ = abs(int(row.dif_quant)) dif_edges_protein_2 = dif_edges_protein_2.append(row) dif_edges_protein_1 = dif_edges_protein_1.sort_values('aux_position') dif_edges_protein_2 = dif_edges_protein_2.sort_values('aux_position_') if 'dif_quant'in dif_edges_protein_2.columns: dif_edges_protein_2.drop(columns=['dif_quant'], inplace = True) dif_edges_protein_2.columns = ['NodeId1', 'Interaction', 'NodeId2','aux_chain_1', 'aux_chain_2', 'Quant', 'aux_position'] else: dif_edges_protein_2.columns = ['NodeId1', 'Interaction', 'NodeId2','aux_chain_1', 'aux_chain_2', 'Quant', 'aux_position'] dict_result_edges_1[i+'-'+j] = dif_edges_protein_1 dict_result_edges_2[i+'-'+j] = dif_edges_protein_2 vet.append(i + '-' + j) vet.clear() #SAIDA DE ARQUIVOS #REPORT NODES for i in dict_result_nodes_1.keys(): os.mkdir(output_directory+'/'+i) file = open(output_directory+"/"+i+"/report_dif_nodes_"+i+".txt", "a", newline='') file.write(i + "\n\n") protein_1_name = i.split("-")[0] protein_2_name = i.split("-")[1] file.write("- AMINOACIDS ONLY PRESENT IN PROTEIN -> " + protein_1_name + "\n\n") dict_result_nodes_1[i].to_csv(file, sep="\t", index=False) file.write("\n- AMINOACIDS ONLY PRESENT IN PROTEIN -> " + protein_2_name + "\n\n") dict_result_nodes_2[i].to_csv(file, sep="\t", index=False) file.write("\n- POSITIONS THAT HAVE CHANGED AMINOACIDS -> " + i + "\n\n") dict_result_nodes_change[i].to_csv(file, sep="\t", index=False) file.close() #REPORT NODES for i in dict_result_degree_change.keys(): file = open(output_directory+"/"+i+"/report_dif_degree_"+i+".txt", "a", newline='') dict_result_degree_change[i].to_csv(file, sep="\t", index=False) file.close() #REPORT EDGES for i in dict_result_edges_1.keys(): file = open(output_directory+"/"+i+"/report_dif_edges_"+i+".txt", "a", newline='') file.write(i + "\n\n") protein_1_name = i.split("-")[0] protein_2_name = i.split("-")[1] file.write("- EDGES ONLY PRESENT IN PROTEIN -> " + protein_1_name + "\n\n") dict_result_edges_1[i].to_csv(file, sep="\t", index=False) file.write("\n- EDGES ONLY PRESENT IN PROTEIN -> " + protein_2_name + "\n\n") dict_result_edges_2[i].to_csv(file, sep="\t", index=False) file.close() #GERAÇAO DE ARQUIVOS DE GRÁFICOS #ORDENAR DATAFRAMES def populate_position(position): return(int(position)) def populate_aux_ord(node_id1): aux = str(node_id1).split(':') return(int(aux[1])) for i in dict_protein_nodes_chain: dict_protein_nodes_chain[i]['Position'] = dict_protein_nodes_chain[i]['Position'].apply(populate_position) dict_protein_nodes_chain[i] = dict_protein_nodes_chain[i].sort_values('Position') dict_protein_edges_chain[i]['aux_ord'] = dict_protein_edges_chain[i]['NodeId1'].apply(populate_aux_ord) dict_protein_edges_chain[i] = dict_protein_edges_chain[i].sort_values('aux_ord') #GRAFICO HEATMAP DEGREE for i in dict_protein_nodes_chain: heatmap_degree = dict_protein_nodes_chain[i] file = open(output_directory+"/heatmap_degree_"+i+".csv", "a", newline='') file.write("group,variable,idnode,degree,color\n") maior = heatmap_degree['Position'].max() #DATAFRAME AUXILIAR PARA VERIFICAR ESPAÇOS EM BRANCO cont_df_aux = pd.DataFrame() cont_df_aux['Position'] = '' cont = 1 for w in range(0, int(maior)): cont_df_aux.loc[w, 'Position'] = cont cont+=1 ## heatmap_degree = heatmap_degree.merge(cont_df_aux, on="Position", how='outer', suffixes=['', '_'], indicator=True) for w, row in heatmap_degree.iterrows(): if row._merge == 'right_only': heatmap_degree.loc[w,'Degree'] = 0 heatmap_degree.loc[w,'NodeId'] = '-' heatmap_degree = heatmap_degree.sort_values('Position') col = 0 lin = 1 #dim_max = 9 dim_max = int(sqrt(maior)) pos = 1 cont = 0 for w, row in heatmap_degree.iterrows(): cont+=1 degree = 0 color = int(row['Degree']) * 1.9 + 75 file.write(str(col) + ',' + str(lin) + ',' + str(row['NodeId']) + ',' + str(row['Degree']) + ',' + str(color) + '\n') pos+=1 if col <= dim_max: col+=1 else: lin+=1 col = 0 if col == dim_max: lin = cont + 1 col = 0 file.close() #GRAFICO DIF_DEGREE for i in dict_equal_nodes: #VERIFICA SE EXISTE NODE IGUAL if not dict_equal_nodes[i].empty: dict_equal_nodes[i]['dif_degree'] = dict_equal_nodes[i].apply(lambda x: abs(x['Degree'] - x['Degree_']), axis=1) dif_degree_ord = dict_equal_nodes[i].sort_values('dif_degree', ascending=[False]).head(10) file = open(output_directory+"/"+i+"/graphic_dif_degree_"+i+".csv", "a", newline='') file.write('nodes\tdifferences\n') #VERIFICA SE EXISTE NODE IGUAL if not dict_equal_nodes[i].empty: for w, row in dif_degree_ord.iterrows(): file.write(str(row['NodeId']) + '\t' + str(row['dif_degree']) + '\n') file.close() #GRAFICO DIF_INTERACTION #Gráfico que mostra a diferença na quantidade de interações da comparação. #Apenas interações exclusivas da primeira subtraido das interações exclusivas da segunda for i in dict_result_edges_1: #TOTAL DE INTERAÇÕES PRESENTES APENAS NA 1B55.A aux_1 = dict_result_edges_1[i] for w, row in aux_1.iterrows(): if row.Quant > 1: for j in range(1, int(row.Quant)): aux_1 = aux_1.append(row, ignore_index=True) cont_int_1_series = aux_1.Interaction.value_counts() #TOTAL DE INTERAÇÕES PRESENTES APENAS NA 2Z0P.A aux_2 = dict_result_edges_2[i] for w, row in aux_2.iterrows(): if row.Quant > 1: for j in range(1, int(row.Quant)): aux_2 = aux_2.append(row, ignore_index=True) cont_int_2_series = aux_2.Interaction.value_counts() #UNIAO DOS NOMES DAS COLUNAS index_list = list(set().union(cont_int_1_series.index, cont_int_2_series.index)) cont_int = pd.DataFrame() cont_int = cont_int.assign(**dict.fromkeys(index_list, 0)) cont_int = cont_int.append(cont_int_1_series) cont_int = cont_int.append(cont_int_2_series) cont_int.reset_index(drop=True, inplace=True) cont_int = cont_int.fillna(0) cont_int = abs(cont_int.diff(-1)) cont_int = cont_int.drop([1]) cont_int = cont_int.sort_values(by=0, axis=1, ascending=[False]) cont_int = cont_int.transpose() #cont_int['id2'] = cont_int.index #cont_int.set_index(0,inplace=True) #cont_int.T file = open(output_directory+"/"+i+"/graphic_dif_interaction_"+i+".csv", "a", newline='') file.write("interactions\tlosses\n") cont_int.to_csv(file, sep="\t") file.close() #REMOVER O 0 file = open(output_directory+"/"+i+"/graphic_dif_interaction_"+i+".csv", "r") contents = file.readlines() file.close() contents.pop(1) # remove the line item from list, by line number, starts from 0 file = open(output_directory+"/"+i+"/graphic_dif_interaction_"+i+".csv", "w") contents = "".join(contents) file.write(contents) file.close() #GRAFICO HEATMAP - número de diferenças dos edges para cada node #Presentes apenas na primeira + presentes apenas na segunda dict_heatmap = {} for i in dict_result_edges_1: #VERIFICA SE EXISTE NODE IGUAL if not dict_equal_nodes[i].empty: aux_1 = dict_result_edges_1[i] aux_2 = dict_result_edges_2[i] #IDENTIFICANDO A MAIOR POSIÇÃO DA PROTEÍNA maior_pos_protein_1 = dict_result_edges_1[i]['aux_position'].max() maior_pos_protein_2 = dict_result_edges_2[i]['aux_position'].max() maior = max(maior_pos_protein_1, maior_pos_protein_2) #CONTAR O NUMERO DE INTERAÇÕES DE CADA NÓ aux_1_series = aux_1['NodeId1'].value_counts() aux_1_dataframe = pd.DataFrame({'NodeId1': aux_1_series.index, 'Quant': aux_1_series.values}) aux_2_series = aux_2['NodeId1'].value_counts() aux_2_dataframe = pd.DataFrame({'NodeId1': aux_2_series.index, 'Quant': aux_2_series.values}) #POPULAR A COLUNA AUX_NODEID E AUX_POSITION aux_1_dataframe['aux_nodeId1'] = aux_1_dataframe['NodeId1'].apply(populate_aux_target) aux_1_dataframe['aux_position'] = aux_1_dataframe['NodeId1'].apply(populate_aux_position) aux_2_dataframe['aux_nodeId1'] = aux_2_dataframe['NodeId1'].apply(populate_aux_target) aux_2_dataframe['aux_position'] = aux_2_dataframe['NodeId1'].apply(populate_aux_position) sum_merge = aux_1_dataframe.merge(aux_2_dataframe, on="aux_nodeId1", how='outer', suffixes=['', '_'], indicator=True) sum_merge['Comentary'] = '' sum_merge['Differences'] = 0 sum_merge['Color'] = -1 for w, row in sum_merge.iterrows(): if row._merge == 'both': sum_merge.loc[w,'Differences'] = int(row.Quant) + int(row.Quant_) sum_merge.loc[w,'Comentary'] = 'Have changes' sum_merge.loc[w,'Color'] = sum_merge.loc[w,'Differences'] + 60 #variação de cor de acordo com a difference if row._merge == 'left_only': sum_merge.loc[w,'Differences'] = row.Quant sum_merge.loc[w,'Comentary'] = 'Have changes' #sum_merge.loc[w,'Comentary'] = 'AA not assigned in Protein 2' #sum_merge.loc[w,'Color'] = 30 #apenas presente na proteina 1 sum_merge.loc[w,'Color'] = sum_merge.loc[w,'Differences'] + 60 #variação de cor de acordo com a difference if row._merge == 'right_only': sum_merge.loc[w,'Differences'] = row.Quant_ #sum_merge.loc[w,'Comentary'] = 'AA not assigned in Protein 1' sum_merge.loc[w,'Comentary'] = 'Have changes' sum_merge.loc[w,'aux_position'] = row.aux_position_ sum_merge.loc[w,'NodeId1'] = row.NodeId1_ #sum_merge.loc[w,'Color'] = 1 #apenas presente na proteina 2 sum_merge.loc[w,'Color'] = sum_merge.loc[w,'Differences'] + 60 #variação de cor de acordo com a difference sum_merge.drop(columns=['Quant', 'NodeId1_', 'Quant_', 'aux_position_', '_merge'], inplace = True) #PEGANDO POSIÇÕES DE NÓS SEMELHANTES equal_nodes_heatmap = dict_equal_nodes[i].drop(dict_equal_nodes[i].columns[[3,4,5,6,7,8,9,10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32]], axis=1) equal_nodes_heatmap.columns = ['NodeId1', 'Chain', 'Position','aux_nodeId1'] #MERGE COM O DATAFRAME DE NODES IGUAIS PARA PEGAR A POSIÇÃO E NOME DE NÓS PRESENTE NAS DUAS PROTEINAS #E QUE NAO TIVERAM DIFERENÇAS NOS EDGES sum_merge = sum_merge.merge(equal_nodes_heatmap, on="aux_nodeId1", how='outer', suffixes=['', '_'], indicator=True) for w, row in sum_merge.iterrows(): if row._merge == 'right_only': sum_merge.loc[w,'NodeId1'] = row.NodeId1_ sum_merge.loc[w,'Differences'] = 0 sum_merge.loc[w,'Comentary'] = 'No changed' sum_merge.loc[w,'Color'] = 60 #sem diferenças sum_merge.loc[w,'aux_position'] = row.Position sum_merge.drop(columns=['NodeId1_', 'Chain', 'Position', '_merge'], inplace = True) #POPULAR ESPAÇOS NAO ASSIMILADOS PELO RING cont_df = pd.DataFrame() cont_df['aux_position'] = '' cont = 1 for w in range(0, int(maior)): cont_df.loc[w, 'aux_position'] = cont cont+=1 sum_merge = sum_merge.merge(cont_df, on="aux_position", how='outer', suffixes=['', '_'], indicator=True) for w, row in sum_merge.iterrows(): if row._merge == 'right_only': sum_merge.loc[w,'NodeId1'] = '-' sum_merge.loc[w,'aux_nodeId1'] = '-' sum_merge.loc[w,'Comentary'] = 'AA not assigned in RIN' sum_merge.loc[w,'Differences'] = 0 sum_merge.loc[w,'Color'] = 50 #nao assimilado pelo ring sum_merge.drop(columns=['_merge'], inplace = True) dict_heatmap[i] = sum_merge.sort_values('aux_position') #VERIFICAÇÃO DE AMINOACIDO EXCLUSIVO DA PROTEINA exclusive_1 = {} exclusive_2 = {} dict_aux_exclusive_1 = dict_result_nodes_1[i] dict_aux_exclusive_2 = dict_result_nodes_2[i] for w, row in dict_aux_exclusive_1.iterrows(): key_dict = row.Position exclusive_1[key_dict] = row.Residue for w in exclusive_1: index_row = dict_heatmap[i][dict_heatmap[i].aux_position == w].index.tolist() dict_heatmap[i].loc[index_row,'Comentary'] = 'AA not assigned in Protein 2' dict_heatmap[i].loc[index_row,'Color'] = 30 for w, row in dict_aux_exclusive_2.iterrows(): key_dict = row.Position exclusive_2[key_dict] = row.Residue for w in exclusive_2: index_row = dict_heatmap[i][dict_heatmap[i].aux_position == w].index.tolist() dict_heatmap[i].loc[index_row,'Comentary'] = 'AA not assigned in Protein 1' dict_heatmap[i].loc[index_row,'Color'] = 1 #VERIFICAÇÃO DE MUDANÇA DE AMINOACIDO NA POSIÇÃO dict_aux_change = dict_result_nodes_change[i] dict_aux_change['aux_nodeId'] = dict_aux_change['NodeId'].apply(populate_aux_target) a = {} for w, row in dict_aux_change.iterrows(): if row.Position not in a: key_dict = row.Position a[key_dict] = row.Residue else: a[key_dict] = a[key_dict] + '-' + str(row.Residue) for w in a: index_row = dict_heatmap[i][dict_heatmap[i].aux_position == w].index.tolist() dict_heatmap[i].loc[index_row,'Comentary'] = 'Amino Changed (' + a[w] + ')' dict_heatmap[i].loc[index_row,'Color'] = 800000000 dict_heatmap[i].drop(columns=['aux_nodeId1', 'aux_position'], inplace = True) #SAIDA DE ARQUIVO HEATMAP file = open(output_directory+"/"+i+'/heatmap-'+i+'.csv', 'a', newline='') file.write("group,variable,color,idnode,diff,comentary\n") col = 0 lin = 1 #dim_max = 9 dim_max = int(sqrt(maior)) pos = 1 cont = 0 for w, row in dict_heatmap[i].iterrows(): cont+=1 file.write(str(col) + ',' + str(lin) + ',' + str(row['Color']) + ',' + str(row['NodeId1']) + ',' + str(int(row['Differences'])) + ',' + str(row['Comentary']) + '\n') if col <= dim_max: col+=1 else: lin+=1 col = 0 if col == dim_max: lin = cont + 1 col = 0 file.close() #ANALISE #GRÁFICO 1 - SAÍDA: DATAFRAME ONDE CADA TUPLA É O DEGREE DA POSIÇÃO DE TODAS AS COMPARAÇÕES amino_name = ['ALA','ARG','ASN','ASP','CYS','GLU','GLN','GLY','HIS','ILE','LEU','LYS','MET','PHE','PRO','SER','THR','TRP','TYR','VAL'] #CRIAÇÃO DE DATAFRAME COM O NOME DE CADA PDB SENDO UMA COLUNA df_degree = pd.DataFrame() df_degree['Position'] = '' for i in dict_protein_nodes_chain.keys(): df_degree[i]= '' #PREENCHIMENTO DA COLUNA POSIÇÃO COM OS NÚMEROS DAS POSIÇÕES SEM REPETIÇOES for i in dict_protein_nodes_chain: for w, row in dict_protein_nodes_chain[i].iterrows(): if row["Position"] not in df_degree["Position"].values: if row['Residue'] in amino_name: df_degree = df_degree.append({'Position': row["Position"]}, ignore_index = True) #ORDENAR COLUNA POSITION E ORDENAR df_degree = df_degree.sort_values('Position') df_degree.set_index('Position', inplace=True) #PREENCHER OS DEGREES DE CADA PDB for i in dict_protein_nodes_chain: for w, row in dict_protein_nodes_chain[i].iterrows(): if row['Residue'] in amino_name: df_degree.loc[row['Position'], i] = row['Degree'] #REMOVENDO POSIÇÕES BASEADO NA QUANTIDADES DE VALORES NAN sum_nan = df_degree.isnull().sum(axis=1) df_sum_nan = sum_nan.to_frame() for w, row in df_sum_nan.iterrows(): if (row[0] >= len(df_degree.columns)*80/100) or (abs(row[0] - len(df_degree.columns)) == 1): df_degree.drop(w, inplace=True) #CRIANDO COLUNAS MEDIA E MEDIANA #df_degree['mean'] = df_degree.mean(axis=1) #df_degree['median'] = df_degree.median(axis=1) df_degree['std'] = df_degree.std(axis = 1, skipna = True) df_degree['Position'] = df_degree.index file = open(output_directory+ '/df_degree_analysis.txt', 'a', newline='') df_degree.to_csv(file, sep="\t", index=False) file.close() # Data for plotting x = df_degree.index y = df_degree['std'] y1 = - df_degree['std'] # Create two subplots sharing y axis fig,ax = plt.subplots() #ticks dos pontos dos graficos ax.set_xticks(df_degree.index) #tamanho do grafico gerado em px fig.set_size_inches(130, 10) #plot dos dados do dataframe ax.plot(x, y, -y, color='xkcd:azure') #plot das linhas horizontais line1 = plt.axhline(y=0.5, color='xkcd:sand', linestyle='--') line2 = plt.axhline(y=-0.5, color='xkcd:sand', linestyle='--') line3 = plt.axhline(y=1.5, color='xkcd:peach', linestyle='--') line4 = plt.axhline(y=-1.5, color='xkcd:peach', linestyle='--') #preenchimento da cor ax.fill_between(x, y, y1, color='xkcd:powder blue') #plot dos pontos coloridos for i in x: aux_y = y[i] if aux_y >= 1.5: plt.plot(i, aux_y, 'ro', color = 'red') ax.annotate(' '+str(int(i)), xy=(i, aux_y), xytext=(1, 0), textcoords="offset points", ha='left', va='center', color='red', clip_on=True) elif aux_y >= 0.5 and aux_y < 1.5: plt.plot(i, aux_y, 'ro', color = 'xkcd:orangey brown') ax.annotate(' '+str(int(i)), xy=(i, aux_y), xytext=(1, 0), textcoords="offset points", ha='left', va='center', color='xkcd:orangey brown', clip_on=True) else: plt.plot(i, aux_y, 'ro', color = 'xkcd:cerulean') ax.annotate(' '+str(int(i)), xy=(i, aux_y), xytext=(1, 0), textcoords="offset points", ha='left', va='center', color='xkcd:cerulean', clip_on=True) for i in x: aux_y = y1[i] if aux_y <= -1.5: plt.plot(i, aux_y, 'ro', color = 'red') ax.annotate(' '+str(int(i)), xy=(i, aux_y), xytext=(1, 0), textcoords="offset points", ha='left', va='center', color='red', clip_on=True) elif aux_y <= -0.5 and aux_y > -1.5: plt.plot(i, aux_y, 'ro', color = 'xkcd:orangey brown') ax.annotate(' '+str(int(i)), xy=(i, aux_y), xytext=(1, 0), textcoords="offset points", ha='left', va='center', color='xkcd:orangey brown', clip_on=True) else: plt.plot(i, aux_y, 'ro', color = 'xkcd:cerulean') ax.annotate(' '+str(int(i)), xy=(i, aux_y), xytext=(1, 0), textcoords="offset points", ha='left', va='center', color='xkcd:cerulean', clip_on=True) #titulo ax.set(title='Titulo', ylabel='Standart Deviation') #ajustas as bordas do gráfico para tocar nos eixos plt.axis(xmin=df_degree.index.min() - 0.2, xmax=df_degree.index.max()) #adiciona o grid ax.grid() plt.savefig(output_directory+'/plot.svg') directory = "../" + directory r_script_path = os.path.join(project_path, "scripts_corins", "code.R") pipe = subprocess.Popen(["Rscript", r_script_path, directory]) pipe.wait()
## copy everything before original building instructions ## print the new ones ## close ## ## ext = os.path.splitext(args.infile[0])[1] if ext == '.lxf': fhandler=ZipFile(args.infile[0]).open('IMAGE100.LXFML') else: if ext == '.lxfml': fhandler= open(args.infile[0],'r') else: raise IOError("Can only handle .lxf or .lxfml files!") al = fhandler.readlines() ## parse XML a=xmltodict.parse(string.join(al,"\n")) mpl = check_integrity(a) if mpl is not None: print "Am not generating building instructions!" exit(1) #al = fhandler.readlines() fhandler.close() ## find line with original building instructions li=['<BuildingInstruction name' in l for l in al] idx = li.index(True) # line where building instructions start