def fill_protein_parameters(protein_records): label = 'Filling protein parameters: ' show_progress(label, 35, 0.0) index = 1 for protein_record in protein_records: protein_record.protein_parameters = ProteinParameters(protein_record.protein.sequence) show_progress(label, 35, index / len(protein_records)) index += 1 print()
def save_proteins_to_csv(proteins, file_name): label = 'Saving proteins to \'{0}\': '.format(file_name) show_progress(label, 40, 0.0) with open(file_name, 'w') as file: file.write('id;name;sequence\n') index = 1 for protein in proteins: file.write(protein.id + ';' + protein.name + ';' + protein.sequence + '\n') show_progress(label, 40, index / len(proteins)) index += 1 print()
def load_proteins_from_csv(file_name): label = 'Loading proteins from \'{0}\': '.format(file_name) show_progress(label, 40, 0.0) data = genfromtxt(file_name, dtype=None, delimiter=';', names=True) proteins = [] index = 1 for line in data: proteins.append(Protein(id=b2str(line['id']), name=b2str(line['name']), sequence=b2str(line['sequence']))) show_progress(label, 40, index / len(data)) index += 1 print() return proteins
def save_protein_records_to_folder(protein_records, folder='results/'): if not folder[-1] == '/': folder += '/' label = 'Saving protein records to \'{0}\': '.format(folder) show_progress(label, 40, 0.0) index = 1 for protein_record in protein_records: with open(folder + protein_record.protein.id + '.txt', 'w') as file: file.write(str(protein_record)) show_progress(label, 40, index / len(protein_records)) index += 1 print()
def fill_peptide_parameters(protein_records): print('Filling peptide parameters:') protein_index = 1 for protein_record in protein_records: print('Processing protein record #{0} of {1}:'.format(protein_index, len(protein_records))) stdout.flush() # 1. process received peptide records first label = '{0:>25}: '.format('Received peptides ({0})'.format(len(protein_record.received_peptide_records))) show_progress(label, 40, 0.0) peptide_index = 1 for peptide_record in protein_record.received_peptide_records: peptide_record.peptide_parameters = PeptideParameters(peptide_record.peptide.sequence) show_progress(label, 40, peptide_index / len(protein_record.received_peptide_records)) peptide_index += 1 print() # 2. process then missed peptide records if len(protein_record.missed_peptide_records) == 0: protein_index += 1 continue label = '{0:>25}: '.format('Missed peptides ({0})'.format(len(protein_record.missed_peptide_records))) show_progress(label, 40, 0.0) peptide_index = 1 for peptide_record in protein_record.missed_peptide_records: peptide_record.peptide_parameters = PeptideParameters(peptide_record.peptide.sequence) show_progress(label, 40, peptide_index / len(protein_record.missed_peptide_records)) peptide_index += 1 print() protein_index += 1 print() print('Filling peptide parameters: done.')
def fill_parameter_lists(protein_records): total_received_peptides_number = 0 total_missed_peptides_number = 0 for protein_record in protein_records: total_received_peptides_number += len(protein_record.received_peptide_records) total_missed_peptides_number += len(protein_record.missed_peptide_records) received_parameters = DataFrame(zeros((total_received_peptides_number, len(peptide_parameter_names)), dtype=float64), columns=peptide_parameter_names) missed_parameters = DataFrame(zeros((total_missed_peptides_number, len(peptide_parameter_names)), dtype=float64), columns=peptide_parameter_names) # fill received peptides parameters label = 'Filling received peptides parameter lists: ' show_progress(label, 32, 0.0) index = 1 for protein_record in protein_records: for received_peptide_record in protein_record.received_peptide_records: received_parameters['Sequence length'][index] = received_peptide_record.peptide_parameters.sequence_length received_parameters['Aromaticity'][index] = received_peptide_record.peptide_parameters.aromaticity received_parameters['Instability'][index] = received_peptide_record.peptide_parameters.instability received_parameters['Isoelectric point'][index] = \ received_peptide_record.peptide_parameters.isoelectric_point received_parameters['Molecular weight'][index] = received_peptide_record.peptide_parameters.molecular_weight received_parameters['Kyte plot'][index] = received_peptide_record.peptide_parameters.kyte_plot received_parameters['Aliphatic index'][index] = received_peptide_record.peptide_parameters.aliphatic_index received_parameters['Boman index'][index] = received_peptide_record.peptide_parameters.boman_index received_parameters['Hydrophobicity'][index] = received_peptide_record.peptide_parameters.hydrophobicity for kidera_factor in received_peptide_record.peptide_parameters.kidera_factors: received_parameters['Kidera factor: {0}'.format(kidera_factor['name'])][index] = kidera_factor['value'] show_progress(label, 32, index / total_received_peptides_number) index += 1 print() # fill missed peptides parameters label = 'Filling missed peptides parameter lists: ' show_progress(label, 32, 0.0) index = 1 for protein_record in protein_records: for missed_peptide_record in protein_record.missed_peptide_records: missed_parameters['Sequence length'][index] = missed_peptide_record.peptide_parameters.sequence_length missed_parameters['Aromaticity'][index] = missed_peptide_record.peptide_parameters.aromaticity missed_parameters['Instability'][index] = missed_peptide_record.peptide_parameters.instability missed_parameters['Isoelectric point'][index] = missed_peptide_record.peptide_parameters.isoelectric_point missed_parameters['Molecular weight'][index] = missed_peptide_record.peptide_parameters.molecular_weight missed_parameters['Kyte plot'][index] = missed_peptide_record.peptide_parameters.kyte_plot missed_parameters['Aliphatic index'][index] = missed_peptide_record.peptide_parameters.aliphatic_index missed_parameters['Boman index'][index] = missed_peptide_record.peptide_parameters.boman_index missed_parameters['Hydrophobicity'][index] = missed_peptide_record.peptide_parameters.hydrophobicity for kidera_factor in missed_peptide_record.peptide_parameters.kidera_factors: missed_parameters['Kidera factor: {0}'.format(kidera_factor['name'])][index] = kidera_factor['value'] show_progress(label, 32, index / total_missed_peptides_number) index += 1 print() return received_parameters, missed_parameters
def construct_protein_records(proteins, main_data): label = 'Constructing protein records: ' show_progress(label, 40, 0.0) protein_records = [] # 1. process all main data index = 1 for line in main_data: # 1.1. construct peptide and peptide match from current analysis current_peptide = Peptide(sequence=b2str(line['sequence'])) current_peptide_match = PeptideMatch(analysis_name=b2str(line['filename']), score=line['score'], reverse_score=line['reverseScore'], percent_of_scored_peak_intensity=line['percent_scored_peak_intensity'], total_intensity=line['totalIntensity'], precursor_averagine_chi_squared=line['precursorAveragineChiSquared'], retention_time_min=line['retentionTimeMin'], chromatographic_peak_width_in_seconds=line['chromatographicPeakWidthSec']) # 1.2. get protein id for current analysis current_protein_id = b2str(line['accession_number']) # 1.3. find protein with such id protein = find_protein_with_id(proteins, current_protein_id) # TODO: if such protein not exists, extract Protein object and add to proteins # 1.4. find record with such protein protein_record = find_protein_record_with_protein(protein_records, protein) # 1.5. if record with such protein exists, add current match to received peptides if protein_record is not None: # 1.5.1. if such peptide was already received, add peptide match peptide_record = find_peptide_record_with_peptide(protein_record.received_peptide_records, current_peptide) if peptide_record is not None: peptide_record.matches.append(current_peptide_match) # 1.5.2. if such peptide was not received yet, add peptide record with this one peptide match else: current_peptide_record = PeptideRecord(current_peptide, [current_peptide_match]) protein_record.received_peptide_records.append(current_peptide_record) # 1.6. if protein record with such protein not exists, create new protein record else: current_peptide_record = PeptideRecord(current_peptide, [current_peptide_match]) protein_record = ProteinRecord(protein, received_peptide_records=[current_peptide_record]) protein_records.append(protein_record) show_progress(label, 40, index / len(main_data)) index += 1 print() # 2. sort peptide records by length (starting from longest) label = 'Filling received peptide records: ' show_progress(label, 35, 0.0) index = 1 for protein_record in protein_records: protein_record.received_peptide_records = sorted(protein_record.received_peptide_records, key=lambda peptide_record: len(peptide_record.peptide.sequence), reverse=True) show_progress(label, 35, index / len(protein_records)) index += 1 print() return protein_records
def construct_proteins(main_data): proteins = [] label = 'Constructing proteins from main data: ' show_progress(label, 35, 0.0) # 1. fill list with unique proteins index = 1 for line in main_data: # 1.1. construct protein from current line current_protein = Protein(id=b2str(line['accession_number']), name=b2str(line['entry_name'])) # 1.2. add if not already exists in list if current_protein not in proteins: proteins.append(current_protein) show_progress(label, 35, index / len(main_data)) index += 1 print() return proteins
def fill_missed_peptide_records(protein_records): label = 'Filling missed peptide records: ' show_progress(label, 40, 0.0) index = 1 for protein_record in protein_records: # 1. construct list of sequences of received peptides received_sequences = [peptide_record.peptide.sequence for peptide_record in protein_record.received_peptide_records] # 2. calculate list of missed sequence fragments missed_sequences = cut_received_peptide_sequences(protein_record.protein.sequence, received_sequences) missed_sequences = [trypsinolize_sequence(x) for x in missed_sequences] # 3. construct peptide record for each fragment and store them in missed peptide records protein_record.missed_peptide_records = [] for missed_sequences_list in missed_sequences: for fragment in missed_sequences_list: protein_record.missed_peptide_records.append(PeptideRecord(peptide=Peptide(sequence=fragment))) show_progress(label, 40, index / len(protein_records)) index += 1 print()
def calculate_simple_statistics(parameters, per_peptide_correlations=None): label = 'Calculating simple statistics: ' show_progress(label, 40, 0.0) stats = {} total_stats_length = len(parameters.columns) if per_peptide_correlations is not None: total_stats_length += len(per_peptide_correlations.columns) index = 1 for parameter_name in parameters.columns: stats[parameter_name] = calculate_simple_statistics_for_serie(parameters[parameter_name]) show_progress(label, 40, index / total_stats_length) index += 1 if per_peptide_correlations is not None: for parameter_name in per_peptide_correlations.columns: stats[parameter_name] = calculate_simple_statistics_for_serie(per_peptide_correlations[parameter_name]) show_progress(label, 40, index / total_stats_length) index += 1 print() return stats
def fill_per_peptide_correlations(protein_records): per_peptide_correlation_parameter_labels = ['{0} per peptide correlation (Pearson)'.format(name) for name in per_peptide_correlation_parameter_names] total_received_peptides_number = 0 total_missed_peptides_number = 0 for protein_record in protein_records: total_received_peptides_number += len(protein_record.received_peptide_records) total_missed_peptides_number += len(protein_record.missed_peptide_records) total_received_pairs_number = total_received_peptides_number * (total_received_peptides_number - 1) // 2 received_per_peptide_correlations = DataFrame(zeros((total_received_pairs_number, len(per_peptide_correlation_parameter_labels)), dtype=float64), columns=per_peptide_correlation_parameter_labels) total_missed_pairs_number = total_missed_peptides_number * (total_missed_peptides_number - 1) // 2 missed_per_peptide_correlations = DataFrame(zeros((total_missed_pairs_number, len(per_peptide_correlation_parameter_labels)), dtype=float64), columns=per_peptide_correlation_parameter_labels) received_kidera_factors = DataFrame(zeros((len(kidera_factor_names), total_received_peptides_number), dtype=float64)) missed_kidera_factors = DataFrame(zeros((len(kidera_factor_names), total_missed_peptides_number), dtype=float64)) received_acid_percents = DataFrame(zeros((len('AGVMDYNSWLFIKPQCERTH'), total_received_peptides_number), dtype=float64)) missed_acid_percents = DataFrame(zeros((len('AGVMDYNSWLFIKPQCERTH'), total_missed_peptides_number), dtype=float64)) received_acid_compounds = DataFrame(zeros((len(amino_acid_group_names), total_received_peptides_number), dtype=float64)) missed_acid_compounds = DataFrame(zeros((len(amino_acid_group_names), total_missed_peptides_number), dtype=float64)) # received_charges = [] # missed_charges = [] received_hydrophobic_moments = DataFrame(zeros((len(hydrophobic_moments_names), total_received_peptides_number), dtype=float64)) missed_hydrophobic_moments = DataFrame(zeros((len(hydrophobic_moments_names), total_missed_peptides_number), dtype=float64)) secondary_structure_fraction_names = ['Helix', 'Turn', 'Sheet'] received_secondary_structure_fractions = DataFrame( zeros((len(secondary_structure_fraction_names), total_received_peptides_number), dtype=float64)) missed_secondary_structure_fractions = DataFrame( zeros((len(secondary_structure_fraction_names), total_missed_peptides_number), dtype=float64)) label = 'Filling received peptides array-like parameter lists: ' show_progress(label, 35, 0.0) index = 1 for protein_record in protein_records: for received_peptide_record in protein_record.received_peptide_records: kidera_factor_index = 0 for kidera_factor in received_peptide_record.peptide_parameters.kidera_factors: received_kidera_factors[index - 1][kidera_factor_index] = kidera_factor['value'] kidera_factor_index += 1 acid_index = 0 for acid in 'AGVMDYNSWLFIKPQCERTH': received_acid_percents[index - 1][acid_index] = \ received_peptide_record.peptide_parameters.amino_acid_percents[acid] acid_index += 1 group_index = 0 for group in received_peptide_record.peptide_parameters.amino_acids_composition: received_acid_compounds[index - 1][group_index] = group['percent'] group_index += 1 # charges = [] # for charge in received_peptide_record.peptide_parameters.charges: # charges.append(charge['charge']) # received_charges.append(charges) moment_index = 0 for moment in received_peptide_record.peptide_parameters.hydrophobic_moments: if moment['name'] != 'Polygly-polypro helix': received_hydrophobic_moments[index - 1][moment_index] = moment['moment'] group_index += 1 fraction_index = 0 for fraction in received_peptide_record.peptide_parameters.secondary_structure_fraction: received_secondary_structure_fractions[index - 1][fraction_index] = fraction['value'] fraction_index += 1 show_progress(label, 35, index / total_received_peptides_number) index += 1 print() label = 'Filling missed peptides array-like parameter lists: ' show_progress(label, 35, 0.0) index = 1 for protein_record in protein_records: for missed_peptide_record in protein_record.missed_peptide_records: kidera_factor_index = 0 for kidera_factor in missed_peptide_record.peptide_parameters.kidera_factors: missed_kidera_factors[index - 1][kidera_factor_index] = kidera_factor['value'] kidera_factor_index += 1 acid_index = 0 for acid in 'AGVMDYNSWLFIKPQCERTH': missed_acid_percents[index - 1][acid_index] = \ missed_peptide_record.peptide_parameters.amino_acid_percents[acid] acid_index += 1 group_index = 0 for group in missed_peptide_record.peptide_parameters.amino_acids_composition: missed_acid_compounds[index - 1][group_index] = group['percent'] group_index += 1 # charges = [] # for charge in missed_peptide_record.peptide_parameters.charges: # charges.append(charge['charge']) # missed_charges.append(charges) # moment_index = 0 for moment in missed_peptide_record.peptide_parameters.hydrophobic_moments: if moment['name'] != 'Polygly-polypro helix': missed_hydrophobic_moments[index - 1][moment_index] = moment['moment'] group_index += 1 fraction_index = 0 for fraction in missed_peptide_record.peptide_parameters.secondary_structure_fraction: missed_secondary_structure_fractions[index - 1][fraction_index] = fraction['value'] fraction_index += 1 show_progress(label, 35, index / total_missed_peptides_number) index += 1 print() print('Calculating Kidera factors per peptide Pearson correlation (received peptides): ', end='') received_per_peptide_correlations['Kidera factors per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(received_kidera_factors.corr(method='pearson'), 'Kidera factors') print('done') print('Calculating Kidera factors per peptide Pearson correlation (missed peptides): ', end='') missed_per_peptide_correlations['Kidera factors per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(missed_kidera_factors.corr(method='pearson'), 'Kidera factors') print('done') print('Calculating amino acid percents per peptide Pearson correlation (received peptides): ', end='') received_per_peptide_correlations['Amino acid percents per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(received_acid_percents.corr(method='pearson'), 'Amino acid percents') print('done') print('Calculating amino acid percents per peptide Pearson correlation (missed peptides): ', end='') missed_per_peptide_correlations['Amino acid percents per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(missed_acid_percents.corr(method='pearson'), 'Amino acid percents') print('done') print('Calculating amino acid compositions per peptide Pearson correlation (received peptides): ', end='') received_per_peptide_correlations['Amino acid compositions per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(received_acid_compounds.corr(method='pearson'), 'Amino acid compositions') print('done') print('Calculating amino acid compositions per peptide Pearson correlation (missed peptides): ', end='') missed_per_peptide_correlations['Amino acid compositions per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(missed_acid_compounds.corr(method='pearson'), 'Amino acid compositions') print('done') # # label = 'Calculating charges Kendall correlation (missed peptides): ' # show_progress(label, 40, 0.0) # index = 1 # for first_charges in range(0, len(missed_charges)): # for second_charges in range(first_charges + 1, len(missed_charges)): # missed['Charges per peptide correlation (Kendall)'].append( # statistics.kendalltau(missed_charges[first_charges], missed_charges[second_charges]).correlation) # show_progress(label, 40, index / len(missed_charges)) # index += 1 # print() print('Calculating hydrophobic moments per peptide Pearson correlation (received peptides): ', end='') received_per_peptide_correlations['Hydrophobic moments per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(received_hydrophobic_moments.corr(method='pearson'), 'Hydrophobic moments') print('done') print('Calculating hydrophobic moments per peptide Pearson correlation (missed peptides): ', end='') missed_per_peptide_correlations['Hydrophobic moments per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(missed_hydrophobic_moments.corr(method='pearson'), 'Hydrophobic moments') print('done') print('Calculating secondary structure fractions per peptide Pearson correlation (received peptides): ', end='') received_per_peptide_correlations['Secondary structure fractions per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(received_secondary_structure_fractions.corr(method='pearson'), 'Secondary structure fractions') print('done') print('Calculating secondary structure fractions per peptide Pearson correlation (missed peptides): ', end='') missed_per_peptide_correlations['Secondary structure fractions per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(missed_secondary_structure_fractions.corr(method='pearson'), 'Secondary structure fractions') print('done') return received_per_peptide_correlations, missed_per_peptide_correlations