Exemplo n.º 1
0
def fetch_fastas_for_DUDE():
    if not os.path.exists(
            os.path.join(BLAST_MAIN_FOLDER, 'fastas_from_dude.txt')):
        log(f'Starting fetching FASTA\'s for DUDE')
        dude_uni_ids = DUDE_uniID_from_folder(DUDE_PATH)
        with open(os.path.join(BLAST_MAIN_FOLDER, 'fastas_from_dude.txt'),
                  'w') as handle:
            for uniprot_id in dude_uni_ids:
                pdb_ids = get_pdbs_from_unicode(uniprot_id)
                pdbs_dict = dict()
                for pdb in pdb_ids:
                    try:
                        if check_if_pdb_xray(pdb):
                            pdb_fasta = get_pdb_fasta(pdb)
                            pdbs_dict[uniprot_id + '-' +
                                      pdb] = pdb_fasta.strip('"')
                    except HTTPError:
                        log(f'Failed fetching FASTA sequence for: {pdb}')
                pdbs_dict2 = dict()
                for key, value in pdbs_dict.items():
                    if value not in list(pdbs_dict2.values()):
                        pdbs_dict2[key] = value
                for key, value in pdbs_dict2.items():
                    handle.write(f'>{key}\n{value}\n')
    else:
        log('DUD-E fastas already exists.')
Exemplo n.º 2
0
def fetch_fastas_for_DEKOIS():
    if not os.path.exists(
            os.path.join(BLAST_MAIN_FOLDER, 'fastas_from_dekois.txt')):
        log('Starting fetching FASTA\' s for DEKOIS')
        dekois_ligands, _ = DEKOIS_uniID_from_folder(DEKOIS_PATH)
        dekois_ligands_uni_ids = set()
        for i in dekois_ligands:
            try:
                for k in dekois_ligands[i]['Uniprot_ID'].split():
                    dekois_ligands_uni_ids.add(k)
            except KeyError:
                log(f'No Uniprot ID for {i}')
        with open(os.path.join(BLAST_MAIN_FOLDER, 'fastas_from_dekois.txt'),
                  'w') as handle:
            for uniprot_id in dekois_ligands_uni_ids:
                pdb_ids = get_pdbs_from_unicode(uniprot_id)
                pdbs_dict = dict()
                for pdb in pdb_ids:
                    try:
                        if check_if_pdb_xray(pdb):
                            pdb_fasta = get_pdb_fasta(pdb)
                            pdbs_dict[uniprot_id + '-' +
                                      pdb] = pdb_fasta.strip('"')
                    except HTTPError:
                        log(f'Failed fetching FASTA sequence for: {pdb}')
                pdbs_dict2 = dict()
                for key, value in pdbs_dict.items():
                    if value not in list(pdbs_dict2.values()):
                        pdbs_dict2[key] = value
                for key, value in pdbs_dict2.items():
                    handle.write(f'>{key}\n{value}\n')
    else:
        log('DEKOIS fastas already exists.')
Exemplo n.º 3
0
def fetch_fastas_for_chembl(csv_path, output):
    if not os.path.exists(output):
        main_table = pd.read_csv(csv_path, index_col=0)
        log(f'Downloading fastas for {len(main_table)} targets')
        with open(output, 'w') as handle:
            for index, row in main_table.iterrows():
                chembl_name = row['ChEMBL ID']
                pdb_name = row['main_PDB_structure']
                fasta = get_pdb_fasta(row['main_PDB_structure'])
                handle.write(f'>{chembl_name}-{pdb_name}\n{fasta}\n')
    else:
        log(f'{output} already exists!')
Exemplo n.º 4
0
def choose_primary_pdb_for_chembl(csv_path):
    main_table = pd.read_csv(csv_path, index_col=0)
    main_table['main_PDB_structure'] = ''
    for index, row in main_table.iterrows():
        pdbs = row['PDB_entry'].split()
        fasta = [get_pdb_fasta(i) for i in pdbs]
        best = ''
        for seq in fasta:
            if len(seq) > len(best):
                best = seq
        if best != '':
            best_pdb = pdbs[fasta.index(best)]
            main_table.at[index, 'main_PDB_structure'] = best_pdb
    main_table.to_csv(csv_path)
def choose_primary_pdb_for_chembl(main_table):
    main_table['main_PDB_structure'] = ''
    for index, row in main_table.iterrows():
        pdbs = row['PDB_entry'].split()
        log(f'Fetching PDBs for {index}, {len(pdbs)} PDBs to fetch.')
        fasta = [get_pdb_fasta(i) for i in pdbs]
        best = ''
        for seq in fasta:
            if len(seq) > len(best):
                best = seq
        if best != '':
            best_pdb = pdbs[fasta.index(best)]
            main_table.at[index, 'main_PDB_structure'] = best_pdb
    main_table = main_table.sort_values(
        'Active_compounds',
        ascending=False).drop_duplicates(subset='main_PDB_structure')
    main_table = main_table[main_table['main_PDB_structure'] != '']
    return main_table
Exemplo n.º 6
0
def make_blast_csv(master_path, blasts=None):
    main_table = pd.read_csv(master_path, index_col=0)
    output_df = main_table.loc[:, [
        'ChEMBL ID', 'main_PDB_structure', 'Active_compounds',
        'Inactive_compounds'
    ]]
    output_file = os.path.join(BLAST_MAIN_FOLDER, 'chembl_blast_results.csv')
    for db in blasts:
        #chembl_Id, hit_ID, identity%, evalue
        blast_results = load_blast_results(db, [0, 1, 2, 6, 7, 10])
        db_name = str(os.path.split(db)[-1]).split('_')[0].split('-')[1]
        output_df[f'identity%_{db_name}'] = ''
        output_df[f'evalue_{db_name}'] = ''
        output_df[f'target_name_{db_name}'] = ''
        output_df[f'query_alignment_length_{db_name}'] = ''
        output_df[f'total_query_length_{db_name}'] = ''
        output_df[f'alignment_to_total_ratio_{db_name}'] = ''
        name = ''
        for chembl_id, db_target, identity, q_start, q_end, evalue in blast_results:
            if name not in chembl_id or name == '':
                query_pdb_smile_len = len(
                    get_pdb_fasta(chembl_id.split('-')[-1]))
                query_alignment_length = q_end - q_start
                name = chembl_id.split('-')[0]
                output_df.at[name, f'identity%_{db_name}'] = identity
                output_df.at[name, f'evalue_{db_name}'] = evalue
                output_df.at[name, f'target_name_{db_name}'] = db_target
                output_df.at[
                    name,
                    f'query_alignment_length_{db_name}'] = query_alignment_length
                output_df.at[
                    name,
                    f'total_query_length_{db_name}'] = query_pdb_smile_len
                output_df.at[
                    name,
                    f'alignment_to_total_ratio_{db_name}'] = query_alignment_length / query_pdb_smile_len
    output_df.to_csv(output_file)