示例#1
0
def data_transforming(traindf):
    #Transforming SMILES to MOL
    traindf['mol'] = traindf['SMILES sequence'].apply(
        lambda x: Chem.MolFromSmiles(x))

    print('Molecular sentence:', mol2alt_sentence(traindf['mol'][1], radius=1))
    print('\nMolSentence object:',
          MolSentence(mol2alt_sentence(traindf['mol'][1], radius=1)))
    print(
        '\nDfVec object:',
        DfVec(
            sentences2vec(MolSentence(
                mol2alt_sentence(traindf['mol'][1], radius=1)),
                          model,
                          unseen='UNK')))
    #Constructing sentences
    traindf['sentence'] = traindf.apply(
        lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)

    #Extracting embeddings to a numpy.array
    #Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures
    traindf['mol2vec'] = [
        DfVec(x)
        for x in sentences2vec(traindf['sentence'], model, unseen='UNK')
    ]
    X = np.array([x.vec for x in traindf['mol2vec']])
    return X
def mol2vec(fin_name, fout_name, clean=False):
	
	#clean_data, removing smiles string can't convert to molecules 
	#We may improve this latter. Only do once 

	if clean:
		print('cleaning data...')
		clean_file(fin_name, fin_name)

	clean_data = pd.read_csv(fin_name)

	#Load pre-trained model 
	model = word2vec.Word2Vec.load('./models/model_300dim.pkl')

	print('making vec data...')
	#convert to sentences 
	mols = [Chem.MolFromSmiles(smi) for smi in clean_data['smiles'].values]
	sentences = [MolSentence(mol2alt_sentence(mol, 1)) for mol in mols]

	#convert to vectors 
	vecs = [DfVec(x) for x in sentences2vec(sentences, model, unseen='UNK')]
	vec_values = np.array([v.vec for v in vecs])

	# Form dataframe 
	cols = ['vec_'+str(i) for i in range(300)]
	df = pd.DataFrame(vec_values, columns=cols)
	df.insert(0, "smiles", clean_data['smiles'].values, True) 

	df.to_csv(fout_name)


	return vec_values
示例#3
0
    def process_ligands(self, ligands):
        XD = []

        if self.drug_format == "labeled_smiles":
            if type(ligands) == OrderedDict:
                iterator = ligands.keys()
            else:
                iterator = range(ligands.shape[0])

            for d in iterator:
                XD.append(
                    label_smiles(ligands[d], self.SMILEN, self.charsmiset))

        elif self.drug_format == "mol2vec":
            from gensim.models import word2vec
            from mol2vec.features import (MolSentence, mol2alt_sentence,
                                          sentences2vec)
            from rdkit.Chem import PandasTools

            word2vec_model = word2vec.Word2Vec.load(self.mol2vec_model_path)
            df_ligands = pd.DataFrame({"smiles": ligands})

            PandasTools.AddMoleculeColumnToFrame(df_ligands, "smiles", "ROMol")
            dtc_train = df_ligands[df_ligands["ROMol"].notnull()]
            dtc_train.loc[:, "mol-sentence"] = dtc_train.apply(
                lambda x: MolSentence(
                    mol2alt_sentence(x["ROMol"], self.mol2vec_radius)),
                axis=1,
            )
            XD = sentences2vec(dtc_train["mol-sentence"],
                               word2vec_model,
                               unseen="UNK")

        return XD
示例#4
0
def mol2vec_features(mol2vec_path, dataframe, smiles_col, target_col, pad_to):
    model = word2vec.Word2Vec.load(mol2vec_path)
    # validate smiles first!
    smiles_lst = dataframe[smiles_col].to_numpy()
    labels_lst = dataframe[target_col].to_numpy()
    idx = []
    for i, s in enumerate(smiles_lst):
        try:
            mol = Chem.MolFromSmiles(s)
            if mol is None:
                continue
        except Exception as e:
            continue
        idx.append(i)
    smiles_lst = smiles_lst[np.array(idx)]
    labels_lst = labels_lst[np.array(idx)]
    # mol2vec embeddings
    mollst = [Chem.MolFromSmiles(x) for x in smiles_lst]
    sentences = [mol2alt_sentence(x, 1) for x in mollst]
    features = np.zeros([len(mollst), pad_to, model.vector_size])
    labels = np.array(labels_lst)
    print("mean: ", labels.mean(), "std: ", labels.std())
    for idx, sentence in enumerate(sentences):
        count = 0
        for word in sentence:
            if count == pad_to:
                break
            try:
                features[idx, count] = model.wv[word]
                count += 1
            except KeyError as e:
                pass
    assert features.shape[0] == labels.shape[0]
    return features, labels
def embed_single_smiles(smiles):
    model = word2vec.Word2Vec.load('data/model_300dim.pkl')
    mol = Chem.MolFromSmiles(smiles)
    sentences = sentences2vec(MolSentence(mol2alt_sentence(mol, 1)),
                              model,
                              unseen='UNK')
    return sentences
示例#6
0
def mol2sentence(smiles_batch: List[str], vocab,
                 args: Namespace) -> List[dict]:
    output_list = []
    for smiles in smiles_batch:
        if smiles in SMILES_TO_SENTENCE:
            sentence = SMILES_TO_SENTENCE[smiles]
        else:
            mol = Chem.MolFromSmiles(smiles)
            if mol is not None:
                sentence = mol2alt_sentence(mol, radius=args.radius)
                SMILES_TO_SENTENCE[smiles] = sentence
            else:
                continue
        # convert to ids
        sentence = [
            vocab.stoi.get(token, vocab.unk_index)
            for i, token in enumerate(sentence)
        ]
        sentence = [vocab.sos_index] + sentence + [vocab.eos_index]
        segment_label = ([1 for _ in range(len(sentence))])[:args.seq_len]

        input = sentence[:args.seq_len]
        padding = [vocab.pad_index for _ in range(args.seq_len - len(input))]
        input.extend(padding)
        segment_label.extend(padding)

        output = {'input': input, 'segment_label': segment_label}

        output = {key: torch.tensor(value) for key, value in output.items()}
        output_list.append(output)

    return output_list
示例#7
0
def _parallel_job(smiles, r):
    """Helper function for joblib jobs
    """
    if smiles is not None:
        # smiles = Chem.MolToSmiles(mol)
        mol = Chem.MolFromSmiles(smiles)
        sentence = mol2alt_sentence(mol, r)
        return " ".join(sentence)
def embed_smiles(smiles):
    model = word2vec.Word2Vec.load('data/model_300dim.pkl')
    mols = (Chem.MolFromSmiles(i) for i in smiles)
    sentences = [
        sentences2vec(MolSentence(mol2alt_sentence(m, 1)), model, unseen='UNK')
        for m in mols
    ]
    return sentences
示例#9
0
def featurize(ligand_data, trained_model, outpath):
    """Generate features from mol2vec model.
    Parameters
    ----------
    ligand_data: (str) A path to a csv file containing ligand structure data.
    trained_model: (str) Path to a pickle file of a trained word2vec model.
    outpath: (str) Path for storing output files.
    """
    data = pd.read_csv(ligand_data)
    # Create new column to store fingerprints
    data['words'] = np.zeros(len(data), dtype='object')

    # Read chemical structures
    ligands = (Chem.MolFromSmiles(x) for x in data['canonical_smiles'])

    # Generate fingerprints
    print("Generating molecular fingerprints.")
    i = 0
    with tqdm(total=len(data)) as pbar:
        for l in ligands:
            fingerprint = mol2alt_sentence(l, 1)
            data['words'][i] = list(fingerprint)
            i += 1
            pbar.update()
    pickle.dump(data, open(outpath + "/fingerprints.pkl", 'wb'))

    print("Finding unique fingerprints.")
    all_words = np.array(
            [word for sentence in data['words'] for word in sentence])
    unique_words = np.unique(all_words)

    # Create a data frame of embeddings
    print("Storing embeddings.")
    model = word2vec.Word2Vec.load(trained_model)
    embeddings = {}
    for word in unique_words:
        try:
            embeddings[word] = model.wv.word_vec(word)
        except:
            embeddings[word] = np.zeros(300)
    embeddings = pd.DataFrame(embeddings)
    pickle.dump(embeddings, open(outpath + "/embeddings.pkl", 'wb'))

    # Create a data frame to store ligand vectors
    vectors = {}
    print("Generating vectors.")
    for mol in tqdm(data['molecule_chembl_id']):
        fingerprint = data.loc[data.molecule_chembl_id == mol]['words']
        for sentence in fingerprint:
            components = embeddings[sentence]
            vec = np.sum(components, axis=1)
        vectors[mol] = vec
    vectors = pd.DataFrame(vectors).T

    print("Writing csv file.")
    pickle.dump(vectors, open(outpath + "/vectors.pkl", 'wb'))
    vectors.to_csv(outpath + "/ligand_vectors.csv")
示例#10
0
 def polymer_embeddings(cls, smile):
     sentences = []
     model = word2vec.Word2Vec.load('regressor/POLYINFO_PI1M.pkl')
     sentence = MolSentence(mol2alt_sentence(Chem.MolFromSmiles(smile), 1))
     sentences.append(sentence)
     PE_model = [
         DfVec(x) for x in sentences2vec(sentences, model, unseen='UNK')
     ]
     PE = np.array([x.vec.tolist() for x in PE_model])
     return PE
def vec_mol2vec_smile(smiles: List[str], mol2vec) -> np.ndarray:
    # TODO evaluate impact of radius
    alt_seqs = map(lambda x: mol2alt_sentence(Chem.MolFromSmiles(x), 1),
                   smiles)
    vec_seqs = []
    for seqs in alt_seqs:
        vec_seqs.append([get_embbed(x, mol2vec) for x in seqs])
    return tf.keras.preprocessing.sequence.pad_sequences(vec_seqs,
                                                         padding="post",
                                                         truncating="post",
                                                         dtype="float32")
示例#12
0
def download_data(dev_mode: str,
                  model: word2vec.Word2Vec) -> (np.ndarray, np.ndarray):
    """
    Returns tuple X, y which are numpy arrays
    """
    assert dev_mode.lower() == 'false' or dev_mode.lower() == 'true'

    if dev_mode.lower() == 'false':
        print('Using Actual Data...')
        data_path = os.path.join(args.data_dir, 'HIV.csv')
        df = pd.read_csv(data_path)
        df['sentence'] = df.apply(lambda x: MolSentence(
            mol2alt_sentence(Chem.MolFromSmiles(x['smiles']), 1)),
                                  axis=1)
        df['mol2vec'] = [
            DfVec(x)
            for x in sentences2vec(df['sentence'], model, unseen='UNK')
        ]

        # convert dataframe into numpy array for training
        X = np.array([x.vec for x in df['mol2vec']])
        y = np.array(df['HIV_active'].astype(int))
    else:
        # use example data set
        data_path = os.path.join(args.data_dir, 'ames.sdf')
        df = PandasTools.LoadSDF(data_path)
        df['sentence'] = df.apply(
            lambda x: MolSentence(mol2alt_sentence(x['ROMol'], 1)), axis=1)
        df['mol2vec'] = [
            DfVec(x)
            for x in sentences2vec(df['sentence'], model, unseen='UNK')
        ]

        # convert dataframe into numpy array for training
        X = np.array([x.vec for x in df['mol2vec']])
        y = np.array(df['class'].astype(int))

    return X, y
def mol2vec(data):
    x = data.drop(columns=['smiles', 'activity', 'mol'])
    model = word2vec.Word2Vec.load('model_300dim.pkl')
    data['sentence'] = data.apply(
        lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)
    # Extracting embeddings to a numpy.array
    # Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures
    data['mol2vec'] = [
        DfVec(x) for x in sentences2vec(data['sentence'], model, unseen='UNK')
    ]
    x_mol = np.array([x.vec for x in data['mol2vec']])
    x_mol = pd.DataFrame(x_mol)
    # Concatenating matrices of features
    new_data = pd.concat((x, x_mol), axis=1)
    return new_data
示例#14
0
    def forward(self,
                smiles_batch: List[str]) -> Tuple[torch.Tensor, torch.Tensor]:
        embs = []
        lengths = []
        max_seq_len = 0
        batch_size = len(smiles_batch)
        for smiles in smiles_batch:
            try_emb = self.mapping.get(smiles, None)
            if try_emb is None:
                # try:
                mol = Chem.MolFromSmiles(smiles)
                sentence = mol2alt_sentence(mol, radius=1)
                emb = []
                for word in sentence:
                    try:
                        try:
                            vec = self.mol2vec.wv.word_vec(word)
                        except AttributeError:
                            vec = self.mol2vec[word]
                    except KeyError:
                        vec = self.unk_emb
                    emb.append(vec)
                # (seq_len, embed_dim)
                emb = np.array(emb, dtype=np.float)
                seq_len = len(sentence)
                if seq_len > max_seq_len:
                    max_seq_len = seq_len
                embs.append(emb)
                lengths.append(seq_len)
            # except:
            # print('Failed smiles {}'.format(smiles))
        # embs: List[np.ndarray]
        emb_data = np.zeros((batch_size, max_seq_len, self.mol2vec_embed_dim),
                            dtype=np.float)
        for emb_no, emb in enumerate(embs):
            emb_data[emb_no, :lengths[emb_no]] = emb
        emb_tensor = torch.Tensor(emb_data)
        length_data = np.array(lengths, dtype=np.int)
        length_tensor = torch.LongTensor(length_data)

        if torch.cuda.is_available():
            emb_tensor = emb_tensor.cuda()
            length_tensor = length_tensor.cuda()

        if self.ffn is not None:
            emb_tensor = self.ffn(emb_tensor)

        return emb_tensor, length_tensor
def smiles2vector_duplicates_average(smiles_string):
    """ Convert SMILES to 300d embedding
    
    Args:
        smiles_string (string): single SMILES string
    
    Returns:
        embedding (numpy.ndarray): 300d mol vector array
        
    """
    sentence = mol2alt_sentence(Chem.MolFromSmiles(smiles_string), radius=1)
    vec_node = 0
    for i in range(len(sentence)):
        vec = mol2vec_model.wv[sentence[i]]
        vec_node += vec

    return vec_node / len(sentence)
示例#16
0
def mol2vec_features(model, dataframe, smiles_col, target_col, pad_to):
    mollst = [Chem.MolFromSmiles(x) for x in dataframe[smiles_col]]
    sentences = [mol2alt_sentence(x, 1) for x in mollst]
    features = np.zeros([len(mollst), pad_to, model.vector_size])
    labels = np.reshape(np.array(dataframe[target_col]), (-1, 1))
    print("mean: ", labels.mean(), "std: ", labels.std())
    for idx, sentence in enumerate(sentences):
        count = 0
        for word in sentence:
            if count == pad_to:
                break
            try:
                features[idx, count] = model.wv[word]
                count += 1
            except KeyError as e:
                pass
    assert features.shape[0] == labels.shape[0]
    return features, labels
示例#17
0
def jak2(smile):

    mol = Chem.MolFromSmiles(smile)
    if not mol:
        return 0
    #if mol.HasSubstructMatch(sb):
    #    return 0
    sentence = MolSentence(mol2alt_sentence(mol, 1))
    fp = [
        DfVec(x).vec.tolist()
        for x in sentences2vec(np.array([sentence]), model, unseen='UNK')
    ]
    #fp = Chem.GetMorganFingerprintAsBitVect(m, 3, nBits=1024)
    score = clf.predict(xgb.DMatrix(fp))[0]
    try:
        qed = QED.qed(mol)
    except:
        qed = 0
    score = 1 * score + 0 * qed
    score = score * 0.9 + (np.random.random_sample() - 0.5) * 0.1
    return score
示例#18
0
def get_IC50():
    """
    Write a file containing the IC50, SMILES, SMILES embedding and protein embedding from the BindingDB dataset
    Input file size is 3,5Gb
    Output file size is around 25Gb
    """
    # Get all protein sequences
    Protein = []
    with open('data/BindingDB_All.tsv', encoding='utf-8') as i:
        for line in i:
            splitline = line.split("\t")
            Protein.append(splitline[37])
    # Delete the header
    del Protein[0]
    # Embed the sequences
    protein_embed = embed_protein(100, Protein, 3, 5, 5)

    model = word2vec.Word2Vec.load('data/model_300dim.pkl')
    with open('data/BindingDB_All.tsv', encoding='utf-8') as i:
        with open('data/BindingDB_IC50.tsv', 'w') as o:
            for z, line in enumerate(i):
                splitline = line.split("\t")

                # Write the header
                if z == 0:
                    o.write(
                        "IC50" + "\t" + "Ligand SMILES" + "\t" + "SMILES embedding" + "\t" + "Protein embedding" + "\n")

                # Write the info only when the IC50 and the SMILES code are valid
                else:
                    if splitline[9] != ("" and 0):
                        if ("<" not in splitline[9]) and (">" not in splitline[9]):
                            try:
                                m = Chem.MolFromSmiles(splitline[1])
                                smiles_embedding = sentences2vec(MolSentence(mol2alt_sentence(m, 1)), model,
                                                                 unseen='UNK')
                                o.write(str(splitline[9]) + "\t" + str(splitline[1]) + "\t" + str(
                                    smiles_embedding.tolist()) + "\t" + str(next(protein_embed)) + "\n")
                            except TypeError:
                                next(protein_embed)
def label(path, label_file, model, title):
    data = load_raw_data(path, [label_file])["test"]
    x = data.drop(columns=["smiles", "activity", 'mol'])
    process_model = word2vec.Word2Vec.load('model_300dim.pkl')
    data['sentence'] = data.apply(
        lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)
    # Extracting embeddings to a numpy.array
    # Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures
    data['mol2vec'] = [
        DfVec(x)
        for x in sentences2vec(data['sentence'], process_model, unseen='UNK')
    ]
    x_mol = np.array([x.vec for x in data['mol2vec']])
    x_mol = pd.DataFrame(x_mol)
    # Concatenating matrices of features
    x_test = pd.concat((x, x_mol), axis=1)
    x_test = StandardScaler().fit_transform(x_test)
    preds = model.predict_proba(x_test)[:, 1]
    write_data = data.drop(columns=["smiles"])
    # print(type(write_data))
    # print(write_data)
    write_data['activity'] = preds
示例#20
0
def get_fp(smiles):
    fp = []
    model = model = word2vec.Word2Vec.load(
        '/content/drive/My Drive/model_300dim.pkl')
    df = pd.DataFrame(columns=['SMILES'])
    processed_indices = []
    invalid_indices = []
    for i in range(len(smiles)):
        mol = smiles[i]
        tmp = np.array(mol2image(mol, n=2048))
        if np.isnan(tmp[0]):
            invalid_indices.append(i)
        else:
            fp.append(tmp)
            df = df.append({'SMILES': mol}, ignore_index=True)
            processed_indices.append(i)
    df['mol'] = df['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))
    df['sentence'] = df.apply(
        lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)
    df['mol2vec'] = [
        DfVec(x) for x in sentences2vec(df['sentence'], model, unseen='UNK')
    ]
    X = np.array([x.vec for x in df['mol2vec']])
    return X, processed_indices, invalid_indices
示例#21
0
    d_mols = {}
    l_num = 1
    r_num = 1
    for fname in ligands_folder:
        if 'actives' in fname:
            receptor_name = fname.split('-actives')[0].split('/')[-1]
            label = 1
        elif 'decoys' in fname:
            receptor_name = fname.split('-decoys')[0].split('/')[-1]
            label = 0
        if receptor_name + '_' + str(label) not in d_mols.keys():
            d_mols[receptor_name + '_' + str(label)] = []

        df = PandasTools.LoadSDF(fname)
        df['sentence'] = df.apply(
            lambda x: MolSentence(mol2alt_sentence(x['ROMol'], 1)), axis=1)
        df['mol2vec'] = [
            DfVec(x)
            for x in sentences2vec(df['sentence'], model, unseen='UNK')
        ]
        X = np.array([x.vec for x in df['mol2vec']])
        d_mols[receptor_name + '_' + str(label)] = X

        print(str(l_num), " th receptor")
        l_num = l_num + 1

    save_obj(d_mols,
             directory + 'train_test_data/' + date_str + '/ligand_dict_mols')
else:
    ligand_dict = load_obj(savepath + '/ligand_dict_mols')
import pandas as pd
import numpy as np
from rdkit import Chem
from mol2vec.features import mol2alt_sentence,MolSentence
from gensim.models import word2vec
import torch


data = pd.read_csv("../training_smiles.csv")
y = np.array(data["ACTIVE"].astype(int))

data = data[["SMILES"]]
data["SMILES_str"] = data["SMILES"] 
data["SMILES"] = data["SMILES"].apply(lambda x: Chem.MolFromSmiles(x))
model = word2vec.Word2Vec.load('../models/model_300dim.pkl')
data['sentence'] = data.apply(lambda x: MolSentence(mol2alt_sentence(x['SMILES'], 1)), axis=1)
data = [x.sentence for x in data['sentence']]


vocabs = [x for x in model.wv.index2word if x != 'UNK']
vocab_size = len(vocabs)+1
embed_size = model.wv.vector_size
weight = torch.zeros(vocab_size, embed_size)
word_to_idx = {word: i+1 for i, word in enumerate(vocabs) }
word_to_idx['UNK']=0
idx_to_word = {i+1: word for i, word in enumerate(vocabs) }
idx_to_word[0]='UNK'
vocabs.append('UNK')
for i in range(len(vocabs)):
    index = word_to_idx[vocabs[i]]
    weight[index, :] = torch.from_numpy(model.wv.get_vector(idx_to_word[word_to_idx[vocabs[i]]]))
示例#23
0
def smiles2sentence(smiles):
    mol = Chem.MolFromSmiles(smiles)
    sentence = mol2alt_sentence(mol, 1)
    return sentence
示例#24
0
from rdkit import Chem
from mol2vec.features import mol2alt_sentence
from gensim.models import word2vec
from tqdm import tqdm

# Read data
data = pd.read_csv("all_unique_ligands.csv")
ligands = (Chem.MolFromSmiles(x) for x in data['canonical_smiles'])
# Create new column to store fingerprints
data['words'] = np.zeros(len(data), dtype='object')

print("Generating molecular fingerprints")
i = 0
with tqdm(total=len(data)) as pbar:
    for l in ligands:
        fingerprint = mol2alt_sentence(l, 1)
        data['words'][i] = list(fingerprint)
        i += 1
        pbar.update()
pickle.dump(data, open("fingerprints.pkl", 'wb'))

# Find all unique words
print("Finding unique fingerprints")
all_words = np.array([word for sentence in data['words'] for word in sentence])
unique_words = np.unique(all_words)

# Create a data frame of embeddings
print("Storing embeddings")
model = word2vec.Word2Vec.load('model_300dim.pkl')
embeddings = {}
for word in unique_words:
示例#25
0
    def storeMolecule():
        pass


"""
Test running
"""
directory = "/home/noh/Desktop/CURRENT_WORK_IN_PROGRESS/Chemiinformatics/RDKIT/rdkit/Docs/Book/data"
sdf_file = 'bzr.sdf'

process = rdkit_processdf(
    directory, sdf_file)  # Initialization of the class that reads the sdf file
molList = process.returnMol()
molSmiles = process.MoltoSmiles()
mol2VecList = [
    mol2alt_sentence(x, 1) for x in molList
]  # Using mol2vec to encode molecules as sentences, meaning that each substructure
# represents a word

# Defining the number of hidden layers and the number of nodes inside them

n_hidden1 = 300
n_hidden2 = 100
n_hidden3 = 100
"""
---------------------------------------------
| Fingerprinting and Molecular Similarity   |
---------------------------------------------

The RDkit has a variety of built-in functionality for generating fingerprints
and using them to calculate molecular similarity. The RDKit has a variety for 
示例#26
0
    d_mols={}
    l_num=1
    r_num=1
    for fname in ligands_folder:   
        if 'actives' in fname:
            receptor_name=fname.split('-actives')[0].split('/')[-1]   
            label=1           
        elif 'decoys' in fname:
            receptor_name=fname.split('-decoys')[0].split('/')[-1]
            label=0            
        if receptor_name+'_'+str(label) not in d_mols.keys():
            d_mols[receptor_name+'_'+str(label)]=[]
            
        df = PandasTools.LoadSDF(fname)
        df['sentence'] = df.apply(lambda x: MolSentence(mol2alt_sentence(x['ROMol'], 1)), axis=1)
        df['mol2vec'] = [DfVec(x) for x in sentences2vec(df['sentence'], model, unseen='UNK')]
        X = np.array([x.vec for x in df['mol2vec']])
        d_mols[receptor_name+'_'+str(label)]=X

        print(str(l_num), " th receptor")
        l_num = l_num+1

    save_obj(d_mols, directory + 'train_test_data/'+date_str+'/ligand_dict_mols')
else:
    ligand_dict=load_obj(savepath+'/ligand_dict_mols')

#####################################################
#Data
#####################################################             
if generate_images:
示例#27
0
    plt.title("MAE {}, MSE {}".format(round(mae, 4), round(mse, 4)))
    plt.show()

    print('MAE score:', round(mae, 4))
    print('MSE score:', round(mse,4))

#Read and initialize the Lipophilicity database
mdf= pd.read_csv('Lipophilicity_df_revised.csv')
target = mdf['exp']

mdf.drop(columns='exp',inplace=True)
mdf['mol'] = mdf['smiles'].apply(lambda x: Chem.MolFromSmiles(x))
#Loading pre-trained model via word2vec
model = word2vec.Word2Vec.load('model_300dim.pkl')

mols = MolSentence(mol2alt_sentence(mdf['mol'][1], radius=1))
keys = set(model.wv.vocab.keys())
mnk = set(mols)&keys

s2v = sentences2vec(MolSentence(mol2alt_sentence(mdf['mol'][1], radius=1)), model, unseen='UNK')
mdf['sentence'] = mdf.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)
mdf['mol2vec'] = [DfVec(x) for x in sentences2vec(mdf['sentence'], model, unseen='UNK')]

X = np.array([x.vec for x in mdf['mol2vec']])
X.shape
y = target.values
y.shape

#For the full training set using the substructure of vectors
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
model = word2vec.Word2Vec.load('model_300dim.pkl')
示例#28
0
    plt.show()

    print('MAE score:', round(mae, 4))
    print('MSE score:', round(mse, 4))


mdf = pd.read_csv('Lipophilicity_df_revised.csv')
target = mdf['exp']

mdf.drop(columns='exp', inplace=True)
mdf['mol'] = mdf['smiles'].apply(lambda x: Chem.MolFromSmiles(x))
#Loading pre-trained model via word2vec
from gensim.models import word2vec
model = word2vec.Word2Vec.load('model_300dim.pkl')

mols = MolSentence(mol2alt_sentence(mdf['mol'][1], radius=1))
keys = set(model.wv.vocab.keys())
mnk = set(mols) & keys

s2v = sentences2vec(MolSentence(mol2alt_sentence(mdf['mol'][1], radius=1)),
                    model,
                    unseen='UNK')

mdf['sentence'] = mdf.apply(
    lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)
mdf['mol2vec'] = [
    DfVec(x) for x in sentences2vec(mdf['sentence'], model, unseen='UNK')
]

X = np.array([x.vec for x in mdf['mol2vec']])
X.shape
示例#29
0
##from mol2vec.helpers import depict_identifier, plot_2D_vectors, IdentifierTable, mol_to_svg

aa_smis = ['CC(N)C(=O)O', 'N=C(N)NCCCC(N)C(=O)O', 'NC(=O)CC(N)C(=O)O', 'NC(CC(=O)O)C(=O)O',
          'NC(CS)C(=O)O', 'NC(CCC(=O)O)C(=O)O', 'NC(=O)CCC(N)C(=O)O', 'NCC(=O)O',
          'NC(Cc1cnc[nH]1)C(=O)O', 'CCC(C)C(N)C(=O)O', 'CC(C)CC(N)C(=O)O', 'NCCCCC(N)C(=O)O',
          'CSCCC(N)C(=O)O', 'NC(Cc1ccccc1)C(=O)O', 'O=C(O)C1CCCN1', 'NC(CO)C(=O)O',
          'CC(O)C(N)C(=O)O', 'NC(Cc1c[nH]c2ccccc12)C(=O)O', 'NC(Cc1ccc(O)cc1)C(=O)O',
          'CC(C)C(N)C(=O)O']
aa_codes = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLU', 'GLN', 'GLY', 'HIS', 'ILE', 
            'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL']

aas = [Chem.MolFromSmiles(x) for x in aa_smis]

from gensim.models import word2vec
model = word2vec.Word2Vec.load('model_300dim.pkl')
aa_sentences = [mol2alt_sentence(x, 1) for x in aas]
aalist={}
index=0
for x in aa_sentences:

    aa= np.zeros(300)
    for y in x:
        aa=aa+model.wv.word_vec(y)
    aalist[aa_codes[index]]=aa
    #print (aa)
    index=index+1


for name in aa_codes:
    print (name,  aalist[name] )
    #print (name, ' '.join( str(x) for x in list[name]))
示例#30
0
def molecule2sentence(molecule, radius=1):
    sentence = mol2alt_sentence(molecule, radius=radius)
    return sentence