def overlappingNgramWord2VecEncode(self, n = None, windowSize = None, vectorSize = None, fileName = None, sc = None): ''' Encodes a protein sequence by converting it into n-grams and then transforming it into a Word2Vec feature vector. If given word2Vec file name, then this function encodes a protein sequence by converting it into n-grams and then transforming it using pre-trained word2Vec model read from that file Attribute: n (int): The number of words in an n-gram windowSize (int): width of the window used to slide across the \ squence, context words from [-window,window] vectorSize (int): dimension of the feature vector fileName (str): filename of Word2Vec model Returns: dataset with features vector added to original dataset ''' # Create n-grams out of the sequence # E.g., 2-gram IDCGH, ... =>[ID, DC, CG, GH, ...] data = sequenceNgrammer.ngram(self.data, n, "ngram") if not (n == None and windowSize == None and vectorSize == None): # Convert n-grams to W2V freature vector # [ID, DC, CG, GH, ...] => [0.1234, 0.2394, ...] word2Vec = Word2Vec() word2Vec.setInputCol("ngram") \ .setOutputCol(self.outputCol) \ .setNumPartitions(8) \ .setWindowSize(windowSize) \ .setVectorSize(vectorSize) \ self.model = word2Vec.fit(data) elif fileName != None and sc != None: reader = Word2VecModel() self.model = reader.load(sc, fileName) print(f"model file : {fileName} \n \ inputCol : {self.model.getInputCol()} \n \ windowSize : {self.model.getWindowSize()} \n \ vectorSize : {self.model.getVectorSize()}") self.model.setOutputCol(self.outputCol) else: raise Exception("Either provide word2Vec file (filename) + SparkContext (sc), \ or number of words(n) + window size(windowSize) \ + vector size (vetorSize), for function parameters") return return self.model.transform(data)
def shifted_3gram_word2vec_encode(self, data=None, inputCol=None, outputCol=None, windowSize=None, vectorSize=None, fileName=None, sc=None): '''Encodes a protein sequence as three non-overlapping 3-grams, trains a Word2Vec model on the 3-grams, and then averages the three resulting freature vectors. Attribute --------- data (DataFrame): input data to be encoded [None] inputCol (str): name of the input column [None] outputCol (str): name of the output column [None] windowSize (int): width of the window used to slide across the sequence context words from -window to window vectorSize (int): dimension of the feature vector [None] fileName (string): filename of Word2VecModel [None] sc (SparkContext): spark context [None] Returns ------- dataset with features vector added to original dataset References ---------- Asgari E, Mofrad MRK (2015) Continuous Distributed Representation of Biological Sequences for Deep Proteomics and Genomics. PLOS ONE 10(11): e0141287. doi: https://doi.org/10.1371/journal.pone.0141287 ''' if data is not None: self.data = data if inputCol is not None: self.inputCol = inputCol if outputCol is not None: self.outputCol = outputCol if self.data is None: raise ValueError("Class variable data is not defined, please pass\ in a dataframe into the data parameter") # Create n-grams out of the sequence # e.g., 2-gram [IDCGH, ...] => [ID. DC, CG, GH,...] data = sequenceNgrammer.shifted_ngram(self.data, 3, 0, "ngram0") data = sequenceNgrammer.shifted_ngram(data, 3, 1, "ngram1") data = sequenceNgrammer.shifted_ngram(data, 3, 2, "ngram2") if not (windowSize == None and vectorSize == None): ngram0 = data.select("ngram0").withColumnRenamed("ngram0", "ngram") ngram1 = data.select("ngram1").withColumnRenamed("ngram1", "ngram") ngram2 = data.select("ngram2").withColumnRenamed("ngram2", "ngram") ngrams = ngram0.union(ngram1).union(ngram2) # Convert n-grams to W2V feature vector # [I D, D C, C G, G H, ... ] => [0.1234, 0.2394, .. ] word2Vec = Word2Vec() word2Vec.setInputCol("ngram") \ .setOutputCol("feature") \ .setMinCount(10) \ .setNumPartitions(8) \ .setWindowSize(windowSize) \ .setVectorSize(vectorSize) self.model = word2Vec.fit(ngrams) elif fileName != None and sc != None: reader = Word2VecModel() self.model = reader.load(sc, fileName) print(f"model file : {fileName} \n \ inputCol : {self.model.getInputCol()} \n \ windowSize : {self.model.getWindowSize()} \n \ vectorSize : {self.model.getVectorSize()}") else: raise Exception( "Either provide word2Vec file (filename) + SparkContext (sc), \ or window size(windowSize) + vector size (vetorSize), \ for function parameters") return #data = data.withColumn("feature0",self.model.transform(data.select('ngram0').withColumnRenamed("ngram0","ngram"))) for i in reversed(range(3)): feature = self.model.transform( data.select('ngram' + str(i)).withColumnRenamed( "ngram" + str(i), "ngram")) data = data.join( feature.withColumnRenamed("ngram", "ngram" + str(i)), "ngram" + str(i)) data = data.withColumnRenamed("feature", "feature" + str(i)) data = self._average_feature_vectors(data, self.outputCol) data.printSchema() cols = ['structureChainId','sequence','labelQ8','labelQ3','ngram0','ngram1',\ 'ngram2','feature0','feature1','feature2', 'features'] data = data.select(cols) return data
def overlapping_ngram_word2vec_encode(self, data=None, inputCol=None, outputCol=None, n=None, windowSize=None, vectorSize=None, fileName=None, sc=None): '''Encodes a protein sequence by converting it into n-grams and then transforming it into a Word2Vec feature vector. If given word2Vec file name, then this function encodes a protein sequence by converting it into n-grams and then transforming it using pre-trained word2Vec model read from that file Attribute --------- data (DataFrame): input data to be encoded [None] inputCol (str): name of the input column [None] outputCol (str): name of the output column [None] n (int): The number of words in an n-gram [None] windowSize (int): width of the window used to slide across the \ squence, context words from -window to window \ [None] vectorSize (int): dimension of the feature vector [None] fileName (str): filename of Word2Vec model [None] Returns ------- dataset with features vector added to original dataset ''' if data is not None: self.data = data if inputCol is not None: self.inputCol = inputCol if outputCol is not None: self.outputCol = outputCol if self.data is None: raise ValueError("Class variable data is not defined, please pass\ in a dataframe into the data parameter") # Create n-grams out of the sequence # E.g., 2-gram IDCGH, ... =>[ID, DC, CG, GH, ...] data = sequenceNgrammer.ngram(self.data, n, "ngram") if not (n == None and windowSize == None and vectorSize == None): # Convert n-grams to W2V freature vector # [ID, DC, CG, GH, ...] => [0.1234, 0.2394, ...] word2Vec = Word2Vec() word2Vec.setInputCol("ngram") \ .setOutputCol(self.outputCol) \ .setNumPartitions(8) \ .setWindowSize(windowSize) \ .setVectorSize(vectorSize) \ self.model = word2Vec.fit(data) elif fileName != None and sc != None: reader = Word2VecModel() self.model = reader.load(sc, fileName) print(f"model file : {fileName} \n \ inputCol : {self.model.getInputCol()} \n \ windowSize : {self.model.getWindowSize()} \n \ vectorSize : {self.model.getVectorSize()}") self.model.setOutputCol(self.outputCol) else: raise Exception( "Either provide word2Vec file (filename) + SparkContext (sc), \ or number of words(n) + window size(windowSize) \ + vector size (vetorSize), for function parameters" ) return return self.model.transform(data)
def shifted3GramWord2VecEncode(self, windowSize = None, vectorSize = None, fileName = None, sc = None): ''' Encodes a protein sequence as three non-overlapping 3-grams, trains a Word2Vec model on the 3-grams, and then averages the three resulting freature vectors. <P> Asgari E, Mofrad MRK (2015) Continuous Distributed Representation of Biological Sequences for Deep Proteomics and Genomics. PLOS ONE 10(11): e0141287. doi: <a href="https://doi.org/10.1371/journal.pone.0141287">10.1371/journal.pone.0141287</a> Attribute: windowSize (int): width of the window used to slide across the sequence context words from [-window, window] vectorSize (int): dimension of the feature vector fileName (string): filename of Word2VecModel sc (SparkContext): spark context Return: dataset with features vector added to original dataset ''' # Create n-grams out of the sequence # e.g., 2-gram [IDCGH, ...] => [ID. DC, CG, GH,...] # TODO set input column #data = sequenceNgrammer.ngram(self.data, 2, "ngram") data = sequenceNgrammer.shiftedNgram(self.data, 3, 0, "ngram0") data = sequenceNgrammer.shiftedNgram(data, 3, 1, "ngram1") data = sequenceNgrammer.shiftedNgram(data, 3, 2, "ngram2") if not (windowSize == None and vectorSize == None): ngram0 = data.select("ngram0").withColumnRenamed("ngram0","ngram") ngram1 = data.select("ngram1").withColumnRenamed("ngram1","ngram") ngram2 = data.select("ngram2").withColumnRenamed("ngram2","ngram") ngrams = ngram0.union(ngram1).union(ngram2) # Convert n-grams to W2V feature vector # [I D, D C, C G, G H, ... ] => [0.1234, 0.2394, .. ] word2Vec = Word2Vec() word2Vec.setInputCol("ngram") \ .setOutputCol("feature") \ .setMinCount(10) \ .setNumPartitions(8) \ .setWindowSize(windowSize) \ .setVectorSize(vectorSize) self.model = word2Vec.fit(ngrams) elif fileName != None and sc != None: reader = Word2VecModel() self.model = reader.load(sc, fileName) print(f"model file : {fileName} \n \ inputCol : {self.model.getInputCol()} \n \ windowSize : {self.model.getWindowSize()} \n \ vectorSize : {self.model.getVectorSize()}") else: raise Exception("Either provide word2Vec file (filename) + SparkContext (sc), \ or window size(windowSize) + vector size (vetorSize), \ for function parameters") return #data = data.withColumn("feature0",self.model.transform(data.select('ngram0').withColumnRenamed("ngram0","ngram"))) for i in reversed(range(3)): feature = self.model.transform(data.select('ngram' + str(i)).withColumnRenamed("ngram" + str(i),"ngram")) data = data.join(feature.withColumnRenamed("ngram","ngram" + str(i)), "ngram" + str(i)) data = data.withColumnRenamed("feature", "feature" + str(i)) data = self.averageFeatureVectors(data, self.outputCol) data.printSchema() cols = ['structureChainId','sequence','labelQ8','labelQ3','ngram0','ngram1',\ 'ngram2','feature0','feature1','feature2', 'features'] data = data.select(cols) return data