def __init__(self, method, parameters): """ :param randomseed: seed to ensure same replicated results :param parameters: genetic algorithm parameters plus model training parameters """ self.features = None self.X = None self.y = None self.classifier = None self.kfold = None loader = DataLoader.DataLoader() loader.load_all() feature_creator = FeatureBuilder.FeatureBuilder(loader) feature_creator.create_all_features() dataset = feature_creator.MasterTable master_table = dataset[dataset.DATA_PART == 'train'].drop( ['DATA_PART'], axis=1) features = ['SK_ID_CURR', 'TARGET'] self.features = [x for x in master_table.keys() if x not in features] self.X = master_table[self.features] self.y = master_table.TARGET self.classifier = ModelCreator(parameters=parameters['MODEL']).model if method == 'ga': self.kfold = model_selection.KFold( n_splits=parameters['CROSS_VALIDATION_FOLDS'], random_state=parameters['OPTIMISATION']['RANDOM_SEED'])
### Set up root directory print(os.getcwd()) os.chdir('homepath') print(os.getcwd()) # pathFeatures = 'models/token+sequence' # fileFeatures = os.path.join(pathFeatures,'df.csv') # In[3]: import src.data.DataLoader as CustomDataLoader from src.features.CustomTokenizer import CustomTokenizer # In[4]: pairs = CustomDataLoader.DataLoader() # In[5]: len(pairs) # In[6]: def round_half_up(n, decimals=0): multiplier = 10**decimals return int(np.floor(n * multiplier + 0.5) / multiplier) # In[7]:
print(os.getcwd()) # In[3]: tstdataFile = 'srcdata' # ### 1. Source data processing # In[4]: import src.data.DataLoader as CustomDataLoader from src.features.CustomTokenizer import CustomTokenizer # In[5]: pairs = CustomDataLoader.DataLoader(tstdataFile) # In[6]: numOriginal = len(pairs) # In[1]: numOriginal # In[7]: pairs[0] # ### Generated Data Loader
from src.data import DataLoader from src import FeatureBuilder from src.models.Model import ModelCreator from src.config import Consts as cs import json import warnings warnings.filterwarnings('ignore') if __name__ == '__main__': # Loading Clean Data loader = DataLoader.DataLoader() loader.load_all() # Feature Construction feature_creator = FeatureBuilder.FeatureBuilder(loader) feature_creator.create_all_features() master_table = feature_creator.MasterTable # Split Data to Train and Test master_table_train = master_table[master_table.DATA_PART == 'train'].drop( ['DATA_PART'], axis=1) master_table_test = master_table[master_table.DATA_PART == 'test'].drop( ['DATA_PART'], axis=1) # features_to_eliminate = ['SK_ID_CURR', 'TARGET'] # features = [x for x in MasterTable_Train.keys() if x not in features_to_eliminate] # load optimal features according to feature selector with open(cs.PATH_SAVE_BEST_FEATURES, 'rb') as file: best_features = json.load(file)
def __init__(self, typeData): zeroTensor = torch.zeros(100) self.dataSize = 1654 ''' Load Pretrained Word2Vec Model ''' preTrainedWord2Vec = word2vec.KeyedVectors.load_word2vec_format(embedding, binary=True) ''' Load the data from the file ''' #data = pd.read_csv(file, delimiter="\t", header=None,names=['a','b','score']) #fullData = np.loadtxt(data, delimiter='\t') pairs = CustomDataLoader.DataLoader() # Pairs will have the whole data in list of tupples {a, b, score} print (len(pairs)) print (pairs[3][0], pairs[3][1],pairs[3][2]) print(CustomTokenizer(pairs[3][0])) print(CustomTokenizer(pairs[3][1])) print(pairs[3][2]) #TODO: Provide full data textData = pairs[:self.dataSize] #self.len = self.dataSize ''' Get the Tokenization and embeddings ''' aData = [] bData = [] labels = [] for eachData in tqdm(textData): a = CustomTokenizer(eachData[0]) b = CustomTokenizer(eachData[1]) label = float(eachData[2]) aEmbed = [] bEmbed = [] for eachToken in a: if eachToken in preTrainedWord2Vec: aEmbed.append(preTrainedWord2Vec[eachToken]) if len(aEmbed) < maxSentenceLen: aEmbed += [zeroTensor] * (maxSentenceLen - len(aEmbed)) elif len(aEmbed) > maxSentenceLen: aEmbed = aEmbed[:maxSentenceLen] for eachToken in b: if eachToken in preTrainedWord2Vec: bEmbed.append(preTrainedWord2Vec[eachToken]) if len(bEmbed) < maxSentenceLen: bEmbed += [zeroTensor] * (maxSentenceLen - len(bEmbed)) elif len(bEmbed) > maxSentenceLen: bEmbed = bEmbed[:maxSentenceLen] aData.append(aEmbed) bData.append(bEmbed) labels.append(label) #Check shapes and sizes print (len(aData), len(bData), len(labels)) print (len(aData[3]), len (bData[3]), len (aData[3][0]),type(aData[3][0])) ''' Convert to Tensors''' aDataTensor = torch.FloatTensor(aData) bDataTensor = torch.FloatTensor(bData) labels = torch.FloatTensor(labels) print (aDataTensor.shape, bDataTensor.shape, labels.shape) xData = torch.cat((aDataTensor,bDataTensor),1) print (xData.shape) splitVal = int(np.floor(validation_split * self.dataSize)) splitTest = int(np.floor(test_split * self.dataSize)) self.xVal, self.yVal = xData[:splitVal], labels[:splitVal] self.xTest, self.yTest = xData[splitVal: splitVal+splitTest], labels[splitVal: splitVal+splitTest] self.xTrain, self.yTrain = xData[splitVal+splitTest:], labels[splitVal+splitTest:] print (len(self.xTrain),len(self.xVal),len(self.xTest)) print (splitVal, splitTest) if typeData == "train": self.xData = self.xTrain self.labels = self.yTrain elif typeData == "test": self.xData = self.xTest self.labels = self.yTest else: self.xData = self.xVal self.labels = self.yVal self.len = len(self.xData) print (len(self.xData),len(self.labels))