def __init__(self, method, parameters):
        """
        :param randomseed: seed to ensure same replicated results
        :param parameters: genetic algorithm parameters plus model training parameters

        """
        self.features = None
        self.X = None
        self.y = None
        self.classifier = None
        self.kfold = None

        loader = DataLoader.DataLoader()
        loader.load_all()

        feature_creator = FeatureBuilder.FeatureBuilder(loader)
        feature_creator.create_all_features()
        dataset = feature_creator.MasterTable
        master_table = dataset[dataset.DATA_PART == 'train'].drop(
            ['DATA_PART'], axis=1)

        features = ['SK_ID_CURR', 'TARGET']
        self.features = [x for x in master_table.keys() if x not in features]

        self.X = master_table[self.features]
        self.y = master_table.TARGET

        self.classifier = ModelCreator(parameters=parameters['MODEL']).model

        if method == 'ga':
            self.kfold = model_selection.KFold(
                n_splits=parameters['CROSS_VALIDATION_FOLDS'],
                random_state=parameters['OPTIMISATION']['RANDOM_SEED'])
예제 #2
0
### Set up root directory
print(os.getcwd())
os.chdir('homepath')
print(os.getcwd())
# pathFeatures = 'models/token+sequence'
# fileFeatures = os.path.join(pathFeatures,'df.csv')

# In[3]:

import src.data.DataLoader as CustomDataLoader
from src.features.CustomTokenizer import CustomTokenizer

# In[4]:

pairs = CustomDataLoader.DataLoader()

# In[5]:

len(pairs)

# In[6]:


def round_half_up(n, decimals=0):
    multiplier = 10**decimals
    return int(np.floor(n * multiplier + 0.5) / multiplier)


# In[7]:
예제 #3
0
print(os.getcwd())

# In[3]:

tstdataFile = 'srcdata'

# ### 1. Source data processing

# In[4]:

import src.data.DataLoader as CustomDataLoader
from src.features.CustomTokenizer import CustomTokenizer

# In[5]:

pairs = CustomDataLoader.DataLoader(tstdataFile)

# In[6]:

numOriginal = len(pairs)

# In[1]:

numOriginal

# In[7]:

pairs[0]

# ### Generated Data Loader
예제 #4
0
from src.data import DataLoader
from src import FeatureBuilder
from src.models.Model import ModelCreator
from src.config import Consts as cs
import json
import warnings
warnings.filterwarnings('ignore')

if __name__ == '__main__':

    # Loading Clean Data
    loader = DataLoader.DataLoader()
    loader.load_all()

    # Feature Construction
    feature_creator = FeatureBuilder.FeatureBuilder(loader)
    feature_creator.create_all_features()
    master_table = feature_creator.MasterTable

    # Split Data to Train and Test
    master_table_train = master_table[master_table.DATA_PART == 'train'].drop(
        ['DATA_PART'], axis=1)
    master_table_test = master_table[master_table.DATA_PART == 'test'].drop(
        ['DATA_PART'], axis=1)

    # features_to_eliminate = ['SK_ID_CURR', 'TARGET']
    # features = [x for x in MasterTable_Train.keys() if x not in features_to_eliminate]

    # load optimal features according to feature selector
    with open(cs.PATH_SAVE_BEST_FEATURES, 'rb') as file:
        best_features = json.load(file)
예제 #5
0
    def __init__(self, typeData):
        
        zeroTensor = torch.zeros(100)
        self.dataSize = 1654
        
        ''' Load Pretrained Word2Vec Model '''
        preTrainedWord2Vec = word2vec.KeyedVectors.load_word2vec_format(embedding, binary=True)

        ''' Load the data from the file '''
        #data = pd.read_csv(file, delimiter="\t", header=None,names=['a','b','score'])
        #fullData = np.loadtxt(data, delimiter='\t')
        pairs = CustomDataLoader.DataLoader()
        # Pairs will have the whole data in list of tupples {a, b, score}
        print (len(pairs))
        print (pairs[3][0], pairs[3][1],pairs[3][2])
        print(CustomTokenizer(pairs[3][0]))
        print(CustomTokenizer(pairs[3][1]))
        print(pairs[3][2])

        #TODO: Provide full data
        textData = pairs[:self.dataSize]
        
        #self.len = self.dataSize
        
        ''' Get the Tokenization and embeddings '''
        aData = []
        bData = []
        labels = []
        for eachData in tqdm(textData):
            a = CustomTokenizer(eachData[0])
            b = CustomTokenizer(eachData[1])
            label = float(eachData[2])
            aEmbed = []
            bEmbed = []
            for eachToken in a:
                if eachToken in preTrainedWord2Vec:
                    aEmbed.append(preTrainedWord2Vec[eachToken])
                    
            if len(aEmbed) < maxSentenceLen:
                aEmbed += [zeroTensor] * (maxSentenceLen - len(aEmbed))
            elif len(aEmbed) > maxSentenceLen:
                aEmbed = aEmbed[:maxSentenceLen]
            
            for eachToken in b:
                if eachToken in preTrainedWord2Vec:
                    bEmbed.append(preTrainedWord2Vec[eachToken])
                    
            if len(bEmbed) < maxSentenceLen:
                bEmbed += [zeroTensor] * (maxSentenceLen - len(bEmbed))
            elif len(bEmbed) > maxSentenceLen:
                bEmbed = bEmbed[:maxSentenceLen]
                
            aData.append(aEmbed)
            bData.append(bEmbed)
            labels.append(label)
                   
                
        #Check shapes and sizes
        print (len(aData), len(bData), len(labels))
        print (len(aData[3]), len (bData[3]), len (aData[3][0]),type(aData[3][0]))
        
        ''' Convert to Tensors'''
        aDataTensor = torch.FloatTensor(aData)
        bDataTensor = torch.FloatTensor(bData)
        labels = torch.FloatTensor(labels)
        print (aDataTensor.shape, bDataTensor.shape, labels.shape)
        
        xData = torch.cat((aDataTensor,bDataTensor),1)
        print (xData.shape)
        
        splitVal = int(np.floor(validation_split * self.dataSize))
        splitTest = int(np.floor(test_split * self.dataSize))
        self.xVal,  self.yVal    = xData[:splitVal],                    labels[:splitVal]
        self.xTest, self.yTest   = xData[splitVal: splitVal+splitTest], labels[splitVal: splitVal+splitTest]
        self.xTrain, self.yTrain = xData[splitVal+splitTest:],          labels[splitVal+splitTest:]
        print (len(self.xTrain),len(self.xVal),len(self.xTest))
        print (splitVal, splitTest)
        
        if typeData == "train":
            self.xData = self.xTrain
            self.labels = self.yTrain
        elif typeData == "test":
            self.xData = self.xTest
            self.labels = self.yTest
        else:
            self.xData = self.xVal
            self.labels = self.yVal
        self.len = len(self.xData)
        print (len(self.xData),len(self.labels))