def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda): self.query = {} self.candidate = candidatePath self.tweet = {} self.mu = mu self.sigma = sigma #similarity threshold self.lamda = lamda #cluster threshold self.jaccInstance = Jaccard() self.klInstance = Distance(mu, corpusFile) print "corpus read done!"
def __init__(self, train_path, test_path): self.train_path = train_path self.test_path = test_path self.preprocessor = Preprocessor() self.trn = pd.DataFrame(columns=Classifier._COLS) # Read data_frame self.tst = pd.DataFrame(columns=Classifier._COLS) # Read data_frame self.trn_gs = pd.DataFrame(columns=Classifier._GS_COLS) # Known labels self.tst_gs = pd.DataFrame(columns=Classifier._GS_COLS) # Known labels self.tok_trn = [] self.tok_tst = [] self.feature_extractor = FeatureExtractor() self.jaccard = Jaccard() self.rfr = RFR() self.nn = MLPRegressor(hidden_layer_sizes=(100, 30, 30), validation_fraction=0.3, alpha=0.3, warm_start=False, max_iter=1000, activation='logistic')