def RunLanguageModel(self, _Classifier, trainSet): # Configurations file xml of the language model configFileLanguageModel = os.path.join( self.basePath, "LanguageModel", "Configurations", "Configurations-" + _Classifier + ".xml") stopWordsFileName = os.path.join(self.basePath, "LanguageModel", "Input", "stop_words.txt") linksDBFile = os.path.join(self.basePath, "LanguageModel", "Output", "links_database.txt") # The serialization file to save the model languageModelSerializationFile = os.path.join(self.basePath, "LanguageModel", "Output", "language_model.bin") if _Classifier == "Lexicon": langModelTxtLoadFile = os.path.join( self.basePath, "LanguageModel", "Input", "language_model_lexicon_synonyms.txt") # Start the LanguageModel: # Initialize the LanguageModel_Lexicon self.languageModel = LanguageModel(configFileLanguageModel, stopWordsFileName, languageModelSerializationFile, linksDBFile, trainSet) self.languageModel.BuildLanguageModel() if _Classifier == "Lexicon": self.languageModel.LoadModelFromTxtFile(langModelTxtLoadFile)
def __init__(self, words_dict_path=None, dataset_path=None, modeln=1): if not words_dict_path: words_dict_path = os.path.join('data', 'questions_dict.bin') if not dataset_path: dataset_path = os.path.join('data', 'questions_dataset.bin') self.modeln = modeln self.words_dict_path = words_dict_path self.dataset_path = dataset_path configFileLanguageModel = os.path.join('LanguageModel', 'Configurations', 'Configurations_questions.xml') stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt') linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt') languageModelSerializationFile = os.path.join('LanguageModel', 'Output', 'language_model.bin') self.languageModel = LanguageModel(configFileLanguageModel, stopWordsFileName, languageModelSerializationFile, linksDBFile, []) self.configFileFeaturesExtractor = os.path.join( 'FeaturesExtractor', 'Configurations', 'Configurations_questions.xml') self.trainFeaturesSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_features.bin') self.trainLabelsSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_labels.bin')
def __init__(self, modeln=1): self.modeln = modeln configFileLanguageModel = os.path.join('LanguageModel', 'Configurations', 'Configurations_sentiment.xml') stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt') linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt') languageModelSerializationFile = os.path.join('LanguageModel', 'Output', 'language_model.bin') self.languageModel = LanguageModel(configFileLanguageModel, stopWordsFileName, languageModelSerializationFile, linksDBFile, []) self.configFileFeaturesExtractor = os.path.join( 'FeaturesExtractor', 'Configurations', 'Configurations_sentiment.xml') self.trainFeaturesSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_features.bin') self.trainLabelsSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_labels.bin')
if (LANGUAGE_MODEL): # Configurations file xml of the language model configFileLanguageModel = ".\\LanguageModel\\Configurations\\Configurations.xml" langModelLogFile = ".\\LanguageModel\\Output\\language_model.txt" stopWordsFileName = ".\\LanguageModel\\Input\\stop_words.txt" # The serialization file to save the model languageModelSerializationFile = ".\\LanguageModel\\Output\\language_model.bin" # Start the LanguageModel: #------------------------- if not LOAD_LANGUAGE_MODEL: # Initialize the LanguageModel languageModel = LanguageModel(configFileLanguageModel, stopWordsFileName, languageModelSerializationFile, datasetBuilder.dataSet) languageModel.BuildLanguageModel() if MERGE_BI_GRAM: languageModel.NGram = 2 languageModel.BuildLanguageModel() if MERGE_TRI_GRAM: languageModel.NGram = 3 languageModel.BuildLanguageModel() languageModel.DumpLanguageModel(langModelLogFile) languageModel.SaveModel() else: # Load the LanguageModel languageModel = LanguageModel(configFileLanguageModel, stopWordsFileName,
#datasetBuilder.trainSet.extend(datasetBuilder.GetDatasetFromXLSXFile(xlsxTestFileName)) # Configurations file xml of the language model configFileLanguageModel = ".\\LanguageModel\\Configurations\\Configurations.xml" langModelLogFile = ".\\LanguageModel\\Output\\language_model.txt" langModelTxtLoadFile = ".\\LanguageModel\\Output\\language_model_stocks_mix.txt" stopWordsFileName = ".\\LanguageModel\\Input\\stop_words.txt" linksDBFile = ".\\LanguageModel\\Output\\links_database.txt" # The serialization file to save the model languageModelSerializationFile = ".\\LanguageModel\\Output\\language_model.bin" # Start the LanguageModel: # Initialize the LanguageModel languageModel = LanguageModel(configFileLanguageModel, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet) languageModel.BuildLanguageModel() languageModel.SaveModel() ''' # Extract relevant tweets only relevantDataSet = [] irrelevantDataSet = [] for case in datasetBuilder.trainSet: if case['label'] == 'relevant': relevantDataSet.append(case) elif case['label'] == 'irrelevant': irrelevantDataSet.append(case) # Initialize the LanguageModel relLanguageModel = LanguageModel(configFileLanguageModel, stopWordsFileName, languageModelSerializationFile, relevantDataSet)
def __init__(self): ''' Constructor :type self: ''' # Start the DatasetBuilder #------------------------- # Configurations file xml of the dataset builder configFileDatasetBuilder = os.path.join('DatasetBuilder','Configurations','Configurations.xml') # The serialization file to save the dataset datasetSerializationFile = os.path.join('DatasetBuilder','Output', 'dataset.bin') # The XLSX file name for train set xlsxTrainFileName = os.path.join('DatasetBuilder','Input','train') # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile(xlsxTrainFileName) # Configurations file xml of the language model configFileLanguageModel_lexicon = os.path.join('LanguageModel', 'Configurations', 'Configurations-lexicon.xml') configFileLanguageModel_Tasi = os.path.join('LanguageModel', 'Configurations', 'Configurations-Tasi.xml') stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt') linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt') # The serialization file to save the model languageModelSerializationFile = os.path.join('LanguageModel', 'Output', 'language_model.bin') # Start the LanguageModel: # Initialize the LanguageModel_Lexicon self.languageModel_lexicon = LanguageModel(configFileLanguageModel_lexicon, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet) self.languageModel_lexicon.BuildLanguageModel() # Initialize the LanguageModel_Tasi self.languageModel_Tasi = LanguageModel(configFileLanguageModel_Tasi, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet) self.languageModel_Tasi.BuildLanguageModel() # Configurations file xml of the features extractor configFileFeaturesExtractor_Lexicon = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-lexicon.xml') configFileFeaturesExtractor_Tasi = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml') # The serialization file to save the features trainFeaturesSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_features.bin') trainLabelsSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_labels.bin') # Start the FeaturesExtractor: #----------------------------- # Initialize the FeaturesExtractor _ Lexicon trainFeaturesExtractor_Lexicon = FeaturesExtractor(configFileFeaturesExtractor_Lexicon, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel_lexicon, datasetBuilder.trainSet) trainFeaturesExtractor_Lexicon.ExtractNumTfFeatures() # Initialize the FeaturesExtractor _ Tasi trainFeaturesExtractor_Tasi = FeaturesExtractor(configFileFeaturesExtractor_Tasi, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel_Tasi, datasetBuilder.trainSet) trainFeaturesExtractor_Tasi.ExtractNumTfFeatures() # The serialization file to save the features configFileClassifier_Lexicon = os.path.join('Classifier', 'Configurations', 'Configurations-lexicon.xml') configFileClassifier_Tasi = os.path.join('Classifier', 'Configurations', 'Configurations-Tasi.xml') modelSerializationFile = os.path.join('Classifier', 'Output', 'classifier_model.bin') # Start the Classifier: #--------------------- print(trainFeaturesExtractor_Tasi.labels[:4]) print([i['label'] for i in trainFeaturesExtractor_Lexicon.dataSet[:4]]) self.classifier_Lexicon = Classifier(configFileClassifier_Lexicon, modelSerializationFile, trainFeaturesExtractor_Lexicon.features, trainFeaturesExtractor_Lexicon.labels, [], []) self.classifier_Tasi = Classifier(configFileClassifier_Tasi, modelSerializationFile, trainFeaturesExtractor_Tasi.features, trainFeaturesExtractor_Tasi.labels, [],[]) # Train self.classifier_Lexicon.Train() self.classifier_Tasi.Train()
class Filter(object): ''' classdocs ''' def __init__(self): ''' Constructor :type self: ''' # Start the DatasetBuilder #------------------------- # Configurations file xml of the dataset builder configFileDatasetBuilder = os.path.join('DatasetBuilder','Configurations','Configurations.xml') # The serialization file to save the dataset datasetSerializationFile = os.path.join('DatasetBuilder','Output', 'dataset.bin') # The XLSX file name for train set xlsxTrainFileName = os.path.join('DatasetBuilder','Input','train') # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile(xlsxTrainFileName) # Configurations file xml of the language model configFileLanguageModel_lexicon = os.path.join('LanguageModel', 'Configurations', 'Configurations-lexicon.xml') configFileLanguageModel_Tasi = os.path.join('LanguageModel', 'Configurations', 'Configurations-Tasi.xml') stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt') linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt') # The serialization file to save the model languageModelSerializationFile = os.path.join('LanguageModel', 'Output', 'language_model.bin') # Start the LanguageModel: # Initialize the LanguageModel_Lexicon self.languageModel_lexicon = LanguageModel(configFileLanguageModel_lexicon, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet) self.languageModel_lexicon.BuildLanguageModel() # Initialize the LanguageModel_Tasi self.languageModel_Tasi = LanguageModel(configFileLanguageModel_Tasi, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet) self.languageModel_Tasi.BuildLanguageModel() # Configurations file xml of the features extractor configFileFeaturesExtractor_Lexicon = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-lexicon.xml') configFileFeaturesExtractor_Tasi = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml') # The serialization file to save the features trainFeaturesSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_features.bin') trainLabelsSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_labels.bin') # Start the FeaturesExtractor: #----------------------------- # Initialize the FeaturesExtractor _ Lexicon trainFeaturesExtractor_Lexicon = FeaturesExtractor(configFileFeaturesExtractor_Lexicon, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel_lexicon, datasetBuilder.trainSet) trainFeaturesExtractor_Lexicon.ExtractNumTfFeatures() # Initialize the FeaturesExtractor _ Tasi trainFeaturesExtractor_Tasi = FeaturesExtractor(configFileFeaturesExtractor_Tasi, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel_Tasi, datasetBuilder.trainSet) trainFeaturesExtractor_Tasi.ExtractNumTfFeatures() # The serialization file to save the features configFileClassifier_Lexicon = os.path.join('Classifier', 'Configurations', 'Configurations-lexicon.xml') configFileClassifier_Tasi = os.path.join('Classifier', 'Configurations', 'Configurations-Tasi.xml') modelSerializationFile = os.path.join('Classifier', 'Output', 'classifier_model.bin') # Start the Classifier: #--------------------- print(trainFeaturesExtractor_Tasi.labels[:4]) print([i['label'] for i in trainFeaturesExtractor_Lexicon.dataSet[:4]]) self.classifier_Lexicon = Classifier(configFileClassifier_Lexicon, modelSerializationFile, trainFeaturesExtractor_Lexicon.features, trainFeaturesExtractor_Lexicon.labels, [], []) self.classifier_Tasi = Classifier(configFileClassifier_Tasi, modelSerializationFile, trainFeaturesExtractor_Tasi.features, trainFeaturesExtractor_Tasi.labels, [],[]) # Train self.classifier_Lexicon.Train() self.classifier_Tasi.Train() def Classify(self, text, stockName): testSet = [] for t in text: testSet.append({'label' : '', 'text' : t}) if stockName == 'Tasi': # Configurations file xml of the features extractor configFileFeaturesExtractor = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml') testFeaturesExtractor = FeaturesExtractor(configFileFeaturesExtractor, None, None, self.languageModel_Tasi, testSet) testFeaturesExtractor.ExtractNumTfFeatures() self.classifier_Tasi.testFeatures = testFeaturesExtractor.features self.classifier_Tasi.testTargets = [] for i in range(len(self.classifier_Tasi.testFeatures)): #self.classifier_Tasi.testTargets[i] = 1 self.classifier_Tasi.testTargets.append(1) label, acc, val = self.classifier_Tasi.Test() else: configFileFeaturesExtractor = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations-lexicon.xml') testFeaturesExtractor = FeaturesExtractor(configFileFeaturesExtractor, None, None, self.languageModel_lexicon, testSet) self.classifier_Lexicon.testFeatures = testFeaturesExtractor.features self.classifier_Lexicon.testTargets = [] for i in range(len(self.classifier_Lexicon.testFeatures)): self.classifier_Lexicon.testTargets[i] = 1 label, acc, val = self.classifier_Lexicon.Test() return label
class Filter(object): ''' classdocs ''' def __init__(self, basePath, stockName, Retrain): ''' Constructor :type self: ''' if (basePath == None): self.basePath = self.basePath else: self.basePath = basePath self.stockName = stockName serializationFile = open( os.path.join(self.basePath, 'StockToClassifier.bin'), 'rb') self.StockToClassifier = pickle.load(serializationFile) #import pdb; pdb.set_trace() self.usedClassifier = self.StockToClassifier[self.stockName] # Start the DatasetBuilder #------------------------- # Configurations file xml of the dataset builder configFileDatasetBuilder = os.path.join(self.basePath, "DatasetBuilder", "Configurations", "Configurations.xml") # The serialization file to save the dataset datasetSerializationFile = os.path.join(self.basePath, "DatasetBuilder", "Output", "dataset.bin") if Retrain == False: # The XLSX file name for train set xlsxTrainFileName = os.path.join(self.basePath, "DatasetBuilder", "Input", "train") # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile( xlsxTrainFileName) self.RunLanguageModel(self.usedClassifier, datasetBuilder.trainSet) trainFeaturesExtractor = self.RunFeatureExtractor( self.usedClassifier, datasetBuilder.trainSet) self.Train(self.usedClassifier, trainFeaturesExtractor, True) else: # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) def Classify(self, text): testSet = [] for t in text: testSet.append({'label': '', 'text': t}) # Configurations file xml of the features extractor configFileFeaturesExtractor = os.path.join( self.basePath, "FeaturesExtractor", "Configurations", "Configurations-" + self.usedClassifier + ".xml") testFeaturesExtractor = FeaturesExtractor(configFileFeaturesExtractor, None, None, self.languageModel, testSet) if self.usedClassifier == "Lexicon": testFeaturesExtractor.ExtractLexiconFeatures() else: testFeaturesExtractor.ExtractNumTfFeatures() self.classifier.testFeatures = testFeaturesExtractor.features self.classifier.testTargets = [] for i in range(len(self.classifier.testFeatures)): self.classifier.testTargets.append(1) label, acc, val = self.classifier.Test() return label def RunLanguageModel(self, _Classifier, trainSet): # Configurations file xml of the language model configFileLanguageModel = os.path.join( self.basePath, "LanguageModel", "Configurations", "Configurations-" + _Classifier + ".xml") stopWordsFileName = os.path.join(self.basePath, "LanguageModel", "Input", "stop_words.txt") linksDBFile = os.path.join(self.basePath, "LanguageModel", "Output", "links_database.txt") # The serialization file to save the model languageModelSerializationFile = os.path.join(self.basePath, "LanguageModel", "Output", "language_model.bin") if _Classifier == "Lexicon": langModelTxtLoadFile = os.path.join( self.basePath, "LanguageModel", "Input", "language_model_lexicon_synonyms.txt") # Start the LanguageModel: # Initialize the LanguageModel_Lexicon self.languageModel = LanguageModel(configFileLanguageModel, stopWordsFileName, languageModelSerializationFile, linksDBFile, trainSet) self.languageModel.BuildLanguageModel() if _Classifier == "Lexicon": self.languageModel.LoadModelFromTxtFile(langModelTxtLoadFile) def RunFeatureExtractor(self, _Classifier, trainSet): # Configurations file xml of the features extractor configFileFeaturesExtractor = os.path.join( self.basePath, "FeaturesExtractor", "Configurations", "Configurations-" + _Classifier + ".xml") # The serialization file to save the features trainFeaturesSerializationFile = os.path.join(self.basePath, "FeaturesExtractor", "Output", "train_features.bin") trainLabelsSerializationFile = os.path.join(self.basePath, "FeaturesExtractor", "Output", "train_labels.bin") # Start the FeaturesExtractor: #----------------------------- # Initialize the FeaturesExtractor trainFeaturesExtractor = FeaturesExtractor( configFileFeaturesExtractor, trainFeaturesSerializationFile, trainLabelsSerializationFile, self.languageModel, trainSet) if _Classifier == "Lexicon": trainFeaturesExtractor.ExtractLexiconFeatures() else: trainFeaturesExtractor.ExtractNumTfFeatures() return trainFeaturesExtractor def Train(self, _Classifier, trainFeaturesExtractor, Full): # The serialization file to save the features configFileClassifier = os.path.join( self.basePath, "Classifier", "Configurations", "Configurations-" + _Classifier + ".xml") modelSerializationFile = os.path.join(self.basePath, "Classifier", "Output", "classifier_model.bin") # Start the Classifier: #--------------------- self.classifier = Classifier(configFileClassifier, modelSerializationFile, trainFeaturesExtractor.features, trainFeaturesExtractor.labels, [], []) if Full == True: self.classifier.Train() def GetBestClassifier(self, trainSet): #import pdb; pdb.set_trace() self.RunLanguageModel("Lexicon", trainSet) trainFeaturesExtractor = self.RunFeatureExtractor("Lexicon", trainSet) self.Train("Lexicon", trainFeaturesExtractor, False) LexiconAcc = self.classifier.getCrossValidationAccuarcy() self.RunLanguageModel("DT", trainSet) trainFeaturesExtractor = self.RunFeatureExtractor("DT", trainSet) self.Train("DT", trainFeaturesExtractor, False) DTAcc = self.classifier.getCrossValidationAccuarcy() self.RunLanguageModel("SVM", trainSet) trainFeaturesExtractor = self.RunFeatureExtractor("SVM", trainSet) self.Train("SVM", trainFeaturesExtractor, False) SVMAcc = self.classifier.getCrossValidationAccuarcy() bestClassifier = max(LexiconAcc, DTAcc, SVMAcc) if bestClassifier == LexiconAcc: self.StockToClassifier[self.stockName] = "Lexicon" elif bestClassifier == DTAcc: self.StockToClassifier[self.stockName] = "DT" else: self.StockToClassifier[self.stockName] = "SVM"
def init(cls, save_path, use_backend=True, pre_stocks=None): ''' Constructor :type self: ''' global stocks if pre_stocks: stocks = pre_stocks for stock in stocks: print('Buildind model for %s' % stock) stock_model = {} # Start the DatasetBuilder #------------------------- # Configurations file xml of the dataset builder configFileDatasetBuilder = os.path.join('DatasetBuilder', 'Configurations', 'Configurations.xml') # The serialization file to save the dataset datasetSerializationFile = os.path.join('DatasetBuilder', 'Output', 'dataset.bin') # The XLSX file name for train set xlsxTrainFileName = os.path.join('DatasetBuilder', 'Input', 'train') # Initialize the DatasetBuilder from serialization file datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) if use_backend: datasetBuilder.trainSet = datasetBuilder.GetDatasetFromBackend( stock) else: datasetBuilder.trainSet = datasetBuilder.GetDatasetFromXLSXFile( xlsxTrainFileName, stock) if len(datasetBuilder.trainSet) < NMIN_SET: print("Not enough data: ", len(datasetBuilder.trainSet)) continue datasetBuilder.trainSet = datasetBuilder.trainSet[NVALID:] # Configurations file xml of the language model configFileLanguageModel_lexicon = os.path.join( 'LanguageModel', 'Configurations', 'Configurations-Tasi.xml') stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt') linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt') # The serialization file to save the model languageModelSerializationFile = os.path.join( 'LanguageModel', 'Output', 'language_model.bin') # Start the LanguageModel: # Initialize the LanguageModel_Lexicon stock_model['languageModel_lexicon'] = LanguageModel( configFileLanguageModel_lexicon, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet) stock_model['languageModel_lexicon'].BuildLanguageModel() # Configurations file xml of the features extractor configFileFeaturesExtractor_Lexicon = os.path.join( 'FeaturesExtractor', 'Configurations', 'Configurations-Tasi.xml') # The serialization file to save the features trainFeaturesSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_features.bin') trainLabelsSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_labels.bin') # Start the FeaturesExtractor: #----------------------------- # Initialize the FeaturesExtractor _ Lexicon trainFeaturesExtractor_Lexicon = FeaturesExtractor( configFileFeaturesExtractor_Lexicon, trainFeaturesSerializationFile, trainLabelsSerializationFile, stock_model['languageModel_lexicon'], datasetBuilder.trainSet) trainFeaturesExtractor_Lexicon.ExtractNumTfFeatures(sparse=True) #print(trainFeaturesExtractor_Lexicon.features[0]) # The serialization file to save the features configFileClassifier_Lexicon = os.path.join( 'Classifier', 'Configurations', 'Configurations-Tasi.xml') modelSerializationFile = os.path.join('Classifier', 'Output', 'classifier_model.bin') # Start the Classifier: #--------------------- stock_model['classifier_Lexicon'] = Classifier( configFileClassifier_Lexicon, modelSerializationFile, trainFeaturesExtractor_Lexicon.sparse_features, trainFeaturesExtractor_Lexicon.labels, [], []) #stock_model['classifier_Lexicon'] = Classifier(configFileClassifier_Lexicon, modelSerializationFile, trainFeaturesExtractor_Lexicon.features, trainFeaturesExtractor_Lexicon.labels, [], []) #print(trainFeaturesExtractor_Lexicon.labels[:4]) #print([i['label'] for i in trainFeaturesExtractor_Lexicon.dataSet[:4]]) # Train stock_model['classifier_Lexicon'].Train() stock_model['training_samples'] = len(datasetBuilder.trainSet) cls.save(save_path, stock, stock_model) print("----------------------------------------------------")
class SentimentModel(object): def __init__(self, modeln=1): self.modeln = modeln configFileLanguageModel = os.path.join('LanguageModel', 'Configurations', 'Configurations_sentiment.xml') stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt') linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt') languageModelSerializationFile = os.path.join('LanguageModel', 'Output', 'language_model.bin') self.languageModel = LanguageModel(configFileLanguageModel, stopWordsFileName, languageModelSerializationFile, linksDBFile, []) self.configFileFeaturesExtractor = os.path.join( 'FeaturesExtractor', 'Configurations', 'Configurations_sentiment.xml') self.trainFeaturesSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_features.bin') self.trainLabelsSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_labels.bin') def get_data(self, backend=True): configFileDatasetBuilder = os.path.join('DatasetBuilder', 'Configurations', 'Configurations.xml') datasetSerializationFile = os.path.join('DatasetBuilder', 'Output', 'dataset.bin') self.datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) dataset = None #if not backend: xlsxTrainFileName = os.path.join('DatasetBuilder', 'Input', 'sentiment') dataset = self.datasetBuilder.GetSentimentDatasetFromXLSXFile( xlsxTrainFileName) if backend: dataset2 = self.datasetBuilder.GetSentimentDatasetFromBackend() for item in dataset2: dataset[item] = dataset2[item] dataset = list(dataset.values()) if len(dataset) < MIN_DATA: return print("Data length: ", len(dataset)) self.languageModel.dataset = dataset self.languageModel.totalNumberOfDocs = len(dataset) self.languageModel.BuildLanguageModel() self.languageModel.dataset = [] return dataset def prepare_data(self, dataset): trainFeaturesExtractor = FeaturesExtractor( self.configFileFeaturesExtractor, self.trainFeaturesSerializationFile, self.trainLabelsSerializationFile, self.languageModel, dataset, sentiment_features=True) trainFeaturesExtractor.ExtractNumTfFeatures(init_dicts()) maxid = max([max(i.keys()) for i in trainFeaturesExtractor.features]) X = [] Y = [] for i, item in enumerate(trainFeaturesExtractor.features): itemx = [0 for _ in range(maxid)] l = [0, 0, 0] l[trainFeaturesExtractor.labels[i] - 1] = 1 for j in trainFeaturesExtractor.features[i]: v = trainFeaturesExtractor.features[i][j] itemx[j - 1] = v X.append(itemx) Y.append(trainFeaturesExtractor.labels[i]) trainFeaturesExtractor.dataset = [] trainFeaturesExtractor.features = [] trainFeaturesExtractor.labels = [] return X, Y def transform_data(self, X, Y): X = np.array(X) Y = np.array(Y) ri = range(X.shape[0]) rl = range(X.shape[1]) d = pd.DataFrame(X, index=ri, columns=rl) d['class'] = Y return d def split_data(self, d): training_indices, testing_indices = train_test_split( d.index, stratify=d['class'].values, train_size=0.75, test_size=0.25) return training_indices, testing_indices def train(self, backend=True): rawdata = self.get_data(backend) dxy = self.prepare_data(rawdata) d = self.transform_data(dxy[0], dxy[1]) self.training_indices, self.testing_indices = self.split_data(d) X = d.loc[self.training_indices].drop('class', axis=1).values Y = d.loc[self.training_indices, 'class'].values Xtest = d.loc[self.testing_indices].drop('class', axis=1).values Ytest = d.loc[self.testing_indices, 'class'].values if self.modeln == 1: print(self.fit_model1(X, Y)) print(self.evaluate_model1(Xtest, Ytest)) if self.modeln == 2: print(self.fit_model2(X, Y)) print(self.evaluate_model2(Xtest, Ytest)) if self.modeln == 3: print(self.fit_model3(X, Y)) print(self.evaluate_model3(Xtest, Ytest)) if self.modeln == 4: print(self.fit_model4(X, Y)) print(self.evaluate_model4(Xtest, Ytest)) def fit_model1(self, X, Y): self.model1 = LinearSVC(C=0.01, penalty="l1", dual=False, random_state=42) self.model1.fit(X, Y) recall = self.model1.score(X, Y) return recall def evaluate_model1(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model2(self, X, Y): self.model1 = LinearSVC(C=0.18, penalty="l1", dual=False, random_state=42) self.model1.fit(X, Y) recall = self.model1.score(X, Y) return recall def evaluate_model2(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model3(self, X, Y): pre_recall = 0.0 for g in [0.01, 0.05, 0.1, 0.3, 0.5]: model = SVC(C=0.18, gamma=g, random_state=42) model.fit(X, Y) recall = model.score(X, Y) print(recall) if recall > pre_recall: pre_recall = recall self.model1 = model return recall def evaluate_model3(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model4(self, X, Y): model = SVC(C=0.18, gamma=0.1, random_state=42) model.fit(X, Y) recall = model.score(X, Y) self.model1 = model return recall def evaluate_model4(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def classify(self, tweets): dataset = [] for tw in tweets: dataset.append({'text': tw, 'label': 'neutral'}) X, Y = self.prepare_data(dataset) return self.model1.predict(X) @classmethod def load(cls, path): return pickle.load(open(path, 'rb')) def save(self, path): return pickle.dump(self, open(path, 'wb'))
class SentimentModel(object): def __init__(self, modeln = 1): self.modeln = modeln configFileLanguageModel = os.path.join('LanguageModel', 'Configurations', 'Configurations_sentiment.xml') stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt') linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt') languageModelSerializationFile = os.path.join('LanguageModel', 'Output', 'language_model.bin') self.languageModel = LanguageModel(configFileLanguageModel, stopWordsFileName, languageModelSerializationFile, linksDBFile, []) self.configFileFeaturesExtractor = os.path.join('FeaturesExtractor', 'Configurations', 'Configurations_sentiment.xml') self.trainFeaturesSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_features.bin') self.trainLabelsSerializationFile = os.path.join('FeaturesExtractor', 'Output', 'train_labels.bin') def get_data(self, backend=True): configFileDatasetBuilder = os.path.join('DatasetBuilder','Configurations','Configurations.xml') datasetSerializationFile = os.path.join('DatasetBuilder','Output', 'dataset.bin') self.datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) dataset = None #if not backend: xlsxTrainFileName = os.path.join('DatasetBuilder','Input','sentiment') dataset = self.datasetBuilder.GetSentimentDatasetFromXLSXFile(xlsxTrainFileName) if backend: dataset2 = self.datasetBuilder.GetSentimentDatasetFromBackend() for item in dataset2: dataset[item] = dataset2[item] dataset = list(dataset.values()) if len(dataset) < MIN_DATA: return print("Data length: ", len(dataset)) self.languageModel.dataset = dataset self.languageModel.totalNumberOfDocs = len(dataset) self.languageModel.BuildLanguageModel() self.languageModel.dataset = [] return dataset def prepare_data(self, dataset): trainFeaturesExtractor = FeaturesExtractor(self.configFileFeaturesExtractor, self.trainFeaturesSerializationFile, self.trainLabelsSerializationFile, self.languageModel, dataset, sentiment_features=True) trainFeaturesExtractor.ExtractNumTfFeatures(sentiment_dict=init_dicts(), sparse=True) X= trainFeaturesExtractor.sparse_features Y = np.array(trainFeaturesExtractor.labels) trainFeaturesExtractor.dataset = [] trainFeaturesExtractor.features = [] trainFeaturesExtractor.labels = [] return X, Y def transform_data(self, X, Y): X = np.array(X) Y = np.array(Y) ri = range(X.shape[0]) rl = range(X.shape[1]) d = pd.DataFrame(X, index=ri, columns=rl) d['class'] = Y return d def split_data(self, X, Y): training_indices, testing_indices = train_test_split(range(X.shape[0]), stratify = Y, train_size=0.75, test_size=0.25) self.ntraining_samples = len(training_indices) return training_indices, testing_indices def train(self, backend=True): rawdata = self.get_data(backend) Xall, Yall = self.prepare_data(rawdata) self.training_indices, self.testing_indices = self.split_data(Xall, Yall) X = Xall[self.training_indices] Y = Yall[self.training_indices] Xtest = Xall[self.testing_indices] Ytest = Yall[self.testing_indices] acc = 0.0 if self.modeln == 1: print(self.fit_model1(X, Y)) acc = self.evaluate_model1(Xtest, Ytest) if self.modeln == 2: print(self.fit_model2(X, Y)) acc = self.evaluate_model2(Xtest, Ytest) if self.modeln == 3: print(self.fit_model3(X, Y)) acc = self.evaluate_model3(Xtest, Ytest) if self.modeln == 4: print(self.fit_model4(X, Y)) acc = self.evaluate_model4(Xtest, Ytest) if self.modeln == 5: print(self.fit_model5(X, Y)) acc = self.evaluate_model5(Xtest, Ytest) result = {'accuracy': acc, 'training_samples': self.ntraining_samples} return result def fit_model1(self, X, Y): self.model1 = LinearSVC(C=0.018, dual=False, random_state=42) self.model1.fit(X, Y) recall = self.model1.score(X, Y) return recall def evaluate_model1(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model2(self, X, Y): self.model1 = LinearSVC(C=0.18, penalty="l1", dual=False, random_state=42) self.model1.fit(X, Y) recall = self.model1.score(X, Y) return recall def evaluate_model2(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model3(self, X, Y): pre_recall = 0.0 for g in [0.01, 0.05, 0.1, 0.3, 0.5]: model = SVC(C=0.18, gamma=g, random_state=42) model.fit(X, Y) recall = model.score(X, Y) print(recall) if recall > pre_recall: pre_recall = recall self.model1 = model return recall def evaluate_model3(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model4(self, X, Y): model = SVC(C=0.18, gamma=0.1, random_state=42) model.fit(X, Y) recall = model.score(X, Y) self.model1 = model return recall def evaluate_model4(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model5(self, X, Y): model = LogisticRegression(C=0.18, random_state=42) model.fit(X, Y) recall = model.score(X, Y) self.model1 = model return recall def evaluate_model5(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def classify(self, tweets): dataset = [] for tw in tweets: dataset.append({'text': tw, 'label':'neutral'}) X, Y = self.prepare_data(dataset) return self.model1.predict(X) @classmethod def load(cls, path): return pickle.load(open(path, 'rb')) def save(self, path): return pickle.dump(self, open(path, 'wb'))
#datasetBuilder.trainSet.extend(datasetBuilder.GetDatasetFromXLSXFile(xlsxTestFileName)) # Configurations file xml of the language model configFileLanguageModel = ".\\LanguageModel\\Configurations\\Configurations.xml" langModelLogFile = ".\\LanguageModel\\Output\\language_model.txt" langModelTxtLoadFile = ".\\LanguageModel\\Input\\language_model_lexicon_synonyms.txt" stopWordsFileName = ".\\LanguageModel\\Input\\stop_words.txt" linksDBFile = ".\\LanguageModel\\Output\\links_database.txt" # The serialization file to save the model languageModelSerializationFile = ".\\LanguageModel\\Output\\language_model.bin" # Start the LanguageModel: # Initialize the LanguageModel languageModel = LanguageModel(configFileLanguageModel, stopWordsFileName, languageModelSerializationFile, linksDBFile, datasetBuilder.trainSet) languageModel.BuildLanguageModel() languageModel.LoadModelFromTxtFile(langModelTxtLoadFile) # Configurations file xml of the features extractor configFileFeaturesExtractor = ".\\FeaturesExtractor\\Configurations\\Configurations.xml" # The serialization file to save the features trainFeaturesSerializationFile = ".\\FeaturesExtractor\\Output\\train_features.bin" trainLabelsSerializationFile = ".\\FeaturesExtractor\\Output\\train_labels.bin" testFeaturesSerializationFile = ".\\FeaturesExtractor\\Output\\test_features.bin" testLabelsSerializationFile = ".\\FeaturesExtractor\\Output\\test_labels.bin" testExportFileName = ".\\FeaturesExtractor\\Output\\test_data.txt" trainExportFileName = ".\\FeaturesExtractor\\Output\\train_data.txt" # Start the FeaturesExtractor:
if (LANGUAGE_MODEL): # Configurations file xml of the language model configFileLanguageModel = ".\\LanguageModel\\Configurations\\Configurations.xml" langModelLogFile = ".\\LanguageModel\\Output\\language_model.txt" langModelTxtLoadFile = ".\\LanguageModel\\Output\\custom_lang_model.txt" stopWordsFileName = ".\\LanguageModel\\Input\\stop_words.txt" # The serialization file to save the model languageModelSerializationFile = ".\\LanguageModel\\Output\\language_model.bin" # Start the LanguageModel: #------------------------- if LOAD_LANGUAGE_MODEL: # Load the LanguageModel languageModel = LanguageModel(configFileLanguageModel, stopWordsFileName, languageModelSerializationFile, datasetBuilder.dataSet) languageModel.LoadModel() elif LOAD_LANGUAGE_MODEL_FROM_TXT: # Load the LanguageModel languageModel = LanguageModel(configFileLanguageModel, stopWordsFileName, languageModelSerializationFile, datasetBuilder.dataSet) languageModel.LoadModelFromTxtFile(langModelTxtLoadFile) elif LANGUAGE_MODEL_ON_RELEVANT: # Extract relevant tweets only relevantDataSet = [] for case in datasetBuilder.dataSet: if case['label'] == 'relevant':
class QuestionsModel(object): def __init__(self, words_dict_path=None, dataset_path=None, modeln=1): if not words_dict_path: words_dict_path = os.path.join('data', 'questions_dict.bin') if not dataset_path: dataset_path = os.path.join('data', 'questions_dataset.bin') self.modeln = modeln self.words_dict_path = words_dict_path self.dataset_path = dataset_path configFileLanguageModel = os.path.join('LanguageModel', 'Configurations', 'Configurations_questions.xml') stopWordsFileName = os.path.join('LanguageModel', 'Input', 'stop_words.txt') linksDBFile = os.path.join('LanguageModel', 'Output', 'links_database.txt') languageModelSerializationFile = os.path.join('LanguageModel', 'Output', 'language_model.bin') self.languageModel = LanguageModel(configFileLanguageModel, stopWordsFileName, languageModelSerializationFile, linksDBFile, []) self.configFileFeaturesExtractor = os.path.join( 'FeaturesExtractor', 'Configurations', 'Configurations_questions.xml') self.trainFeaturesSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_features.bin') self.trainLabelsSerializationFile = os.path.join( 'FeaturesExtractor', 'Output', 'train_labels.bin') def get_data(self): configFileDatasetBuilder = os.path.join('DatasetBuilder', 'Configurations', 'Configurations.xml') datasetSerializationFile = os.path.join('DatasetBuilder', 'Output', 'dataset.bin') self.datasetBuilder = DatasetBuilder(configFileDatasetBuilder, [], datasetSerializationFile) dataset = self.datasetBuilder.getQuestionsDataset(self.dataset_path) self.languageModel.dataset = dataset self.languageModel.totalNumberOfDocs = len(dataset) self.languageModel.BuildLanguageModel() self.languageModel.dataset = [] return dataset def prepare_data(self, dataset): trainFeaturesExtractor = FeaturesExtractor( self.configFileFeaturesExtractor, self.trainFeaturesSerializationFile, self.trainLabelsSerializationFile, self.languageModel, dataset, questions_features=True) print("Data length: ", len(dataset)) words_dict = self.datasetBuilder.getQuestionsDatasetDictionary( self.words_dict_path) trainFeaturesExtractor.ExtractNumTfFeatures(questions_dict=words_dict) maxid = max([max(i.keys()) for i in trainFeaturesExtractor.features]) X = [] Y = [] L = len(dataset) for i, item in enumerate(trainFeaturesExtractor.features): itemx = [0 for _ in range(maxid)] l = [0, 0, 0] l[trainFeaturesExtractor.labels[i] - 1] = 1 for j in trainFeaturesExtractor.features[i]: v = trainFeaturesExtractor.features[i][j] itemx[j - 1] = v X.append(itemx) Y.append(trainFeaturesExtractor.labels[i]) return X, Y, L def transform_data(self, X, Y): X = np.array(X) Y = np.array(Y) ri = range(X.shape[0]) rl = range(X.shape[1]) d = pd.DataFrame(X, index=ri, columns=rl) d['class'] = Y return d def split_data(self, d): training_indices, testing_indices = train_test_split( d.index, stratify=d['class'].values, train_size=0.75, test_size=0.25) return training_indices, testing_indices def train(self): rawdata = self.get_data() X, Y, L = self.prepare_data(rawdata) ret = [0, 0] ret[0] = L d = self.transform_data(X, Y) self.training_indices, self.testing_indices = self.split_data(d) X = d.loc[self.training_indices].drop('class', axis=1).values Y = d.loc[self.training_indices, 'class'].values Xtest = d.loc[self.testing_indices].drop('class', axis=1).values Ytest = d.loc[self.testing_indices, 'class'].values if self.modeln == 1: print(self.fit_model1(X, Y)) ret[1] = self.evaluate_model1(Xtest, Ytest) print(ret[1]) if self.modeln == 2: print(self.fit_model2(X, Y)) ret[1] = self.evaluate_model2(Xtest, Ytest) print(ret[1]) if self.modeln == 3: print(self.fit_model3(X, Y)) ret[1] = self.evaluate_model2(Xtest, Ytest) print(ret[1]) return ret @classmethod def load(cls, path): return pickle.load(open(path, 'rb')) def save(self, path): return pickle.dump(self, open(path, 'wb')) def fit_model1(self, X, Y): self.model1 = LinearSVC(C=0.01, penalty="l1", dual=False, random_state=42) self.model1.fit(X, Y) recall = self.model1.score(X, Y) return recall def evaluate_model1(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model2(self, X, Y): self.model1 = LinearSVC(C=0.18, penalty="l1", dual=False, random_state=42) self.model1.fit(X, Y) recall = self.model1.score(X, Y) return recall def evaluate_model2(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def fit_model3(self, X, Y): pre_recall = 0.0 for g in [0.01, 0.05, 0.1, 0.3, 0.5]: model = SVC(C=0.18, gamma=g, random_state=42) model.fit(X, Y) recall = model.score(X, Y) print(recall) if recall > pre_recall: pre_recall = recall self.model1 = model return recall def evaluate_model3(self, X, Y): evaluation = self.model1.score(X, Y) return evaluation def isQuestion(self, opinion): dataset = [{'text': opinion.text, 'label': 'negativeq'}] X, Y, L = self.prepare_data(dataset) return self.model1.predict(X)[0] def addQuestion(self, opinion): q = models.QuestionOpinion() q.tweet = opinion q.since_id = opinion.twitter_id q.save() return q def checkQuestion(self, twitter, q): s = twitter.search(q='@' + q.tweet.tweeter.tweeter_name, count='500', result_type='mixed', since_id=q.since_id) replies = [] for tw in s['statuses']: if tw['in_reply_to_status_id_str'] == str(q.tweet.twitter_id): replies.append(tw) diffdate = datetime.now() - q.date_created.replace(tzinfo=None) if diffdate.days > MAX_DAYS: q.delete() else: q.since_id = s['statuses'][0]['in_reply_to_status_id_str'] q.save() return {"replies": replies, 'found': s['statuses']} def checkQuestions(self, twitter): for q in models.QuestionOpinion.objects.filter(): self.checkQuestion(twitter, q)