def readInfo(trainName, testName): dataframe = readData(trainName + '.csv') dataframe2 = readData(testName + '.csv') unethical_variables = [ 'estu_tieneetnia', 'estu_tipodocumento.1', 'fami_trabajolaborpadre', 'fami_trabajolabormadre', 'estu_genero.1', 'estu_pais_reside.1', 'estu_depto_reside.1', 'estu_cod_reside_depto.1', 'estu_mcpio_reside.1', 'estu_cod_reside_mcpio.1', 'estu_areareside', 'fami_nivelsisben', 'fami_pisoshogar', 'fami_tienemicroondas', 'fami_tienehorno', 'fami_tieneautomovil.1', 'fami_tienedvd', 'fami_tiene_nevera.1', 'estu_nacionalidad.1', 'fami_telefono.1', 'estu_trabajaactualmente', 'estu_antecedentes', 'estu_expectativas', 'cole_cod_dane_establecimiento', 'cole_cod_dane_sede', 'cole_area_ubicacion', 'cole_jornada', 'cole_cod_mcpio_ubicacion', 'cole_mcpio_ubicacion', 'cole_cod_depto_ubicacion', 'cole_depto_ubicacion' ] NaN_variables = [ 'estu_tomo_cursopreparacion', 'estu_cursodocentesies', 'desemp_prof', 'estu_cursoiesapoyoexterno', 'estu_cursoiesexterna', 'estu_simulacrotipoicfes', 'estu_actividadrefuerzoareas', 'estu_actividadrefuerzogeneric' ] no_aportan_variables = [ 'estu_estudiante.1', 'cole_sede_principal', 'cole_nombre_sede', 'cole_codigo_icfes', 'profundiza', 'cole_nombre_establecimiento', 'cole_nombre_establecimiento', 'cole_genero', 'cole_naturaleza', 'periodo.1', 'estu_fechanacimiento.1', 'estu_inst_cod_departamento', 'periodo', 'estu_consecutivo.1' ] for variable in [NaN_variables, no_aportan_variables, unethical_variables]: dataframe.drop(variable, axis=1, inplace=True) for variable in [NaN_variables, no_aportan_variables, unethical_variables]: dataframe2.drop(variable, axis=1, inplace=True) training_data = dataframe.to_numpy().tolist() for el in training_data: x = 0 for ele in el: if type(ele) is not str and math.isnan(ele): el[x] = '-' x += 1 testing_data = dataframe2.to_numpy().tolist() for el in testing_data: x = 0 for ele in el: if type(ele) is not str and math.isnan(ele): el[x] = '-' x += 1 return training_data, testing_data
def test_raise_exceptions(): from hrmData import hrmData from readData import readData from timeSegment import timeSegment with pytest.raises(ValueError): myDataset = readData("test_data31.csv") hrmObject = hrmData(myDataset, 1, 4) with pytest.raises(ValueError): myDataset = readData("test_data31.csv") hrmObject = hrmData(myDataset, 2, 5)
def test_faulty_data_load(): from readData import readData myDataset1 = readData("test_data28.csv") nanValueTime1 = 0.9 nanValueVoltage1 = -0.345 assert myDataset1.time[324] == nanValueTime1 assert myDataset1.voltage[338] == nanValueVoltage1 myDataset2 = readData("test_data30.csv") badDataTime = 3.86 badDataVoltage = -0.025 assert pytest.approx(myDataset2.time[965]) == badDataTime assert pytest.approx(myDataset2.voltage[972]) == badDataVoltage
def test_regular_data_read(): from readData import readData myDataset1 = readData("test_data3shortTime.csv") timeFromCSV = [0, 0.003, 0.006, 0.008, 0.011, 0.014, 0.017, 0.019, 0.022, 0.025, 0.028, 0.031, 0.033, 0.036, 0.039, 0.042, 0.044, 0.047, 0.05, 0.053] assert pytest.approx(myDataset1.time[0:20]) == timeFromCSV
def test_voltage_extremes(): from hrmData import hrmData from readData import readData from timeSegment import timeSegment myDataset = readData("test_data31.csv") hrmObject = hrmData(myDataset) maxMinValue = (0.7875, -0.19375) assert pytest.approx(hrmObject.voltage_extremes) == maxMinValue
def test_num_beats(): from hrmData import hrmData from readData import readData from timeSegment import timeSegment myDataset = readData("test_data31.csv") hrmObject = hrmData(myDataset) numBeatsin31 = 19 assert numBeatsin31*0.8 < hrmObject.num_beats < numBeatsin31*1.2
def main(fileName, k): sourceData = readData.readData(fileName) result1 = kmeans(sourceData, k) result2 = kmeansPlusPlus(sourceData, k) return result1, result2
def main(args): ticks = time.time() trainingData = readData.readData(TRAINING_LABELS_PATH, TRAINING_IMAGES_PATH) totalInstances = 0 for i in range(0, len(trainingData)): for fV in range(0, len(trainingData[i])): totalInstances += 1 nb = NaiveBayes(trainingData, totalInstances) nb.naiveBayes() testingData = readData.readData(TEST_LABELS_PATH, TEST_IMAGES_PATH) predictedVals = nb.predictLabels(testingData) ticks = time.time() - ticks print "Total Accuracy: "+str(nb.accuracy(predictedVals)) print "Execution Time: "+str(ticks) print "Confusion Matrix:\n " nb.confusionmatrix(predictedVals)
def main(): from readData import readData from hrmData import hrmData import numpy as np import json csvFileName = "test_data31.csv" myDataset = readData(csvFileName) hrmObject = hrmData(myDataset) write_to_json(csvFileName, hrmObject)
def test_subtractDCOffset(): from hrmData import hrmData from readData import readData from timeSegment import timeSegment myDataset = readData("test_data31.csv") hrmObject = hrmData(myDataset) subtractedOffsetValues = [-0.027071875, -0.002071875, 0.029178125, 0.054178125, 0.079178125] assert hrmObject.meanSubtractedVoltage[0:5] == \ pytest.approx(subtractedOffsetValues)
def test_default_value_for_time_segment(): from readData import readData from hrmData import hrmData from timeSegment import timeSegment myDataset = readData('easytestfile.csv') myTimePoints = timeSegment(myDataset) expectedVoltageValues = [[100, 101], [102, 103], [104, 105], [106, 107], [108, 109], [110, 111], [112, 113], [114, 115], [116, 117], [118, 119]] assert myTimePoints.segmentList == expectedVoltageValues
def test_time_of_beats(): from hrmData import hrmData from readData import readData from timeSegment import timeSegment myDataset = readData("test_data31.csv") hrmObject = hrmData(myDataset) locationOfBeats = [0., 0.7715, 1.543, 2.3145, 3.086, 3.8575, 4.629, 5.4005, 6.172, 6.9435, 7.715, 8.4865, 9.258, 10.0295, 10.801, 11.5725, 12.344, 13.1155, 13.887] assert locationOfBeats == pytest.approx(hrmObject.beats)
def test_voltage_list(): from readData import readData from hrmData import hrmData from timeSegment import timeSegment myDataset = readData('easytestfile.csv') myTimePoints = timeSegment(myDataset, 4) expectedVoltageValues = [[100, 101, 102, 103], [104, 105, 106, 107], [108, 109, 110, 111], [112, 113, 114, 115], [116, 117, 118, 119]] assert myTimePoints.segmentList == expectedVoltageValues
def test_time_index(): timesAt2SecondsFile2 = [ 720, 1440, 2160, 2880, 3600, 4320, 5040, 5760, 6480, 7200, 7920, 8640, 9360 ] from readData import readData from hrmData import hrmData from timeSegment import timeSegment myDataset = readData("test_data2.csv") myTimePoints = timeSegment(myDataset, 2) assert myTimePoints.listOfSegmentsIdx == timesAt2SecondsFile2
def test_intervalHR(): from hrmData import hrmData from readData import readData from timeSegment import timeSegment myDataset = readData("test_data31.csv") hrmObject = hrmData(myDataset) actualHR = 84.38818565400844 assert pytest.approx(hrmObject.global_mean_hr_bpm) == 84.38818565400844 hrmObject2 = hrmData(myDataset, 2, 8) assert pytest.approx(hrmObject.global_mean_hr_bpm) == 84.38818565400844
def QB(): items = readData() message = { 'status': 200, 'message': 'OK', 'data': items } resp = jsonify(message) resp.status_code = 200 print(resp) return(resp)
def main(argv=None): """ 不收敛 """ num_repeat = 1 MSEs = [] abbrTrain = 'E:\python_project\happinessPredict\DataSet\happiness_train_abbr.csv' for _ in range(num_repeat): x_train, x_test, y_train, y_test = readData.readData(abbrTrain, True) y_train_array = np.zeros((len(y_train), 5)) for i in range(len(y_train)): if y_train[i] == 1: y_train_array[i] = np.array([1, 0, 0, 0, 0]) elif y_train[i] == 2: y_train_array[i] = np.array([0, 1, 0, 0, 0]) elif y_train[i] == 3: y_train_array[i] = np.array([0, 0, 1, 0, 0]) elif y_train[i] == 4: y_train_array[i] = np.array([0, 0, 0, 1, 0]) elif y_train[i] == 5: y_train_array[i] = np.array([0, 0, 0, 0, 1]) y_test_array = np.zeros((len(y_test), 5)) for i in range(len(y_test)): if y_test[i] == 1: y_test_array[i] = np.array([1, 0, 0, 0, 0]) elif y_test[i] == 2: y_test_array[i] = np.array([0, 1, 0, 0, 0]) elif y_test[i] == 3: y_test_array[i] = np.array([0, 0, 1, 0, 0]) elif y_test[i] == 4: y_test_array[i] = np.array([0, 0, 0, 1, 0]) elif y_test[i] == 5: y_test_array[i] = np.array([0, 0, 0, 0, 1]) # 对每一行的样本的同一位置的特征进行z-score标准化 scaler = preprocessing.StandardScaler().fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) mse = train(x_train, x_test, y_train_array, y_test_array) MSEs.append(mse) tf.reset_default_graph() plt.figure() plt.grid() plt.xlabel('iteration$(\\times10^2)$') plt.ylabel('MSE') # plt.axis([0, len(mse), 0, 1.1]) for i in range(num_repeat): plt.plot(MSEs[i]) # 保存图片 # plt.savefig("diff_s1423 with PCA+ANN.svg", transparent=True, format='svg') plt.show()
def test_determineLagTime(): from hrmData import hrmData from readData import readData from timeSegment import timeSegment myDataset = readData("test_data31.csv") hrmObject = hrmData(myDataset) actualHR = 80 assert 0.9*actualHR < hrmObject.heartRateList[0] < 1.1*actualHR assert 0.9*actualHR < hrmObject.heartRateList[1] < 1.1*actualHR assert 0.9*actualHR < hrmObject.heartRateList[2] < 1.1*actualHR assert 0.9*actualHR < hrmObject.heartRateList[3] < 1.1*actualHR assert 0.9*actualHR < hrmObject.heartRateList[4] < 1.1*actualHR assert 0.9*actualHR < hrmObject.heartRateList[5] < 1.1*actualHR
def depthSearch(idx, depth=1000): if dbase.phdExists(idx): return if depth < 0: print >> sys.stderr, "[Search] Exceeded search depth on", idx phd, tpls = readData(idx) if dbase.writePhD(idx, phd[1]) is None: print >> sys.stderr, "[Search] Failed on writing Ph. D.", idx return for aID in writeDegreeTuples(tpls): try: depthSearch(aID, depth-1) except Exception, e: print >> sys.stderr, "[Search] Exception occured processing %d:" % aID, e
def test_convertTimeToIdx(): from hrmData import hrmData from readData import readData from timeSegment import timeSegment myDataset = readData("test_data31.csv") hrmObject = hrmData(myDataset) startIdx1 = 0 endIdx1 = 6 assert startIdx1 == hrmObject.startIdx assert endIdx1 == hrmObject.endIdx hrmObject2 = hrmData(myDataset, 2, 8) startIdx2 = 1 endIdx2 = 4 assert startIdx2 == hrmObject2.startIdx assert endIdx2 == hrmObject2.endIdx
def split_data(input_path): for split in SPLITS: output_path = join(input_path, 'split_files', split) os.makedirs(join(output_path, 'features'), exist_ok=True) os.makedirs(join(output_path, 'labels'), exist_ok=True) i = 0 for code in CPC_CODES: for patent in readData(input_path, split, code): abstract = patent['abstract'].encode().decode() desc = patent['description'].encode().decode() with open(join(output_path, 'features', f'{i}.desc'), 'w') as file: file.write(desc) with open(join(output_path, 'labels', f'{i}.label'), 'w') as file: file.write(abstract) i += 1
def test_write_json(): from main import main from hrmData import hrmData from readData import readData import json from timeSegment import timeSegment myDataset = readData("test_data31.csv") hrmObject = hrmData(myDataset) main() data = {'File Name': hrmObject.rawData.csvFileName, 'mean_hr_bpm': hrmObject.mean_hr_bpm, 'voltage_extremes': hrmObject.voltage_extremes, 'duration': hrmObject.duration, 'num_beats': hrmObject.num_beats, 'beats': hrmObject.beats} with open('test_data31.json') as data_file: data_loaded = json.load(data_file) data_loaded_list = data_loaded.items() data_list = data.items() assert data_loaded_list == data_list
import pandas as pd import numpy as np import readData import Cal_Acc_Gyro import motion file_dir = os.getcwd() print(file_dir) data_file_number = '2' f = 100 T = 1 / f # 1. read data measure_data = readData.readData(file_dir, data_file_number) print(measure_data.frames) # 2. calculate the norm of acc and gyro and detect the contact contact_flag = Cal_Acc_Gyro.contac_detection(measure_data.acc_data, measure_data.gyro_data, measure_data.frames) # 3. calculate the attitude and position calculate_data = motion.motion(measure_data.acc_data, measure_data.gyro_data, contact_flag) # 4. plot the signal of vel and position n = measure_data.frames t = np.linspace(0, n * T, n + 1) plt.figure(1)
from readData import readData from Clustering import kmeans_al from Vi import showCluster from dataNomalization import normalization from dimReduction import DimReduction from oneHot import oneHotData from topicModelling import topic from metricLearning import metricLearning #import pandas as pd if __name__ == "__main__": fileName = '../../demographic+Data+From+mimic.csv' toRows = 100 data = readData(fileName, toRows) onehotdata = oneHotData(data) scaledData = normalization(onehotdata) #df_scaledData = pd.DataFrame(scaledData) df_OriData = topic(scaledData, 10) df_NewData = metricLearning(df_OriData) #print (df_OriData) OriKmeansresult = kmeans_al(df_OriData) NewKmeansresult = kmeans_al(df_NewData) #print kmeansresult.labels_ TwoDOriData = DimReduction(df_OriData) TwoDNewData = DimReduction(df_NewData)
import readData from frequentCount import * from aprioriGen import * min_sup = 2 D = readData.readData("shoppingList.csv") def miningFrequentItemSet(D, min_sup): # initialized frequentItemSets = [] L1 = find_frequent_1_itemsets(D, min_sup) frequentItemSets.extend(L1) # find frequent itemset Lk, until it is empty Lk = L1 while len(Lk) != 0: # here, Ck is also a semi-finished Lk which processed by link and prune Ck = apriori_gen(Lk) # obtain final frequent itemset Lk Lk = scanDataBase(D, min_sup, Ck) frequentItemSets.extend(Lk) return frequentItemSets if __name__ == "__main__":
from featureEngineer import featureEngineer from readData import readData # ### Set global variables # # Note: The first N entries in the dataset should have labels. The rest will be used for testing. The very last column should contain the labels # In[ ]: FILE_PATH = 'dataset.csv' OUT_FILE = 'run1.h5' # ### Clean and split data into arrays # In[ ]: X, y = readData(FILE_PATH) X, k = featureEngineer(X) A = np.matmul(X.T, X) B = np.matmul(X.T, y) A = np.linalg.pinv(A) W = np.matmul(A, B) #These are the learned weights with h5py.File(OUT_FILE, 'w') as file: file.create_dataset('weights', W.shape) file['weights'][...] = W file.create_dataset('k', (1, ), data=k)
from readData import readData import numpy as np X, Y = readData() m = len(X) t0 = 0 t1 = 0 alpha = .01 iter = 9 from gradientDescent import gradientDescent t0, t1 = gradientDescent(X, Y, t0, t1, alpha, iter, m) print t0, t1
if inputFileTr.lower() == "" or inputClassesTr == "" or (mdlName.lower() != "test" and mdlName.lower() != "train"):# or inputFileTs.lower() == "" or inputClassesTs.lower() == "": print("You have NOT entered one of the required inputs!") sys.exit() #inputFileTr = "X_train.txt" #inputClassesTr = "y_train.txt" #inputFileTs = "X_test.txt" #inputClassesTs = "y_test.txt" print("\nLoading saved tree ...") tree = ET.ElementTree(file="trained_Tree.xml") xmlRoot = tree.getroot() root = makeTree(xmlRoot) print("\nReading 1 set of data ...") clases = readData(inputClassesTr) data1 = readData(inputFileTr) data1 = np.append(data1, clases, axis=1) if inputClassesTs != "": print("\nReading 2 set of data ...") clases = readData(inputClassesTs) data2 = readData(inputFileTs) data2 = np.append(data2, clases, axis=1) data = np.append(data1, data2, axis=0) if inputClassesTs != "" else data1 randIndx = np.load('invtRandData.npy') if (mdlName.lower() == "test") else np.load('randData.npy') print("Estimating classes and calculating accuracy ...") result = TreeResult (data[randIndx, :], root)
def reduce_tweets_words(): [leave_tweets, stay_tweets, other_tweets] = readData() leave_tweets = categorizy_tweets(leave_tweets, "neg") new_leave = getTokenizedTweetsFile("leaveTweets/ExtraLeaveTweets.txt", "neg") leave_Farias = getTokenizedTweetsFile("leaveTweets/FariasLeave.txt", "neg") stay_tweets = categorizy_tweets(stay_tweets, "pos") new_stay = getTokenizedTweetsFile("stayTweets/ExtraStayTweets.txt", "pos") stay_Farias = getTokenizedTweetsFile("stayTweets/FariasStay.txt", "pos") other_tweets = categorizy_tweets(other_tweets, "neutral") #Os arquivos que foram de Ada e vem pelo metodo categorizy_tweets sao todos 'Str' #Vou tentar fazer todos Unicode #leave_tweets = unicode_them(leave_tweets) #stay_tweets = unicode_them(stay_tweets) #other_tweets = unicode_them(other_tweets) tokenized_tweets = leave_tweets + new_leave + leave_Farias + stay_tweets + new_stay + stay_Farias + other_tweets + other_tweets all_words = [] #import ipdb;ipdb.set_trace() print(len(leave_tweets)) print(len(new_leave)) print(len(leave_Farias)) print(len(stay_tweets)) print(len(new_stay)) print(len(stay_Farias)) print(len(other_tweets)) print(len(other_tweets)) print(len(tokenized_tweets)) ############################################################################# # # AGORA QUE OS TWEETS ESTAO EM TUPLAS (tweets_tokenizados , categoria) VAMOS # REDUZIR O TAMANHO DOS TWEETS TOKENIZADOS TIRANDO STOPWORDS E OUTRAS COISAS # QUE NAO AGREGAM NA EXTRACAO DE CARACTERISTICAS # ############################################################################# # Fazendo o stopwords nas palavras dos documentos pra tirar muita coisa inutil stop_words = set(stopwords.words("english")) #Vou tentar melhorar a lista de stopwords colocando nela algumas pontuacoes q nao servem de nada # Fiz elas com unicode pq eh assim que as stop_words estao punctuation = [u'.', u'-', u',', u'"', u'(', u')', u':', u"'", u'--', u';', u'!', u'$', u'*', u'&', u'...', u':/', u'/', u'..'] punctuation = set(punctuation) punct = list(string.punctuation) #stop = stopwords.words('english') + punctuation + ['rt', 'via'] global new_stop_words new_stop_words = stop_words.union(punct) twitter_symbols = [u'rt', u'#voteleave', u'#voteremain', u'#leaveeu', u'h', u'#rt', u'=', u'@', u'https', u'+', u'\'', u'|', u'…', u'‘', u'’', u'..', u'...'] twitter_symbols = set(twitter_symbols) new_stop_words = new_stop_words.union(twitter_symbols) # NA VERDADE NAO TO CONSEGUINDO TIRAR O @USER DO RT MAS ISSO # NAO VAI INTERFERIR POIS A FREQUENCIA DE SE TER UM @USER DO MESMO USER EH POUCA #user_rt_pattern = "@\w+?" #url_pattern = 'http[s]:/' emotions_pattern = '\u\d+' url_pattern = 'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+' user_rt_pattern = '(?:@[\w_]+)' # "(?:[a-z][a-z'\-_]+[a-z])", # words with - and ' # r'(?:[\w_]+)', # other words # r'(?:\S)' # anything else #user_rt_pattern = '(?:@[\w_]+)' # filtered_tweets = [for tweet_cat in tokenized_tweets if ] filtered_tweets = [] tokens_to_be_removed = [] #print(tokenized_tweets[228:230]) ############################################################################# # # AQUI EU VOU TENTAR CONSTRUIR OS BIGRAMS DE CADA TWEET E ASSIM ADICIONA-LOS # EM UMA LISTA COM TODOS OS BIGRAMS FEITOS, DO MESMO JEITO QUE FACO COM TODAS # AS PALAVRAS DOS TWEETS # # Eh melhor criar a lista de todos os bigrams juntando cada lista de bigrams dos tweets # do que fazer a lista dos bigrams baseado na lista de todas as palavras # pq da segunda maneira podemos fazer bigrams que no existem pq pegam de um tweet e de outro ############################################################################# #Para cada tupla (tweet_tok,categoria) for tweet_cat in tokenized_tweets: #Pra cada token desse tweet for token in tweet_cat[0]: #Se o token for uma das stop_words ou ter o Regex de URl ou RT a gnt tira #import ipdb;ipdb.set_trace() #token = token.encode('utf-8').decode('utf-8') if token in new_stop_words or re.match(url_pattern, token) or re.search(user_rt_pattern, token) or re.match(emotions_pattern, token): tokens_to_be_removed.append(token) #print(tokens_to_be_removed) #Vi todos os tokens q eram pra ser removidos desse tweet #Agora vou remove-los for token in tokens_to_be_removed: #import ipdb;ipdb.set_trace() #token = token.encode('utf-8').decode('utf-8') tweet_cat[0].remove(token) #Limpar o tokens_to_be_removed pq senao vai sempre acumular de outros tweets tokens_to_be_removed = [] # Encodando tudo pra sair do Unicode e ficar em UTF-8 #tweet_cat[0] = [token.encode('utf-8') for token in tweet_cat[0]] #Primeiro criar os bigrams desse tweet e dps adicionar na lista de todos os bigrams #print(type(tweet_cat[26][0])) # for token in tweet_cat[0]: # #Transformando tudo em unicode # if type(token) == str: # token = token.decode('utf-8') # elif type(token) == unicode: # token = token.encode('utf-8').decode('utf-8') tweet_bigrams = list(bigrams(tweet_cat[0])) #tweet_bigrams = [(tupla[0].decode('utf-8'), tupla[1].decode('utf-8')) for tupla in tweet_bigrams] #import ipdb;ipdb.set_trace() #print(type(tweet_cat[26][0])) # tweet_bigrams eh uma lista entao se eu simplesmente fazer .append() em all_bigrams # all_bigrams ira ser so uma lista de listas #tweet_bigrams = [bi.encode('utf-8') for bi in tweet_bigrams] for i in range(len(tweet_bigrams)): all_bigrams.append(tweet_bigrams[i]) #Adiciona o tweet sem as stopwords na nova lista ################################################################## # # AGORA TEM UM NOVO CAMPO COM TODOS OS BIGRAMS DO TWEET # ASSIM OS BIGRAMS TB TERAO UMA CATEGORIA E SERAO IMPORTANTES PRA A CLASSIFICACAO # COM ISSO AO INVES DE TUPLA SERA TRIPLA (tokens, bigrams, category) # ################################################################## tweet_bigrams_cat = (tweet_cat[0], tweet_bigrams, tweet_cat[1]) filtered_tweets.append(tweet_bigrams_cat) # Exemplo de tweet filtrado com stopwords # ([u'@mpvine', u'If', u'fifty', u'million', u'people', u'say', # u'foolish', u'thing', u"it's", u'still', u'foolish', u'thing'], 'pos') #print(filtered_tweets[228:230]) ####################################################################### # # JOGANDO TODOS OS TWEETS REDUZIDOS EM UM ARQUIVO # ####################################################################### #Arquivo com as novas tuplas dos tweets filtrados with open('FilteredTweets2.txt', 'w') as outfile: for item in filtered_tweets: outfile.write(str(item) + '\n') #Arquivo com as novas tuplas dos tweets filtrados with open('Bigrams.txt', 'w') as outfile: for item in all_bigrams: outfile.write(str(item) + '\n') return filtered_tweets
def main(args): decisionTree = DecisionTree() cIndex, attributesList, data = readData.readData(args.input) decisionTree.makeTree(decisionTree.root, cIndex, attributesList, data[0:500]) print accuracy(decisionTree.root, data[500:600], cIndex)
# Batch normalisation after layers -> Gaussian # Data augmentation # Confusion matrix batch_size = 100 epochs = 1 validation_size = 100 num_classes = 10 result_file = "test_run_results.txt" # input image dimensions img_x, img_y = 32, 32 # load data sets arr, labels, images = readData( 'C:\\Users\\nystr\\GTSRB\\Final_Training\\Images', num_classes, (img_x, img_y)) v_arr, v_labels, v_images = readValidationData( 'C:\\Users\\nystr\\GTSRB\\Final_Test\\Images', (img_x, img_y), validation_size) #arr, labels, images = readData('C:/Users/Filip/Documents/Kandidat/GTSRB/Final_Training/Images', num_classes, (img_x, img_y)) #v_arr, v_labels, v_images = readValidationData('C:/Users/Filip/Documents/Kandidat/GTSRB/Final_Test/Images', # (img_x, img_y), validation_size) x_train = np.asarray(arr) y_train = oneHotEncode(labels, num_classes) x_test = np.asarray(v_arr)
import readData import costFunction from sigmoid import sigmoidGradient from randomInit import randomInit from backPropagation import backPropagation from predict import predict from gradientChecking import gradientChecking showcost = 0 if __name__=="__main__": input_layer = 400 hidden_layer = 25 num_labels = 10 (X,y) = readData.readData() lam = 1 if showcost: import numpy as np (ogTheta1,ogTheta2) = readData.readWeights() Thetas = np.reshape(ogTheta1,ogTheta1.size) Thetas = np.append(Thetas,ogTheta2) print costFunction.computeRegularizedCost(Thetas, X,y,input_layer,hidden_layer, num_labels,lam) Theta1 = randomInit(input_layer,hidden_layer) Theta2 = randomInit(hidden_layer,num_labels)
Created on : 2015年10月23日 time: 下午2:01:41 Function: ''' import numpy as np import scipy as sp from sklearn import tree from sklearn.metrics import precision_recall_curve from sklearn.metrics import classification_report from sklearn.cross_validation import train_test_split from readData import readData ''''' 数据读入 ''' path = "E:/Desktop/Image/SVMData/loc_train.txt" x,y = readData(path) ''''x_train 训练数据, x_test 训练标签, y_train 测试数据, y_test 测试标签''' x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.5) ''''' 使用信息熵作为划分标准,对决策树进行训练 ''' clf = tree.DecisionTreeClassifier(criterion='entropy') print(clf) clf.fit(x_train, y_train)
from numpy import * import SVM from readData import readData ################## test svm ##################### ## step 1: load data print "step 1: load data..." # fileIn = open('D:/Desktop/python study/Image/src/testSet.txt') # for line in fileIn.readlines(): # lineArr = line.strip().split() # print(lineArr) # dataSet.append([float(tk) for tk in lineArr[:-1]]) # labels.append(float(lineArr[-1])) dataSet,labels = readData("E:/Desktop/Image/SVMData/gender_wechat.txt") t = int(len(labels)/5) dataSet = mat(dataSet) labels = mat(labels).T train_x = dataSet[0:t, :] train_y = labels[0:t, :] test_x = dataSet[t:len(labels), :] test_y = labels[t:len(labels), :] ## step 2: training... print "step 2: training..." C = 0.6 toler = 0.001 maxIter = 50 svmClassifier = SVM.trainSVM(train_x, train_y, C, toler, maxIter, kernelOption = ('rbf', 0))
f_in_trn = 'Data/images_train' f_in_tst = 'Data/images_test' f_in_sol = 'Data/train_solutions.csv' f_in_flat_trn = 'Data/train_.csv' f_in_flat_tst = 'Data/test_.csv' f_out_trn = 'Data/train_32_deskew.csv' f_out_tst = 'Data/test_32_deskew.csv' f_out_subm = 'Submissions/ls_32_deskew.csv' # Process images from readData import readData (Xtrn, Ytrn, Xtst) = readData(f_in_trn, f_in_tst, f_in_sol, augmenting=False) from saveData import saveData saveData((Xtrn, Xtst), (f_out_trn, f_out_tst), colfmt='%.18e') # Load processed images from flat file, on disk ''' from loadData import loadData Xtrn = loadData(f_in_flat_trn, rowskip=0) Xtst = loadData(f_in_flat_tst, rowskip=0) tst = loadData(f_in_flat_tst, rowskip=0) Ytrn = loadData(f_in_sol, rowskip=1) ''' # Fit OLS ''' from sklearn import linear_model model = linear_model.LinearRegression() model.fit(Xtrn, Ytrn[::, 1:])
def main(): [x_train, y_train, x_test, y_test] = readData.readData() backword(x_train, y_train, x_test, y_test)
def reduce_tweets_words(): #Ja vai pegar os tweets dos txts e tokeniza-los e colocar nesse array como tuplas # (Twitter_tokenizado , categoria_twitter) # ([u'RT', u'@mpvine', u':', u'If', u'fifty', u'million', u'people', u'say', u'a', # u'foolish', u'thing', u',', u"it's", u'still', u'a', u'foolish', u'thing', u'.'], 'pos') # Tem 2853 no FeatureSet, sendo: 1286 Stay e 1567 Leave # openFile_getTokenizedTweets("StayTweets1.txt", "pos") # openFile_getTokenizedTweets("StayTweetsDate.txt", "pos") # openFile_getTokenizedTweets("StayTweetsDate2.txt", "pos") # openFile_getTokenizedTweets("StayJune14.txt", "pos") # openFile_getTokenizedTweets("StayJune15.txt", "pos") # openFile_getTokenizedTweets("StayJune16.txt", "pos") # openFile_getTokenizedTweets("StayJune17.txt", "pos") # openFile_getTokenizedTweets("StayJune18.txt", "pos") # openFile_getTokenizedTweets("StayJune19.txt", "pos") # openFile_getTokenizedTweets("StayJune20.txt", "pos") # openFile_getTokenizedTweets("StayTweetsNow.txt", "pos") ########################################################################################### #Too fazendo isso pra pegar so os 1286 primeiros desse arquivo q tem 1537 # with open("LeaveTweets1.txt") as doc: # lines = doc.readlines() # lines = lines[:1286] # #print(len(lines)) # for l in lines: # #Pra tirar se tiver emotions no formato /u2026 por exemplo # l = l.decode('unicode_escape').encode('ascii','ignore') # tokens = tknzr.tokenize(l) # global tokenized_tweets # #Pega cada token e bota em minuscula # lw_tokens = [w.lower() for w in tokens] # tokenized_tweets.append((lw_tokens, "neg", l)) # openFile_getTokenizedTweets("LeaveTweetsDate.txt", "neg") # openFile_getTokenizedTweets("LeaveTweetsDate2.txt", "neg") # openFile_getTokenizedTweets("LeaveJune14.txt", "neg") # openFile_getTokenizedTweets("LeaveJune15.txt", "neg") # openFile_getTokenizedTweets("LeaveJune16.txt", "neg") # openFile_getTokenizedTweets("LeaveJune17.txt", "neg") # openFile_getTokenizedTweets("LeaveJune18.txt", "neg") # openFile_getTokenizedTweets("LeaveJune19.txt", "neg") # openFile_getTokenizedTweets("LeaveJune20.txt", "neg") # openFile_getTokenizedTweets("LeaveTweetsNow.txt", "neg") [leave_tweets, stay_tweets, other_tweets] = readData() leave_tweets = categorizy_tweets(leave_tweets, "neg") new_leave = getTokenizedTweetsFile("ExtraLeaveTweets.txt", "neg") stay_tweets = categorizy_tweets(stay_tweets, "pos") new_stay = getTokenizedTweetsFile("ExtraStayTweets.txt", "pos") other_tweets = categorizy_tweets(other_tweets, "neutral") tokenized_tweets = leave_tweets + new_leave + stay_tweets + new_stay + other_tweets all_words = [] print(len(leave_tweets)) print(len(new_leave)) print(len(stay_tweets)) print(len(new_stay)) print(len(other_tweets)) print(len(tokenized_tweets)) ############################################################################# # # AGORA QUE OS TWEETS ESTAO EM TUPLAS (tweets_tokenizados , categoria) VAMOS # REDUZIR O TAMANHO DOS TWEETS TOKENIZADOS TIRANDO STOPWORDS E OUTRAS COISAS # QUE NAO AGREGAM NA EXTRACAO DE CARACTERISTICAS # ############################################################################# # Fazendo o stopwords nas palavras dos documentos pra tirar muita coisa inutil stop_words = set(stopwords.words("english")) #Vou tentar melhorar a lista de stopwords colocando nela algumas pontuacoes q nao servem de nada # Fiz elas com unicode pq eh assim que as stop_words estao punctuation = [u'.', u'-', u',', u'"', u'(', u')', u':', u'?', u"'", u'--', u';', u'!', u'$', u'*', u'&', u'...', u':/', u'/', u'%', u'..'] punctuation = set(punctuation) global new_stop_words new_stop_words = stop_words.union(punctuation) twitter_symbols = [u'rt', u'#voteleave', u'#voteremain', u'#leaveeu', u'h', u'#rt', u'=', u'@', u'https', u'+', u"'", u'|', u'...'] twitter_symbols = set(twitter_symbols) new_stop_words = new_stop_words.union(twitter_symbols) # NA VERDADE NAO TO CONSEGUINDO TIRAR O @USER DO RT MAS ISSO # NAO VAI INTERFERIR POIS A FREQUENCIA DE SE TER UM @USER DO MESMO USER EH POUCA #user_rt_pattern = "@\w+?" #url_pattern = 'http[s]:/' emotions_pattern = '\u\d+' url_pattern = 'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+' user_rt_pattern = '(?:@[\w_]+)' #user_rt_pattern = '(?:@[\w_]+)' # filtered_tweets = [for tweet_cat in tokenized_tweets if ] filtered_tweets = [] tokens_to_be_removed = [] #print(tokenized_tweets[228:230]) #Para cada tupla (tweet_tok,categoria) for tweet_cat in tokenized_tweets: #Pra cada token desse tweet for token in tweet_cat[0]: #Se o token for uma das stop_words ou ter o Regex de URl ou RT a gnt tira #print(token) if token in new_stop_words or re.match(url_pattern, token) or re.search(user_rt_pattern, token) or re.match(emotions_pattern, token): tokens_to_be_removed.append(token) #print(tokens_to_be_removed) #Vi todos os tokens q eram pra ser removidos desse tweet #Agora vou remove-los for token in tokens_to_be_removed: tweet_cat[0].remove(token) #Limpar o tokens_to_be_removed pq senao vai sempre acumular de outros tweets tokens_to_be_removed = [] #Adiciona o tweet sem as stopwords na nova lista filtered_tweets.append(tweet_cat) # Exemplo de tweet filtrado com stopwords # ([u'@mpvine', u'If', u'fifty', u'million', u'people', u'say', # u'foolish', u'thing', u"it's", u'still', u'foolish', u'thing'], 'pos') #print(filtered_tweets[228:230]) ####################################################################### # # JOGANDO TODOS OS TWEETS REDUZIDOS EM UM ARQUIVO # ####################################################################### #Arquivo com as novas tuplas dos tweets filtrados with open('FilteredTweets2.txt', 'w') as outfile: for item in filtered_tweets: outfile.write(str(item) + '\n') return filtered_tweets
from matplotlib import pyplot as plt import numpy as np from sklearn import datasets, linear_model from sklearn.linear_model import OrthogonalMatchingPursuit from sklearn.cross_validation import KFold from sklearn.decomposition import PCA from sklearn.metrics import make_scorer,mean_squared_error,r2_score from sklearn.preprocessing import PolynomialFeatures from sklearn.preprocessing import scale from readData import readData from gridsearch_helper import grid_search_helper_basic,grid_search_helper #read the sets split into training,testing and validation sets by dataSplit.py X_train,Y_train,X_val,Y_val,X_test,Y_test=readData() #define the no of dimensions of the feature vector and the max dimension of the expanded polynomial #feature space noDim=8 noPoly=2 #define no of repetitions and folds of Cross-Validation that is to be done for training noRep=5 n_folds=5 #define scoring functions
def __init__(self): self.data = readData.readData( readData.datafilePath )
start = 0 end = 0 d = [] for i in xrange(len(data)-1): if isZero(data[i],data[i+1]): print start,end if end -start >=step: d.append(data[start:end]) start = end else: end = i if end -start >=step: d.append(data[start:end]) return d data = readData("data/turn/Turn1.csv") sf = lowPassFilter(data["AX"],0.02) #d,vars,means = splitData(sf.tolist()) #newD = merge(d,vars,means) #print len(newD) #for line in newD: # print len(line) newD = splitData2(sf) for line in newD: print len(line) data = readData("data/turn/Turn3.csv") sf = lowPassFilter(data["AX"],0.02) for line in splitData2(sf): newD.append(line) for line in newD: sim = []
import readData import costFunction from sigmoid import sigmoidGradient from randomInit import randomInit from backPropagation import backPropagation from predict import predict from gradientChecking import gradientChecking showcost = 0 if __name__ == "__main__": input_layer = 400 hidden_layer = 25 num_labels = 10 (X, y) = readData.readData() lam = 1 if showcost: import numpy as np (ogTheta1, ogTheta2) = readData.readWeights() Thetas = np.reshape(ogTheta1, ogTheta1.size) Thetas = np.append(Thetas, ogTheta2) print costFunction.computeRegularizedCost(Thetas, X, y, input_layer, hidden_layer, num_labels, lam) Theta1 = randomInit(input_layer, hidden_layer) Theta2 = randomInit(hidden_layer, num_labels)
def main(args): decisionTree = DecisionTree() cIndex, attributesList, data = readData.readData(args.input) decisionTree.makeTree(decisionTree.root, cIndex, attributesList, data) decisionTree.printTree(decisionTree.root)