def snd_url(): ''' 拼接租房二级页面地址 :return: ''' # details = detail() index_list = regx_select(detail(url)) finall_list = [] inputname = raw_input('press in name:') try: for links in index_list: for sec in link2nd_tool.linkstr(detail(links)): print links + sec + '>>>>>> is found' finall_list.append(links + sec) print finall_list print "\r\n" * 10 + "-" * 30 IOutils.rtfile_input(str(finall_list), (inputname + '.txt')) time.sleep(numpy.random.randint(1, 4)) except: print("官方个别分站内容未填充,已跳过。。。") finally: return finall_list
def file_rt(page_list): ''' 结果保存 :param page_list: :return: ''' IOutils.rtfile_time(str(page_list), "txt") print "列表完成"
def preprocess_all_mk2(mode='train', disp=True): """Preprocesses all the data. Mean cancellation by subtracting SENSOR_MEAN and scaling with SENSOR_STD""" csvlist = io.get_file_list(mode=mode, fullpath=True) pif = lambda msg: printflush(msg) if disp else None pif('MK2 preprocessing for ' + mode + ' data\n') for fullpath in csvlist: t0 = time() fpath, fname = os.path.split(fullpath) data = pd.read_csv(fullpath).values[:,1:] pif('Processing ' + fname + ' -- ' + str(data.shape[0]) + ' samples...') # Removes the mean of each sensor data -= utils.SENSOR_MEAN # Scale the data with the standard deviation from the training data data /= utils.SENSOR_STD final_fname = fullpath[:-4] + '_mk2' np.save(final_fname, data) pif("%.3f"%(time()-t0) + " s\n")
def preprocess_all_mk1(norm_wind=None, div_factor=300, mode='train', disp=True): """Preprocesses all the data. Simply scales the data with div_factor, then applies running zeromean""" csvlist = io.get_file_list(mode=mode, fullpath=True) pif = lambda msg: printflush(msg) if disp else None pif('MK1 preprocessing') for fullpath in csvlist: t0 = time() fpath, fname = os.path.split(fullpath) data = pd.read_csv(fullpath).values[:,1:] pif('Processing ' + fname + ' -- ' + str(data.shape[0]) + ' samples...') # Scale the data data /= float(div_factor) # Execute the running mean if norm_wind is not None: wind = norm_wind else: wind = data.shape[0] final_data = utils.running_zeromean(data, wind, axis=0) pif("\b" + "%.3f"%(time()-t0) + " s\n") str_wind = 'FULL' if wind==data.shape[0] else str(wind) final_fname = fullpath[:-4] + '_mk1_norm' + str_wind np.save(final_fname, final_data)
def train_mean_std(disp=True): """Outputs the mean and standard deviation of each sensor, for the training data ONLY""" csvlist = io.get_file_list(mode='train', fullpath=True) pif = lambda msg: printflush(msg) if disp else None # Sum all the means and STD together mean = np.zeros(32, dtype=DTYPE) var = np.zeros(32, dtype=DTYPE) for fullpath in csvlist: t0 = time() fpath, fname = os.path.split(fullpath) data = pd.read_csv(fullpath).values[:,1:] pif('Processing ' + fname + ' -- ' + str(data.shape[0]) + ' samples...') mean += np.mean(data, axis=0, dtype=DTYPE) var += np.var(data, axis=0, dtype=DTYPE) pif("\b" + "%.3f"%(time()-t0) + " s\n") # Divide by # of datasets dataset_count = len(csvlist) mean /= dataset_count var /= dataset_count # Sqrt the variance std = np.sqrt(var) # print representation print(repr(mean)) print(repr(std))
def preprocess_all_mk0(norm_wind=None, nperseg=256, mode='train', max_freq_count=10, disp=True): """Preprocesses all the data. Append the 10 highest frequency components of each previous nperseg window""" csvlist = io.get_file_list(mode=mode, fullpath=True) pif = lambda msg: printflush(msg) if disp else None pif('MK0 preprocessing') for fullpath in csvlist: t0 = time() fpath, fname = os.path.split(fullpath) data = pd.read_csv(fullpath).values[:,1:] pif('Processing ' + fname + ' -- ' + str(data.shape[0]) + ' samples...') # Get the spectrograph f,t,sxx = utils.spectrogram(data, window='boxcar', nperseg=nperseg) #spectro_fname = fullpath[:-4] + '_spectro' #np.save(spectro_fname, sxx) # N Principal frequencies (a normalized index) max_freqs = principal_frequencies(sxx, max_freq_count) # BLow up the max frequencies to match the data array repeated_max_freqs = np.zeros((data.shape[0], max_freq_count), dtype=max_freqs.dtype) tmp = np.zeros((1, max_freqs.shape[1])) max_freqs = np.insert(max_freqs, 0, tmp, axis=0) for k in range(0,max_freqs.shape[0]-1): repeated_max_freqs[k*nperseg:(k+1)*nperseg,:] = np.tile(max_freqs[k,:], (nperseg,1)) final_index = k # Execute the running mean if norm_wind is not None: wind = norm_wind else: wind = data.shape[0] norm_data = utils.running_zeromean(data, wind, axis=0) pif("\b" + "%.3f"%(time()-t0) + " s\n") # Concatenate #del data final_data = np.append(norm_data, repeated_max_freqs, axis=1) #del norm_data str_wind = 'FULL' if wind==data.shape[0] else wind final_fname = fullpath[:-4] + '_mk0' + '_W' + str(nperseg) + '_norm' + str(str_wind) np.save(final_fname, final_data)
def load_csv_data(subid, serid, mode='train', dir_name=None): """Loads the data from the appropriate folder, unless dir_name is specified""" if dir_name is None: file_name = io.get_datadir(mode=mode) else: file_name = dir_name file_name += '/subj'+str(subid)+'_series'+str(serid) data = pd.read_csv(file_name+'_data.csv') events = pd.read_csv(file_name+'_events.csv') return data.values[:,1:], events.values[:,1:]
def preprocess_all_mk3(mode='train', wind=3, butter_order=4, disp=True): """Preprocesses all the data. Mean cancellation by subtracting SENSOR_MEAN and scaling with SENSOR_STD an MA filter is used to reduce the impact of high frequency noise """ csvlist = io.get_file_list(mode=mode, fullpath=True) pif = lambda msg: printflush(msg) if disp else None pif('MK3 preprocessing for ' + mode + ' data\n') for fullpath in csvlist: t0 = time() fpath, fname = os.path.split(fullpath) data = pd.read_csv(fullpath).values[:,1:] pif('Processing ' + fname + ' -- ' + str(data.shape[0]) + ' samples...') # Removes the mean of each sensor data -= utils.SENSOR_MEAN # Scale the data with the standard deviation from the training data data /= utils.SENSOR_STD # Moving average, to remove outliers data = utils.mov_avg(data, wind, axis=0) # TODO # Filter the data brain_list = [] freq_mask = 0 for flo, fhi in utils.FREQUENCY_BANDS.itervalues(): brain_list.append(utils.butter_apply(data, low=flo, high=fhi)) freq_mask = int(round(flo+fhi*10)) del data #Free some memory! final_data = np.concatenate(brain_list, axis=1) # Save preprocessed data and print stuff to console str_wind = 'FULL' if wind==final_data.shape[0] else str(wind) final_fname = fullpath[:-4] + '_mk3_wind' + str_wind + '_fmask' + str(freq_mask) np.save(final_fname, final_data) del brain_list, final_data # Free some memory for the next datafile pif("%.3f"%(time()-t0) + " s\n")
def __init__(self, subjects=SUBJECTS, series=SERIES): training_ds = IOutils.data_streamer(patients_list=subjects, series_list=series) all_data = list(training_ds) X,Y = zip(*all_data) self.data = X[0] self.events = Y[0] self.mean = self.data.mean(axis=0) self.std = self.data.std(axis=0) self.normalize()
import matplotlib.pylab as plt from collections import Counter import sys if not len(sys.argv)>1: print """arguments: SUBJECT_ID: Id of the subject you want to train """ raise Exception('NEED MORE ARGUMENTS') # NUM_ZERO_METRIC = sys.argv[3] # obtain the first 7 datas for the 2nd subject subject_id = int(sys.argv[1]) training_ds = IOutils.data_streamer2(keeplist=[ (subject_id, i) for i in range(1,7) ]) #nn = BasicNN(input_shape=(None,42), output_num_units=12, max_epochs=int(sys.argv[5]), hidden_num_units=int(sys.argv[4])) vt = IOutils.VectorTransformer() linear = LogisticRegression(class_weight = 'auto') # n_repeat_sampling = int(sys.argv[2]) dataset_count = 0 for X,Y in training_ds: dataset_count += 1 # transform the Ys Y = vt.transform(Y) # print('total size before sampling:', len(Y)) X = X.astype(np.float) # normalization for regression X[np.isnan(X)] = 0
def step2(): ''' 得到数据 :return: ''' # 读取上一阶段保存的列表 fd = open('chuzu_list.txt', 'r') all_text = fd.read() fd.close() # 列表处理 L = all_text.replace('[', '') L = L.replace(']', '') L = L.replace('\'', '') chuzu_list = L.split(",") # 出租房页面列表 chuzu_list = list(set(chuzu_list)) # 已有数据列表 done_list = list_check.done_lis() print(done_list) # 出错列表 error_list = [] # 详情页处理 item_pages = [] random.shuffle(chuzu_list) for i in chuzu_list: # 显示当前页面地址 print i it_urls = get_itempage_url.get_url(i) time.sleep(numpy.random.randint(3, 6)) # 详情获取和保存 for url in it_urls: if url not in done_list: if 'e.58.com' in url: print('无效地址。。下一个') elif 'jxjump' in url: print('无效地址。。下一个') else: if 'short.58.com' in url: url = url.replace('&end=end', '') print( '############################当前地址不在已完成列表中############################' ) try: # 得到页面数据 city, district, title, rental_type, phone_num, contacts, url_now, rent, lease, area, heading, community, address, detail, facility, advantage, pic = haoitem.get_items( url) # 得到处理后城市名 c_name = haoitem.get_cname() # 所在地区和省份 region, province = get_city_info.get_areas(c_name) # 保存到json的内容 detel = { "region": region, "province": province, "city": city, "district": district, "title": title, "rental_type": rental_type, "url_now": url_now, "rent": rent, "lease": lease, "area": area.replace(' ', ''), "heading": heading, "community": community, 'address': address, "contacts": contacts, "phone": phone_num, "detail": detail, "facility": facility, "advantage": advantage, "pics": pic } jStr = json.dumps(detel, ensure_ascii=False, indent=1) IOutils.rtfile_time_with_path(jStr, 'json') write_db.data_in(detel) time.sleep(numpy.random.randint(3, 6)) except: print( '########################看来有的页面有问题,触发反爬了,休息片刻########################' ) print('#' * 20 + url + '\t' + '#' * 20) error_list.append(url) time.sleep(15) else: print( '############################页面已经搞过了,下一个############################' ) time.sleep(numpy.random.randint(3, 5)) # finally: # return item_pages if it_urls != None: it_pages = item_pages.append(it_urls) print it_pages # 详情页列表保存 file_zf = open('zf_item_list.txt', 'w') file_zf.write(repr(it_pages)) file_zf.close() file_zf = open('error_list.txt', 'w') file_zf.write(repr(error_list)) file_zf.close() return error_list
#parser.add_argument("--combineStages" , help= "e.g. <superMat_name1>:<Name1>,<Name2>;<sumperMat_name2>:<Name3>") parser.add_argument( "--fo", help= "name output figure. Extension indiciates figure type. Data files saved with same name but different extension" ) #parser.add_argument("--stagesPlot") parser.add_argument("--vmin", type=float) parser.add_argument("--vmax", type=float) parser.add_argument("--linkage_regTargets", default="complete") parser.add_argument("--linkage_TFs", default="complete") args = parser.parse_args() ########################################################################################################################### ## LOAD DATA corrDF = IOutils.loadDF(args.geneCorr) genes = list( set().union(*[IOutils.readListFromFile(x) for x in args.genes.split(",")])) genes_missing = [x for x in genes if not (x in corrDF.index)] if len(genes_missing) > 0: print("WARNING the following genes are not in correlation data {}".format( genes_missing)) genes = [x for x in genes if x in corrDF.index] TFs = list( set().union(*[IOutils.readListFromFile(x) for x in args.TFs.split(",")])) TFs_missing = [x for x in TFs if not (x in corrDF.index)] if len(TFs_missing) > 0: print("WARNING the following TFs are not in correlation data {}".format( TFs_missing)) TFs = [x for x in TFs if x in corrDF.index]
import IOutils # from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier # from sklearn import svm import random import sys from sklearn.metrics import confusion_matrix from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import f1_score ds = IOutils.data_streamer2() vt = IOutils.VectorTransformer() X_valid, Y_valid = ds.next() # use as follows: Y_valid = vt.transform(Y_valid) Y_valid [Y_valid != 0] = 1 # Y_valid [Y_valid == 0] = 0 # NaivB = GaussianNB() # linear = LogisticRegression(class_weight = 'auto') #accounts for class imbalance # support = svm.SVC(kernel='rbf',C=10)
import sys sys.path.append("src") import Argument import IOutils import ImageProcessing userCommand = Argument.CommandParser() userCommand.parseArguments() inputVideo = userCommand.getInputVideoInfo() outputVideo = userCommand.getOutputVideoInfo() outputText = userCommand.getOutputTextInfo() port = IOutils.IOport(inputVideo, outputVideo, outputText) port.createFileInstancesUponRequirement() preprocessor = ImageProcessing.VideoPreprocessor(inputVideo) preprocessor.findSideToCrop() preprocessor.findCropPoints() scaleRatio = preprocessor.getDisplayTargetRatio() algo = ImageProcessing.Algorithm() arrowMaker = ImageProcessing.VideoArtist(scaleRatio) arrowMaker.findBestFrameMapping((8, 8)) monitor = IOutils.Display() frame = port.getInputVideoFrame() croppedFrame = preprocessor.cropFrameIntoSquare(frame) previousFrame = preprocessor.convertFrameIntoSpecifiedFormat(croppedFrame)
# In[1]: import numpy as np import matplotlib.pylab as plt from collections import Counter import IOutils import random from neuralnetworks.templates import BasicNN2 from sklearn.metrics import confusion_matrix, classification_report from itertools import product # In[2]: training_ds = IOutils.data_streamer2(keeplist=[ (i, 0) for i in range(1,12) ]) # obtain data for first 11 patients nn = BasicNN2(input_shape=(None,42), output_num_units=11, max_epochs=100, hidden=[200,120,30]) vt = IOutils.VectorTransformer() # In[3]: n_repeat_sampling = 1 dataset_count = 0 for X,Y in training_ds: dataset_count += 1 # transform the Ys Y = vt.transform(Y) # print('total size before sampling:', len(Y))
# -*- coding:utf-8 -*- ''' 批量获取所有页面地址 需要配合反爬 ''' import IOutils import get_pages import numpy import time import re index_list = IOutils.readfile() # 列表字符处理 L = index_list.replace('[', '') L = L.replace(']', '') L = L.replace('\'', '') list_u = L.split(",") list_u = list(set(list_u)) print list_u for url in list_u: ''' 再次处理url列表 ''' key = str(url) regx = r'http\:\/\/[a-z]+\.58\.com\/(chuzu|ershoufang|pinpaigongyu)\/' pattern1 = re.compile(regx)
def save_dicts(): dic = map_dict() dicts = repr(dic) IOutils.rtfile(dicts)
] # In[4]: nn = NeuralNet(layers_list, max_epochs=30, update=nesterov_momentum, update_learning_rate=0.02, verbose=1000, **LF.kwargs) # In[5]: training_ds = IOutils.data_streamer(patients_list=[2], series_list=range(1,7)) # nn = BasicCNN(input_shape=(None,42), output_num_units=12, max_epochs=50, hidden=[256, 120], add_drops=[1,1]) vt = IOutils.VectorTransformer() # In[ ]: n_repeat_sampling = 1 dataset_count = 0 for X,Y in training_ds: X = X.astype(np.float) X[np.isnan(X)] = 0 X = X/X.max() wg = window_generator_ND(X, window_size=WINDOW_SIZE) dataset_count += 1 # transform the Ys
# coding: utf-8 # In[1]: import numpy as np import matplotlib.pylab as plt from collections import Counter import IOutils import random from neuralnetworks.templates import BasicNN2 from sklearn.metrics import confusion_matrix, classification_report # In[ ]: training_ds = IOutils.data_streamer2(keeplist=[ (1, i) for i in range(1,7) ]) # obtain the first 7 datas for the 1st subject nn = BasicNN2(input_shape=(None,42), output_num_units=11, max_epochs=100, hidden=[200,120,30]) vt = IOutils.VectorTransformer() # In[ ]: n_repeat_sampling = 1 dataset_count = 0 for X,Y in training_ds: dataset_count += 1 # transform the Ys Y = vt.transform(Y) # print('total size before sampling:', len(Y)) X = X.astype(np.float) # normalization for regression
#IOutils.LABEL_NAMES has all the classes we wish to predict #HI # initialize logistic regressors LRs = {} LRsprocessed = {} for label_name in IOutils.LABEL_NAMES: # each label will have its own logistic regressor LRs[label_name] = LogisticRegression() LRsprocessed[label_name] = LogisticRegression() print('Initialized logistic regressors') # Load training data # load 1 trial each from 3 patients train_data = IOutils.data_streamer(mode='train', num_patients=1, num_series=8) filters = ['alpha', 'beta'] # obtain a validation set X_valid, Y_valid = train_data.next() # X_valid = X_valid[] selected_channels = range(X_valid.shape[1]) # selected_channels = [3,4] X_valid = X_valid[:,selected_channels] # print Y_valid # X_valid = np.array(X_valid) Y_valid = np.array(Y_valid)
def url_list(): list_u = IOutils.readfile() return list_u
import numpy as np import os import pandas as pd # Grab train data data = [] label = [] n_sub = 1 n_series = 8 train_streamer = io.data_streamer2(mode='train') for k in range(n_sub): sub_data = [] sub_label = [] for series in range(n_series): d, e = train_streamer.next() sub_data.append(d) sub_label.append(e) data.append(sub_data) label.append(sub_label) np.save('eeg_train.npy', [data, label]) del data, label
this code runs LR and NN at the same time """ import numpy as np import IOutils # from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn import svm import random import sys from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, classification_report from neuralnetworks.templates import BasicNN2 nn = BasicNN2(max_epochs=100, hidden=[200,50,30], input_shape=(None, 42), output_num_units=2) ds = IOutils.data_streamer2(keeplist=[ (i,0) for i in xrange(1,12) ]) vt = IOutils.VectorTransformer() # NaivB = GaussianNB() lr = LogisticRegression(class_weight="auto") #svc = svm.SVC(kernel='rbf',C=10,class_weight="auto") #======= # svc = svm.SVC(kernel='rbf',C=10,class_weight="auto") for X_next, Y_next in ds: X_next = X_next.astype(np.float)/X_next.max() X_next[np.isnan(X_next)]=0 zipped = zip(X_next,Y_next) random.shuffle(zipped) X,Y = zip(*zipped)
#exampleArgs="--fi_expr ../exprData/impute-t10_libNorm10K_cells.foreskin_genesGeq1Pct.all.kcyte.pkl \ #--fi_stageIDs ../clusterCells/kasp.ka10.k30_impute.t10_gene.geq5UMIgeq100Cell.all.csv \ #--cells ./cellNames.passFilter.tmp \ #--superstages progenitor:stage1,stage2,stage3;\ #progenitor.stage4:stage1,stage2,stage3,stage4;\ #differentiated:stage5,stage6,stage7;\ #differentiated.stage4:stage4,stage5,stage6,stage7;\ #all.noStage8:stage1,stage2,stage3,stage4,stage5,stage6,stage7 \ #--corrMethod logTpm \ #--fo corr_tmpA" #args = parser.parse_args(exampleArgs.split()) if args.superstages is not None: superstage_dict = IOutils.parseStrToDict(args.superstages, valueType="str", pairSep=";") try: superstage_dict = OrderedDict([(k, [int(x) for x in v.split(",")]) for k, v in superstage_dict.items()]) except: superstage_dict = OrderedDict([ (k, [int(re.search(r'(\d+)$', x).group(1)) for x in v.split(",")]) for k, v in superstage_dict.items() ]) ############################################################################################################ ## Load data cellData = IOutils.loadCellData( OrderedDict([("expr", args.fi_expr), ("pcComps", args.fi_stageIDs)])) cellData = pd.concat([