Пример #1
0
def snd_url():
    '''
    拼接租房二级页面地址

    :return:
    '''
    # details = detail()
    index_list = regx_select(detail(url))
    finall_list = []

    inputname = raw_input('press in name:')

    try:
        for links in index_list:
            for sec in link2nd_tool.linkstr(detail(links)):
                print
                links + sec + '>>>>>>  is  found'
                finall_list.append(links + sec)
                print
                finall_list
                print
                "\r\n" * 10 + "-" * 30
                IOutils.rtfile_input(str(finall_list), (inputname + '.txt'))
                time.sleep(numpy.random.randint(1, 4))
    except:
        print("官方个别分站内容未填充,已跳过。。。")

    finally:
        return finall_list
Пример #2
0
def file_rt(page_list):
    '''
    结果保存
    :param page_list:
    :return:
    '''
    IOutils.rtfile_time(str(page_list), "txt")
    print "列表完成"
Пример #3
0
def preprocess_all_mk2(mode='train',
                       disp=True):
    """Preprocesses all the data.
    Mean cancellation by subtracting SENSOR_MEAN and scaling with SENSOR_STD"""
    csvlist = io.get_file_list(mode=mode, fullpath=True)
    pif = lambda msg: printflush(msg) if disp else None

    pif('MK2 preprocessing for ' + mode + ' data\n')
    
    for fullpath in csvlist:
        t0 = time()
        fpath, fname = os.path.split(fullpath)
        data = pd.read_csv(fullpath).values[:,1:]
        pif('Processing ' + fname + ' -- ' + str(data.shape[0]) + ' samples...')

        # Removes the mean of each sensor
        data -= utils.SENSOR_MEAN
        
        # Scale the data with the standard deviation from the training data
        data /= utils.SENSOR_STD

        final_fname = fullpath[:-4] + '_mk2'
        np.save(final_fname, data)

        pif("%.3f"%(time()-t0) + " s\n")
Пример #4
0
def preprocess_all_mk1(norm_wind=None,
                       div_factor=300,
                       mode='train',
                       disp=True):
    """Preprocesses all the data.
    Simply scales the data with div_factor, then applies running zeromean"""
    csvlist = io.get_file_list(mode=mode, fullpath=True)
    pif = lambda msg: printflush(msg) if disp else None

    pif('MK1 preprocessing')

    for fullpath in csvlist:
        t0 = time()
        fpath, fname = os.path.split(fullpath)
        data = pd.read_csv(fullpath).values[:,1:]
        pif('Processing ' + fname + ' -- ' + str(data.shape[0]) + ' samples...')

        # Scale the data
        data /= float(div_factor)

        # Execute the running mean
        if norm_wind is not None:
            wind = norm_wind
        else:
            wind = data.shape[0]
        final_data = utils.running_zeromean(data, wind, axis=0)
        pif("\b" + "%.3f"%(time()-t0) + " s\n")

        str_wind = 'FULL' if wind==data.shape[0] else str(wind)
        final_fname = fullpath[:-4] + '_mk1_norm' + str_wind
        np.save(final_fname, final_data)
Пример #5
0
def train_mean_std(disp=True):
    """Outputs the mean and standard deviation of each sensor, for the training data ONLY"""
    csvlist = io.get_file_list(mode='train', fullpath=True)
    pif = lambda msg: printflush(msg) if disp else None

    # Sum all the means and STD together
    mean = np.zeros(32, dtype=DTYPE)
    var = np.zeros(32, dtype=DTYPE)
    for fullpath in csvlist:
        t0 = time()
        fpath, fname = os.path.split(fullpath)
        data = pd.read_csv(fullpath).values[:,1:]
        pif('Processing ' + fname + ' -- ' + str(data.shape[0]) + ' samples...')

        mean += np.mean(data, axis=0, dtype=DTYPE)
        var += np.var(data, axis=0, dtype=DTYPE)
        pif("\b" + "%.3f"%(time()-t0) + " s\n")

    # Divide by # of datasets
    dataset_count = len(csvlist)
    mean /= dataset_count
    var /= dataset_count

    # Sqrt the variance
    std = np.sqrt(var)

    # print representation 
    print(repr(mean))
    print(repr(std))
Пример #6
0
def preprocess_all_mk0(norm_wind=None,
                       nperseg=256,
                       mode='train',
                       max_freq_count=10,
                       disp=True):
    """Preprocesses all the data.
    Append the 10 highest frequency components of each previous nperseg window"""
    csvlist = io.get_file_list(mode=mode, fullpath=True)

    pif = lambda msg: printflush(msg) if disp else None

    pif('MK0 preprocessing')

    for fullpath in csvlist:
        t0 = time()
        fpath, fname = os.path.split(fullpath)
        data = pd.read_csv(fullpath).values[:,1:]
        pif('Processing ' + fname + ' -- ' + str(data.shape[0]) + ' samples...')
        
        # Get the spectrograph
        f,t,sxx = utils.spectrogram(data, window='boxcar', nperseg=nperseg)
        #spectro_fname = fullpath[:-4] + '_spectro'
        #np.save(spectro_fname, sxx)

        # N Principal frequencies (a normalized index)
        max_freqs = principal_frequencies(sxx, max_freq_count)

        # BLow up the max frequencies to match the data array
        repeated_max_freqs = np.zeros((data.shape[0], max_freq_count), dtype=max_freqs.dtype)
        tmp = np.zeros((1, max_freqs.shape[1]))
        max_freqs = np.insert(max_freqs, 0, tmp, axis=0)
        for k in range(0,max_freqs.shape[0]-1):
            repeated_max_freqs[k*nperseg:(k+1)*nperseg,:] = np.tile(max_freqs[k,:], (nperseg,1))
        final_index = k




        # Execute the running mean
        if norm_wind is not None:
            wind = norm_wind
        else:
            wind = data.shape[0]
        norm_data = utils.running_zeromean(data, wind, axis=0)
        pif("\b" + "%.3f"%(time()-t0) + " s\n")

        # Concatenate
        #del data
        final_data = np.append(norm_data, repeated_max_freqs, axis=1)
        #del norm_data

        str_wind = 'FULL' if wind==data.shape[0] else wind
        final_fname = fullpath[:-4] + '_mk0' + '_W' + str(nperseg) + '_norm' + str(str_wind)
        np.save(final_fname, final_data)
Пример #7
0
def load_csv_data(subid, serid, mode='train', dir_name=None):
    """Loads the data from the appropriate folder, unless dir_name is specified"""
    if dir_name is None:
        file_name = io.get_datadir(mode=mode)
    else:
        file_name = dir_name

    file_name += '/subj'+str(subid)+'_series'+str(serid)
    
    data = pd.read_csv(file_name+'_data.csv')
    events = pd.read_csv(file_name+'_events.csv')
    return data.values[:,1:], events.values[:,1:]
Пример #8
0
def preprocess_all_mk3(mode='train',
                       wind=3,
                       butter_order=4,
                       disp=True):
    """Preprocesses all the data.
    Mean cancellation by subtracting SENSOR_MEAN and scaling with SENSOR_STD
    an MA filter is used to reduce the impact of high frequency noise
    """
    csvlist = io.get_file_list(mode=mode, fullpath=True)
    pif = lambda msg: printflush(msg) if disp else None

    pif('MK3 preprocessing for ' + mode + ' data\n')
    
    for fullpath in csvlist:
        t0 = time()
        fpath, fname = os.path.split(fullpath)
        data = pd.read_csv(fullpath).values[:,1:]
        pif('Processing ' + fname + ' -- ' + str(data.shape[0]) + ' samples...')

        # Removes the mean of each sensor
        data -= utils.SENSOR_MEAN
        
        # Scale the data with the standard deviation from the training data
        data /= utils.SENSOR_STD

        # Moving average, to remove outliers
        data = utils.mov_avg(data, wind, axis=0)

        # TODO
        # Filter the data 
        brain_list = []
        freq_mask = 0
        for flo, fhi in utils.FREQUENCY_BANDS.itervalues():
            brain_list.append(utils.butter_apply(data, low=flo, high=fhi))
            freq_mask = int(round(flo+fhi*10))


        del data #Free some memory!
        final_data = np.concatenate(brain_list, axis=1)



        # Save preprocessed data and print stuff to console
        str_wind = 'FULL' if wind==final_data.shape[0] else str(wind)
        final_fname = fullpath[:-4] + '_mk3_wind' + str_wind + '_fmask' + str(freq_mask)
        np.save(final_fname, final_data)
        del brain_list, final_data # Free some memory for the next datafile

        pif("%.3f"%(time()-t0) + " s\n")
Пример #9
0
	def __init__(self, subjects=SUBJECTS, series=SERIES):
		training_ds = IOutils.data_streamer(patients_list=subjects, series_list=series)

		all_data = list(training_ds)

		X,Y = zip(*all_data)


		self.data = X[0]
		self.events = Y[0]

		self.mean = self.data.mean(axis=0)
		self.std = self.data.std(axis=0)

		self.normalize()
import matplotlib.pylab as plt
from collections import Counter
import sys


if not len(sys.argv)>1:
    print """arguments:
        SUBJECT_ID: Id of the subject you want to train
    """
    raise Exception('NEED MORE ARGUMENTS')
    
    
# NUM_ZERO_METRIC = sys.argv[3]
# obtain the first 7 datas for the 2nd subject
subject_id = int(sys.argv[1])
training_ds = IOutils.data_streamer2(keeplist=[ (subject_id, i) for i in range(1,7) ]) 
#nn = BasicNN(input_shape=(None,42), output_num_units=12, max_epochs=int(sys.argv[5]), hidden_num_units=int(sys.argv[4]))
vt = IOutils.VectorTransformer()

linear = LogisticRegression(class_weight = 'auto')
           
# n_repeat_sampling = int(sys.argv[2])
dataset_count = 0
for X,Y in training_ds:
    dataset_count += 1
    # transform the Ys
    Y = vt.transform(Y)
#     print('total size before sampling:', len(Y))
    X = X.astype(np.float)
    # normalization for regression
    X[np.isnan(X)] = 0
Пример #11
0
def step2():
    '''
    得到数据
    :return:
    '''

    # 读取上一阶段保存的列表
    fd = open('chuzu_list.txt', 'r')
    all_text = fd.read()
    fd.close()
    # 列表处理
    L = all_text.replace('[', '')
    L = L.replace(']', '')
    L = L.replace('\'', '')
    chuzu_list = L.split(",")
    # 出租房页面列表
    chuzu_list = list(set(chuzu_list))
    # 已有数据列表
    done_list = list_check.done_lis()
    print(done_list)
    # 出错列表
    error_list = []

    # 详情页处理
    item_pages = []
    random.shuffle(chuzu_list)
    for i in chuzu_list:
        # 显示当前页面地址
        print i
        it_urls = get_itempage_url.get_url(i)
        time.sleep(numpy.random.randint(3, 6))
        # 详情获取和保存
        for url in it_urls:
            if url not in done_list:
                if 'e.58.com' in url:
                    print('无效地址。。下一个')

                elif 'jxjump' in url:
                    print('无效地址。。下一个')
                else:
                    if 'short.58.com' in url:
                        url = url.replace('&end=end', '')
                    print(
                        '############################当前地址不在已完成列表中############################'
                    )
                    try:
                        # 得到页面数据
                        city, district, title, rental_type, phone_num, contacts, url_now, rent, lease, area, heading, community, address, detail, facility, advantage, pic = haoitem.get_items(
                            url)
                        # 得到处理后城市名
                        c_name = haoitem.get_cname()
                        # 所在地区和省份
                        region, province = get_city_info.get_areas(c_name)
                        # 保存到json的内容
                        detel = {
                            "region": region,
                            "province": province,
                            "city": city,
                            "district": district,
                            "title": title,
                            "rental_type": rental_type,
                            "url_now": url_now,
                            "rent": rent,
                            "lease": lease,
                            "area": area.replace(' ', ''),
                            "heading": heading,
                            "community": community,
                            'address': address,
                            "contacts": contacts,
                            "phone": phone_num,
                            "detail": detail,
                            "facility": facility,
                            "advantage": advantage,
                            "pics": pic
                        }

                        jStr = json.dumps(detel, ensure_ascii=False, indent=1)
                        IOutils.rtfile_time_with_path(jStr, 'json')
                        write_db.data_in(detel)
                        time.sleep(numpy.random.randint(3, 6))

                    except:
                        print(
                            '########################看来有的页面有问题,触发反爬了,休息片刻########################'
                        )
                        print('#' * 20 + url + '\t' + '#' * 20)
                        error_list.append(url)
                        time.sleep(15)
            else:
                print(
                    '############################页面已经搞过了,下一个############################'
                )
                time.sleep(numpy.random.randint(3, 5))
            # finally:
            #     return item_pages

            if it_urls != None:
                it_pages = item_pages.append(it_urls)

    print it_pages

    # 详情页列表保存
    file_zf = open('zf_item_list.txt', 'w')
    file_zf.write(repr(it_pages))
    file_zf.close()

    file_zf = open('error_list.txt', 'w')
    file_zf.write(repr(error_list))
    file_zf.close()

    return error_list
Пример #12
0
#parser.add_argument("--combineStages" , help= "e.g. <superMat_name1>:<Name1>,<Name2>;<sumperMat_name2>:<Name3>")
parser.add_argument(
    "--fo",
    help=
    "name output figure. Extension indiciates figure type. Data files saved with same name but different extension"
)
#parser.add_argument("--stagesPlot")
parser.add_argument("--vmin", type=float)
parser.add_argument("--vmax", type=float)
parser.add_argument("--linkage_regTargets", default="complete")
parser.add_argument("--linkage_TFs", default="complete")
args = parser.parse_args()

###########################################################################################################################
## LOAD DATA
corrDF = IOutils.loadDF(args.geneCorr)
genes = list(
    set().union(*[IOutils.readListFromFile(x) for x in args.genes.split(",")]))
genes_missing = [x for x in genes if not (x in corrDF.index)]
if len(genes_missing) > 0:
    print("WARNING the following genes are not in correlation data {}".format(
        genes_missing))
    genes = [x for x in genes if x in corrDF.index]
TFs = list(
    set().union(*[IOutils.readListFromFile(x) for x in args.TFs.split(",")]))
TFs_missing = [x for x in TFs if not (x in corrDF.index)]
if len(TFs_missing) > 0:
    print("WARNING the following TFs are not in correlation data {}".format(
        TFs_missing))
    TFs = [x for x in TFs if x in corrDF.index]
Пример #13
0
import IOutils
# from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# from sklearn import svm
import random
import sys
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score




ds = IOutils.data_streamer2() 

vt = IOutils.VectorTransformer()

X_valid, Y_valid = ds.next()

# use as follows: 
Y_valid = vt.transform(Y_valid)


Y_valid [Y_valid != 0] = 1
# Y_valid [Y_valid == 0] = 0 

# NaivB = GaussianNB()
# linear = LogisticRegression(class_weight = 'auto')  #accounts for class imbalance
# support = svm.SVC(kernel='rbf',C=10)
Пример #14
0
import sys
sys.path.append("src")
import Argument
import IOutils
import ImageProcessing

userCommand = Argument.CommandParser()
userCommand.parseArguments()
inputVideo = userCommand.getInputVideoInfo()
outputVideo = userCommand.getOutputVideoInfo()
outputText = userCommand.getOutputTextInfo()

port = IOutils.IOport(inputVideo, outputVideo, outputText)
port.createFileInstancesUponRequirement()

preprocessor = ImageProcessing.VideoPreprocessor(inputVideo)
preprocessor.findSideToCrop()
preprocessor.findCropPoints()
scaleRatio = preprocessor.getDisplayTargetRatio()

algo = ImageProcessing.Algorithm()

arrowMaker = ImageProcessing.VideoArtist(scaleRatio)
arrowMaker.findBestFrameMapping((8, 8))

monitor = IOutils.Display()

frame = port.getInputVideoFrame()
croppedFrame = preprocessor.cropFrameIntoSquare(frame)
previousFrame = preprocessor.convertFrameIntoSpecifiedFormat(croppedFrame)
# In[1]:

import numpy as np
import matplotlib.pylab as plt
from collections import Counter
import IOutils
import random
from neuralnetworks.templates import BasicNN2
from sklearn.metrics import confusion_matrix, classification_report
from itertools import product


# In[2]:

training_ds = IOutils.data_streamer2(keeplist=[ (i, 0) for i in range(1,12) ]) # obtain data for first 11 patients

nn = BasicNN2(input_shape=(None,42), output_num_units=11, max_epochs=100, hidden=[200,120,30])

vt = IOutils.VectorTransformer()


# In[3]:

n_repeat_sampling = 1
dataset_count = 0
for X,Y in training_ds:
    dataset_count += 1
    # transform the Ys
    Y = vt.transform(Y)
#     print('total size before sampling:', len(Y))
Пример #16
0
# -*- coding:utf-8 -*-
'''
批量获取所有页面地址
需要配合反爬
'''

import IOutils
import get_pages
import numpy
import time
import re

index_list = IOutils.readfile()

# 列表字符处理
L = index_list.replace('[', '')
L = L.replace(']', '')
L = L.replace('\'', '')
list_u = L.split(",")

list_u = list(set(list_u))

print list_u

for url in list_u:
    '''
    再次处理url列表
    '''
    key = str(url)
    regx = r'http\:\/\/[a-z]+\.58\.com\/(chuzu|ershoufang|pinpaigongyu)\/'
    pattern1 = re.compile(regx)
Пример #17
0
def save_dicts():
    dic = map_dict()
    dicts = repr(dic)
    IOutils.rtfile(dicts)
Пример #18
0
]


# In[4]:

nn = NeuralNet(layers_list, 
               max_epochs=30, 
               update=nesterov_momentum, 
               update_learning_rate=0.02, 
               verbose=1000, 
               **LF.kwargs)


# In[5]:

training_ds = IOutils.data_streamer(patients_list=[2], series_list=range(1,7))
# nn = BasicCNN(input_shape=(None,42), output_num_units=12, max_epochs=50, hidden=[256, 120], add_drops=[1,1])
vt = IOutils.VectorTransformer()


# In[ ]:

n_repeat_sampling = 1
dataset_count = 0
for X,Y in training_ds:
    X = X.astype(np.float)
    X[np.isnan(X)] = 0
    X = X/X.max()
    wg = window_generator_ND(X, window_size=WINDOW_SIZE)
    dataset_count += 1
    # transform the Ys
# coding: utf-8

# In[1]:

import numpy as np
import matplotlib.pylab as plt
from collections import Counter
import IOutils
import random
from neuralnetworks.templates import BasicNN2
from sklearn.metrics import confusion_matrix, classification_report


# In[ ]:

training_ds = IOutils.data_streamer2(keeplist=[ (1, i) for i in range(1,7) ]) # obtain the first 7 datas for the 1st subject
nn = BasicNN2(input_shape=(None,42), output_num_units=11, max_epochs=100, hidden=[200,120,30])
vt = IOutils.VectorTransformer()


# In[ ]:

n_repeat_sampling = 1
dataset_count = 0
for X,Y in training_ds:
    dataset_count += 1
    # transform the Ys
    Y = vt.transform(Y)
#     print('total size before sampling:', len(Y))
    X = X.astype(np.float)
    # normalization for regression
Пример #20
0
#IOutils.LABEL_NAMES has all the classes we wish to predict
#HI
# initialize logistic regressors
LRs = {}
LRsprocessed = {}
for label_name in IOutils.LABEL_NAMES:
    # each label will have its own logistic regressor
    LRs[label_name] = LogisticRegression()
    LRsprocessed[label_name] = LogisticRegression()

print('Initialized logistic regressors')


# Load training data
# load 1 trial each from 3 patients
train_data = IOutils.data_streamer(mode='train', num_patients=1, num_series=8)

filters = ['alpha', 'beta']

# obtain a validation set
X_valid, Y_valid = train_data.next()

# X_valid = X_valid[]

selected_channels = range(X_valid.shape[1])
# selected_channels = [3,4]

X_valid = X_valid[:,selected_channels]
# print Y_valid
# X_valid = np.array(X_valid)
Y_valid = np.array(Y_valid)
Пример #21
0
def url_list():
    list_u = IOutils.readfile()

    return list_u
Пример #22
0
import numpy as np
import os
import pandas as pd





# Grab train data
data = []
label = []
n_sub = 1
n_series = 8

train_streamer = io.data_streamer2(mode='train')

for k in range(n_sub):
    sub_data = []
    sub_label = []
    for series in range(n_series):
        d, e = train_streamer.next()
        sub_data.append(d)
        sub_label.append(e)

    data.append(sub_data)
    label.append(sub_label)

np.save('eeg_train.npy', [data, label])

del data, label
Пример #23
0
	this code runs LR and NN at the same time
"""

import numpy as np
import IOutils
# from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import random
import sys
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, classification_report

from neuralnetworks.templates import BasicNN2

nn = BasicNN2(max_epochs=100, hidden=[200,50,30], input_shape=(None, 42), output_num_units=2)
ds = IOutils.data_streamer2(keeplist=[ (i,0) for i in xrange(1,12) ]) 

vt = IOutils.VectorTransformer()

# NaivB = GaussianNB()
lr = LogisticRegression(class_weight="auto")

#svc = svm.SVC(kernel='rbf',C=10,class_weight="auto")
#=======
# svc = svm.SVC(kernel='rbf',C=10,class_weight="auto")
for X_next, Y_next in ds:
	X_next = X_next.astype(np.float)/X_next.max()
	X_next[np.isnan(X_next)]=0
	zipped = zip(X_next,Y_next)
	random.shuffle(zipped)
	X,Y = zip(*zipped)
Пример #24
0
#exampleArgs="--fi_expr ../exprData/impute-t10_libNorm10K_cells.foreskin_genesGeq1Pct.all.kcyte.pkl \
#--fi_stageIDs ../clusterCells/kasp.ka10.k30_impute.t10_gene.geq5UMIgeq100Cell.all.csv \
#--cells ./cellNames.passFilter.tmp \
#--superstages progenitor:stage1,stage2,stage3;\
#progenitor.stage4:stage1,stage2,stage3,stage4;\
#differentiated:stage5,stage6,stage7;\
#differentiated.stage4:stage4,stage5,stage6,stage7;\
#all.noStage8:stage1,stage2,stage3,stage4,stage5,stage6,stage7 \
#--corrMethod  logTpm \
#--fo corr_tmpA"
#args = parser.parse_args(exampleArgs.split())

if args.superstages is not None:
    superstage_dict = IOutils.parseStrToDict(args.superstages,
                                             valueType="str",
                                             pairSep=";")
    try:
        superstage_dict = OrderedDict([(k, [int(x) for x in v.split(",")])
                                       for k, v in superstage_dict.items()])
    except:
        superstage_dict = OrderedDict([
            (k, [int(re.search(r'(\d+)$', x).group(1)) for x in v.split(",")])
            for k, v in superstage_dict.items()
        ])

############################################################################################################
## Load data
cellData = IOutils.loadCellData(
    OrderedDict([("expr", args.fi_expr), ("pcComps", args.fi_stageIDs)]))
cellData = pd.concat([