示例#1
0
def train_modelA(target_dirs, non_target_dirs, tgauss=4, ngauss=11):
    target_coef = []
    target_frequency = []
    print("Target data:")
    for dir_name in target_dirs:
        for f in glob(dir_name + '/*.wav'):
            print('Processing file: ', f)
            fs, f = wavfile.read(f)
            mfcc, freq = ft.features(f, fs)
            target_coef.append(mfcc)
            target_frequency.append(freq)
    target_coef = np.vstack(target_coef)
    target_frequency = np.array(target_frequency)

    non_target_coef = []
    non_target_frequency = []
    print("Non target data:")
    for dir_name in non_target_dirs:
        for f in glob(dir_name + '/*.wav'):
            print('Processing file: ', f)
            fs, f = wavfile.read(f)
            mfcc, freq = ft.features(f, fs)
            non_target_coef.append(mfcc)
            non_target_frequency.append(freq)
    non_target_coef = np.vstack(non_target_coef)
    non_target_frequency = np.array(non_target_frequency)

    print("Training gaussian distribution for frequency")
    mu_freq1 = target_frequency.mean()
    mu_freq2 = non_target_frequency.mean()
    cov_freq1 = target_frequency.var()
    cov_freq2 = non_target_frequency.var()

    # Initialize mean vectors to randomly selected data points from corresponding class
    # Initialize all covariance matrices to the same covariance matrices computed using
    # all the data from the given class
    print("Training GMM for mfcc")
    m1 = tgauss
    mus1 = target_coef[np.random.randint(1, target_coef.shape[1], m1)]
    cov1 = np.cov(target_coef.T, bias=True)
    covs1 = [cov1] * m1
    ws1 = np.ones(m1) / m1

    m2 = ngauss
    mus2 = non_target_coef[np.random.randint(1, non_target_coef.shape[1], m2)]
    cov2 = np.cov(non_target_coef.T, bias=True)
    covs2 = [cov2] * m2
    ws2 = np.ones(m2) / m2

    for i in range(30):
        ws1, mus1, covs1, ttl1 = ft.train_gmm(target_coef, ws1, mus1, covs1)
        ws2, mus2, covs2, ttl2 = ft.train_gmm(non_target_coef, ws2, mus2,
                                              covs2)
        print("target error:", ttl1, "non target error: ", ttl2)

    return (mu_freq1, mu_freq2), (cov_freq1,
                                  cov_freq2), (mus1, mus2), (covs1,
                                                             covs2), (ws1, ws2)
示例#2
0
文件: if_c.py 项目: sys113/CGT
def if_c():
    if language == 'EN':
        print(c.BLUE+'clear '+c.RED+'screen'+c.BLUE+' .'+c.GREEN+'.'+c.RED+'.')
        sleep(2)
        clear()
        features()
    elif language == 'FA':
        print(c.RED+'tamiz'+c.BLUE+' kardan'+c.RED+' safhe '+c.BLUE+' .'+c.GREEN+'.'+c.RED+'.')
        sleep(2)
        clear()
        features()
示例#3
0
def featpyramid(pic,model):
    #?????
    pyra = {}
    padx = math.ceil(model["maxsize"][0][0][0][1])
    pady = math.ceil(model["maxsize"][0][0][0][0])
    sbin = model["sbin"][0][0][0][0]
    interval = model["interval"][0][0][0][0]
    sc = 2.0 **(1.0/interval)
    imsize = [pic.shape[1],pic.shape[2]]
    max_scale = int(1 + np.floor(math.log(min(imsize)/(5.0*sbin))/math.log(sc)))
    pyra["feat"] = list(range(int(max_scale + interval)))
    pyra["scales"] = np.zeros((max_scale + interval, 1))
    pyra["imsize"] = imsize
    time = 0
    for i in range(interval):
        starttime = datetime.datetime.now()
        scaled = resize.resize(pic,1.0/sc**i)
        endtime = datetime.datetime.now()
        tmp = features.features(scaled,sbin/2.0)
        time += (endtime - starttime).seconds
        size =[tmp.shape[0],tmp.shape[1]+2*pady+2,tmp.shape[2]+2*padx+2]
        pyra["feat"][i]=np.zeros(size)
        pyra["feat"][i][:,pady+1:size[1]-pady-1,padx+1:size[2]-padx-1] = tmp
        pyra["scales"][i] = 2.0/sc**(i)
        #starttime = datetime.datetime.now()
        tmp = features.features(scaled,sbin)
        #endtime = datetime.datetime.now()
        #time += (endtime - starttime).seconds
        size =[tmp.shape[0],tmp.shape[1]+2*pady+2,tmp.shape[2]+2*padx+2]
        pyra["feat"][i+interval]=np.zeros(size)
        pyra["feat"][i+interval][:,pady+1:size[1]-pady-1,padx+1:size[2]-padx-1] = tmp
        pyra["scales"][i+interval] = 1.0/sc**(i-1)
        for j in range(i+interval,max_scale,interval):
            starttime = datetime.datetime.now()
            scaled = resize.resize(scaled, 0.5)
            endtime = datetime.datetime.now()
            tmp = features.features(scaled,sbin)
            time += (endtime - starttime).seconds
            size =[tmp.shape[0],tmp.shape[1]+2*pady+2,tmp.shape[2]+2*padx+2]
            pyra["feat"][j+interval]=np.zeros(size)
            pyra["feat"][j+interval][:,pady+1:size[1]-pady-1,padx+1:size[2]-padx-1] = tmp
            pyra["scales"][j+interval] = 0.5/sc**(i-1)
    for i in range(len(pyra["feat"])):
        pyra["feat"][i][31,0:pady+1,:]=1
        end=pyra["feat"][i].shape
        pyra["feat"][i][31,end[1]-padx-1:end[1],:]=1
        pyra["feat"][i][31,:,0:padx+1]=1
        pyra["feat"][i][31,:,end[2]-pady-1:end[2]]=1

    print time
    pyra["padx"] = padx
    pyra["pady"] = pady
    return pyra
示例#4
0
def dx(dx_features_fname, dx_features_split_fname, split_fname, feature_diseases, db, training_data_fname, time_scale_days, verbose=True):

	feature_loincs = []
	feature_drugs = []

	training_data = pd.read_csv(training_data_fname, sep='\t', dtype=str)

	# we want to relate the presence or absence of diagnoses in the outcome window the presence or absence of the label which is calculated based on codes in the outcome window
	training_data = training_data[['person','y','outcome_start_date','outcome_end_date','age','gender']]
	training_data.columns = ['person','y','training_start_date','training_end_date','age','gender']

	features.features(db, training_data, feature_loincs, feature_diseases, feature_drugs, time_scale_days, dx_features_fname, calc_gfr=False, verbose=verbose, add_age_sex=False)

	features.split(dx_features_fname, dx_features_split_fname, split_fname, verbose)
def processdata(urllists, word_count_threshold, depth):
    content = []
    nums = []
    nums.append(0)
    for url in urllists:
        crawler = webCrawler(url, depth)
        crawler.crawl()
        nums.append(len(crawler.data))
        content.extend(crawler.data)

    instance = features(word_count_threshold)    
    word_counts, wordtoix = instance.extractwords(content)
    N = len(word_counts)
    for i in range(1, len(nums)):
        nums[i] = nums[i-1] + nums[i]
     
    cid = 0   
    output = np.zeros((nums[len(nums)-1], N+1))    
    for url in urllists:
        crawler = webCrawler(url, depth)
        crawler.crawl()
        currlen = len(crawler.data)
        feats = instance.bagofwords(crawler.data, word_counts, wordtoix)
        print feats.shape
        b = np.zeros((currlen,N+1))
        print b[:, :-1].shape
        b[:,0:N] = feats
        b[:,N] = cid +1 
        output[nums[cid]:nums[cid+1],:] = b
        cid = cid + 1
    np.savetxt('test.out', output, delimiter=',')   # X is an array   
示例#6
0
def eval(rules, hand):
  score = 0

  for feat in features.features(hand):
    score += rules[feat]

  return score
def featureExtraction():

    '''
    Extract features and save
    :param recordings:
    :param varin:
    :return:
    '''
    recordings              = getRecordings(wav_path)

    for recording in recordings:
        wav_file   = os.path.join(wav_path,recording+'.wav')
        energy_filename         = os.path.join(feature_path,'energy'+'_'+recording+'_'
                                                   +str(varin['framesize'])+'_'+str(varin['hopsize'])+'.npy')
        spec_filename           = os.path.join(feature_path,'spec'+'_'+recording+'_'
                                                   +str(varin['framesize'])+'_'+str(varin['hopsize'])+'.npy')
        for featurename in feature_set:
            print 'saving feature for ', recording, ', feature ', featurename
            feature_filename        = os.path.join(feature_path,featurename+'_'+recording+'_'
                                                   +str(varin['framesize'])+'_'+str(varin['hopsize'])+'.npy')
            varin['feature_select'] = featurename
            feature, energy, spec = features.features(wav_file,varin)

            np.save(feature_filename,feature)

            if featurename == feature_set[-1]:
                np.save(energy_filename,energy)
                np.save(spec_filename,spec)
示例#8
0
def eval(rules, hand):
    score = 0

    for feat in features.features(hand):
        score += rules[feat]

    return score
示例#9
0
def read_dataset():
    
    df = pd.read_csv("yahoostock.csv")
    X,y = features.features("yahoostock.csv")
    df = df.iloc[::-1]
    date = df[df.columns[0]]
    close = df[df.columns[5]].values
    close = close[1:]
    
    y=close
    date=date.transpose()
    date = date[27:5010]
    X = X[27:5010]
    print(X.shape)
    X = pd.DataFrame(X)
    
    # normalize data
    min_max_scaler = preprocessing.MinMaxScaler()
    np_scaled = min_max_scaler.fit_transform(X)
    X = pd.DataFrame(np_scaled)
    X = X.values
    y = y[27:5010]
    print(y.shape)

    return (X,y,date)
示例#10
0
 def __init__(self, path):
     obj = features()
     STEPS = 47
     self.data = []  # (total, 2, 47, 60)
     self.batch_id = 0
     self.data_label = []
     self.data_length = []
     file = open(path, 'r')
     num = 0
     for line in file.readlines():
         li = line.strip().split('\t')
         if not len(li) > 0:
             continue
         if li[0] == '1':
             self.data_label.append(1)
         else:
             self.data_label.append(0)
         _t, _seq_len = obj.word2vec_list(li[1], li[2])
         t1 = _t[0]
         t2 = _t[1]
         while len(t1) < STEPS:
             t1.append(np.zeros((60), dtype=np.float32))
         while len(t2) < STEPS:
             t2.append(np.zeros((60), dtype=np.float32))
         self.data.append([t1, t2])
         self.data_length.append(_seq_len)
         num += 1
示例#11
0
	def get_featureset( self, promo=None ):
		"""
			Grabs an appropriate featureset.
		"""
		# Short-circuit if a featureset has already been established
		if hasattr( self, "fs" ): return getattr( self, "fs" )
		allfeatures = features()
		
		# Parse out the first path segment and use it if it's a legit promo
		m = re.compile("^/?([^/\\?]*)").findall( self.request.path )
		path = m[0] or "default" if m and len(m) else None
		promo = path if path in allfeatures else None
		
		# If the path doesn't indicate a promotion, check params and cookies
		promo = promo or self.request.params.get("promo") or self.request.cookies.get("promo")
		
		# Retrieve featureset
		key = self.request.params.get("key") or self.request.cookies.get("key")
		fs = featureset.featureset( key, promo )
		
		# Set HTTP cookies to remember the key and promotion
		self.set_cookie( "key", fs.key().id_or_name() )
		if hasattr( fs, "promo" ): self.set_cookie( "promo", fs.promo )
		
		# Set local reference to featureset and return
		setattr( self, "fs", fs )
		return fs
示例#12
0
def test_modelA(test_dirs, muf, covf, mug, covg, ws, fs=16000):
    freq_posterior = []
    mfcc_posterior = []
    result = {}
    for dir_name in test_dirs:
        for file in glob(dir_name + '/*.wav'):
            print('Processing file: ', file)
            fs, f = wavfile.read(file)
            mfcc, freq = ft.features(f, fs)
            freq_posterior = scipy.stats.norm.logpdf(
                freq, muf[0], covf[0]) + np.log(0.5) - (
                    scipy.stats.norm.logpdf(freq, muf[1], covf[1]) -
                    np.log(0.5))
            mfcc = np.vstack(mfcc)
            tmp = []
            for coef in mfcc:
                tmp.append(
                    ft.logpdf_gmm(coef, ws[0], mug[0], covg[0]) + np.log(0.5) -
                    ft.logpdf_gmm(coef, ws[1], mug[1], covg[1]) - np.log(0.5))
            hard = np.mean(tmp) + freq_posterior
            soft = hard > 8.25
            result[file] = (hard, soft)
            print(file, hard, soft)

    return result
示例#13
0
def featureExtraction():
    '''
    Extract features and save
    :param recordings:
    :param varin:
    :return:
    '''
    recordings = getRecordings(wav_path)

    for recording in recordings:
        wav_file = os.path.join(wav_path, recording + '.wav')
        energy_filename = os.path.join(
            feature_path, 'energy' + '_' + recording + '_' +
            str(varin['framesize']) + '_' + str(varin['hopsize']) + '.npy')
        spec_filename = os.path.join(
            feature_path, 'spec' + '_' + recording + '_' +
            str(varin['framesize']) + '_' + str(varin['hopsize']) + '.npy')
        for featurename in feature_set:
            print 'saving feature for ', recording, ', feature ', featurename
            feature_filename = os.path.join(
                feature_path, featurename + '_' + recording + '_' +
                str(varin['framesize']) + '_' + str(varin['hopsize']) + '.npy')
            varin['feature_select'] = featurename
            feature, energy, spec = features.features(wav_file, varin)

            np.save(feature_filename, feature)

            if featurename == feature_set[-1]:
                np.save(energy_filename, energy)
                np.save(spec_filename, spec)
def train(mymodel,
          myoptimizer,
          output_dir,
          epoch,
          train_dataloader,
          eval_dataloader,
          UsingGPU=True,
          min_f1score=0.8,
          maxtokeep=3,
          CVAfterEpoch=2,
          classnum=3):
    featuremodel = features.features()
    if UsingGPU:
        mymodel = mymodel.cuda()
        featuremodel = featuremodel.cuda()
    num_train_steps = int(epoch * len(train_dataloader.dataset) /
                          train_dataloader.batch_size)
    logger.info("***** Do train *****")
    logger.info("  Num examples = %d", len(train_dataloader.dataset))
    logger.info("  Batch size = %d", train_dataloader.batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    global_step = 0  # 共迭代的次数
    maxf1score = min_f1score
    for i in range(1, epoch + 1):
        logger.info("********epoch:{}********".format(i))
        for p in myoptimizer.param_groups:
            p['lr'] = p['lr'] * 0.8
        for batch in train_dataloader:
            global_step += 1
            _features, labels = batch
            if UsingGPU:
                _features = _features.cuda()
                labels = labels.cuda()
            logist, loss = mymodel(featuremodel(_features), labels)
            loss.backward()
            #             fgm.attack()  # 在embedding上添加对抗扰动
            #             logist,loss_adv = mymodel(input_ids, segment_ids, input_mask, label_ids)
            #             loss_adv.backward()  # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
            #             fgm.restore()  # 恢复embedding参数
            myoptimizer.step()
            myoptimizer.zero_grad()
            if global_step % 100 == 0:
                logger.info("step:{}, loss:{:.5f}".format(
                    global_step, loss.data))
            if global_step % 500 == 0 and i >= CVAfterEpoch:
                mymodel.eval()
                precision, recall, f1 = eval(mymodel, eval_dataloader,
                                             classnum, UsingGPU)
                mymodel.train()
                logger.info(
                    "step:{}, precision:{:.5f}, recall:{:.5f}, f1:{:.5f}".
                    format(global_step, precision, recall, f1))
                if f1 > maxf1score:
                    maxf1score = f1
                    model.save(mymodel,
                               global_step,
                               output_dir,
                               MaxModelCount=maxtokeep)
示例#15
0
def train(data, targets, filenames):
    targets = [val == "INFEC" for val in targets] # Set INFEC as positive val
   
    # Choose training mode
    options = ["Cross validation", "Build and test model"]
    res = ui.prompt(options=options)
    mode = options[int(res)]

    # Choose ML algorithm
    options = ["Support Vector Machine", "Random Forest",
            "Decision Tree Classifier", "KNN"]
    res = ui.prompt("Choose a ML algorithm:", options)
    switch = {
        0: svm.SVC(C=100., random_state=0),
        1: RandomForestClassifier(n_estimators=50, max_depth=None, random_state=0),
        2: DecisionTreeClassifier(random_state=0),
        3: KNeighborsClassifier()
    }
    clf = switch.get(int(res))

    if mode == "Cross validation":
        model_evaluation(data, targets, clf)
    elif mode == "Build and test model":
        # Train model
        clf.fit(data, targets)

        # Get test dir
        while True:
            dirname = ui.prompt("Which directory are the test files in?")
            if os.path.isdir(dirname):
                break
            print("ERROR: Directory not found.")

        # Set up data/targets for test model
        print("\n************************************")
        print("*  PREPARING MODEL FOR EVALUATION  *")
        print("************************************")

        pageNames, y_true, filenames = pproc.process(dirname)    
        y_true = [val == "INFEC" for val in y_true] # Set INFEC as positive val
        test_data = ft.features(pageNames)
   
        y_pred = clf.predict(test_data)

        save_filenames(y_true, y_pred, filenames)
    
        conf_matrix = skm.confusion_matrix(y_true, y_pred)
        accuracy = skm.accuracy_score(y_true, y_pred)
        precision = skm.precision_score(y_true, y_pred, average=None)
        recall = skm.recall_score(y_true, y_pred, average=None)
        f1 = skm.f1_score(y_true, y_pred, average=None)
        print("\n{}".format(conf_matrix))
        print("Accuracy:  {}".format(accuracy))
        print("Precision: {}".format(precision[1]))
        print("Recall:    {}".format(recall[1]))
        print("F1:        {}".format(f1[1]))
示例#16
0
def p_feature(f, trainingsteksten):
    """ Calculates the propability of a feature, based on a list of traintexts
    Args: String (feature), List of Tuples (String,String) (traintexts (text,category))
    Returns: Float
    """
    voorkomens = 0
    for (tekst,cat) in trainingsteksten:
        if(features(f,tekst)):
            voorkomens += 1
    return float(voorkomens)/len(trainingsteksten)
示例#17
0
def scores(samples1, freq1, samples2, freq2, samples3, freq3, q1=None, q2=None, return_score=False):

  sentence = features(samples1, freq1).transpose()
  query1 = features(samples2, freq2).transpose()
  query2 = features(samples3, freq3).transpose()

  score_list1 = []
  for pp in range(0, sentence.shape[0]-query1.shape[0], 5):
    score = 0
    for n in range(query1.shape[0]):
      score += pearsonr(query1[n], sentence[pp+n])[0]
    score_list1.append(score/query1.shape[0])

  score_list2 = []
  for pp in range(0, sentence.shape[0]-query2.shape[0], 5):
    score = 0
    for n in range(query2.shape[0]):
      score += pearsonr(query2[n], sentence[pp+n])[0]
    score_list2.append(score/query2.shape[0])

  #print(len(score_list1), '\t', len(score_list2))

  t1 = np.arange(len(score_list1))/100*5
  t2 = np.arange(len(score_list2))/100*5
  
  fig = plt.figure(figsize=(8,2))
  plt.plot(t1, score_list1, t2, score_list2)
  if q1 != None and q2 != None:
    plt.legend([q1, q2])
  else:
    plt.legend(['query1', 'query2'])
  plt.gca().set_xlabel('t')
  plt.gca().set_ylabel('scores')
  plt.gca().set_xlim(left=0)
  plt.gca().set_ylim(bottom=0)
  plt.tight_layout()
  if __name__ == '__main__':
    plt.savefig(path1.stem + '_score.pdf')
  elif return_score == True:
    return score_list1, score_list2
  else:
    return fig
示例#18
0
def run_secretory(filename):
    conn = sqlite3.connect('database.db')
    c = conn.cursor()
    
    parameter_file=open(filename+"_parameters.txt", 'w')
    seqID_list=[]
    result_file=open(filename+"_result.txt", 'w')
    result_file.write("Sequence_ID\tPrediction\n")

    records=SeqIO.parse(filename, "fasta")
    for record in records:
        i=0
        hash_sequence=hashlib.md5(str(record.seq)).hexdigest()
        c.execute("SELECT * FROM secretory WHERE sequence='"+hash_sequence+"'")
        data=c.fetchone()
        if data is None:
            parameter_file.write(features(record.id, str(record.seq))+"\n")
            seqID_list.append(record.id)
                 
        else:
            c.execute("UPDATE secretory SET access=access+1, time=CURRENT_TIMESTAMP WHERE sequence='"+hash_sequence+"'")
            conn.commit()
            c.execute("SELECT prediction FROM secretory WHERE sequence='"+hash_sequence+"'")
            data1=c.fetchone()
            result_file.write(str(record.id)+"\t"+data1[0]+"\n")

    
    parameter_file.close()
    paraFile=filename+"_parameters.txt"
    libsvm_secretory(paraFile)

    predicted = open(paraFile+".predict", "r")

    fasta_rec=SeqIO.index(filename, "fasta")
    print predicted
    i=0

    for pred in predicted:
        print pred
        if int(pred)==1:
            pred='Secretory Protein'
        if int(pred)==0:
            pred='Non-Secretory Protein'
            
        result_file.write(seqID_list[i]+"\t"+pred+"\n")
        c.execute("INSERT INTO secretory VALUES ('"+hashlib.md5(str(fasta_rec[seqID_list[i]].seq)).hexdigest()+"', '"+pred+"', 0, CURRENT_TIMESTAMP)")
        i=i+1
    conn.commit()
    predicted.close()
    result_file.close()
    if secretory_email!="":
        command = "echo 'Your SchistoProt Prediction Result is attached for job ID: '"+filename+"'\n\n\nKind regards,\n\nLutz Krause & Shihab Hasan\nBioinformatics Lab, QIMR Berghofer Medical Research Institute'"+" | EMAIL='Shihab Hasan <*****@*****.**>' mutt -a "+filename+"'_result.txt' -s 'SchistoProt Prediction Result' -- "+secretory_email
        subprocess.call(command, shell=(sys.platform!="Linux"))
示例#19
0
def eval(path):
    result = getData(path)
    result, src, dst = features(result)
    result = result.reshape(1, -1)
    model, scaler, df = readData()
    result = scaler.transform(result)
    preds = model.predict(result)

    if preds[0] == 1:
        write(src, dst)
    else:
        with open('result.txt', 'w') as f:
            f.write('benign')
示例#20
0
    def locateChild(self, ctx, segments):
        if segments[0] == "" or segments[0] == "index.html":
            return self, []
        if len(segments[0]) < 4 or len(segments[0]) > 20:
            return rend.NotFound
        elif segments[0] == "site":
            return dyn_site_root(), []
        elif segments[0] == "signup":
            return signup(), []
        elif segments[0] == "upgrade":
            return user_upgrade(), []
        elif segments[0] == "free_account":
            return free_account(), []
        elif segments[0] == "downloads":
            return downloads(), []
        elif segments[0] == "qoop":
            return qoop(), segments[1:]
        if "reset_password" in segments[0]:
            request = inevow.IRequest(ctx)
            if request.args.has_key('username') and request.args.has_key(
                    'hash'):
                return reset_password(), []
            else:
                return rend.NotFound
        if "quick_start_guide" in segments[0]:
            return quick_start_guide(), []
        if "features" in segments[0]:
            return features(), []
        if "developers" in segments[0]:
            return developers(), []
        if "publish" in segments[0]:
            return user_publish("unknown"), segments[1:]
        if "community" in segments[0] and "feeds" in segments[1]:
            obj = user_homepage("")
            obj.username = "******"  # have to hack this because the user_homepage ctor
            # lowercases it
            return obj, segments[1:]

        def act_check(count):
            if count:
                return user_homepage(segments[0]), segments[1:]
            else:
                if segments[1] == "img":
                    return dyn_image_handler("noserve", self.app,
                                             self.log), segments[2:]
                else:
                    return rend.NotFound

        d = self.app.api.users.check_exists('username', segments[0])
        d.addCallback(act_check)
        return d
	def locateChild(self, ctx, segments):
		if segments[0] == "" or segments[0] == "index.html":
			return self, []
		if len(segments[0]) < 4 or len(segments[0]) > 20:
			return rend.NotFound
		elif segments[0] == "site":
			return dyn_site_root(), []
		elif segments[0] == "signup":
			return signup(), []
		elif segments[0] == "upgrade":
			return user_upgrade(), []
		elif segments[0] == "free_account":
			return free_account(), []
		elif segments[0] == "downloads":
			return downloads(), []
		elif segments[0] == "qoop":
			return qoop(), segments[1:]
		if "reset_password" in segments[0]:
			request = inevow.IRequest(ctx)
			if request.args.has_key('username') and request.args.has_key('hash'):
				return reset_password(), []
			else:
				return rend.NotFound
		if "quick_start_guide" in segments[0]:
			return quick_start_guide(), []
		if "features" in segments[0]:
			return features(), []
		if "developers" in segments[0]:
			return developers(), []
		if "publish" in segments[0]:
			return user_publish("unknown"), segments[1:]
		if "community" in segments[0] and "feeds" in segments[1]:
			obj = user_homepage("")
			obj.username = "******" # have to hack this because the user_homepage ctor
			                       # lowercases it
			return obj, segments[1:]

		def act_check(count):
			if count:
				return user_homepage(segments[0]), segments[1:]
			else:
				if segments[1] == "img":
					return dyn_image_handler("noserve", self.app, self.log), segments[2:]
				else:
					return rend.NotFound

		d = self.app.api.users.check_exists('username', segments[0])
		d.addCallback(act_check)
		return d
 def showTree(self, tree, s='', depth=0):
     for _ in range(depth - 1):
         print('|    ', end="")
     if depth > 0:
         print('-', end="")
         print(s, end="")
         print('-> ', end="")
     if type(tree) == int:  # feuille
         print("class ", self.labelClass[tree])
     else:  # noeud interne
         feature, test = tree[0]
         feature = list(ft.features().keys())[feature]
         print(feature, " <= ", test, " ?")
         self.showTree(tree[1], s='Y', depth=depth + 1)
         self.showTree(tree[2], s='N', depth=depth + 1)
示例#23
0
def vectorize(hands):
  featurelist = [features.features(h) for h in hands]

  (names, revnames) = feature_names(featurelist)
  ret = []

  for feats in featurelist:
    row = [0] * len(names)

    for f in feats:
      idx = names[f]
      row[idx] += 1

    ret.append(row)

  return (ret, names, revnames)
def eval(mymodel, eval_dataloader, classnum, UsingGPU):
    pre_labels = []
    ground_true = []
    for batch in eval_dataloader:
        _features, labels = batch
        featuremodel = features.features()
        if UsingGPU:
            _features = _features.cuda()
            labels = labels.cuda()
            featuremodel = featuremodel.cuda()
        pre_label, _ = mymodel.inference(featuremodel(_features))
        pre_labels = pre_labels + pre_label
        ground_true = ground_true + list(labels.cpu().numpy())
    precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
        ground_true, pre_labels, labels=range(classnum))
    return sum(precision) / 3, sum(recall) / 3, sum(f1) / 3
示例#25
0
def vectorize(hands):
    featurelist = [features.features(h) for h in hands]

    (names, revnames) = feature_names(featurelist)
    ret = []

    for feats in featurelist:
        row = [0] * len(names)

        for f in feats:
            idx = names[f]
            row[idx] += 1

        ret.append(row)

    return (ret, names, revnames)
示例#26
0
def eval(mymodel, eval_dataloader, classnum, UsingGPU):
    pre_labels = []
    ground_true = []
    featuremodel = features.features()
    if UsingGPU:
        featuremodel = featuremodel.cuda()
    for batch in eval_dataloader:
        _features, labels = batch
        if UsingGPU:
            _features = _features.cuda()
            labels = labels.cuda()
        pre_label, _ = mymodel.inference(featuremodel(_features))
        pre_labels = pre_labels + pre_label
        ground_true = ground_true + list(np.squeeze(labels.cpu().numpy(), 1))
    precision, recall, f1, _ = sklearn.metrics.precision_recall_fscore_support(
        ground_true, pre_labels, labels=range(classnum), average='micro')
    return precision, recall, f1
def read_dataset():
    #X=[]
    #y=[]
    #simavg, weiavg, ROC, stoK, stoD, rsi, MACD, WR, ado, CCI, label = features.features("yahoostock.csv")
    X, y = features.features("yahoostock.csv")
    """X.append(simavg)
    X.append(weiavg)
    X.append(ROC)
    X.append(stoK)
    X.append(stoD)
    X.append(rsi)
    X.append(MACD)
    X.append(WR)
    X.append(ado)
    X.append(CCI)
    y.append(label)
 
    #print(CCI)
    X = np.array(X)
    X = X.transpose()"""
    #print(X[:,9])
    X = X[27:5010]
    print(X.shape)

    X = pd.DataFrame(X)
    min_max_scaler = preprocessing.MinMaxScaler()
    np_scaled = min_max_scaler.fit_transform(X)
    X = pd.DataFrame(np_scaled)
    X = X.values

    #y = np.array(y)

    #y = y.transpose()
    y = y[27:5010]
    y = pd.DataFrame(y)

    #print(y)

    # Encode the dependent variable
    encoder = LabelEncoder()
    encoder.fit(y.values.ravel())
    y = encoder.transform(y.values.ravel())
    Y = one_hot_encode(y)
    #print(X.shape)
    return (X, Y)
示例#28
0
def main():
    # Create images & extract features (until user quits)
    doneExtracting = False
    while not doneExtracting:
        pageNames, targets, filenames = pproc.process(sys.argv[1])
        data = ft.features(pageNames)

        # Create and evaluate model (until user quits)
        doneTraining = False
        while not doneTraining:
            tr.train(data, targets, filenames)

            options = ["Try another model", "Extract new features", "Quit"]
            res = options[int(ui.prompt(options=options))]
            if res == "Quit":
                doneTraining = True
                doneExtracting = True
            elif res == "Extract new features":
                doneTraining = True
示例#29
0
def read_dataset():

    X, y = features.features("yahoostock.csv")
    X = X[27:5010]
    X = pd.DataFrame(X)

    # normalize data
    min_max_scaler = preprocessing.MinMaxScaler()
    np_scaled = min_max_scaler.fit_transform(X)
    X = pd.DataFrame(np_scaled)
    X = X.values
    y = y[27:5010]
    y = pd.DataFrame(y)

    # Encode the dependent variable
    encoder = LabelEncoder()
    encoder.fit(y.values.ravel())
    Y = encoder.transform(y.values.ravel())
    return (X, Y)
示例#30
0
def cluster2():
    f = features.features()
    nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(f)
    distances, indices = nbrs.kneighbors(f)

    print "Distances and indices with kneighbors algorithm"
    print distances
    print indices

    K_MEANS = cluster.KMeans(n_clusters=5)
    K_MEANS.fit(f)

    X = np.zeros(len(f))
    Y = np.zeros(len(f))
    i = 0
    for d in f:
        X[i] = d[0]
        Y[i] = d[1]
        i += 1

    CENTERS = K_MEANS.cluster_centers_
    pca = PCA(n_components=2)
    X_r = pca.fit(f).transform(f)

    X1 = []
    Y1 = []
    for i in range(len(X_r)):
        X1.append(X_r[i, 0])
        Y1.append(X_r[i, 1])

    K_MEANS.fit(X_r)
    NCENTERS = K_MEANS.cluster_centers_
    FIG = plt.figure()
    AX = FIG.add_subplot(111)
    SCATTER = AX.scatter(X1, Y1, c=K_MEANS.labels_, s=50)
    for i, j in NCENTERS:
        AX.scatter(i, j, s=50, c='red', marker='+')
    AX.set_xlabel('x')
    AX.set_ylabel('y')
    plt.colorbar(SCATTER)
    plt.show()
示例#31
0
def classify(text, trained_model, features1, categories,tr_texts):
    """ Classifies a text based on (not per se) a trained model, categories, features and the trainingstexts.
    Args: String (a text), List of Tuples (String,String), List of Strings, List of Strings
    Returns: String (prints propabilities per category)
    """
    if (trained_model == []):
        trained_model = train(tr_texts, categories, features1)
    score_cat = {}
    for c in categories:
        noemer = 1 * p_cat(c,tr_texts)
        teller = 1
        for f in features1:
            if(features(f,text)):
                noemer = noemer * trained_model[1][c][f]
                teller = teller * trained_model[0][f] + 0.0000001
        score_cat[c] = float(noemer)/teller
    result = "false" #random.choice(categories)
    max_score = 0
    for c in categories:
        if (score_cat[c]>max_score):
            result = c
            max_score = score_cat[c]
    return result
示例#32
0
def read_dataset():

    X, y = features.features("yahoostock.csv")

    X = X[27:5010]
    print(X.shape)

    X = pd.DataFrame(X)
    # print(X)
    #X.to_csv('data.csv')
    #min_max_scaler = preprocessing.MinMaxScaler()
    #np_scaled = min_max_scaler.fit_transform(X)
    #X = pd.DataFrame(np_scaled)
    X = X.values
    y = y[27:5010]
    y = pd.DataFrame(y)
    print(y.shape)

    # Encode the dependent variable
    encoder = LabelEncoder()
    encoder.fit(y.values.ravel())
    Y = encoder.transform(y.values.ravel())
    return (X, Y)
示例#33
0
def featureset( key=None, promo=None, **kwargs ):
	"""
		Static factory method for creating or retrieving FeatureSet instances.
	"""
	# Local reference to features
	allfeatures = features()
	
	# Check to see if the specified key indicates a promotion
	if promo not in allfeatures: promo = "default"
	promofeatures = allfeatures[promo] if promo != "default" else None
	
	# Check request and lookup existing feature set, or create new empty set
	key = key or "uuid:%s" % str( uuid.uuid4() )
	fs = db.get( db.Key.from_path( "FeatureSet", key ) ) or FeatureSet( key_name=key )
	changed = not fs.is_saved()
	
	# Fill in promo on featureset
	if not hasattr( fs, "promo" ) or fs.promo != promo:
		changed = True
		setattr( fs, "promo", promo )
	
	# Fill in any keyword arguments
	for k, v in kwargs.iteritems():
		if hasattr( fs, k ) and getattr( fs, k ) == v: continue
		changed = True
		setattr( fs, k, v )

	# Fill in FeatureSet instance from allfeatures
	for feature, groups in allfeatures["default"].iteritems():
	
		# Override groups if in a promo which contains that feature
		if promofeatures and feature in promofeatures: groups = promofeatures[feature]
	
		# If groups is really just one value, the only choice is to set it
		if type(groups) != dict:
			if hasattr( fs, feature ) and getattr( fs, feature ) == groups: continue
			changed = True
			setattr( fs, feature, groups )
			continue
	
		# If the FeatureSet already has this feature, make sure it's a legal value
		if hasattr( fs, feature ):
			oldval = getattr( fs, feature )
			found = False
			for value, frequency in groups.iteritems():
				if oldval == value: 
					found = True
					break
			if found: continue
			
		# Randomly pick from the feature's groups based on defined frequencies
		stops = []
		last = 0
		for value, frequency in groups.iteritems():
			last += frequency
			stops.append( ( last, value ) )
		r = random.uniform( 0.0, last )
		for i in range( len( stops ) ):
			if r < stops[i][0]:
				break
		
		# Set the feature on the feature set
		changed = True
		setattr( fs, feature, stops[i][1] )
		
	# Save the FeatureSet
	if changed: fs.put()
	return fs
    def lookup(self, instance):
        ret = defaultdict(int)
        for feature in instance:
            if feature in self.vocab:
                ret[self.vocab[feature]] += 1
            else:
                ret[self.vocab["UNKNOWN"]] += 1
        return ret


if __name__ == "__main__":
    racism = DataSet("racism")
    racism_features = []
    for tweet in tqdm(racism.data):
        racism_features.append(features(preprocess(tweet)))

    sexism = DataSet("sexism")
    sexism_features = []
    for tweet in tqdm(sexism.data):
        sexism_features.append(features(preprocess(tweet)))

    neither = DataSet("neither")
    neither_features = []
    for tweet in tqdm(neither.data):
        neither_features.append(features(preprocess(tweet)))

    vocab = Vocab()
    vocab.add(racism_features)
    vocab.add(sexism_features)
    vocab.add(neither_features)
def run(out_dir,
        config_fname,
        data_paths_fname,
        stats_list_fname,
        split_fname=None,
        check_if_file_exists=False,
        verbose=True):

    data_paths = util.read_yaml(data_paths_fname)
    config = util.read_yaml(config_fname)

    stats_key = config['stats_key']
    outcome_stat_name = config['outcome_stat_name']
    cohort_stat_name = config.get('cohort_stat_name', None)
    lab_lower_bound = config.get('lab_lower_bound', None)
    lab_upper_bound = config.get('lab_upper_bound', None)
    gap_days = config.get('gap_days', None)
    training_window_days = config['training_window_days']
    buffer_window_days = config['buffer_window_days']
    outcome_window_days = config['outcome_window_days']
    time_period_days = config['time_period_days']
    time_scale_days = config['time_scale_days']
    use_just_labs = config['use_just_labs']
    feature_loincs_fname = config['feature_loincs_fname']
    add_age_sex = config['add_age_sex']
    calc_gfr = config['calc_gfr']
    regularizations = config.get('regularizations', [1])
    lin_n_cv_iters = config.get('lin_n_cv_iters', -1)
    n_cv_iters = config.get('n_cv_iters', -1)
    progression = config['progression']
    progression_lab_lower_bound = config.get('progression_lab_lower_bound',
                                             None)
    progression_lab_upper_bound = config.get('progression_lab_upper_bound',
                                             None)
    progression_gap_days = config.get('progression_gap_days', None)
    progression_stages = config.get('progression_stages', None)
    progression_init_stages = config.get('progression_init_stages', None)
    evaluate_nn = config.get('evaluate_nn', True)

    outcome_fname = out_dir + stats_key + '_' + outcome_stat_name + '.txt'
    if cohort_stat_name is None:
        cohort_fname = data_paths['demographics_fname']
    else:
        cohort_fname = out_dir + stats_key + '_' + cohort_stat_name + '.txt'
    gfr_loincs = util.read_list_files('data/gfr_loincs.txt')
    training_data_fname = out_dir + stats_key + '_training_data.txt'

    feature_loincs = util.read_list_files(feature_loincs_fname)
    if use_just_labs == False:
        feature_diseases = [[
            icd9
        ] for icd9 in util.read_list_files('data/kidney_disease_mi_icd9s.txt')]
        feature_drugs = [
            util.read_list_files('data/drug_class_' + dc.lower().replace(
                '-', '_').replace(',', '_').replace(' ', '_') + '_ndcs.txt')
            for dc in util.read_list_files(
                'data/kidney_disease_drug_classes.txt')
        ]
    else:
        feature_diseases = []
        feature_drugs = []

    n_labs = len(feature_loincs)

    if add_age_sex:
        age_index = len(feature_loincs) + len(feature_diseases) + len(
            feature_drugs)
        gender_index = len(feature_loincs) + len(feature_diseases) + len(
            feature_drugs) + 1
    else:
        age_index = None
        gender_index = None

    features_fname = out_dir + stats_key + '_features.h5'
    features_split_fname = out_dir + stats_key + '_features_split.h5'
    predict_fname = out_dir + stats_key + '_prediction_results.yaml'
    if evaluate_nn:
        nn_predict_fname = out_dir + stats_key + '_nn_prediction_results.yaml'
    else:
        nn_predict_fname = None

    if verbose:
        print "Loading data"

    db = util.Database(data_paths_fname)
    db.load_people()
    db.load_db(['loinc', 'loinc_vals', 'cpt', 'icd9_proc', 'icd9', 'ndc'])

    stats = util.read_yaml(stats_list_fname)[stats_key]

    if verbose:
        print "Calculating patient stats"

    data = ps.patient_stats(db,
                            stats,
                            stats_key,
                            out_dir,
                            stat_indices=None,
                            verbose=verbose,
                            check_if_file_exists=check_if_file_exists,
                            save_files=True)

    if verbose:
        print "Building training data"

    outcome_data = btd.build_outcome_data(out_dir, outcome_fname)
    cohort_data = btd.setup(data_paths['demographics_fname'], outcome_fname,
                            cohort_fname)
    # calc_gfr = True here because it's required to define the condition
    training_data = btd.build_training_data(db, cohort_data, gfr_loincs, lab_lower_bound, lab_upper_bound, \
     training_window_days, buffer_window_days, outcome_window_days, time_period_days, time_scale_days, gap_days, calc_gfr=True, verbose=verbose, \
     progression=progression, progression_lab_lower_bound=progression_lab_lower_bound, progression_lab_upper_bound=progression_lab_upper_bound, \
     progression_gap_days=progression_gap_days, progression_init_stages=progression_init_stages, progression_stages=progression_stages)
    training_data.to_csv(training_data_fname, index=False, sep='\t')

    if verbose:
        print "Building features"

    features.features(db, training_data, feature_loincs, feature_diseases,
                      feature_drugs, time_scale_days, features_fname, calc_gfr,
                      verbose, add_age_sex)

    if split_fname is None:
        split_fname = out_dir + stats_key + '_split.txt'
        features.train_validation_test_split(training_data['person'].unique(),
                                             split_fname,
                                             verbose=verbose)

    features.split(features_fname, features_split_fname, split_fname, verbose)

    if verbose:
        print "Training, validating and testing models"

    predict.predict(features_split_fname, lin_n_cv_iters, n_cv_iters,
                    regularizations, n_labs, age_index, gender_index,
                    predict_fname, nn_predict_fname)
#Importing modules
import pandas as pd
import features as f

#Creating feature sets
f2 = f.features()
Dataframe_Easy = f2.create_dataframe()
Dataframe_Medium = f2.create_dataframe()
Dataframe_Tough = f2.create_dataframe()

#Easy
value1 = []
for i in range(0, 33):
    value1.append(1)
df1 = pd.DataFrame({'Class': value1})
Dataframe_Easy = Dataframe_Easy.join(df1)

#Medium
value2 = []
for i in range(0, 33):
    value2.append(2)
df2 = pd.DataFrame({'Class': value2})
Dataframe_Medium = Dataframe_Medium.join(df2)

#Hard
value3 = []
for i in range(0, 33):
    value3.append(3)
df3 = pd.DataFrame({'Class': value3})
Dataframe_Tough = Dataframe_Tough.join(df3)
示例#37
0
def extractFeatures(img):
    img = ImageOps.fit(Image.fromarray(img), (32, 32))
    hMats, wFilters = filterBuilder.buildFilters()
    return features.features(np.asarray(img),hMats,wFilters)
import pickle
from nltk import NaiveBayesClassifier
from features import features

f1 = open("male.txt")
f2 = open("female.txt")

trainer = NaiveBayesClassifier.train
namelist = ([(name, 'male') for name in f1] + [(name, 'female')
                                               for name in f2])

train = namelist[:5000]

classifier = trainer([(features(n), g) for (n, g) in train])

with open('classifier.pickle', 'wb') as outfile:
    pickle.dump(classifier, outfile)
    outfile.close()
test_loader = Data.DataLoader(
    # 从数据库中每次抽出batch size个样本
    dataset=test_dataset,
    batch_size=batchsize,
    shuffle=False,
    num_workers=2,
)

logging.basicConfig(level=logging.INFO,
                    # filename=save_dir + '/log.txt',
                    filemode='w',
                    format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

mymodel =model.model(seqlenth,featuresize=4,seqembedding=3,dropout=0.5)
featuremodel=features.features()
if UsingGPU:
    mymodel=mymodel.cuda()
    featuremodel=featuremodel.cuda()

estimation = np.zeros((testingImagNumber, n_class))
for i in range(K):
    # logger.info('*****{}***** fold test start.'.format(i))
    model_list = os.listdir(save_dir+'/Fold_{}'.format(i))
    model_list.sort()
    # print(save_dir+'/baseline_{}/'.format(i)+model_list[-1])
    temp_model = torch.load(save_dir+'/Fold_{}/'.format(i)+model_list[-1])
    mymodel.load_state_dict(temp_model['model_state_dict'])
    mymodel.eval()
    out = torch.FloatTensor()
    pre_labels=[]
示例#40
0
    def __init__(self):
        self.obj = features()
        step = 1
        learning_rate = 0.001
        self.STEPS = 47
        n_input = 60  # data input
        n_hidden = 64  # hidden layer num of features
        n_classes = 32  # total classes output lstm
        self.X_in = tf.placeholder(tf.float32, [None, 2, self.STEPS, n_input])
        self.y = tf.placeholder(tf.float32, [None])
        self.z = tf.placeholder(tf.int32, [None, 2])
        with tf.variable_scope('fc') as scope:
            weights = {
                # (60 * 300)
                'in': tf.Variable(tf.random_normal([n_input, n_hidden]), ),
            }
            biases = {
                'in': tf.Variable(tf.constant(0.1, shape=[
                    n_hidden,
                ])),
                'mini': tf.constant(0.00001, shape=[
                    1,
                ])
            }
            scope.reuse_variables()
        X1 = self.X_in[:, 0]
        X2 = self.X_in[:, 1]
        seq1_len = self.z[:, 0]
        seq2_len = self.z[:, 1]
        X1 = tf.reshape(X1, [-1, n_input])
        X1 = tf.matmul(X1, weights['in']) + biases['in']
        X1 = tf.reshape(X1, [-1, self.STEPS, n_hidden])
        X2 = tf.reshape(X2, [-1, n_input])
        X2 = tf.matmul(X2, weights['in']) + biases['in']
        X2 = tf.reshape(X2, [-1, self.STEPS, n_hidden])
        X1 = tf.nn.relu(X1)
        X2 = tf.nn.relu(X2)
        size = tf.shape(seq1_len)[0]
        with tf.name_scope("layer1"):
            with tf.variable_scope("rnn_1"):
                lstm_cell = tf.contrib.rnn.BasicLSTMCell(32)
                init_state = lstm_cell.zero_state(size, dtype=tf.float32)
                outputs1, _1 = tf.nn.dynamic_rnn(lstm_cell,
                                                 X1,
                                                 sequence_length=seq1_len,
                                                 dtype=tf.float32,
                                                 initial_state=init_state)
                outputs2, _2 = tf.nn.dynamic_rnn(lstm_cell,
                                                 X2,
                                                 sequence_length=seq2_len,
                                                 dtype=tf.float32,
                                                 initial_state=init_state)
                outputs1 = tf.nn.relu(outputs1)
                outputs2 = tf.nn.relu(outputs2)

        with tf.name_scope("layer2"):
            with tf.variable_scope("rnn_2"):
                lstm_cell_b = tf.contrib.rnn.BasicLSTMCell(32)
                init_state_b = lstm_cell_b.zero_state(size, dtype=tf.float32)
                __, states1 = tf.nn.dynamic_rnn(lstm_cell_b,
                                                outputs1,
                                                sequence_length=seq1_len,
                                                dtype=tf.float32,
                                                initial_state=None)
                __, states2 = tf.nn.dynamic_rnn(lstm_cell_b,
                                                outputs2,
                                                sequence_length=seq2_len,
                                                dtype=tf.float32,
                                                initial_state=None)

        norm1 = tf.sqrt(tf.reduce_sum(tf.square(states1[1]), axis=1))
        norm2 = tf.sqrt(tf.reduce_sum(tf.square(states2[1]), axis=1))
        dot = tf.reduce_sum(tf.multiply(states1[1], states2[1]), axis=1)
        final = dot / tf.add(tf.multiply(norm1, norm2), biases['mini'])
        with tf.name_scope("prediction"):
            final = final * 0.5 + 0.5
            self.pred = tf.reshape(final, [-1])
        with tf.name_scope("cost"):
            tv = tf.trainable_variables()
            # l2_cost = 0.00001 * tf.reduce_sum([tf.nn.l2_loss(v) for v in tv])
            self.cost = -tf.reduce_mean(self.y * tf.log(self.pred + 0.00001) +
                                        (1 - self.y) *
                                        tf.log(1 - self.pred + 0.00001))
        self.train_op = tf.train.AdamOptimizer(learning_rate).minimize(
            self.cost)
        init_op = tf.global_variables_initializer()
        # merged_summary = tf.summary.merge_all()
        self.saver = tf.train.Saver(tf.global_variables())
        # config = tf.ConfigProto()
        # config.gpu_options.per_process_gpu_memory_fraction = 0.6
        self.sess = tf.Session()
        self.sess.run(init_op)
        ckpt = tf.train.get_checkpoint_state(
            '/Users/ivanfzh/Desktop/graduation_proj/fzh/save_w2v/')
        self.saver.restore(self.sess, ckpt.model_checkpoint_path)
def buildbow(word_count_threshold, content):
    instance = features(word_count_threshold)    
    word_counts, wordtoix = instance.extractwords(content)
    return word_counts, wordtoix
        json.dump(word_counts, file(os.path.join(filepath,dictname), 'w'))
        json.dump(wordtoix, file(os.path.join(filepath,dict2idx), 'w'))
                  
        N = len(word_counts)
    
        # process the length of each class    
        for i in range(1, len(nums)):
            nums[i] = nums[i-1] + nums[i]
     
        cid = 0 # class
        output = np.zeros((nums[len(nums)-1], N+1))    
        for url in urllists:
            urls = geturls(url)
            print urls
            content = getdata(urls, depth)
            instance = features(word_count_threshold) 
            feats = instance.bagofwords(content, word_counts, wordtoix)
            print feats.shape
            currlen = len(content)
            b = np.zeros((currlen,N+1))
            print b[:, :-1].shape
            b[:,0:N] = feats
            b[:,-1] = cid
            output[nums[cid]:nums[cid+1],:] = b
            cid = cid + 1
        np.savetxt(os.path.join(filepath,filename), output, delimiter=',')   # X is an array   

 
    #output = np.loadtxt(os.path.join(filepath,filename), delimiter=',', unpack=True)
    #print output.shape
    #output = output.T
示例#43
0
    sql = "SELECT `ID` FROM `{0}`.`{1}` WHERE ".format(
        parameter_info["database_source"], parameter_info["table_captured"])
    where_info = ""
    types = str(parameter_info["type"]).split(',')
    for index in range(0, len(types)):
        where_info += "`TYPE`='{0}'".format(types[index])
        if index is not len(types) - 1:
            where_info += " OR "
    sql += where_info + ";"
    print(sql)

    cursor_source.execute(sql)
    IDs = list()
    temp_result = cursor_source.fetchall()

    cursor_source.close()
    db_source.close()

    for item in temp_result:
        IDs.append(item[0])
        print("Get ID : {0}".format(IDs[len(IDs) - 1]))

    # with ProcessPoolExecutor(int(parameter_info["threads"])) as process_executor:
    #     for source_id in IDs:
    #         process_executor.submit(features.features, parameter_info, int(source_id))

    for source_id in IDs:
        features.features(parameter_info, int(source_id))

    print("All finished.")
示例#44
0
# -*- coding:utf-8 -*-
from sklearn import svm
from sklearn import linear_model
import numpy as np
import pickle
import jieba
import random
from features import features
from sklearn import metrics
import json

LENGTH = 60
DATA_PATH='data.txt'
obj = features()
STATISTICAL_LENGTH = 8



def split(sent):
    return list(jieba.cut(sent))


def load_w2v(path):
    f1 = open(path, 'r')
    y_label = []
    x_set = []
    line_num = 0
    for line in f1.readlines():
        li = line.strip().split('\t')
        if li[0] == '0':
            y_label.append(0)