Python preProcess示例，preprocess.preProcess Python示例

示例#1

0

显示文件

 def train_bert_model(self, use_generator=False):
     data_path = self._para.data_path
     _, train_y, _, _, _, val_y, _, tags = pickle.load(open(data_path+'/nlp_ner.pk', 'rb'))
     self._para.tag_num = len(tags)
     model = ModelLib.BERT_MODEL(self._para)
     checkpoint = ModelCheckpoint(data_path+'/bert-bilstm', monitor='val_viterbi_acc', verbose=1,
                                  save_best_only=True, mode='max')
     pre_process = preProcess(self._para)
     if use_generator:
         val_x = pre_process.load_bert_test(data_path+'/dev.txt', self._para.sep)
         model.fit_generator(
             Generator(self._para).bert_generator(self._para.batch_size,
                                                  data_path+'/train.txt',
                                                  self._para.sep,
                                                  train_y, shuffle=True),
             steps_per_epoch=train_y.shape[0] // self._para.batch_size + 1,
             callbacks=[checkpoint],
             validation_data=(val_x, val_y),
             epochs=self._para.EPOCHES,
             verbose=1)
     else:
         train_x, val_x = pre_process.load_bert_train_dev()
         logger.info('%s, %s' % (train_x.shape, train_y.shape))
         logger.info('%s, %s' % (val_x.shape, val_y.shape))
         model.fit(train_x, train_y,
                   batch_size=self._para.batch_size,
                   epochs=self._para.EPOCHES,
                   callbacks=[checkpoint],
                   validation_data=(val_x, val_y),
                   shuffle=True,
                   verbose=1)
     model.save(data_path+'/bert-bilstm')

示例#2

0

显示文件

文件： model.py 项目： hendraronaldi/tweet_categorization

def trainingModel():
    # load data
    data = pd.read_excel("../Ica_Labelled Tweets (selesai).xlsx",
                         index_col=None,
                         sheet_name='tweets_text',
                         skiprows=[0, 1, 2],
                         na_values=['-', ' '])
    cleaned_data = preProcess(data)

    # train test split
    X = cleaned_data[['tweet']]
    y = cleaned_data[['label']]
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=0.3, random_state=42)

    # train model
    model = XGBClassifier()
    model.fit(X_train, y_train)

    # test model & evaluate predictions
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))

    # save model
    model_filename = 'final_model.sav'
    pickle.dump(model, open(model_filename, 'wb'))

示例#3

0

显示文件

def test(para):
    logger.info('=======Start testing process=======')
    data_path = data_path = para.data_path
    _, _, _, test_y, _, _, _, tags = pickle.load(open(data_path + '/nlp_ner.pk', 'rb'))
    para.tag_num = len(tags)
    model = ModelLib.BERT_MODEL(para)
    model.load_weights(filepath=data_path+'/bert-bilstm')
    pre_process = preProcess(para)
    test_x = pre_process.load_bert_test()
    lengths = pre_process.get_lengths(test_y)

    pred_y = model.predict(test_x)

    tag_pred_y = []
    tag_test_y = []
    for i, y in enumerate(pred_y):
        y = [np.argmax(dim) for dim in y]
        # logger.info(lengths[i])
        p_y = y[:lengths[i]]
        # logger.info(p_y)
        t_y = test_y[i][:lengths[i]].flatten()
        # logger.info(t_y)
        p_y = [tags[dim] for dim in p_y]
        t_y = [tags[dim] for dim in t_y]
        tag_pred_y.append(p_y)
        tag_test_y.append(t_y)

    accuracy = ner_accuracy(tag_pred_y, tag_test_y)
    logger.info('Test accuracy %f' % accuracy)
    precision, recall, f1_score = F1(tag_pred_y, tag_test_y)
    logger.info('Precision=%f, Recall=%f, f1-score=%f' % (precision, recall, f1_score))
    return tag_pred_y, tag_test_y

示例#4

0

显示文件

文件： main.py 项目： hendraronaldi/tweet_categorization

def submit():
    inputData = {'tweet': [txtInput.get()]}
    data = pd.DataFrame(data=inputData)
    cleaned_data = preProcess(data)
    cleaned_data = featureExtraction(cleaned_data)
    result = predictInput(cleaned_data)
    result.insert(loc=0, column='tweet', value=[txtInput.get()])
    lblOutput.configure(text=str(result))

示例#5

0

显示文件

def make_predictions_wraper(file_path='test.csv',
                            df_=None,
                            return_predictions=False,
                            save_name=None):

    # Select the ones who missed the test
    if df_ is None:
        df = pd.read_csv(file_path, index_col='NU_INSCRICAO')
    else:
        df = df_.copy()

    zeros = df.isnull().query('NU_NOTA_LC')[[]]
    zeros['NU_NOTA_MT'] = 0

    # Get ID of columns to be predicted
    predictions = df[['NU_NOTA_LC']].dropna()[[]]  #.index

    # Load data
    if df_ is None:
        features, pipe = preProcess(path=file_path, train=False)
    else:
        features, pipe = preProcess(df_=df_, train=False)

    # Load models
    try:

        with open('models.pkl', 'rb') as f:
            models, metrics = pickle.load(f)
    except:

        raise ValueError('Model File Missing')

    # Make predictions
    predictions['NU_NOTA_MT'] = predictions_ensemble(
        features, models, ensamble_method=ensamble_method).values

    predictions_full = pd.concat([zeros, predictions])

    if save_name is not None:

        predictions_full.to_csv(save_name)

    if return_predictions:

        return predictions_full

示例#6

0

显示文件

文件： train.py 项目： moesmatheus/aceleradev-ds-week8

def train_wraper(file_path = 'train.csv', return_arg = False, plot = False,
     verbose = True, pipe_path = 'pipe.pkl'):
    
    features, target = preProcess(path = file_path)
    
    models, metrics = create_models(features, target, plot = plot, verbose = verbose, pipe_path = pipe_path)
    
    if return_arg:
    
        return models, metrics

示例#7

0

显示文件

文件： buildClassifier.py 项目： GeethaAnne/StockMarketPrediction

def prepare_traindata(): #POPULATES feature_list and featureAndSentiment
    print("building the train dataset....")
    with open("/home/uday/DjangoProjects/stocks/prediction/Sentiment/train_data.csv",'rb') as csvfile:
        csv_reader = csv.reader(csvfile)
        for row in csv_reader:
            tweet = row[0]
            sentiment = row[1]
            featureVector = preProcess(tweet)
            for feature in featureVector:
                feature_list.append(feature)
            featuresAndSentiment.append((featureVector,sentiment))
    csvfile.close()

示例#8

0

显示文件

文件： predictstack.py 项目： moesmatheus/aceleradev-ds-week8

def make_predictions_wraper_stacking(file_path='test.csv',
                                     df_=None,
                                     return_predictions=False,
                                     save_name=None,
                                     models_path='models-stack.pkl'):

    # Select the ones who missed the test
    if df_ is None:
        df = pd.read_csv(file_path, index_col='NU_INSCRICAO')
    else:
        df = df_.copy()

    zeros = df.isnull().query('NU_NOTA_LC')[[]]
    zeros['NU_NOTA_MT'] = 0

    # Get ID of columns to be predicted
    predictions = df[['NU_NOTA_LC']].dropna()[[]]  #.index

    # Load data
    if df_ is None:
        features, pipe = preProcess(path=file_path, train=False)
    else:
        features, pipe = preProcess(df_=df_, train=False)

    # Make predictions
    predictions['NU_NOTA_MT'] = make_predictions_stacking(
        features, models_path=models_path)

    predictions_full = pd.concat([zeros, predictions])

    if save_name is not None:

        predictions_full.to_csv(save_name)

    if return_predictions:

        return predictions_full

示例#9

0

显示文件

 def bert_generator(self, batch_size, data_path, sep, y, shuffle=True):
     index_array = np.arange(y.shape[0])
     if shuffle:
         np.random.shuffle(index_array)
     _parse_data = preProcess(self._para)._parse_data
     data = _parse_data(codecs.open(data_path, 'r'), sep=sep)
     data = [[item[0] for item in sent] for sent in data]
     batches = self.make_batches(y.shape[0]-1, batch_size)
     le = LEmbedding(self._para)
     while True:
         for batch_index, (batch_start, batch_end) in enumerate(batches):
             batch_idx = index_array[batch_start:batch_end]
             batch_data = [data[idx] for idx in batch_idx]
             batch_x = le.embedding(batch_data)
             batch_x = batch_x[:, 1:self._para.max_len+1]
             batch_y = y[batch_idx]
             yield batch_x, batch_y

示例#10

0

显示文件

文件： main.py 项目： hendraronaldi/tweet_categorization

def uploadAction(event=None):
    # python 3 import file using tkinter
    filename = filedialog.askopenfilename()

    # python 2 import file using tkinter
    # filename = tkFileDialog.askopenfilename()

    if not filename.endswith(".csv") and not filename.endswith(".xlsx"):
        lblOutput.configure(text="Wrong input file")
    else:
        # TODO: Read input file and process
        if filename.endswith(".xlsx"):
            data = pd.read_excel(filename)
        else:
            data = pd.read_csv(filename, encoding="ISO-8859-1")

        cleaned_data = preProcess(data)
        cleaned_data = featureExtraction(cleaned_data)
        result = predictInput(cleaned_data)
        result.insert(loc=0, column='tweet', value=data['tweet'])
        lblOutput.configure(text=str(result))

示例#11

0

显示文件

文件： Polynomial.py 项目： harisch1/Movie-Predict

def Poly():
    X, y = preprocess.preProcess()
    y = np.reshape(y, (X.shape[0], 1))

    alpha = 0
    deg = X.shape[1]

    model = Ridge(alpha=alpha, solver='auto', random_state=42)
    model = Pipeline([
        ("poly_features", PolynomialFeatures(degree=deg, include_bias=True)),
        ("std_scaler", StandardScaler()),
        ("regul_reg", model),
    ])
    model.fit(X, y)
    #y_pred = model.predict(x_test)
    cross_valid = cross_val_score(model,
                                  x,
                                  y,
                                  scoring='neg_mean_squared_error',
                                  cv=5)
    print('Cross Validation Errors:', -np.mean(cross_valid))
    print('Theta: \n', model.named_steps["regul_reg"].coef_)

示例#12

0

显示文件

def get_class():
    data = request.get_json()
    data = preProcess(data['data'])
    svm, nb, knn, lr, rf = predict_label(data)
    k, l, m, n, f = predictD2Vclass(data)
    a, b, c, d, e = predictclass(data)
    positive = (int(svm) + int(nb) + int(knn) + int(lr) + int(rf) + int(k) +
                int(l) + int(m) + int(n) + int(a) + int(b) + int(c) + int(d) +
                int(e))
    predicted = model_fasttext.predict(data)
    if (predicted[0][0] == '__label__1'):
        fasttextlabel = '1'
    else:
        fasttextlabel = '0'
    fasttextconfidence = predicted[1][0] * 100
    return jsonify({
        "case": data,
        "SVM": svm,
        "Naive Bayes": nb,
        "k Nearest Neighbour": knn,
        "Logistic Regression": lr,
        "Random Forest": rf,
        "D2VSVM": k,
        "D2VLR": l,
        "D2VRf": m,
        "D2VKNN": n,
        "BOWRF": e,
        "BOWSVM": a,
        "BOWNB": b,
        "BOWKNN": c,
        "BOWLR": d,
        "filenames": f,
        "cases": positive,
        "fasttextlabel": fasttextlabel,
        "fasttextconfidence": fasttextconfidence
    })

示例#13

0

显示文件

from preprocess import preProcess
import os
import sys
from ann_lib import *

inputFile = sys.argv[1]
preProcess(inputFile)

imglist = []
for img in os.listdir("./tmp"):
    imglist.append(img)

imglist.sort()

#call the function to train a model and validate
#this call return the plate number as string
ann_call_fn(imglist)

for fileName in os.listdir("./tmp"):
    os.remove("./tmp/" + fileName)

示例#14

0

显示文件

文件： utils.py 项目： JermyLu/simple_pyTorch_template

    #random choose a item from a list
    def randomChoice(self, l):
        return l[random.randint(0, len(l) - 1)]

    #random choose a instance of (category, word)
    def randomTrainPair(self):
        category = self.randomChoice(self.all_categories)
        word = self.randomChoice(self.category_words[category])
        return category, word

    def randomTrainExample(self):
        category, word = self.randomTrainPair()
        #print(category)
        #print(word)
        category_tensor = self.categoryTensor(category)
        word_tensor = self.inputTensor(word)
        target_tensor = self.targetTensor(word)
        return category_tensor, word_tensor, target_tensor


if __name__ == '__main__':
    from preprocess import preProcess
    path = r'D:\\Pycharm\\workspcae\\NLP-playing\\data_2\\names\\*.txt'
    testing = preProcess(path)
    category_words, all_categories = testing.process()
    #print(category_words['Chinese'])
    utils = Utils(category_words, all_categories)
    category_tensor, word_tensor, target_tensor = utils.randomTrainExample()
    print(category_tensor)
    print(word_tensor)
    print(target_tensor)

示例#15

0

显示文件

文件： main.py 项目： yrao1000/Hinglish-Sentiment-Analysis

book = open_workbook('hinglish.xlsx')
sheet = book.sheet_by_index(0)
ctr = 0
for row in sheet.col(1):
    if ctr != 0:
        posts.append(row.value.encode('utf-8'))
    ctr += 1
print posts[1]

#### Write posts with polirity
writeDoc = open('OUTPUT.csv', 'w+')

#### STEP 1 - Apply preprocessing

for i in range(len(posts)):
    posts[i] = preprocess.preProcess(posts[i])

#### STEP 2 - Actual work

sno = 1
for post in posts:

    ## Get multipliers
    totalPol = 0.0

    tagdata = tagger.getTag(post)
    ## List of multiplying factors
    MFlist = MF.getMF(tagdata)

    for word in post.split(' '):
        ## Get tag info

示例#16

0

显示文件

文件： data_helper.py 项目： dalerxli/computer_vision_corner_detection

import numpy as np
import preprocess
import cv2
import os
"""
POS_IMG_SITE = './img/pos/'
NEG_IMG_SITE = './img/neg/'
"""
TEST_IMG_SITE = './img/test/'

POS_IMG_SITE = './work/pos/'
NEG_IMG_SITE = './work/neg/'

if not os.path.exists('./work') == True:
    preprocess.preProcess()


def load():
    imgs = []
    tags = []
    for name in os.listdir(POS_IMG_SITE):
        imgs.append(cv2.imread(POS_IMG_SITE + name, 0))
        tags.append(1)
    for name in os.listdir(NEG_IMG_SITE):
        imgs.append(cv2.imread(NEG_IMG_SITE + name, 0))
        tags.append(0)
    return np.asarray(imgs, dtype=float), tags


def load_test():
    imgs = []

示例#17

0

显示文件

文件： test.py 项目： moesmatheus/aceleradev-ds-week8

def test_preprocess():
    out = preprocess.preProcess(train=False)

    assert len(out) == 2
    assert type(out[0]) == type(np.array([]))

示例#18

0

显示文件

文件： buildClassifier.py 项目： GeethaAnne/StockMarketPrediction

def buildTestVector(tweet):
    return preProcess(tweet) # takes dataset and returns a vector(combination of words)

示例#19

0

显示文件

def mainFunction(fileName):
    preProcess(fileName)
    return getMarksString()

示例#20

0

显示文件

文件： Task1.py 项目： harisch1/Movie-Predict

import numpy as np
import preprocess
import NeuralNetworks
import NeuralNetwork2
X, Y = preprocess.preProcess('Teleplay.csv')
Y = np.reshape(Y, (Y.shape[0], 1))
X_pred = preprocess.preProcess('New_Teleplay.csv')
X_pred.resize(len(X_pred), 89)

#nn = NeuralNetworks.NeuralNetwork(X,Y)
#nn.train(epochs = 1000)
prediction = np.array(NeuralNetwork2.neural(X, Y, X_pred))
prediction.round(decimals=2)
#nn.pred(X_pred)
np.savetxt("18086809D_Task1.csv", prediction, delimiter=",")

示例#21

0

显示文件

文件： algo.py 项目： Lohit13/HindiSentimentAnalysis

book = xlrd.open_workbook(sys.argv[1])
sheet = book.sheet_by_index(0)
ctr = 0
for row in sheet.col(1):
	if ctr!=0:
		posts.append(row.value.encode('utf-8'))
	ctr+=1

#### Write posts with polirity
writeDoc = open('OUTPUT.csv','w+')


#### STEP 1 - Apply preprocessing

for i in range(len(posts)):
	posts[i] = preprocess.preProcess(posts[i])

#### STEP 2 - Actual work

sno = 1
for post in posts:

	## Get multipliers
	totalPol = 0.0

	tagdata = tagger.getTag(post)
	## List of multiplying factors
	MFlist = MF.getMF(tagdata)

	for word in post.split(' '):
		## Get tag info

示例#22

0

显示文件

def get_cleaned_data():
    data = request.get_json()
    final = preProcess(data['data'])
    return jsonify({'data': final})

示例#23

0

显示文件

        loss = 0
        for i in range(word_tensor.size(0)):
            output, hidden = self.model(category_tensor, word_tensor[i], hidden)
            loss += self.criterion(output, target_tensor[i])
        loss.backward()

        for p in self.model.parameters():
            p.data.add_(p.grad.data, alpha=-self.lr)

        return output, loss.item() / word_tensor.size(0)


if __name__=='__main__':

    path = r'D:\\Pycharm\\workspcae\\NLP-playing\\data_2\\names\\*.txt'
    pp = preProcess(path)
    category_words, all_categories = pp.process()
    # print(category_words['Chinese'])
    utils = Utils(category_words, all_categories)
    #category_tensor, word_tensor, target_tensor = utils.randomTrainExample()

    all_letters = string.ascii_letters + " .,;'-"
    input_size = len(all_letters) + 1
    hidden_size = 128
    model = RNN(len(all_categories), input_size, hidden_size, input_size)

    train = Train(model)

    n_iters = 100000
    print_every = 5000