Python data_preprocessing示例，preprocessing.data_preprocessing Python示例

示例#1

0

显示文件

文件： inference.py 项目： khuloodkh/SANC

def classify(user_input):
    # Define tfidf file path
    pickle_tfidf = "static/pickle_tfidf.pkl"
    # Define model file path
    pickle_model = "static/pickle_model.pkl"
    # Load the tfidf
    loaded_tfidf = pickle.load(open(pickle_tfidf, 'rb'))
    # Load the model
    loaded_model = pickle.load(open(pickle_model, 'rb'))
    # Preprocessing the user input
    pre_user_input = data_preprocessing(user_input)
    # Transform the preprocessed user input
    user_input_tfidf = loaded_tfidf.transform([pre_user_input])
    # Predict the topic
    predictions = loaded_model.predict(user_input_tfidf)
    # Return the prediction
    return predictions

示例#2

0

显示文件

文件： interface.py 项目： cycnc35/AMZ-Cellphone-Review

def main():
    """
    This function run the dash package. Create a html component. The structure of the
    DOM object is a big div includes two children div. One for the chart part the other for
    searching reviews.
    """
    items = pd.read_csv("Data/items.csv")
    reviews = pd.read_csv("Data/reviews.csv")
    review_item, helpful_vote, brands, helpful_vote_dict = data_preprocessing(
        items, reviews)

    helpful_vote_dict = {
        'ASUS': helpful_vote_dict["ASUS"]["title_item"],
        'Apple': helpful_vote_dict["Apple"]["title_item"],
        'Google': helpful_vote_dict["Google"]["title_item"],
        'HUAWEI': helpful_vote_dict["HUAWEI"]["title_item"],
        'Motorola': helpful_vote_dict['Motorola']["title_item"],
        'Nokia': helpful_vote_dict['Nokia']["title_item"],
        'OnePlus': helpful_vote_dict['OnePlus']["title_item"],
        'Samsung': helpful_vote_dict['Samsung']["title_item"],
        'Sony': helpful_vote_dict['Sony']["title_item"],
        'Xiaomi': helpful_vote_dict['Xiaomi']["title_item"]
    }

    names = list(helpful_vote_dict.keys())

    app = dash.Dash(__name__)
    app.layout = html.Div([
        html.Div([
            html.H1(id="project_title",
                    style={"textAlign": "center"},
                    children="Visualization of cell phone reviews data"),
            html.Div([
                html.
                P('This website provides user with a detail reviews from Amazon. Included '
                  '"Sales percentage" from Amazon website, satisfaction histogram in '
                  'different brands.'),
                html.
                P('User can select a certain brand to see the total satisfaction. Also, '
                  'selecting a certain type of cell phone, the website will provides the '
                  'highest vote review from Amazon.')
            ],
                     style={
                         'width': '60%',
                         'margin': "auto",
                         'text-align': 'center'
                     }),
            html.Br(),
            dcc.Graph(id="sales_volume", figure=brand_counts(review_item)),
            html.Br(),
            dcc.Dropdown(id='brand_dropdown0',
                         options=[{
                             'label': name,
                             'value': name
                         } for name in brands],
                         value='ASUS',
                         clearable=False),
            html.Br(),
            dcc.Graph(id="sales_volume_of_type"),
            html.Br(),
            dcc.Graph(id="overall_rating",
                      figure=plot_stacked_rating_hist_allbrands(review_item)),
            html.Br(),
            html.Div([
                dcc.Dropdown(id='brand_dropdown',
                             options=[{
                                 'label': name,
                                 'value': name
                             } for name in brands],
                             value='ASUS',
                             clearable=False),
            ],
                     style={
                         'width': '90%',
                         'display': 'inline-block'
                     }),
            dcc.Graph(id="brand_rating")
        ]),
        html.H2(id="Vote_title",
                children="Highest vote reviews  selecting cell phone",
                style={"textAlign": "center"}),
        html.Div([
            html.Div([
                dcc.Dropdown(id='name-dropdown',
                             options=[{
                                 'label': name,
                                 'value': name
                             } for name in names],
                             placeholder="Select a brand",
                             value=list(helpful_vote_dict.keys())[0],
                             clearable=False),
            ],
                     style={
                         'width': '20%',
                         'display': 'inline-block'
                     }),
            html.Div([
                dcc.Dropdown(id='item-dropdown',
                             placeholder="Select a type",
                             clearable=False),
            ],
                     style={
                         'width': '100%',
                         'display': 'inline-block'
                     }),
            html.Hr(),
            html.P('The selecting review will be displayed below: ' + "\n"),
            html.Div(id='display-selected-values')
        ]),
    ])

    @app.callback(Output('brand_rating', 'figure'),
                  [Input('brand_dropdown', 'value')])
    def update_brand_rating(brand_name):
        filtered_df = review_item[review_item.brand == brand_name]
        ratings = filtered_df.groupby('rating_review').size().reset_index(
            name='counts')
        ratings['counts'] = ratings['counts']

        data = go.Data([
            go.Pie(labels=list(ratings['rating_review']),
                   values=list(ratings['counts']))
        ])
        layout = go.Layout(title={
            "text": "Pie chart of " + brand_name + "'s ratings",
            'x': 0.5,
            'y': 0.9,
            'xanchor': 'center',
            'yanchor': 'top'
        },
                           font={"size": 16})
        figure = go.Figure(data=data, layout=layout)

        colors = [
            'gold', 'mediumturquoise', 'darkorange', 'lightgreen', 'lightblue'
        ]
        figure.update_traces(hoverinfo='label+percent',
                             textinfo='value',
                             textfont_size=16,
                             marker=dict(colors=colors,
                                         line=dict(color='#000000', width=2)))
        return figure

    @app.callback(dash.dependencies.Output('item-dropdown', 'options'),
                  [dash.dependencies.Input('name-dropdown', 'value')])
    def update_date_dropdown(name):
        return [{'label': i, 'value': i} for i in helpful_vote_dict[name]]

    @app.callback(
        dash.dependencies.Output('display-selected-values', 'children'),
        [dash.dependencies.Input('item-dropdown', 'value')])
    def set_display_children(selected_value):
        res = ""
        res += helpful_vote.loc[helpful_vote['title_item'] ==
                                selected_value]['body']
        return res

    @app.callback(Output('sales_volume_of_type', 'figure'),
                  [Input('brand_dropdown0', 'value')])
    def type_counts_of_brand(brand):
        brand_type = review_item.groupby(["brand", "asin"]).size()
        labels = brand_type.loc[brand, :].index.get_level_values(1)
        values = brand_type.loc[brand, :].values
        layout = go.Layout(title={
            "text": "Sales Volume for each type of " + brand,
            "xanchor": "left",
            'yanchor': 'top',
            'x': 0.35,
            'y': 0.9
        },
                           font={"size": 20})
        data = go.Data([go.Pie(labels=labels, values=values, textinfo='none')])
        figure = go.Figure(data=data, layout=layout)

        return figure

    app.run_server(debug=True)

示例#3

0

显示文件

文件： main.py 项目： gmanoj4/Test_CICD

# -*- coding: utf-8 -*-
"""
Created on Mon Oct 14 16:49:47 2019

@author: GR5048890
"""
import modelling
import config
import preprocessing
import unit_testcase
import visualization
if __name__ == '__main__':

    df = preprocessing.read_csv(config.input_path)
    df = preprocessing.data_preprocessing(df)
    df = preprocessing.data_manipulation(df, config.output_path)
    visualization.visualization_raw_data(df, config.image_url)
    X_test, y_pred_asthama, clf = modelling.spliting_data_and_training_model(
        config.output_path, df)
    modelling.visualization(X_test, y_pred_asthama)
    so2 = input("please enter So2 value= ")
    no2 = input("please enter No2 value= ")
    unit_testcase.unit_test_case(clf, so2, no2)

示例#4

0

显示文件

文件： main.py 项目： cuimaolin/3dContainerPacking

                    type=str,
                    help='what model you want to choose')
args = parser.parse_args()

if __name__ == "__main__":
    '''
   得到特征
   '''
    data = getFeaure(ln_root='./data/BR', ln_ult='./data/BR.xlsx')  # 原始数据
    data_afterdispose = getFeaure(ln_root='./data/BR_afterdispose',
                                  ln_ult='./data/BR_afterdispose.xlsx')  # 合成数据
    data = pd.concat([data, data_afterdispose], axis=0)  # 将两种数据合并
    '''
   预处理
   '''
    train_x, test_x, train_y, test_y = data_preprocessing(
        data, test_size=args.test_size)
    '''
   训练模型并得到预测结果
   '''
    if args.model == 'linear':
        print('using linear model')
        pred_y = linear_model(train_x, train_y, test_x)
    if args.model == 'randomForest':
        print('using random forest model')
        pred_y = randomForest_model(train_x, train_y, test_x)
    if args.model == 'MLP':
        print('using neural network model')
        pred_y = network_model(train_x, train_y, test_x)
    if args.model == 'lightgbm':
        print('using lightgbm model')
        pred_y = lightgbm_model(train_x, train_y, test_x)

示例#5

0

显示文件

文件： main.py 项目： DUT-Tjy/keras-practice

from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

print("Data loading...")
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/development.csv')
print("Data loading is done!")

print("Sentence cut...")
train_sentence = preprocessing.data_preprocessing(train_df['joke'])
test_sentence = preprocessing.data_preprocessing(test_df['joke'])
print("Sentence cut is done!")

# # TF-IDF对句子进行向量化
# X = TF_IDF(train_sentence)
# test = TF_IDF(test_sentence)
"""Word2Vec, 返回词向量矩阵，对训练集测试集进行padding"""
X, X_test, embedding_matrix, vocab_size = word2vec_weight(
    train_sentence, test_sentence)
"""数据切分为训练集、验证集"""
# 传统机器学习
y = train_df['label']
X_train, X_dev, y_train, y_dev = model_selection.train_test_split(
    X, y, test_size=0.2)
print("划分训练集测试集完成！")

示例#6

0

显示文件

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# import preprossing function
from preprocessing import data_preprocessing
# import the pickle to Save the models
import pickle

# -------------------------- prepare data for modeling --------------------#

# Read the data set from csv file
df = pd.read_csv('../datasets/dataset.csv')

# Cleand the articles column
df['article'] = df['article'].apply(
    lambda article: data_preprocessing(article))

# Determen the target and the predictor
y = df.cat_topic
X = df.article

# Convert them into data frame
X = pd.DataFrame(X)
y = pd.DataFrame(y)

# Split X and y to train and test data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.2,
                                                    shuffle=True,
                                                    stratify=y,

示例#7

0

显示文件

import preprocessing
import recommender

## dev
target_users_list, users, metadata, magazine, read, read_each_article = preprocessing.data_preprocessing(
    'dev')
recommed = recommender.recommender(target_users_list, './recommend.txt')

## test
#target_users_list, users, metadata, magazine, read, read_each_article = data_preprocessing('test')
#recommed = recommender(target_users_list, './recommend.txt')

示例#8

0

显示文件

from textblob import TextBlob
import gensim
from nltk.stem.snowball import SnowballStemmer
import sys
sys.path.insert(0, 'D:\\bot\\botapi\\botapi')
from functions import token_stems
from functions import token_stems_stop
from functions import tok
from functions import tokenizing
#from leave import leave_func
from main_model import mainmodel_func
from identification import identification
from functions import tok_behavior
from preprocessing import data_preprocessing

pre_process_data = data_preprocessing()

#API WILL RUN FROM HERE


def load_libraries(request):
    return HttpResponse("<h1>Not Much Going On Here 5</h1>")


def bot_API(request, name):
    #result = mainmodel_func(name,main_tok,module_tok)
    result = identification(name, pre_process_data[0], pre_process_data[1],
                            pre_process_data[2], pre_process_data[3])
    return HttpResponse(result)