Пример #1
0
def test_step3_check_output():
    # Check that all RTS models have been created
    RTS_odbc = RxOdbcData(connection_string, table="RTS")
    forest_serialized = rx_read_object(RTS_odbc,
                                       key="RF",
                                       deserialize=False,
                                       decompress=None)
    boosted_serialized = rx_read_object(RTS_odbc,
                                        key="GBT",
                                        deserialize=False,
                                        decompress=None)
    fast_serialized = rx_read_object(RTS_odbc,
                                     key="FT",
                                     deserialize=False,
                                     decompress=None)
    NN_serialized = rx_read_object(RTS_odbc,
                                   key="NN",
                                   deserialize=False,
                                   decompress=None)

    assert forest_serialized.__str__()[:6] == "b'blob"
    assert boosted_serialized.__str__()[:6] == "b'blob"
    assert fast_serialized.__str__()[:6] == "b'blob"
    assert NN_serialized.__str__()[:6] == "b'blob"

    # Check that predictions have been made for for all models
    forest_prediction_sql = RxSqlServerData(
        sql_query=
        "SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = N'Forest_Prediction'",
        connection_string=connection_string)
    forest_prediction = rx_import(input_data=forest_prediction_sql)
    boosted_prediction_sql = RxSqlServerData(
        sql_query=
        "SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = N'Boosted_Prediction'",
        connection_string=connection_string)
    boosted_prediction = rx_import(input_data=boosted_prediction_sql)
    fast_prediction_sql = RxSqlServerData(
        sql_query=
        "SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = N'Fast_Prediction'",
        connection_string=connection_string)
    fast_prediction = rx_import(input_data=fast_prediction_sql)
    NN_prediction_sql = RxSqlServerData(
        sql_query=
        "SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = N'NN_Prediction'",
        connection_string=connection_string)
    NN_prediction = rx_import(input_data=NN_prediction_sql)

    assert isinstance(forest_prediction, DataFrame)
    assert isinstance(boosted_prediction, DataFrame)
    assert isinstance(fast_prediction, DataFrame)
    assert isinstance(NN_prediction, DataFrame)
Пример #2
0
def get_num_rows(table, connection_string):
    count_sql = RxSqlServerData(
        sql_query="SELECT COUNT(*) FROM {};".format(table),
        connection_string=connection_string)
    count = rx_import(count_sql)
    count = count.iloc[0, 0]
    return count
Пример #3
0
def gather_image_paths(images_folder,
                       label_to_number_dict,
                       connection_string,
                       mode="train"):
    table = None
    if mode == "train":
        table = "TrainData"
    elif mode == "val":
        table = "ValData"
    elif mode == "test":
        table = "TestData"
    else:
        raise ValueError("'mode' not recognized")

    query = 'SELECT ([file_stream].GetFileNamespacePath()) as image FROM [{}] WHERE [is_directory] = 0 AND file_type = \'png\''.format(
        table)
    filetable_sql = revo.RxSqlServerData(sql_query=query,
                                         connection_string=connection_string)
    data = revo.rx_import(filetable_sql)

    data["image"] = data["image"].apply(lambda x: os.path.join(
        images_folder, x[1:]))  # TODO: assert to confirm paths exist
    data["class"] = data["image"].map(
        lambda x: os.path.basename(os.path.dirname(x)))
    data["label"] = data["class"].map(lambda x: label_to_number_dict[x])
    print(data.iloc[0, 0])
    return data
Пример #4
0
def select_top_value_of_column(table_name, connection_string, column_name):
    query = "SELECT TOP(1) " + column_name + " FROM " + table_name
    print(query)
    query_sql = RxSqlServerData(sql_query=query, connection_string=connection_string)
    data = rx_import(query_sql)
    print(data)
    return data.iloc[0, 0]
Пример #5
0
 def detect_table(table_name, connection_string):
     from revoscalepy import RxSqlServerData, rx_import
     detect_sql = RxSqlServerData(sql_query="IF EXISTS (select 1 from information_schema.tables where table_name = '{}') SELECT 1 AS ret ELSE SELECT 0 AS ret".format(table_name),
                              connection_string=connection_string)
     does_exist = rx_import(detect_sql)
     if does_exist.iloc[0,0] == 1: return True
     else: return False
Пример #6
0
def gather_image_paths(images_folder, connection_string):
    root = os.path.dirname(images_folder)
    query = 'SELECT ([file_stream].GetFileNamespacePath()) as image FROM [MriData] WHERE [is_directory] = 0'
    filetable_sql = RxSqlServerData(sql_query=query, connection_string=connection_string)
    data = rx_import(filetable_sql)
    data["image"] = data["image"].apply(lambda x: os.path.join(root, x[1:]))    # TODO: assert to confirm paths exist
    data["patient_id"] = data["image"].map(lambda x: os.path.basename(os.path.dirname(x)))
    return data
Пример #7
0
def DoInMemoryDivision_SQLPlayground():
    start_time = time.time()
    #define the strings for connection
    conn_str = 'Driver=SQL Server;Server=SANDY;Database=S19SQLPlayground;Trusted_Connection=True;'
    customer_query = '''SELECT * FROM dbo.Customer;'''
    product_query = '''SELECT * FROM dbo.Product;'''
    purchase_query = '''SELECT * FROM dbo.Purchase;'''

    input_query = [customer_query, product_query, purchase_query]
    table_name = ['customer_data', 'product_data', 'purchase_data']

    #try connection and write query to build the dataframe
    connected = True
    try:
        for i in range(len(input_query)):
            data_source = revoscale.RxSqlServerData(sql_query=input_query[i],
                                                    connection_string=conn_str)
            revoscale.RxInSqlServer(connection_string=conn_str,
                                    num_tasks=1,
                                    auto_cleanup=False)
            #globals()[name] create new variable
            globals()[table_name[i]] = pd.DataFrame(
                revoscale.rx_import(data_source))
    except:
        print(
            "Cannot connect to DataBase and create the data table, existing..."
        )
        connected = False
    if connected == False:
        exit(0)

    #iterate 3 tables, either add a new entry to the custExistence dictionary or update the count
    custExistence = {}
    for cu_index, cu_row in customer_data.iterrows():
        for pr_index, pr_row in product_data.iterrows():
            for pu_index, pu_row in purchase_data.iterrows():
                if pu_row["CustomerId"] == cu_row["CustomerId"] and pu_row[
                        "ProductId"] == pr_row["ProductId"]:
                    currentcustomer = pu_row["CustomerId"]
                    if currentcustomer in custExistence:
                        custExistence[currentcustomer] += 1
                    else:
                        custExistence[currentcustomer] = 1
    #print(custExistence)

    #Count all the product (count the row)
    nbProducts = product_data.shape[0]
    #print(nbProducts)

    #print out the customer who has bought all the products
    for customerid in custExistence:
        if custExistence[customerid] == nbProducts:
            print("CustomerId : " + str(customerid))

    end_time = time.time()
    print("This took " + str(end_time - start_time) + " s")
def evaluate_model(conn_str, candidate_table, image_pairs_table):
    distMethods = ["L2", "cosine"]
    image_scores = load_predicted_scores(conn_str, TABLE_SCORE)
    imagePairQuery = "SELECT * FROM " + image_pairs_table
    imagePairData_sql = RxSqlServerData(connection_string=conn_str,
                                        sql_query=imagePairQuery,
                                        strings_as_factors=False)
    imagePairData = rx_import(imagePairData_sql)
    print("Calculating ranking metrics...")
    calculate_ranking_metrics(image_scores, imagePairData, distMethods)
def prepare_data(conn_str, feature_table, train_table, test_table,
                 evaluation_table, numQueryImagePerCat, numNegImgsPerQueryImg,
                 ratioTrainTest):
    image_data_sql = RxSqlServerData(connection_string=conn_str,
                                     sql_query="SELECT image, Label FROM " +
                                     feature_table,
                                     string_as_factors=False)
    image_data_df = rx_import(image_data_sql)
    print("Split image data into training and testing set...")
    get_training_testing_images(image_data_df, ratioTrainTest, train_table,
                                test_table, conn_str)
    print("Getting ranking set from the testing image set...")
    prepare_evaluation_set(conn_str, feature_table, test_table,
                           evaluation_table, numQueryImagePerCat,
                           numNegImgsPerQueryImg)
Пример #10
0
def test_step2_check_output():
    LoS_sql = RxSqlServerData(
        sql_query="SELECT TOP (5) * FROM [Hospital].[dbo].[LoS] ORDER BY eid",
        connection_string=connection_string)
    LoS = rx_import(input_data=LoS_sql)
    LoS[["number_of_issues"]] = LoS[["number_of_issues"]].apply(to_numeric)

    # bmi = [0.312740, -0.671356, -0.48006, -0.921639, 0.226158]
    # lengthofstay = [3.0, 7.0, 3.0, 1.0, 4.0]
    # number_of_issues = [0, 0, 0, 0, 2]
    # d = {"bmi": bmi, "lengthofstay": lengthofstay, "number_of_issues": number_of_issues}
    # df = DataFrame(d, index = [0,1,2,3,4])

    assert (LoS.loc[4, 'number_of_issues'] == 2)
    assert (round(LoS.loc[1, 'bmi'], 6) == round(-0.67135640398687824, 6))
    assert (LoS.loc[3, 'lengthofstay'] == 1.0)
Пример #11
0
def perform_clustering():
    conn_str = 'Driver=SQL Server;Server=DESKTOP-NLR1KT5;Database=tpcxbb_1gb;Trusted_Connection=True;'

    input_query = '''SELECT
    ss_customer_sk AS customer,
    ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio,
    ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio,
    ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio,
    COALESCE(returns_count, 0) AS frequency
    FROM
    (
      SELECT
        ss_customer_sk,
        COUNT(distinct(ss_ticket_number)) AS orders_count,
        COUNT(ss_item_sk) AS orders_items,
        SUM( ss_net_paid ) AS orders_money
      FROM store_sales s
      GROUP BY ss_customer_sk
    ) orders
    LEFT OUTER JOIN
    (
      SELECT
        sr_customer_sk,
        count(distinct(sr_ticket_number)) as returns_count,
        COUNT(sr_item_sk) as returns_items,
        SUM( sr_return_amt ) AS returns_money
    FROM store_returns
    GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk'''

    column_info = {
        "customer": {
            "type": "integer"
        },
        "orderRatio": {
            "type": "integer"
        },
        "itemsRatio": {
            "type": "integer"
        },
        "frequency": {
            "type": "integer"
        }
    }

    data_source = revoscale.RxSqlServerData(sql_query=input_query,
                                            column_Info=column_info,
                                            connection_string=conn_str)
    revoscale.RxInSqlServer(connection_string=conn_str,
                            num_tasks=1,
                            auto_cleanup=False)
    customer_data = pd.DataFrame(revoscale.rx_import(data_source))
    print("Data frame:", customer_data.head(n=5))
    cdata = customer_data
    K = range(1, 20)
    KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K)
    centroids = (k.cluster_centers_ for k in KM)

    D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids)
    dist = (np.min(D, axis=1) for D in D_k)
    avgWithinSS = [sum(d) / cdata.shape[0] for d in dist]
    plt.plot(K, avgWithinSS, 'b*-')
    plt.grid(True)
    plt.xlabel('Number of clusters')
    plt.ylabel('Average within-cluster sum of squares')
    plt.title('Elbow for KMeans clustering')
    plt.show()

    n_clusters = 4

    means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111)
    columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"]
    est = means_cluster.fit(customer_data[columns])
    clusters = est.labels_
    customer_data['cluster'] = clusters

    for c in range(n_clusters):
        cluster_members = customer_data[customer_data['cluster'] == c][:]
        print('Cluster{}(n={}):'.format(c, len(cluster_members)))
        print('-' * 17)

    print(customer_data.groupby(['cluster']).mean())
Пример #12
0
    "WeekDay": {
        "type": "factor",
        "levels": ["1", "2", "3", "4", "5", "6", "7"]
    },
    "Holiday": {
        "type": "factor",
        "levels": ["1", "0"]
    },
    "Snow": {
        "type": "factor",
        "levels": ["1", "0"]
    }
}

#Get the data from SQL Server Table
data_source = RxSqlServerData(table="dbo.rental_data",
                              connection_string=conn_str,
                              column_info=column_info)
computeContext = RxInSqlServer(connection_string=conn_str,
                               num_tasks=1,
                               auto_cleanup=False)

RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False)

# import data source and convert to pandas dataframe
df = pd.DataFrame(rx_import(input_data=data_source))
print("Data frame:", df)
# Get all the columns from the dataframe.
columns = df.columns.tolist()
# Filter the columns to remove ones we don't want to use in the training
columns = [c for c in columns if c not in ["Year"]]
def get_rental_predictions():
#Connection string to connect to SQL Server named instance
 conn_str = 'Driver=SQL Server;Server=MYSQLSERVER;Database=TutorialDB;Trusted_Connection=True;'

#Define the columns we wish to import
 column_info = {
         "Year" : { "type" : "integer" },
         "Month" : { "type" : "integer" },
         "Day" : { "type" : "integer" },
         "RentalCount" : { "type" : "integer" },
         "WeekDay" : {
             "type" : "factor",
             "levels" : ["1", "2", "3", "4", "5", "6", "7"]
         },
         "Holiday" : {
             "type" : "factor",
             "levels" : ["1", "0"]
         },
         "Snow" : {
             "type" : "factor",
             "levels" : ["1", "0"]
         }
     }

 #Get the data from SQL Server Table
 data_source = RxSqlServerData(table="dbo.rental_data",
                              connection_string=conn_str, column_info=column_info)
 computeContext = RxInSqlServer(
     connection_string = conn_str,
     num_tasks = 1,
     auto_cleanup = False
)


 RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False)

 # import data source and convert to pandas dataframe
 df = pd.DataFrame(rx_import(input_data = data_source))
 print("Data frame:", df)
 # Get all the columns from the dataframe.
 columns = df.columns.tolist()
 # Filter the columns to remove ones we don't want to use in the training
 columns = [c for c in columns if c not in ["Year"]]
  # Store the variable we'll be predicting on.
 target = "RentalCount"
 # Generate the training set.  Set random_state to be able to replicate results.
 train = df.sample(frac=0.8, random_state=1)
 # Select anything not in the training set and put it in the testing set.
 test = df.loc[~df.index.isin(train.index)]
 # Print the shapes of both sets.
 print("Training set shape:", train.shape)
 print("Testing set shape:", test.shape)
 # Initialize the model class.
 lin_model = LinearRegression()
 # Fit the model to the training data.
 lin_model.fit(train[columns], train[target])

 # Generate our predictions for the test set.
 lin_predictions = lin_model.predict(test[columns])
 print("Predictions:", lin_predictions)
 # Compute error between our test predictions and the actual values.
 lin_mse = mean_squared_error(lin_predictions, test[target])
 print("Computed error:", lin_mse)
Пример #14
0
# Connection string to connect to SQL Server named instance.
conn_str = 'Driver=SQL Server;Server=MS;Database=ML;Trusted_Connection=True;'
input_query = '''SELECT * FROM [OnlineSales].[RFM]'''

# Define the columns we wish to import.
column_info = {
    "customer": {"type": "integer"},
    "orderRatio": {"type": "integer"},
    "itemsRatio": {"type": "integer"},
    "frequency": {"type": "integer"}
    }

data_source = revoscalepy.RxSqlServerData(sql_query=input_query, column_Info=column_info, connection_string=conn_str)
revoscalepy.RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False)
# import data source and convert to pandas dataframe.
customer_data = pd.DataFrame(revoscalepy.rx_import(data_source))
print("Data frame:", customer_data.head(n=5))
print(customer_data.describe())

cdata = customer_data
K = range(1, 20)
KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K)
centroids = (k.cluster_centers_ for k in KM)

D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids)
dist = (np.min(D, axis=1) for D in D_k)
avgWithinSS = [sum(d) / cdata.shape[0] for d in dist]
plt.plot(K, avgWithinSS, 'b*-')
plt.grid(True)
plt.xlabel('Number of clusters')
plt.ylabel('Average within-cluster sum of squares')
Пример #15
0
def DoInMemoryDivision():
    # Connection string to connect to local DB
    conn_str = 'Driver=SQL Server;Server=DESKTOP-M98I69D;Database=S19SQLPlayground_Seb;Trusted_Connection=True;'
    # Queries to retrieve the Customers, Products, and purchases data respectively
    query_customers = '''SELECT * FROM dbo.Customer'''
    query_products = '''SELECT * FROM dbo.Product'''
    query_purchases = '''SELECT * FROM dbo.Purchase'''

    # Define the columns we wish to import for Customers, Products, and purchases data respectively
    col_cust_info = {
        "Customer": {
            "type": "integer"
        },
        "FirstName": {
            "type": "str"
        },
        "LastName": {
            "type": "str"
        }
    }
    col_prod_info = {
        "Product": {
            "type": "integer"
        },
        "Description": {
            "type": "str"
        },
        "UnitPrice": {
            "type": "integer"
        }
    }
    col_pur_info = {
        "Customer": {
            "type": "integer"
        },
        "Product": {
            "type": "integer"
        },
        "Quantity": {
            "type": "integer"
        }
    }

    # Start Timer to measure execution time
    start = timer()

    data_source_cust = revoscale.RxSqlServerData(sql_query=query_customers,
                                                 column_Info=col_cust_info,
                                                 connection_string=conn_str)
    data_source_prod = revoscale.RxSqlServerData(sql_query=query_products,
                                                 column_Info=col_prod_info,
                                                 connection_string=conn_str)
    data_source_pur = revoscale.RxSqlServerData(sql_query=query_purchases,
                                                column_Info=col_pur_info,
                                                connection_string=conn_str)
    # retrieve the data sources
    revoscale.RxInSqlServer(connection_string=conn_str,
                            num_tasks=3,
                            auto_cleanup=False)
    # import data sources and convert to pandas dataframe.
    customer_data = pd.DataFrame(revoscale.rx_import(data_source_cust))
    product_data = pd.DataFrame(revoscale.rx_import(data_source_prod))
    purchase_data = pd.DataFrame(revoscale.rx_import(data_source_pur))

    # Create a DataFrame to store customer purchase count
    cust_dict = {'CustId': [], 'PurchaseCount': []}
    cust_Purc_Count = pd.DataFrame(cust_dict)

    # Nb of products
    nb_products = len(product_data.index)
    # Division process
    # Iterate over all customers by df rows
    for cindex, crow in customer_data.iterrows():
        # Iterate over all products by df rows
        for pindex, prow in product_data.iterrows():
            # Iterate over all purchases by df rows
            for hindex, hrow in purchase_data.iterrows():
                # Check whether the currentCustomer has bought the current product
                # print (crow['CustomerId'], hrow['CustomerId'], prow['ProductId'] )
                if (crow['CustomerId']
                        == hrow['CustomerId']) and (prow['ProductId']
                                                    == hrow['ProductId']):
                    # Either add a new DataRow in custPurchasesCount
                    # if no existing for the current customer or update the count
                    custExistence = 0
                    for index, cust_count in cust_Purc_Count.iterrows():
                        if cust_count['CustId'] == crow['CustomerId']:
                            custExistence += 1

                    if (custExistence == 0):
                        # First product for this customer
                        td = {'CustId': crow['CustomerId'], 'PurchaseCount': 1}
                        # Make a dataframe to be able to append to cust_Purc_Count
                        tdf = pd.DataFrame(td, index=[0])
                        # Append to cust_Purc_Count
                        # We must use the syntax below because for df append doesn't occur in place but in the object returned by the method
                        cust_Purc_Count = cust_Purc_Count.append(tdf)
                    else:
                        # The product is already referenced cust_Purc_Count => Just increase the PurchaseCount
                        cust_count['PurchaseCount'] += 1

    for dindex, drow in cust_Purc_Count.iterrows():
        # Check the customers that bought all the products
        if drow['PurchaseCount'] == nb_products:
            print('\nDivision CustomerId: ' + str(drow['CustId']))

    end = timer()
    print("The In Memory Division in Python took (S) : ", end - start)
Пример #16
0
             output_file=News_Test_sql,
             overwrite=True)
rx_data_step(input_data=Label_Names_text,
             output_file=Label_Names_sql,
             overwrite=True)

##########################################################################################################################################
##	Get the factor levels of the label
##########################################################################################################################################

# Get the factor levels of the label.
Factors_sql = RxSqlServerData(
    sql_query="SELECT DISTINCT Label FROM News_Train",
    connection_string=connection_string)

levels_list = list(rx_import(Factors_sql)['Label'])

# Write the factor name and levels into a dictionary.
factor_info = {
    'Label': {
        'type': 'factor',
        'levels': [str(s) for s in levels_list]
    }
}

##########################################################################################################################################
##	Create features on the fly for the training set and train the model
##########################################################################################################################################

# Set the compute context to SQL for training.
rx_set_compute_context(sql)
Пример #17
0
# 2.2.1 Specifying a missed value for strings
import os
import settings as st
from revoscalepy import rx_import, rx_get_info, RxXdfData

# Importing Delimited Text Data
infile = os.path.join(st.SAMPLE_DATA_DIR, 'AirlineDemoSmall.csv')

## REM: Our RxOptions.set_option("OutDataPath", [RESULTS_LOCATION])
## is not taken into account (bug?). Hence, we specify the full location
outfile = os.path.join(st.RESULTS_LOCATION, 'airExample.xdf')

# The 'missingValueString' option is not available in RevoscalePy
# See: https://docs.microsoft.com/en-us/machine-learning-server/python-reference/revoscalepy/rxmissingvalues

# A question has been asked on StackOverflow for missing string values:
# https://stackoverflow.com/questions/51155965/default-value-for-strings-when-missing-in-revoscalepy-import

airData = rx_import(
    input_data=infile,
    output_file=outfile,
    strings_as_factors=True,
    # missingValueString = "M"
    rows_per_read=200000,
    overwrite=True)
Пример #18
0
def get_patients_id(table_name, connection_string):
    query = "SELECT DISTINCT patient_id FROM {}".format(table_name)
    sqldata = RxSqlServerData(sql_query=query, connection_string=connection_string)
    data = rx_import(sqldata)
    return data["patient_id"].tolist()
Пример #19
0
def test_step1_check_output():
    col_info = {
        "eid": {
            'type': 'integer'
        },
        "vdate": {
            'type': 'character'
        },
        "rcount": {
            'type': 'character'
        },
        "gender": {
            'type': 'factor'
        },
        "dialysisrenalendstage": {
            'type': 'factor'
        },
        "asthma": {
            'type': 'factor'
        },
        "irondef": {
            'type': 'factor'
        },
        "pneum": {
            'type': 'factor'
        },
        "substancedependence": {
            'type': 'factor'
        },
        "psychologicaldisordermajor": {
            'type': 'factor'
        },
        "depress": {
            'type': 'factor'
        },
        "psychother": {
            'type': 'factor'
        },
        "fibrosisandother": {
            'type': 'factor'
        },
        "malnutrition": {
            'type': 'factor'
        },
        "hemo": {
            'type': 'factor'
        },
        "hematocrit": {
            'type': 'numeric'
        },
        "neutrophils": {
            'type': 'numeric'
        },
        "sodium": {
            'type': 'numeric'
        },
        "glucose": {
            'type': 'numeric'
        },
        "bloodureanitro": {
            'type': 'numeric'
        },
        "creatinine": {
            'type': 'numeric'
        },
        "bmi": {
            'type': 'numeric'
        },
        "pulse": {
            'type': 'numeric'
        },
        "respiration": {
            'type': 'numeric'
        },
        "secondarydiagnosisnonicd9": {
            'type': 'factor'
        },
        "discharged": {
            'type': 'character'
        },
        "facid": {
            'type': 'factor'
        },
        "lengthofstay": {
            'type': 'integer'
        }
    }

    # Point to the input data set while specifying the classes.
    file_path = "..\\Data"
    LoS_text = RxTextData(file=os.path.join(file_path, "LengthOfStay.csv"),
                          column_info=col_info)
    table_text = rx_import(LoS_text)

    LengthOfStay_sql = RxSqlServerData(
        sql_query="SELECT * FROM [Hospital].[dbo].[LengthOfStay] ORDER BY eid",
        connection_string=connection_string,
        column_info=col_info)
    table_sql = rx_import(LengthOfStay_sql)

    assert table_text.equals(table_sql)
# 2.2 Importing Delimited Text Data
import os
import settings as st
from revoscalepy import rx_import, rx_get_info

# Importing Delimited Text Data
infile = os.path.join(st.SAMPLE_DATA_DIR, 'claims.txt')

claims_data_frame = rx_import(infile)

# Displaying info
print(rx_get_info(claims_data_frame, get_var_info=True))

# Saving text data into XDF file

## REM: Our RxOptions.set_option("OutDataPath", [RESULTS_LOCATION])
## is not taken into account (bug?). Hence, we specify the full location
outfile = os.path.join(st.RESULTS_LOCATION, 'claims.xdf')

rx_import(input_data=infile, output_file=outfile, overwrite=True)
Пример #21
0
##########################################################################################################################################

# Make Predictions, then import them into Python.
forest_prediction_sql = RxSqlServerData(table="Forest_Prediction",
                                        string_as_factors=True,
                                        connection_string=connection_string)
rx_predict(forest_model,
           data=LoS_Test,
           output_data=forest_prediction_sql,
           type="response",
           extra_vars_to_write=["lengthofstay", "eid"],
           overwrite=True)

# Compute the performance metrics of the model.
forest_prediction = rx_import(input_data=forest_prediction_sql)
forest_metrics = evaluate_model(
    observed=forest_prediction['lengthofstay'],
    predicted=forest_prediction['lengthofstay_Pred'],
    model="RF")

##########################################################################################################################################

##	Boosted Trees Scoring

##########################################################################################################################################

# Make Predictions, then import them into Python.
boosted_prediction_sql = RxSqlServerData(table="Boosted_Prediction",
                                         string_as_factors=True,
                                         connection_string=connection_string)
# 2.8 Append to an existing file
import os
import settings as st
from revoscalepy import rx_import, rx_get_info

# Input file
firstFile = os.path.join(st.SAMPLE_DATA_DIR, 'claims.txt')

# Output file
outfile = os.path.join(st.RESULTS_LOCATION, 'claimsclaims.xdf')

# Creating 1st file
rx_import(input_data=firstFile,
          output_file=outfile,
          strings_as_factors=True,
          overwrite=True)

# Displaying info
claims_data_frame = rx_import(outfile)
print(rx_get_info(claims_data_frame, get_var_info=True))

# Appending the same file to the output
rx_import(input_data=firstFile,
          output_file=outfile,
          strings_as_factors=True,
          append="rows")

# Re-displaying info
claims_data_frame = rx_import(outfile)
print(rx_get_info(claims_data_frame, get_var_info=True))
Пример #23
0
import pandas as pd
from revoscalepy import rx_import,  rx_get_info, RxOptions, RxTextData, RxXdfData, rx_create_col_info, rx_data_step


# For this example, we assume the airOT201201.csv file was downloaded into the current working directory
dirpath = os.getcwd()
# Other samples in RxOptions.get_option("sampleDataDir")

# CSV File (Raw Data for one month)
airlineDemoPathCSV = os.path.join(dirpath, "airOT201201.csv")

# Create the CSV Data Source using RxTextData
ds = RxTextData(airlineDemoPathCSV)

# Import first 10 rows of the CSV file
airlineDemo = rx_import(input_data=ds, number_rows=10)

# Inspect data types using rx_get_info
print(rx_get_info(airlineDemo, get_var_info=True))

# Inspect Unique Carrier (factor)
print(rx_get_info(airlineDemo, vars_to_keep="UNIQUE_CARRIER", get_var_info=True))

# Import the first 10 rows again, and this time use strings_as_factors = True
airlineDemo = rx_import(input_data=ds, strings_as_factors = True, number_rows=10)

# Inspect data types
print(rx_get_info(airlineDemo, get_var_info=True))

# Inspect Unique Carrier (factor) to see the difference using strings_as_factors = True
print(rx_get_info(airlineDemo, vars_to_keep="UNIQUE_CARRIER", get_var_info=True))
Пример #24
0
def select_entry_where_column_equals_value(table_name, connection_string, column_name, value):
    query = "SELECT TOP (1) * FROM {} WHERE {} = '{}'".format(table_name, column_name, value)
    print(query)
    query_sql = RxSqlServerData(sql_query=query, connection_string=connection_string)
    data = rx_import(query_sql)
    return data
Пример #25
0
# Defining transformation
colInfoList = {
    'age': {
        'type':
        'factor',
        'levels':
        ['17-20', '21-24', '25-29', '30-34', '35-39', '40-49', '50-59', '60+']
    },
    'car.age': {
        'type': 'factor',
        'levels': ['0-3', '4-7', '8_9', '10+']
    },
    'type': {
        'type': 'factor',
        'levels': ['A', 'B', 'C', 'D']
    },
}

# Applying transformation
rx_import(input_data=infile,
          output_file=outfile,
          column_info=colInfoList,
          overwrite=True)

# Displaying info
claims_data_frame = rx_import(outfile)

print(rx_get_info(claims_data_frame, get_var_info=True))
print(claims_data_frame.head())
outfileTypeRelabeled = os.path.join(st.RESULTS_LOCATION,
                                    'claimsTypeRelabeled.xdf')

# Defining transformation
colInfoList = {
    'type': {
        'type': 'factor',
        'levels': ['A', 'B', 'C', 'D'],
        'newLevels': ['Subcompact', 'Compact', 'Mid-size', 'Full-size'],
        'description': 'Body Type'
    }
}

# Applying transformation
rx_import(input_data=inFileAddVars,
          output_file=outfileTypeRelabeled,
          column_info=colInfoList,
          overwrite=True)

## REM: Contrary to what is mentioned in the documentation, rx_import
## does not return a RxXdfData object when an output file is specified
## but a <class 'bool'>.
## See: https://docs.microsoft.com/en-us/machine-learning-server/python-reference/revoscalepy/rx-import#returns

# Displaying info
claims_data_frame = rx_import(outfileTypeRelabeled)

info = rx_get_info(claims_data_frame, get_var_info=True)
print(info)

print(claims_data_frame.head())
from revoscalepy import rx_import
from revoscalepy import rx_dforest
from revoscalepy import rx_dtree
from revoscalepy import RxSqlServerData
from revoscalepy import rx_serialize_model
import pyodbc

# Connection string to connect to SQL Server named instance
conn_str = 'Driver=SQL Server;Server=.;Database=SQLML;Trusted_Connection=True;'
sql_query ='SELECT "Sepal.Length","Sepal.Width","Petal.Length","Petal.Width",Species ' \
    'FROM dbo.vw_iris_training'

data_source = RxSqlServerData(sql_query=sql_query, connection_string=conn_str)
iris_train_data = rx_import(input_data=data_source)
iris_train_data["Species"] = iris_train_data["Species"].astype("category")

model = rx_dforest(
    "Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width",
    data=iris_train_data)
trained_rx_dforest_model = rx_serialize_model(model,
                                              realtime_scoring_only=True)
#Save model
conn = pyodbc.connect(conn_str)
cursor = conn.cursor()
cursor.execute(
    "DELETE FROM dbo.iris_models WHERE model_name ='rxDForest' AND model_language ='Python'"
)
cursor.execute(
    "INSERT INTO dbo.iris_models(model,model_name,model_language) values (?,?, ?)",
    trained_rx_dforest_model, 'rxDForest', 'Python')
conn.commit()
Пример #28
0
def display_head(table_name, n_rows):
    table_sql = RxSqlServerData(sql_query="SELECT TOP({}}) * FROM {}}".format(
        n_rows, table_name),
                                connection_string=connection_string)
    table = rx_import(table_sql)
    print(table)
def perform_clustering():
    ################################################################################################

    ##	Connect to DB and select data

    ################################################################################################

    # Connection string to connect to SQL Server named instance.
    conn_str = 'Driver=SQL Server;Server=DESKTOP-INU1SHI;Database=tpcxbb_1gb;Trusted_Connection=True;'

    input_query = '''SELECT
    ss_customer_sk AS customer,
    ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio,
    ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio,
    ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio,
    COALESCE(returns_count, 0) AS frequency
    FROM
    (
      SELECT
        ss_customer_sk,
        -- return order ratio
        COUNT(distinct(ss_ticket_number)) AS orders_count,
        -- return ss_item_sk ratio
        COUNT(ss_item_sk) AS orders_items,
        -- return monetary amount ratio
        SUM( ss_net_paid ) AS orders_money
      FROM store_sales s
      GROUP BY ss_customer_sk
    ) orders
    LEFT OUTER JOIN
    (
      SELECT
        sr_customer_sk,
        -- return order ratio
        count(distinct(sr_ticket_number)) as returns_count,
        -- return ss_item_sk ratio
        COUNT(sr_item_sk) as returns_items,
        -- return monetary amount ratio
        SUM( sr_return_amt ) AS returns_money
    FROM store_returns
    GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk'''


    # Define the columns we wish to import.
    column_info = {
        "customer": {"type": "integer"},
        "orderRatio": {"type": "integer"},
        "itemsRatio": {"type": "integer"},
        "frequency": {"type": "integer"}
    }

   
    data_source = revoscale.RxSqlServerData(sql_query=input_query, column_Info=column_info,
                                              connection_string=conn_str)
    ComputeContext = revoscale.RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False)
    # import data source and convert to pandas dataframe.
    customer_data = pd.DataFrame(revoscale.rx_import(data_source))
    print("Data frame:", customer_data.head(n=5))




    ################################################################################################

        ##	Determine number of clusters using the Elbow method

    ################################################################################################
 
     cdata = customer_data
     K = range(1,20)
     KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K)
     centroids = (k.cluster_centers_ for k in KM)

     D_K = (sci_distance.cdist(cdata,cent,'euclidean')for cent in centroids)
     dist = (np.min(D, axis=1)for D in D_K)
     avgWithinSS = [sum(d)/cdata.shape[0] for d in dist]
     plt.plot(K, avgWithinSS, 'b*-')
     plt.grid(True)
     plt.xlabel('Number of Clusters')
     plt.ylabel('Average Within Clusters Sum of Squares')
     plt.title('Kmeans Clustering')
     plt.show()
import microsoftml as ml
from microsoftml import FastForest, FastLinear, FastTrees, LogisticRegression, NeuralNetwork
from microsoftml import rx_fast_forest, rx_fast_linear, rx_fast_trees, rx_logistic_regression, rx_neural_network
from microsoftml import rx_ensemble, Ensemble, EnsembleControl
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

# Read Cleaned Ipo2609 XDF file
IPO2609FE = rx_import("IPO2609FeatureEngineering.xdf")
ipo_train, ipo_test = train_test_split(IPO2609FE, random_state=42)

# Columns for training (X), remove label (y)
features = IPO2609FE.columns.drop(["underpriced"])

# SciKit-Learn logistic regression model
X = ipo_train[features.values.tolist()]
y = np.ravel(ipo_train["underpriced"])
sk_log_reg = LogisticRegression()
sk_log_model = sk_log_reg.fit(X, y)
probList = []
probList = sk_log_model.predict_proba(ipo_test[features.values.tolist()])[:, 1]
probArray = np.asarray(probList)
fpr, tpr, thresholds = roc_curve(ipo_test["underpriced"], probArray)
aucResult = auc(fpr, tpr)
Пример #31
0
def perform_clustering():
    ################################################################################################

    ##	Connect to DB and select data

    ################################################################################################

    # Connection string to connect to SQL Server named instance.
    conn_str = 'Driver=SQL Server;Server=DESKTOP-CF2TNBV;Database=tpcxbb_1gb;Trusted_Connection=True;'

    input_query = '''SELECT
    ss_customer_sk AS customer,
    ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio,
    ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio,
    ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio,
    COALESCE(returns_count, 0) AS frequency
    FROM
    (
      SELECT
        ss_customer_sk,
        -- return order ratio
        COUNT(distinct(ss_ticket_number)) AS orders_count,
        -- return ss_item_sk ratio
        COUNT(ss_item_sk) AS orders_items,
        -- return monetary amount ratio
        SUM( ss_net_paid ) AS orders_money
      FROM store_sales s
      GROUP BY ss_customer_sk
    ) orders
    LEFT OUTER JOIN
    (
      SELECT
        sr_customer_sk,
        -- return order ratio
        count(distinct(sr_ticket_number)) as returns_count,
        -- return ss_item_sk ratio
        COUNT(sr_item_sk) as returns_items,
        -- return monetary amount ratio
        SUM( sr_return_amt ) AS returns_money
    FROM store_returns
    GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk'''

    # Define the columns we wish to import.
    column_info = {
        "customer": {
            "type": "integer"
        },
        "orderRatio": {
            "type": "integer"
        },
        "itemsRatio": {
            "type": "integer"
        },
        "frequency": {
            "type": "integer"
        }
    }

    data_source = revoscale.RxSqlServerData(sql_query=input_query,
                                            column_Info=column_info,
                                            connection_string=conn_str)
    revoscale.RxInSqlServer(connection_string=conn_str,
                            num_tasks=1,
                            auto_cleanup=False)
    # import data source and convert to pandas dataframe.
    customer_data = pd.DataFrame(revoscale.rx_import(data_source))
    print("Data frame:", customer_data.head(n=5))

    ################################################################################################

    ##	Determine number of clusters using the Elbow method

    ################################################################################################

    cdata = customer_data
    K = range(1, 20)
    KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K)
    centroids = (k.cluster_centers_ for k in KM)

    D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids)
    dist = (np.min(D, axis=1) for D in D_k)
    avgWithinSS = [sum(d) / cdata.shape[0] for d in dist]
    plt.plot(K, avgWithinSS, 'b*-')
    plt.grid(True)
    plt.xlabel('Number of clusters')
    plt.ylabel('Average within-cluster sum of squares')
    plt.title('Elbow for KMeans clustering')
    plt.show()

    ################################################################################################
    ##	Perform clustering using Kmeans
    ################################################################################################

    # It looks like k=4 is a good number to use based on the elbow graph.
    n_clusters = 4

    means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111)
    columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"]
    est = means_cluster.fit(customer_data[columns])
    clusters = est.labels_
    customer_data['cluster'] = clusters

    # Print some data about the clusters:

    # For each cluster, count the members.
    for c in range(n_clusters):
        cluster_members = customer_data[customer_data['cluster'] == c][:]
        print('Cluster{}(n={}):'.format(c, len(cluster_members)))
        print('-' * 17)

    # Print mean values per cluster.
    print(customer_data.groupby(['cluster']).mean())