def train_RandomForecast_classifier(X,Y, random_state_value, result):

    x_train, x_test, y_train, y_test = train_test_split(X, Y, 
                                                        stratify = Y)
    
    # Generate Prediction
    clf_RF = RandomForestClassifier(n_estimators = 100)
    clf_RF.fit(x_train, y_train)
    y_predict = clf_RF.predict(x_test)

    # Generate Results
    if result == 'Classification_report':
        NB_class_report = sklearn.metrics.classification_report(y_test, y_predict)
        return NB_class_report
    elif result == 'f1_score':
        NB_f1_score = sklearn.metrics.f1_score(y_test, y_predict)
        return NB_f1_score
    elif result == 'precision_score':
        NB_precision_score = sklearn.metrics.precision_score(y_test, y_predict)
        return NB_precision_score
    elif result == 'recall_score':
        NB_recall_score = sklearn.metrics.recall_score(y_test, y_predict)
        return NB_recall_score
    elif result == 'feature_importance':
        Feature_important = clf_RF.feature_importances_
        df = pd.DataFrame({}, index = X.columns)
        df['Feature Importance'] = Feature_important
        m0.write_to_excel(df, 'Feature_Importance', output_dir)
        return clf_RF.score(x_test, y_test)
def gen_feature_groups():
    target_file = r'Feature_Importance_2018-12-03 17:29:11.854574.xlsx'
    target_dir = r'/home/ccirelli2/Desktop/Programming/SCA_Web_scaper/ML_Algorithm_Results'

    Feature_importance_results = m8.record_feature_importance_ungrouped_categories(
        df, generate_sum_importance_ungrouped_features, target_dir,
        target_file)
    m0.write_to_excel(Feature_importance_results,
                      'Feature_importance_results_manual_generate', target_dir)

    return None
Exemplo n.º 3
0
def SCA_data_scraper(Url, add_pages, table, Run_type, report_output_type,
                     password):
    '''
    INPUTS
    Url:            Stanford Law Web Page - Target of scraper                
    Run_type:       Two options, Reset or Start_from_last_lage
    Url:            The web page from which we are scraping data
    add_pages:      Using the 'Start_from_last_page selection, pages_2_add is an additional
                    page to add to the 'Beginning_page' object.  Sometimes the webpage manager
                    choses to insert blank pages into the numerical sequence of pages, which
                    trips up the scraper. 
    report_output   Type of output the user wants to generate.  Used only for the 'Start_from_last_page
                    selection (need to sync up with driver function for gen reports.
    password        email account password (omitt from base script)
    '''
    # Table Object
    table = table

    # SCRAPER______________________________________________________________________________________

    # RUN-TYPE - RESET
    if Run_type == 'Reset':
        # Reset count values
        Count = 0
        Beginning_page = 100610
        End_page = 107052

        # Clear Database
        print('Clearing data from table {}'.format(table))
        mycursor = mydb.cursor()
        sql = "DELETE FROM {} WHERE page_number IS NOT NULL".format(table)
        # This will not work since we changed the values to varchar()
        mycursor.execute(sql)
        mydb.commit()

        # Create Range for Loop
        upper_bound = End_page - Beginning_page
        range_value = range(0, upper_bound)

        # Enter For Loop & Scrape All Pages in Range.
        for x in range_value:

            # Increment Count
            Count += 1

            # Progress Recorder - % Total Pages Scraped
            m0.progress_recorder(Count, upper_bound)

            # Create Beautiful Soup Object per article
            html = urlopen(Url + str(Beginning_page + Count))
            web_page_address = (Url + str(Beginning_page + Count))
            bsObj = BeautifulSoup(html.read(), 'lxml')

            # Check to See if Page is Blank
            '''Blank Page:      Don't scrape page and most to next 
               the range of pages from beginning until end.  This creates issues for the scraper.  
               If we hit a blank page, the code will increment the count but skip scraping the page. 
            '''
            Tags = bsObj.find('section', {'id': 'company'})
            Defendant = Tags.find('h4').get_text().split(':')[1]
            regex_exp = re.compile(' *[A-Z]+')
            search = re.search(regex_exp, Defendant)

            # If Page Is Not Blank - Scrape
            if bool(search) is True:
                # Load Main Scraper Function
                m4.main_scraper_function(mydb, table, bsObj, web_page_address)
            # Elif Blank - Just increase count & move to the next page
            # do nothing

    # RUN-TYPE - START FROM LAST PAGE
    elif Run_type == 'Start_from_last_page':
        # Set Count Objects - Add One Page
        Beginning_page = int(m5.get_last_page_scraped(mydb, table))
        End_page = Beginning_page + add_pages
        Count = 0

        # Status
        print('Scraper starting from count {}'.format(Beginning_page))

        # Create Range for Loop
        upper_bound = End_page - Beginning_page
        range_value = range(0, upper_bound)

        # Enter For Loop & Scrape All Pages in Range.
        for x in range_value:

            # Count
            Count += 1

            # Progress Recorder - % Total Pages Scraped
            m0.progress_recorder(Count, upper_bound)

            # Create Beautiful Soup Object per article
            html = urlopen(Url + str(Beginning_page + Count))
            print(Url + str(Beginning_page + Count))
            web_page_address = (Url + str(Beginning_page + Count))
            bsObj = BeautifulSoup(html.read(), 'lxml')

            # Check to See if Page is Blank
            Tags = bsObj.find('section', {'id': 'company'})
            Defendant = Tags.find('h4').get_text().split(':')[1]
            regex_exp = re.compile(' *[A-Z]+')
            search = re.search(regex_exp, Defendant)

            # If Page Is Not Blank - Scrape
            if bool(search) is True:
                # Load Main Scraper Function
                m4.main_scraper_function(mydb, table, bsObj, web_page_address)
            # Otherwise, go to next page
            else:
                pass

        # Generate Report (Email or Print)-----------------------------------------

        # Otherwise print results
        if report_output_type == 'print_results':
            m0.driver_function_post_run_scraper_report(mydb, Beginning_page,
                                                       End_page,
                                                       'print_results')

        if report_output_type == 'generate_email':
            # Time Report Generated
            report_gen_time = str(datetime.now())

            # Create DataFrame & Write to File
            df_results = m0.driver_function_post_run_scraper_report(
                mydb, Beginning_page, End_page, 'dataframe_w_results',
                target_output_dir, table)

            # Filename + Path for DataFrame as Excel File
            Excel_file = m0.driver_function_post_run_scraper_report(
                mydb, Beginning_page, End_page, 'dataframe_filename_plus_path',
                target_output_dir, table)

            # Number Of Companies Added to Table
            num_companies_added = len(df_results['defendant_name'])

            # Generate Text File - Body of Emaild
            '''function returns str of filename + path'''
            email_body_filename = m0.driver_function_post_run_scraper_report(
                mydb,
                Beginning_page,
                End_page,
                'email_text_body',
                table=table,
                target_output_dir=target_output_dir)

            # Generate Email
            m0.email_with_attachments(
                password=password,  # input for top lvl scraper function 
                toaddr='*****@*****.**',
                subject='Intellisurance Securities Class Action Scraper Update',
                body=open(email_body_filename).read(),
                attachment_filename=Excel_file)

    # Function Returns Nothing
    return None
Exemplo n.º 4
0
def keywordsearch(mydb, table, password):

	# Create Cursor
	mycursor = mydb.cursor()

	# Sys Datetime
	sys_datetime = str(datetime.now()).replace(' ', '')[0:14]

	# Key Word
	print('Enter your key word')
	keyword = input()

	# Query
	sql_query = '''
	SELECT 	filling_date, 
		close_date, 
		case_status, 
		defendant_name, 
		Symbol, 
		Headquarters, 
		Industry, 
		Sector, 
		Docket,
		court,  
		Judge, 
		case_summary
		

	FROM {}
	WHERE case_summary LIKE '%{}%'
	'''.format(table, keyword)

	# Get Data
	df = pd.read_sql(sql_query, mydb)
	
	# User Interaction 
	print('Your search has returned {} number of results\n'.format(len(df.index)))
	print('Do you want to further refine your search (Yes/No)?')
	resp = input()
	
	if resp == 'Yes':
		print('F**k you.  Proceeding to write data')
	
	elif resp == 'No':
		print('\nOk.  Proceeding to write data for the original query\n')

		# Generate Excel File & Write 2 File
		m0.write_to_excel(df, 'keywordsearchresults_'+ keyword, output_dir, sys_datetime)
		
		# Email Attachment
		print('\n Ready to send you the data via email.  Please input your email address')
		toaddr = input()
		subject = 'SCA Database Query Results for {}'.format(keyword)
		body = 'The results for your query can be found in the attached excel file'
		attachment_filename = 'keywordsearchresults_' + keyword + '.xlsx'
		m0.email_with_attachments(password, toaddr, subject, body, attachment_filename)

	else:
		print('Sorry you did not enter Yes or No.  Proceeding to write data')
		
		# Generate Excel File & Write 2 File
		m0.write_to_excel(df, 'keywordsearchresults_'+ keyword, output_dir, sys_datetime)

		# Email Attachment
		print('Generating email.  Please input your email address')
		toaddr = input()
		subject = 'SCA Database Query Results for {}'.format(keyword)
		body = 'The results for your query can be found in the attachment to this email'
		attachment_filename = 'keywordsearchresults_' + keyword + '.xlsx'
		m0.email_with_attachments(password, toaddr, subject, body, attachment_filename)
def train_KNN_predictor(X, Y, random_state_value, min_year, max_year, write_2_excel = False, 
                        plot = False, results = 'DataFrame'):
    '''Documentation:
    random_state:      seed used by the random generator.
    stratify:          separation of data into homogenious groups before sampling.
    range:             range over which to iterate to generate predictions
    lists:             capture predictions

    '''
    # Split dataset
    x_train, x_test, y_train, y_test = train_test_split(
                                        X, Y,
                                        stratify = Y,
                                        random_state = random_state_value, 
                                        test_size = .15)

    # Ratio Dissmissed to Settled;
    Dismissal_percentage = round(sum(Y) / len(Y), 2)

    # Case Count
    Case_count = len(Y)

    # Lists to Capture Predictions
    accuracy_training_list = []
    accuracy_test_list = []
    dismissal_percentage_list = [Dismissal_percentage for x in range(1,10)]

    # Range of Nearest Neighbors
    num_range_neighbors = range(1,10)
    # Run Loop
    for num in num_range_neighbors:
        # Instantiate KNN Algorithm
        knn = KNeighborsClassifier(n_neighbors = num)
        # Fit algorithm to training data
        knn.fit(x_train, y_train)
        y_predict = knn.predict
        accuracy_training_list.append(knn.score(x_train, y_train))
        accuracy_test_list.append(knn.score(x_test, y_test))


    # Create DataFrame for scores
    df = pd.DataFrame({}, index = [2,3,4,5,6,7,8,9,10])
    df['Accuracy_Training'] = accuracy_training_list
    df['Accuracy_Test'] = accuracy_test_list
    df['Dismissal_Percentage'] = dismissal_percentage_list


    # Write Results To Excel
    if write_2_excel == True:
        m0.write_to_excel(df, 'KNN_output', output_dir)

    # Plotting
    if plot == True:
        plt.plot(num_range_neighbors, accuracy_training_list, label = 'Accuracy of training')
        plt.plot(num_range_neighbors, accuracy_test_list, label = 'Accuracy of test')
        plt.plot(num_range_neighbors, dismissal_percentage_list, label = 'Dismissal Percentage')
        plt.ylabel('Accuracy', fontsize = 20)
        plt.xlabel('Number of Neighbors' , fontsize = 20)
        plt.title('''Performance KNN Algorithm SCA Dataset
                 For Years: {} to {}
                 Case Count => {}
                 Ratio Dismissed to Total Cases => {}'''.format(min_year, max_year, Case_count, 
                 Dismissal_percentage), fontsize = 25)
        plt.legend(fontsize = 15)
        plt.xticks(fontsize = 15)
        plt.yticks(fontsize = 15)
        plt.grid(b=None, which='major')
        plt.show()

    # Confusion Matrix
    if results == 'Confusion_matrix':
        clf_predict_y_test = knn.predict(y_test)
        clf_confusion_matrix = confusion_matrix(y_test, clf_predict_y_test)
        return clf_confusion_matrix 
    
    # Results in Dataframe
    if results == 'DataFrame':
        return df
def generate_ml_pipeline(X, Y, random_state_input, result_input, output):

    # Capture Classifier Results
    KNN_score_list = []
    Log_reg_score_list = []
    NB_score_list = []
    RandomForest_score_list = []

    # Generate Predictions
    KNN_result = m8.train_KNN_single_neighbor_classifier(
        X, Y, 5, random_state_input, result_input)
    Log_reg_result = m8.train_log_regressor_classifier(X, Y,
                                                       random_state_input,
                                                       result_input)
    Naive_bayes_result = m8.train_NaiveBayes_classifier(
        X, Y, random_state_input, 'Multinomial', result_input)
    RandomForest_result = m8.train_RandomForecast_classifier(
        X, Y, random_state_input, result_input)
    # Append Predictions to List
    KNN_score_list.append(round(KNN_result, 2))
    Log_reg_score_list.append(round(Log_reg_result, 2))
    NB_score_list.append(round(Naive_bayes_result, 2))
    RandomForest_score_list.append(round(RandomForest_result, 2))

    # Generate DataFrame
    df_results = pd.DataFrame({}, index=[result_input])
    df_results['KNN'] = KNN_score_list
    df_results['Logistic_Reg'] = Log_reg_score_list
    df_results['Naive_Bayes'] = NB_score_list
    df_results['Random_Forest'] = RandomForest_score_list

    # Output
    if output == 'write2excel':
        # Write to Excel
        m0.write_to_excel(
            df_results,
            'MachineLearning_Pipeline_Output: {}'.format(result_input),
            target_dir)

    elif output == 'plot':
        # Plot Results
        x_labels = [
            'KNN', 'Logistic_Regression', 'Naive_Bayes', 'Random_Forest'
        ]

        y_values = [
            df_results['KNN'][0], df_results['Logistic_Reg'][0],
            df_results['Naive_Bayes'][0], df_results['Random_Forest'][0]
        ]
        plt.bar(x_labels, y_values, align='center', alpha=0.5)
        plt.xlabel('Algorithms', fontsize=15)
        plt.ylabel(result_input, fontsize=15)
        plt.xticks(fontsize=15)
        plt.yticks(fontsize=15)
        plt.title('''Machine Learning Algorithm Comparison
                     Years:  From {} To {}
                     Score:  {}'''.format(min_year_value, max_year_value,
                                          result_input),
                  fontsize=20)
        plt.show()

    elif output == 'print':
        print(df_results.transpose())