import analytics_functions import boto3 from fabric import Connection import os import time ### This file will be run to send commands to the namenode to run the tfidf.py and pearson.py scripts fil = open('./analytics_generated_items/namenode_ip_and_key', 'r') namenode_ip_and_key = fil.read() fil.close() namenode_ip, key_pair = namenode_ip_and_key.split('\n') ### Get connected to namenode and start running commands ### input( 'Press Enterget Pearson Correlation output score(wait around a minute or so): ' ) c = analytics_functions.theconnector(namenode_ip, key_pair) c.run('cd spark_scripts && python3 pearson.py') # input('Press Enter to run the TFIDF script, results will arrive shortly in a file named tfidf_results') # c.run('cd spark_scripts && python3 tfidf.py')
sys.path.append('../') import analytics_functions sys.path.append('../hadoop') import scaling shutil.copy('../hadoop/{}.pem'.format(scaling.key_pair), './') # change tfidf_output to testcopy bash_file = open("get_tfidf.sh", 'w') bash_file.write('scp -i {}.pem -r ubuntu@{}:tfidf_output ./'.format( scaling.key_pair, scaling.namenode_ip)) bash_file.close() c = analytics_functions.theconnector(scaling.namenode_ip, scaling.key_pair) # c.run('cd tfidf_output && ls') print('now getting the TFIDF (this will take 8-10 mins)') c.run('python3 tfidf.py') print('downloading the tfidf, will take awhile') print( 'now getting the Pearson Correlation (this will take ~1min and printed to console)' ) c.run('export PYSPARK_PYTHON=/usr/bin/python3 && python3 pearson.py') # #test copying # c.run('mkdir ./testcopy') # c.run('cp ./tfidf_output/part-00099-5200a268-b05a-403d-b56d-c9d1b2558fd6-c000.csv ./testcopy/')
print("Waiting for instances to start up") time.sleep(60) # ---------------------------------- update the packages ------------------------------------------- > for instance_ip in all_node_ips: success = False while(not success): try: c = analytics_functions.theconnector(instance_ip, key_pair) c.sudo('apt-get update') success = True except: # in case fail print('something went wrong, retrying i a moment') time.sleep(10) # ------------------------------------------- reboot ---------------------------------------------------- > try: ec2.reboot_instances(InstanceIds=all_node_ids, DryRun=True) except ClientError as e: if 'DryRunOperation' not in str(e):
print("Waiting for instances to start up") time.sleep(120) # ---------------------------------- update the packages on the new data nodes------------------------------------------- > print( "------------------------- Updating the packages on the new data nodes --------------------------------------" ) # update the packages only on the new nodes for instance_ip in new_node_ips: success = False tryfactor = 0 while (not success): try: c = analytics_functions.theconnector(instance_ip, key_pair) c.sudo('apt-get -y update') success = True except: # in case fail print('something went wrong, retrying in a moment') tryfactor += 1 if tryfactor == 10: print( 'It has been {} times, something went horribly wrong. Ctrl C to exit and try again' .format(tryfactor)) time.sleep(10) # ------------------------------------------- reboot ---------------------------------------------------- print(