def get_record_num(SICNo, s_d, s_m, s_y, e_d, e_m, e_y): sic_str = 'sic={sic}&' s_d_str = 'startday={s_d}&' s_m_str = 'startmonth={s_m}&' s_y_str = 'startyear={s_y}&' e_d_str = 'endday={e_d}&' e_m_str = 'endmonth={e_m}&' e_y_str = 'endyear={e_y}&' search_url = 'https://www.osha.gov/pls/imis/AccidentSearch.search?' search_url = search_url + 'p_logger=1&acc_description=fall&acc_Abstract=&acc_keyword=&' search_url = search_url + sic_str.format(sic=SICNo) search_url = search_url + 'naics=&Office=All&officetype=All&' search_url = search_url + e_m_str.format(e_m=e_m) search_url = search_url + e_d_str.format(e_d=e_d) search_url = search_url + e_y_str.format(e_y=e_y) search_url = search_url + s_m_str.format(s_m=s_m) search_url = search_url + s_d_str.format(s_d=s_d) search_url = search_url + s_y_str.format(s_y=s_y) search_url = search_url + 'InspNr=' brw = BrowserShadow() res = brw.open_url(search_url) page_content = res.read() page_soup = BeautifulSoup(page_content, "html.parser") count_str_cell = page_soup.select('#maincontain')[0].select('.row-fluid')[0].select('.row-fluid')[0].select('.span3')[0].select('.text-right') count_str = count_str_cell[0].get_text(); record_num = int(count_str[count_str.find('of') + 2 : len(count_str)]) #print(record_num) #print(search_url) return record_num
def parse_page_obtian_event(page_soup): #event_content = page_soup.select('#maintain')[0].select('.row-fluid')[0].select('.table-responsive')[0].select('tbody')[0] event_content = page_soup.select('#maincontain')[0].select('.row-fluid')[0].select('.table-responsive')[1].select('table')[0].find_all('tr') for index in range(1 , len(event_content)): item_list = event_content[index].find_all('td') SummaryNr = item_list[2].find('a').get_text() EventDate = item_list[3].get_text() ReportID = item_list[4].get_text() Fat = item_list[5].get_text() if Fat.find('X') > -1 : Fat = 1 else: Fat = 0 SIC = item_list[6].find('a').get_text() EventDesc = item_list[7].get_text() #print(SummaryNr + EventData + ReportID + str(Fat) + SIC) accident_url_str = item_list[2].find('a')['href'] accident_url_str = 'https://www.osha.gov/pls/imis/' + accident_url_str #print(SummaryNr) brw_d = BrowserShadow() res_d = brw_d.open_url(accident_url_str) page_content_d = res_d.read() page_soup_d = BeautifulSoup(page_content_d, "html.parser") # parse the detail information about the event abstract_info = {'SummaryNr':SummaryNr , 'EventDate':EventDate, 'ReportID':ReportID, 'Fat':str(Fat), 'SIC':SIC, 'EventDesc':EventDesc} parse_accident_details(page_soup_d , abstract_info)
def get_similar_matrix(data_dic): output_f = open('ocup_comp_result.txt', 'w+') comp_element = list(data_dic.keys()) with open('ocup_comp_records.csv', 'w', newline='') as comp_r_csv: spamwriter = csv.writer(comp_r_csv,dialect='excel') spamwriter.writerow(comp_element) url = "https://www.metamind.io/language/relatedness/test?{test_p_str}" brw = BrowserShadow() similar_matrix_dic = {} for each_item_r in comp_element: r_list = [] for each_item_c in comp_element: test_data = {"text_2":each_item_r, "text_1":each_item_c} test_data_str = urlencode(test_data) comp_url = url.format(test_p_str = test_data_str) res = brw.open_url(comp_url) page_content = res.read() compar_result = page_content.decode() result = eval(compar_result) r_list.append(result["score"]) r_str = "row, col, comp_r, %s, %s, %s" % (each_item_r, each_item_c, result["score"]) output_f.write(r_str + "\n") print(r_str) similar_matrix_dic[each_item_r] = r_list spamwriter.writerow(r_list) output_f.close() return similar_matrix_dic
def get_record_list(SICNo, s_d, s_m, s_y, e_d, e_m, e_y): record_num = get_record_num(SICNo, s_d, s_m, s_y, e_d, e_m, e_y) if record_num <= 0: print("No Eligible Record has been retrieved!") return p_finish = 0 # if SICNo == 16: # p_finish = 825 # else: # p_finish = 0 p_show = 100 if record_num < p_show: p_show = record_num checked_num = 0 while 1: if (p_finish + p_show) > record_num: p_show = record_num - p_finish if p_show == 0: break sic_str = 'sic={sic}&' p_finish_str = 'p_finish={p_finish}&' p_show_str = 'p_show={p_show}' search_url = 'https://www.osha.gov/pls/imis/accidentsearch.search?' search_url = search_url + sic_str.format(sic=SICNo) search_url = search_url + 'sicgroup=&naics=&acc_description=fall&acc_abstract=&acc_keyword=&inspnr=&fatal=&officetype=All&office=All&' search_url = search_url + 'startmonth=07&startday=24&startyear=2015&endmonth=07&endday=23&endyear=1984&keyword_list=&p_start=&' search_url = search_url + p_finish_str.format(p_finish=p_finish) search_url = search_url + 'p_sort=&p_desc=DESC&p_direction=Next&' search_url = search_url + p_show_str.format(p_show=p_show) #print(search_url) brw = BrowserShadow() res = brw.open_url(search_url) page_content = res.read() page_soup = BeautifulSoup(page_content, "html.parser") # prase the specified record """collect the records into mysql database""" parse_page_obtian_event(page_soup) checked_num = checked_num + p_show #print(checked_num) if checked_num == record_num: break else: p_finish = p_finish + p_show
def compare_2_sentences(text_1, text_2): url = "https://www.metamind.io/language/relatedness/test?{text}" data_dic = {"text_1":text_1, "text_2":text_2} text_str = urlencode(data_dic) url = url.format(text = text_str) brw = BrowserShadow() res = brw.open_url(url) page_content = res.read() compar_result = page_content.decode() result = eval(compar_result) return result["score"]
def update_employee_detail_info_from_OSHA(): user = '******' pwd = '123456' host = '127.0.0.1' db = 'reported_fall_event' cnx = mysql.connector.connect(user=user, password=pwd, host=host, database=db) cursor = cnx.cursor() query_sql = "select SummaryNr from case_employees order by ID" cursor.execute(query_sql) employee_list = cursor.fetchall() brw = BrowserShadow() need_update = 0 for each_employee in employee_list: SummaryNr_value = each_employee[0].strip() if SummaryNr_value != "202315776" and need_update == 0: continue else: need_update = 1 print("SummaryNr :" , SummaryNr_value) employee_detail_info_url = "https://www.osha.gov/pls/imis/accidentsearch.accident_detail?id={SummaryNr}" employee_detail_info_url = employee_detail_info_url.format(SummaryNr = SummaryNr_value) res = brw.open_url(employee_detail_info_url) page_content = res.read() page_soup = BeautifulSoup(page_content, "html.parser") event_content_details = page_soup.find_all('tr') keyword_position = -1 for index in range(0, len(event_content_details)): if event_content_details[index].get_text().find('Keywords') > -1 : keyword_position = index # do not process if there is no keyword if keyword_position == -1: continue proj_type_has = 1 if event_content_details[keyword_position + 1].find_all('th')[0].get_text().find('End Use') < 0: proj_type_has = 0 if proj_type_has == 0: index_employee_start = keyword_position + 2 else: index_employee_start = keyword_position + 4 for index in range(index_employee_start, len(event_content_details)): current_employee_eid = event_content_details[index].find_all('td')[0].get_text().strip() current_employee_href = event_content_details[index].find_all('td')[0].find('a')['href'] current_employee_url_str = 'https://www.osha.gov/pls/imis/' + current_employee_href print(current_employee_url_str) res_employee_info = brw.open_url(current_employee_url_str) page_content_employee = res_employee_info.read() page_soup_employee = BeautifulSoup(page_content_employee, "html.parser") employee_content_details = page_soup_employee.find_all('tr') inspection_Nr_position = -1 for index in range(0, len(employee_content_details)): if employee_content_details[index].get_text().find('Inspection') > -1 : inspection_Nr_position = index break if inspection_Nr_position == -1: continue TaskAssigned = employee_content_details[inspection_Nr_position + 13].find_all('td')[1].get_text() HumanFactor = employee_content_details[inspection_Nr_position + 10].find_all('td')[1].get_text() EnvironmentFactor = employee_content_details[inspection_Nr_position + 9].find_all('td')[1].get_text() EventType = employee_content_details[inspection_Nr_position + 8].find_all('td')[1].get_text() SourceInjury = employee_content_details[inspection_Nr_position + 7].find_all('td')[1].get_text() PartBody = employee_content_details[inspection_Nr_position + 6].find_all('td')[1].get_text() update_employee_injury_info_sql = ("update case_employees set " "TaskAssigned = %s, HumanFactor = %s, EnvironmentFactor = %s, EventType = %s, SourceInjury = %s, PartBody = %s" "where EID = %s and SummaryNr = %s") update_values = (TaskAssigned, HumanFactor, EnvironmentFactor, EventType, SourceInjury, PartBody, current_employee_eid, SummaryNr_value) cursor.execute(update_employee_injury_info_sql, update_values) cnx.commit() print(TaskAssigned, '--', HumanFactor, '--', EnvironmentFactor, '--', EventType, '--', SourceInjury, '--', PartBody) print("Update has done!") return 1