def parse_state(browser, state, missing_persons=None): if not missing_persons: missing_persons = {} browser.get("https://www.findthemissing.org/en") #search by state select_state(browser, state) browser.find_element_by_name("commit").click() #wait for new entries to show up table = browser.wait_until_visible("list", timeout=30) #wait_for_table_to_load(table, 10, timeout=30) browser.get("https://www.findthemissing.org/en/ajax/search_results?page=1&rows=100&sidx=DateLKA&sord=desc&_search=false") dct = json.loads(browser.find_element_by_css_selector("body").text) pgs = int(dct["total"]) print "found {} pages".format(pgs) for pg in range(1, pgs + 1): browser.get("https://www.findthemissing.org/en/ajax/search_results?page=" + str(pg) + "&rows=100&sidx=DateLKA&sord=desc&_search=false") dct = json.loads(browser.find_element_by_css_selector("body").text) print "page " + str(pg) + " of " + str(pgs) for (num, person) in enumerate(dct["rows"]): print "person " + str(num + 1) + " of " + str(len(dct["rows"])) new_person = common.create_new_record() #organization new_person["namus_number"] = person["cell"][0] new_person["org_name"] = "National Missing and Unidentified Persons System" new_person["org"] = "NAMUS" new_person["org_contact"] = "1-855-626-7600" #personal characteristics new_person["sex"] = common.capitalize(person["cell"][4]) new_person["race"] = person["cell"][5] new_person["age"] = float(person["cell"][6]) arr = person["id"].split("_") browser.get("https://www.findthemissing.org/en/cases/" + arr[0] + "/" + arr[1]) time.sleep(10) has_NCMEC_lbl = False if browser.find_element_by_xpath("//div[@id='case_information']/div/table/tbody/tr[6]/td/label").text == "NCMEC number": has_NCMEC_lbl = True ncmec_case_number = browser.find_element_by_xpath("//div[@id='case_information']/div/table/tbody/tr[6]/td[2]").text.strip() if "NCMEC_" + ncmec_case_number in missing_persons.keys(): print "found NCMEC_" + ncmec_case_number + " so merging..." missing_persons["NCMEC_" + ncmec_case_number]["namus_number"] = new_person["namus_number"] continue #case info photo = browser.find_element_by_css_selector("dt.photo > img").get_attribute("src") if "no_photo" not in photo: new_person["photo"] = browser.find_element_by_css_selector("dt.photo > img").get_attribute("src") new_person["first_name"] = common.capitalize(browser.find_element_by_xpath("//div[@id='case_information']/div/table/tbody/tr[2]/td[2]").text) new_person["middle_name"] = common.capitalize(browser.find_element_by_xpath("//div[@id='case_information']/div/table/tbody/tr[3]/td[2]").text.replace("\"", "")) new_person["last_name"] = common.capitalize(browser.find_element_by_xpath("//div[@id='case_information']/div/table/tbody/tr[4]/td[2]").text) if has_NCMEC_lbl: date = browser.find_element_by_xpath("//div[@id='case_information']/div/table/tbody/tr[7]/td[2]").text else: date = browser.find_element_by_xpath("//div[@id='case_information']/div/table/tbody/tr[6]/td[2]").text new_person["date"] = common.clean_date(date) #determining white or non-white hispanic if new_person["race"] == "White" or new_person["race"] == "Other": ethnicity = browser.find_element_by_xpath("//div[@id='case_information']/div[2]/table/tbody/tr[4]/td[2]").text if ethnicity == "Hispanic/Latino" and new_person["race"] == "White": new_person["race"] = "White Hispanic/Latino" if ethnicity == "Hispanic/Latino" and new_person["race"] == "Other": new_person["race"] = "Non-White Hispanic/Latino" new_person["race"] = common.clean_race(new_person["race"]) height = browser.find_element_by_xpath("//div[@id='case_information']/div[2]/table/tbody/tr[6]/td[2]").text if "to" in height: arr = height.split("to") height = arr[1].strip() new_person["height"] = float(height) weight = browser.find_element_by_xpath("//div[@id='case_information']/div[2]/table/tbody/tr[7]/td[2]").text if "to" in weight: arr = weight.split("to") weight = arr[1].strip() new_person["weight"] = float(weight) browser.find_element_by_link_text("Circumstances").click() time.sleep(3) #circumstance new_person["city"] = common.capitalize(browser.find_element_by_css_selector("div.column1-unit > table > tbody > tr > td.view_field").text) new_person["state"] = common.capitalize(browser.find_element_by_xpath("//div[@id='circumstances']/div/table/tbody/tr[2]/td[2]").text) new_person["county"] = common.capitalize(browser.find_element_by_xpath("//div[@id='circumstances']/div/table/tbody/tr[4]/td[2]").text) new_person["country"] = "US" try: new_person["circumstance"] = browser.find_element_by_id("case_Circumstances").text except NoSuchElementException: new_person["circumstance"] = "" browser.find_element_by_link_text("Physical / Medical").click() time.sleep(3) #physical new_person["hair_color"] = common.clean_hair_color(browser.find_element_by_xpath("//div[@id='physical_characteristics']/div/table/tbody/tr/td[3]").text) left_eye_color = browser.find_element_by_xpath("//div[@id='physical_characteristics']/div/table/tbody/tr[5]/td[3]").text right_eye_color = browser.find_element_by_xpath("//div[@id='physical_characteristics']/div/table/tbody/tr[6]/td[3]").text if left_eye_color == right_eye_color: new_person["eye_color"] = common.clean_eye_color(left_eye_color) else: new_person["eye_color"] = "Multicolor" browser.find_element_by_link_text("Investigating Agency").click() time.sleep(3) state = browser.find_element_by_xpath("//div[@id='police_information']/div[2]/table/tbody/tr[6]/td[2]").text state_paren = "" if state: state_paren = " (" + state + ")" new_person["agency_name"] = browser.find_element_by_xpath("//div[@id='police_information']/div[2]/table/tbody/tr[2]/td[2]").text + state_paren new_person["agency_contact"] = browser.find_element_by_xpath("//div[@id='police_information']/div/table/tbody/tr[4]/td[2]").text #print new_person missing_persons["NAMUS_" + new_person["namus_number"]] = new_person return missing_persons
new_person["org_name"] = person["orgName"] new_person["org"] = person["orgPrefix"] new_person["org_contact"] = detailed_person["orgContactInfo"] # skip unidentified remains cases if new_person["org_name"] == "NCMEC-Unidentified": continue if detailed_person["altContact"]: (agency_name, agency_phone) = common.extract_agency_info(detailed_person["altContact"]) new_person["agency_name"] = agency_name.replace(" ", " ") new_person["agency_contact"] = agency_phone # circumstance if "missingDate" in person.keys(): new_person["date"] = common.clean_date(person["missingDate"]) new_person["circumstance"] = detailed_person["circumstance"] new_person["city"] = common.capitalize(person["missingCity"]) new_person["county"] = common.capitalize(person["missingCounty"]) # skip US terrorities try: new_person["state"] = common.convert_state_abbrev(person["missingState"]) except KeyError: continue new_person["country"] = person["missingCountry"] # personal characteristics new_person["first_name"] = common.capitalize(person["firstName"]) middle_name = common.capitalize(person["middleName"]) if len(middle_name) == 1:
new_person["org_name"] = person["orgName"] new_person["org"] = person["orgPrefix"] new_person["org_contact"] = detailed_person["orgContactInfo"] #skip unidentified remains cases if new_person["org_name"] == "NCMEC-Unidentified": continue if detailed_person["altContact"]: (agency_name, agency_phone) = common.extract_agency_info(detailed_person["altContact"]) new_person["agency_name"] = agency_name.replace(" ", " ") new_person["agency_contact"] = agency_phone #circumstance if "missingDate" in person.keys(): new_person["date"] = common.clean_date(person["missingDate"]) new_person["circumstance"] = detailed_person["circumstance"] new_person["city"] = common.capitalize(person["missingCity"]) new_person["county"] = common.capitalize(person["missingCounty"]) #skip US terrorities try: new_person["state"] = common.convert_state_abbrev(person["missingState"]) except KeyError: continue new_person["country"] = person["missingCountry"] #personal characteristics new_person["first_name"] = common.capitalize(person["firstName"]) middle_name = common.capitalize(person["middleName"]) if len(middle_name) == 1:
def parse_state(browser, state, missing_persons=None): if not missing_persons: missing_persons = {} browser.get("https://www.findthemissing.org/en") #search by state select_state(browser, state) browser.find_element_by_name("commit").click() #wait for new entries to show up table = browser.wait_until_visible("list", timeout=30) #wait_for_table_to_load(table, 10, timeout=30) browser.get( "https://www.findthemissing.org/en/ajax/search_results?page=1&rows=100&sidx=DateLKA&sord=desc&_search=false" ) dct = json.loads(browser.find_element_by_css_selector("body").text) pgs = int(dct["total"]) print "found {} pages".format(pgs) for pg in range(1, pgs + 1): browser.get( "https://www.findthemissing.org/en/ajax/search_results?page=" + str(pg) + "&rows=100&sidx=DateLKA&sord=desc&_search=false") dct = json.loads(browser.find_element_by_css_selector("body").text) print "page " + str(pg) + " of " + str(pgs) for (num, person) in enumerate(dct["rows"]): print "person " + str(num + 1) + " of " + str(len(dct["rows"])) new_person = common.create_new_record() #organization new_person["namus_number"] = person["cell"][0] new_person[ "org_name"] = "National Missing and Unidentified Persons System" new_person["org"] = "NAMUS" new_person["org_contact"] = "1-855-626-7600" #personal characteristics new_person["sex"] = common.capitalize(person["cell"][4]) new_person["race"] = person["cell"][5] new_person["age"] = float(person["cell"][6]) arr = person["id"].split("_") browser.get("https://www.findthemissing.org/en/cases/" + arr[0] + "/" + arr[1]) time.sleep(10) has_NCMEC_lbl = False if browser.find_element_by_xpath( "//div[@id='case_information']/div/table/tbody/tr[6]/td/label" ).text == "NCMEC number": has_NCMEC_lbl = True ncmec_case_number = browser.find_element_by_xpath( "//div[@id='case_information']/div/table/tbody/tr[6]/td[2]" ).text.strip() if "NCMEC_" + ncmec_case_number in missing_persons.keys(): print "found NCMEC_" + ncmec_case_number + " so merging..." missing_persons["NCMEC_" + ncmec_case_number][ "namus_number"] = new_person["namus_number"] continue #case info photo = browser.find_element_by_css_selector( "dt.photo > img").get_attribute("src") if "no_photo" not in photo: new_person["photo"] = browser.find_element_by_css_selector( "dt.photo > img").get_attribute("src") new_person["first_name"] = common.capitalize( browser.find_element_by_xpath( "//div[@id='case_information']/div/table/tbody/tr[2]/td[2]" ).text) new_person["middle_name"] = common.capitalize( browser.find_element_by_xpath( "//div[@id='case_information']/div/table/tbody/tr[3]/td[2]" ).text.replace("\"", "")) new_person["last_name"] = common.capitalize( browser.find_element_by_xpath( "//div[@id='case_information']/div/table/tbody/tr[4]/td[2]" ).text) if has_NCMEC_lbl: date = browser.find_element_by_xpath( "//div[@id='case_information']/div/table/tbody/tr[7]/td[2]" ).text else: date = browser.find_element_by_xpath( "//div[@id='case_information']/div/table/tbody/tr[6]/td[2]" ).text new_person["date"] = common.clean_date(date) #determining white or non-white hispanic if new_person["race"] == "White" or new_person["race"] == "Other": ethnicity = browser.find_element_by_xpath( "//div[@id='case_information']/div[2]/table/tbody/tr[4]/td[2]" ).text if ethnicity == "Hispanic/Latino" and new_person[ "race"] == "White": new_person["race"] = "White Hispanic/Latino" if ethnicity == "Hispanic/Latino" and new_person[ "race"] == "Other": new_person["race"] = "Non-White Hispanic/Latino" new_person["race"] = common.clean_race(new_person["race"]) height = browser.find_element_by_xpath( "//div[@id='case_information']/div[2]/table/tbody/tr[6]/td[2]" ).text if "to" in height: arr = height.split("to") height = arr[1].strip() new_person["height"] = float(height) weight = browser.find_element_by_xpath( "//div[@id='case_information']/div[2]/table/tbody/tr[7]/td[2]" ).text if "to" in weight: arr = weight.split("to") weight = arr[1].strip() new_person["weight"] = float(weight) browser.find_element_by_link_text("Circumstances").click() time.sleep(3) #circumstance new_person["city"] = common.capitalize( browser.find_element_by_css_selector( "div.column1-unit > table > tbody > tr > td.view_field"). text) new_person["state"] = common.capitalize( browser.find_element_by_xpath( "//div[@id='circumstances']/div/table/tbody/tr[2]/td[2]"). text) new_person["county"] = common.capitalize( browser.find_element_by_xpath( "//div[@id='circumstances']/div/table/tbody/tr[4]/td[2]"). text) new_person["country"] = "US" try: new_person["circumstance"] = browser.find_element_by_id( "case_Circumstances").text except NoSuchElementException: new_person["circumstance"] = "" browser.find_element_by_link_text("Physical / Medical").click() time.sleep(3) #physical new_person["hair_color"] = common.clean_hair_color( browser.find_element_by_xpath( "//div[@id='physical_characteristics']/div/table/tbody/tr/td[3]" ).text) left_eye_color = browser.find_element_by_xpath( "//div[@id='physical_characteristics']/div/table/tbody/tr[5]/td[3]" ).text right_eye_color = browser.find_element_by_xpath( "//div[@id='physical_characteristics']/div/table/tbody/tr[6]/td[3]" ).text if left_eye_color == right_eye_color: new_person["eye_color"] = common.clean_eye_color( left_eye_color) else: new_person["eye_color"] = "Multicolor" browser.find_element_by_link_text("Investigating Agency").click() time.sleep(3) state = browser.find_element_by_xpath( "//div[@id='police_information']/div[2]/table/tbody/tr[6]/td[2]" ).text state_paren = "" if state: state_paren = " (" + state + ")" new_person["agency_name"] = browser.find_element_by_xpath( "//div[@id='police_information']/div[2]/table/tbody/tr[2]/td[2]" ).text + state_paren new_person["agency_contact"] = browser.find_element_by_xpath( "//div[@id='police_information']/div/table/tbody/tr[4]/td[2]" ).text #print new_person missing_persons["NAMUS_" + new_person["namus_number"]] = new_person return missing_persons