def fase2(url): # fase 2.1 page_user = requests.get(url) soup_user = BeautifulSoup(page_user.content, 'html.parser') name_vendedor = soup_user.find_all( "h3", {"class": "store-info__name"})[0].contents[0] Califications = soup_user.find_all( "span", {"class": "buyers-feedback-qualification"}) calification_points = [] for C in Califications: calification_points.append(C.contents[4]) reputation = soup_user.find_all("div", {"class": "data-level__wrapper"})[0] reputation = reputation.find_all("span", {"class": "data-level__number"}) recomendado = reputation[0].contents[0] ventas_completadas = reputation[1].contents[0] años_vendiendo = reputation[-1].contents[0] time = soup_user.find_all("p", {"class": "data-level__description"})[-1] time = time.find_all("span")[0].contents[-1].split(' ')[1] if (time == 'años'): time = 'anios' return [ time, calification_points, recomendado, ventas_completadas, años_vendiendo, name_vendedor ]
def get_projects(page): ret_projects = [] curr = page.find("div", {"class": "editable-item section-item current-position"}) t_curr = {} try: t_curr["title"] = str(curr.find("a", {"name": "title"}).text) except: t_curr["title"] = "?" aux_str = "" time = curr.find("span", {"class": "experience-date-locale"}) try: t_curr["duration"] = str(time.find("time").text + "Present") except: t_curr["duration"] = "?" try: t_curr["place"] = str(curr.find("span", {"class": "locality"}).text) except: t_curr["place"] = "?" ret_projects.append(t_curr) for link in page.find_all( "div", {"class": "editable-item section-item past-position"}): temp = {} try: temp["title"] = str(link.find("a", {"name": "title"}).text) except: temp["title"] = "?" try: time = link.find("span", {"class": "experience-date-locale"}) aux_str = "" for tim in time.find_all("time"): aux_str += tim.text # print aux_str temp["duration"] = aux_str except: temp["duration"] = "?" try: temp["place"] = str(link.find("span", {"class": "locality"}).text) except: temp["place"] = "?" ret_projects.append(temp) return ret_projects
a = ho.find_all('span', {'class': 'xQ82C e8fRJf'})[0].text a = a.split(' ')[0] hosttest.append(a) #print(a) #print('==============================') for url in url_list: #擷取新聞網址 a = url.find_all('a')[0]['href'] urltest.append(a) #print(a) #print('==============================') for time in time_list: #擷取新聞發布時間 a = time.find_all('span', {'class': 'f nsa fwzPFf'})[0].text timetest.append(a) #print(a) #print('==============================') count = TestIdCount for i in range(len(titletest)): dateflag = 0 if (hosttest not in bad_host): count = count + 1 if IgnoreDateFlag is not 2: to_append = [ int(count), titletest[i], "neutral", hosttest[i], urltest[i], timetest[i], "" ]
try: grade = curr_section.find( 'p', { 'class': 'pv-entity__secondary-title pv-entity__grade t-14 t-black t-normal' }).find('span', {'class': 'pv-entity__comma-item'}) grades.append(grade.get_text()) except Exception as e: # print("Education grade Exception",e) grades.append('') try: time = curr_section.find( 'p', {'class': 'pv-entity__dates t-14 t-black--light t-normal'}) dates.append((time.find_all('time')[1].get_text())) except Exception as e: # print("Education time Exception",e) dates.append('') for i in range(len(edu_section)): education_info_list.append([ college_names[i], degree_names[i], field_names[i], dates[i], grades[i] ]) except Exception as e: # no education added # print("Education Section Exception", e) pass # print(education_info_list)
def crawl(keyword,sdates,edates): titletest = list() hosttest = list() urltest = list() timetest = list() to_append = list() dic = ["id", "title", "category", "host", "url", "time", "segment",] bad_host = ['中工网','中国奥林匹克委员会','星洲网','China Press','手机网易网','新浪网','东方财富网','千龙网','搜狐' ,'中国新闻网','汉丰网','京报网','人民网','中国侨网','杭州网','中华人民共和国外交部','华体网','NTDTV','新京报' ,'联合国新闻','自由亚洲电台','法国国际广播电台','多维新闻网','BBC 中文网','青年日報','联合早报','新浪网'] df1 = pd.DataFrame(columns=dic) TestIdCount = 0 IgnoreDateFlag = 0 StartNums = sdates if StartNums is '': IgnoreDateFlag = 1 print('no limit') else: nums_tmp = StartNums.split('/') snums = [] for n in nums_tmp: if n.isdigit(): snums += [int(n)] if len(snums) == 3: break; if len(snums) != 3: IgnoreDateFlag = 2 print('wrong date form') a = 1 EndNums = edates if EndNums is '': IgnoreDateFlag = 1 print('no limit') else: nums_tmp = EndNums.split('/') enums = [] for n in nums_tmp: if n.isdigit(): enums += [int(n)] if len(enums) == 3: break; if len(enums) != 3: IgnoreDateFlag = 2 print('wrong date form') headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36" } if IgnoreDateFlag is 1: payload = {'q': str(keyword), 'tbm':'nws', 'lr':'lang_zh-TW', 'num':'100', 'tbs':'qdr:y'} elif IgnoreDateFlag is 0: payload = {'q': str(keyword), 'tbm':'nws', 'lr':'lang_zh-TW', 'num':'100', 'tbs':'cdr:1,cd_min:' \ +str(snums[1])+'/'+str(snums[2])+'/'+str(snums[0]) \ +',cd_max:'+str(enums[1])+'/'+str(enums[2])+'/'+str(enums[0])} s = requests.Session() re = s.get("https://www.google.com.tw/search",params = payload, headers = headers) print(re.url) content = re.text #print(content) soup = BeautifulSoup(content,"html.parser") title_list = soup.find_all('div',{'class':'gG0TJc'}) host_list = soup.find_all('div',{'class':'gG0TJc'}) url_list = soup.find_all('div',{'class':'gG0TJc'}) time_list = soup.find_all('div',{'class':'gG0TJc'}) for ti in title_list: #擷取新聞標題 a = ti.find_all('a')[0].text titletest.append(a) #print(a) for ho in host_list: #擷取新聞網 a = ho.find_all('span',{'class':'xQ82C e8fRJf'})[0].text a = a.split(' ')[0] hosttest.append(a) #print(a) for url in url_list: #擷取新聞網址 a = url.find_all('a')[0]['href'] urltest.append(a) #print(a) for time in time_list: #擷取新聞發布時間 a = time.find_all('span',{'class':'f nsa fwzPFf'})[0].text timetest.append(a) #print(a) count = TestIdCount for i in range(len(titletest)): dateflag = 0 if(hosttest not in bad_host): count = count + 1 if IgnoreDateFlag is not 2: to_append = [int(count),titletest[i],"neutral",hosttest[i],urltest[i],timetest[i],""] a_series = pd.Series(to_append, index = df1.columns) df1 = df1.append(a_series, ignore_index=True) ''' print('======[',i,']=========') print(titletest[i]) print(urltest[i]) print(hosttest[i]) print(timetest[i]) print(" ") ''' to_append.clear() TestIdCount = TestIdCount + len(titletest) count = 0 #print(to_append) titletest.clear() hosttest.clear() urltest.clear() timetest.clear() df1.to_csv('test.csv', index=False, encoding='UTF-8_sig')
def scrapper(link): #Initialize Options to start Chrome as headless in selenium #Initialize the chrome webdriver as 'browser' #Get the login page for linkedin browser.get('https://www.linkedin.com/uas/login') #Open the file with the username and password for LinkedIn login file = open('config.txt') lines = file.readlines() username = lines[0] password = lines[1] #Username and Password for login elementID = browser.find_element_by_id('username') elementID.send_keys(username) elementID = browser.find_element_by_id('password') elementID.send_keys(password) elementID.submit() global time time.sleep(5) #Profile Link to be scraped #link = "https://www.linkedin.com/in/rishab-saini/" browser.get(link) #pause before scrolling SCROLL_PAUSE_TIME = 6 #Get the scroll height of the page last_height = browser.execute_script("return document.body.scrollHeight") #scroll the entire page due to dynamic loading of the webpage we need to load the entire webpage by scrolling for i in range(3): # Scroll down to bottom browser.execute_script( "window.scrollTo(0, document.body.scrollHeight/3);") time.sleep(SCROLL_PAUSE_TIME / 2) browser.execute_script( "window.scrollTo(0, document.body.scrollHeight*(2/3));") time.sleep(SCROLL_PAUSE_TIME / 2) browser.execute_script( "window.scrollTo(0, document.body.scrollHeight);") # Wait to load page time.sleep(SCROLL_PAUSE_TIME) # Calculate new scroll height and compare with last scroll height new_height = browser.execute_script( "return document.body.scrollHeight") if new_height == last_height: break last_height = new_height #try to expand sections(if available), else pass try: #click to expand education section education_expand_button = browser.find_element_by_xpath( "//section[@id='education-section']//button[@class='pv-profile-section__see-more-inline pv-profile-section__text-truncate-toggle link link-without-hover-state']" ) browser.execute_script("arguments[0].click();", education_expand_button) except Exception as e: #print("education_expand_button Exception:", e) pass try: #click to expand projects section projects_expand_button = browser.find_element_by_xpath( "//div[@class='pv-accomplishments-block__content break-words']//button[@aria-label='Expand projects section' and @aria-expanded='false']" ) browser.execute_script("arguments[0].click();", projects_expand_button) except Exception as e: # print("projects_expand_button Exception:", e) pass try: #click to expand certifications section certifications_expand_button = browser.find_element_by_xpath( "//button[@class='pv-profile-section__see-more-inline pv-profile-section__text-truncate-toggle link link-without-hover-state']" ) browser.execute_script("arguments[0].click();", certifications_expand_button) except Exception as e: # print("certifications_expand_button Exception:", e) pass try: # click to expand experience section experiences_expand_button = browser.find_element_by_xpath( "//button[@class='pv-profile-section__see-more-inline pv-profile-section__text-truncate-toggle link link-without-hover-state']" ) browser.execute_script("arguments[0].click();", experiences_expand_button) time.sleep(2) #inline-show-more-text__button link experiences_show_more_expand_button = browser.find_element_by_xpath( "//button[@class='inline-show-more-text__button link']") #print(experiences_show_more_expand_button) browser.execute_script("arguments[0].click();", experiences_show_more_expand_button) except Exception as e: # print("experiences_expand_button Exception:", e) pass try: # click to expand skills section skills_expand_button = browser.find_element_by_xpath( "//button[@class='pv-profile-section__card-action-bar pv-skills-section__additional-skills artdeco-container-card-action-bar artdeco-button artdeco-button--tertiary artdeco-button--3 artdeco-button--fluid']" ) browser.execute_script("arguments[0].click();", skills_expand_button) except Exception as e: # print("skills_expand_button Exception:", e) pass try: # click to expand volunteering section volunteer_expand_button = browser.find_element_by_xpath( "//button[@class='pv-profile-section__see-more-inline pv-profile-section__text-truncate-toggle link link-without-hover-state']" ) browser.execute_script("arguments[0].click();", volunteer_expand_button) except Exception as e: # print("volunteer_expand_button Exception:", e) pass #use beautiful soup for html parsing src = browser.page_source soup = BeautifulSoup(src, 'lxml') #BASIC INFO LIST basic_info_list = [] name_div = soup.find('div', {'class': 'flex-1 mr5'}) name_loc = name_div.find_all('ul') fullname = name_loc[0].find('li').get_text().strip() try: first_name, last_name = fullname.split() #above statement fails when a person has put their name as firstname, middlename, lastname except: first_name, middle_name, last_name = fullname.split() basic_info_list.append(first_name) basic_info_list.append(last_name) headline = name_div.find('h2').get_text().strip() basic_info_list.append(headline) basic_info_list.append(link) #appending empty strings for email_id, phone_number, age and github_link basic_info_list.append('') basic_info_list.append('') basic_info_list.append('') basic_info_list.append('') #print(basic_info_list) #education section education_info_list = [] try: edu_section = soup.find('section', { 'id': 'education-section' }).find('ul') edu_section = edu_section.find_all( 'div', { 'class': 'pv-entity__summary-info pv-entity__summary-info--background-section' }) college_names = [] degree_names = [] field_names = [] grades = [] dates = [] for x in range(len(edu_section)): curr_section = edu_section[x] try: college_name = curr_section.find( 'h3', {'class': 'pv-entity__school-name t-16 t-black t-bold'}) college_names.append(college_name.get_text()) except Exception as e: #print("Education college_name Exception",e) college_names.append('') try: degree_name = curr_section.find( 'p', { 'class': 'pv-entity__secondary-title pv-entity__degree-name t-14 t-black t-normal' }).find('span', {'class': 'pv-entity__comma-item'}) degree_names.append(degree_name.get_text()) except Exception as e: #print("Education degree_name Exception",e) degree_names.append('') try: field_name = curr_section.find( 'p', { 'class': 'pv-entity__secondary-title pv-entity__fos t-14 t-black t-normal' }).find('span', {'class': 'pv-entity__comma-item'}) field_names.append(field_name.get_text()) except Exception as e: #print("Education field_name Exception",e) field_names.append('') try: grade = curr_section.find( 'p', { 'class': 'pv-entity__secondary-title pv-entity__grade t-14 t-black t-normal' }).find('span', {'class': 'pv-entity__comma-item'}) grades.append(grade.get_text()) except Exception as e: #print("Education grade Exception",e) grades.append('') try: time = curr_section.find( 'p', {'class': 'pv-entity__dates t-14 t-black--light t-normal'}) dates.append((time.find_all('time')[1].get_text())) except Exception as e: #print("Education time Exception",e) dates.append('') for i in range(len(edu_section)): education_info_list.append([ college_names[i], degree_names[i], field_names[i], dates[i], grades[i] ]) except Exception as e: #no education added #print("Education Section Exception", e) pass #print(education_info_list) #Project Section projects_info_list = [] project_titles = [] try: project_section = soup.find('div', {'id': 'projects-expandable-content'}) project_section = project_section.find( 'ul', {'class': 'pv-accomplishments-block__list'}) projects = project_section.find_all( 'h4', {'class': 'pv-accomplishment-entity__title t-14 t-bold'}) for i in range(len(projects)): project_name = projects[i].get_text().split('\n')[2] project_name = re.sub(' +', ' ', project_name) project_titles.append(project_name.strip()) projects = project_section.find_all( 'p', {'class': 'pv-accomplishment-entity__date t-14'}) project_time = [] for i in range(len(project_titles)): try: project_date = projects[i].get_text().split('\n')[1] project_date = re.sub(' +', ' ', project_date) project_time.append(project_date[1:]) except Exception as e: # print("project_date Exception", e) project_time.append('') project_descriptions = [] projects2 = project_section.find_all( 'p', {'class': 'pv-accomplishment-entity__description t-14'}) for i in range(len(project_titles)): try: next_empty_elem = projects2[i].findNext('div') curr_proj_desc = next_empty_elem.next_sibling project_descriptions.append(curr_proj_desc.strip()) except Exception as e: # print("curr_proj_desc Exception", e) project_descriptions.append('') #Construct projects_info_list from above data for i in range(len(project_titles)): projects_info_list.append( [project_titles[i], project_time[i], project_descriptions[i]]) except Exception as e: #no projects added #print("Project Section Exception", e) pass #print(projects_info_list) #certifications section certifications_info_list = [] try: certificates_section = soup.find('section', {'id': 'certifications-section'}) list_items = certificates_section.find( 'ul', { 'class': 'pv-profile-section__section-info section-info pv-profile-section__section-info--has-more' }) except Exception as e: #print("certificates_section Exception", e) pass try: if list_items is None: list_items = certificates_section.find( 'ul', { 'class': 'pv-profile-section__section-info section-info pv-profile-section__section-info--has-no-more' }) items = list_items.find_all( 'li', { 'class': 'pv-profile-section__sortable-item pv-certification-entity ember-view' }) cert_names_list = [] cert_issuer_list = [] cert_dates_list = [] for i in range(len(items)): curr_cert_name = items[i].find('h3', {'class': 't-16 t-bold'}) curr_cert_name = curr_cert_name.get_text().strip() cert_names_list.append(curr_cert_name) curr_issuer_name = items[i].find_all('p', {'class': 't-14'})[0] curr_issuer_name = curr_issuer_name.get_text().strip() curr_issuer_name = curr_issuer_name.replace( 'Issuing authority\n', '') cert_issuer_list.append(curr_issuer_name) curr_cert_date = items[i].find_all('p', {'class': 't-14'})[1] curr_cert_date = curr_cert_date.get_text().strip() curr_cert_date = curr_cert_date.replace( 'Issued date and, if applicable, expiration date of the certification or license\n', '').replace('No Expiration Date', '').replace('Issued ', '') cert_dates_list.append(curr_cert_date) #adding elements in certifications_info_list as per schema for i in range(len(cert_names_list)): certifications_info_list.append( [cert_names_list[i], cert_dates_list[i], cert_issuer_list[i]]) except Exception as e: #no certificates added #print("Certificates Section Exception", e) pass #print(certifications_info_list) #Experience Section experience_info_list = [] list_items = [] items = [] try: experience_section = soup.find('section', {'class': 'experience-section'}) #print(experience_section) list_items = experience_section.find( 'ul', { 'class': 'pv-profile-section__section-info section-info pv-profile-section__section-info--has-more' }) except Exception as e: #print("experience_section Exception", e) pass try: if list_items is None: list_items = experience_section.find( 'ul', { 'class': 'pv-profile-section__section-info section-info pv-profile-section__section-info--has-no-more' }) items = list_items.find_all( 'li', { 'class': 'pv-entity__position-group-pager pv-profile-section__list-item ember-view' }) company_names_list = [] position_list = [] dates_employed_list = [] description_list = [] for i in range(len(items)): try: curr_name = items[i].find('p', { 'class': 'pv-entity__secondary-title t-14 t-black t-normal' }) curr_name = curr_name.get_text().strip() curr_name = curr_name.split('\n')[0].strip() #print("1st currname", curr_name) company_names_list.append(curr_name) except Exception as e: #print("Experience curr_name Exception:", e) pass try: if curr_name is None: curr_name = items[i].find('h3', {'class': 't-16 t-black t-bold'}) curr_name = curr_name.get_text().strip() curr_name = curr_name.replace("Company Name\n", '') company_names_list.append(curr_name) except Exception as e: #print("Experience curr_name Exception:", e) pass try: curr_position = items[i].find('h3', {'class': 't-16 t-black t-bold'}) curr_position = curr_position.get_text().strip() curr_position = curr_position.replace("Company Name\n", '') position_list.append(curr_position) except Exception as e: #print("Experience curr_position Exception:", e) pass try: curr_dates = items[i].find('h4', { 'class': 'pv-entity__date-range t-14 t-black--light t-normal' }) curr_dates = curr_dates.get_text().strip() curr_dates = curr_dates.replace('Dates Employed\n', '') dates_employed_list.append(curr_dates) except Exception as e: #print("Experience curr_dates Exception:", e) pass try: curr_description = items[i].find( 'div', { 'class': 'pv-entity__extra-details t-14 t-black--light ember-view' }) curr_description = curr_description.get_text().strip() curr_description = curr_description.replace( '\n\n\n\n\n see less', '') curr_description = curr_description.replace( '\n\n \n \n\n\n\n\n\n\n\n\n\n', ' ') curr_description = curr_description.replace( '\n\n \n…\n\n see more', '') curr_description = curr_description.replace('\n ', '.') curr_description = curr_description.replace('\n\n', '.') description_list.append(curr_description) except Exception as e: #print("Experience curr_description Exception:", e) pass #Add empty description for normalization of data description_list.append('') #create company_names_list from above data for i in range(len(company_names_list)): experience_info_list.append([ company_names_list[i], position_list[i], dates_employed_list[i], description_list[i] ]) except Exception as e: #No Experience Added #print("Experience Section Exception:", e) pass #print(experience_info_list) #Skills Section skills_info_list = [] try: skills_section = soup.find( 'section', { 'class': 'pv-profile-section pv-skill-categories-section artdeco-container-card ember-view' }) except Exception as e: #print("skills_section Exception", e) pass try: if skills_section is None: skills_section = soup.find( 'section', { 'class': 'pv-profile-section pv-skill-categories-section artdeco-container-card artdeco-card ember-view' }) all_skills = skills_section.find_all('span', { 'class': 'pv-skill-category-entity__name-text t-16 t-black t-bold' }) #print(all_skills) for i in range(len(all_skills)): skills_info_list.append(all_skills[i].get_text().strip()) print(skills_info_list) except Exception as e: #No skills added print("Skills Section Exception:", e) pass #Volunteering Section: volunteer_info_list = [] items = [] list_items = [] try: volunteer_section = soup.find( 'section', {'class': 'pv-profile-section volunteering-section ember-view'}) list_items = volunteer_section.find( 'ul', { 'class': 'pv-profile-section__section-info section-info pv-profile-section__section-info--has-more ember-view' }) except Exception as e: #print("Volunteering volunteer_section Exception:", e) pass try: if list_items is None: list_items = volunteer_section.find( 'ul', { 'class': 'pv-profile-section__section-info section-info pv-profile-section__section-info--has-no-more' }) except Exception as e: #print("Volunteering list_items Exception:", e) pass try: items = list_items.find_all( 'li', { 'class': 'pv-profile-section__sortable-item pv-profile-section__section-info-item relative pv-profile-section__sortable-item--v2 pv-profile-section__list-item sortable-item ember-view' }) except Exception as e: # print("Volunteering list_items Exception:", e) pass try: if items == []: items = list_items.find_all( 'li', { 'class': 'pv-profile-section__list-item pv-volunteering-entity pv-profile-section__card-item ember-view' }) except Exception as e: # print("Volunteering items Exception:", e) pass try: for i in range(len(items)): curr_name = items[i].find('span', {'class': 'pv-entity__secondary-title'}) curr_name = curr_name.get_text().strip() curr_role = items[i].find('h3', {'class': 't-16 t-black t-bold'}) curr_role = curr_role.get_text().strip() try: curr_dates = items[i].find( 'h4', { 'class': 'pv-entity__date-range detail-facet inline-block t-14 t-black--light t-normal' }) curr_dates = curr_dates.get_text().strip() curr_dates = curr_dates.replace('Dates volunteered\n', '') except Exception as e: #print("curr_dates Exception", e) curr_dates = '' try: curr_description = items[i].find( 'p', {'class': 'pv-entity__description t-14 t-normal mt4'}) curr_description = curr_description.get_text().strip() except Exception as e: #print("curr_description Exception", e) curr_description = '' #Construct volunteer_info_list from above data volunteer_info_list.append( [curr_name, curr_role, curr_dates, curr_description]) except Exception as e: #no volunteering added #print("Volunteering Section Exception", e) pass try: # click to expand honors and awards section because only either projects or honors and awards can be expanded at a time honors_and_awards_expand_button = browser.find_element_by_xpath( "//section[@class='pv-profile-section pv-accomplishments-section artdeco-container-card ember-view']//button[@aria-label='Expand honors & awards section']" ) browser.execute_script("arguments[0].click();", honors_and_awards_expand_button) # click to expand honors and awards section to show more honors_and_awards_expand_button2 = browser.find_element_by_xpath( "//section[@class='pv-profile-section pv-accomplishments-section artdeco-container-card ember-view']//button[@aria-controls='honors-expandable-content' and @aria-expanded='false']" ) browser.execute_script("arguments[0].click();", honors_and_awards_expand_button2) except Exception as e: #print("honors_and_awards_expand_button Exception", e) pass #accomplishments section accomplishments_info_list = [] try: accomplishments_section = soup.find_all( 'section', { 'class': 'pv-profile-section pv-accomplishments-section artdeco-container-card ember-view' }) honors_section = accomplishments_section[0].find( 'div', {'aria-labelledby': 'honors-title'}) list_items = honors_section.find_all( 'li', {'class': 'pv-accomplishments-block__summary-list-item'}) for i in range(len(list_items)): # appending empty string for year field accomplishments_info_list.append( [list_items[i].get_text().strip(), ""]) except Exception as e: #No accomplishments added #print("Accomplishments Section Exception", e) pass #empty hobbies_info_list because it is not available on linkedin hobbies_info_list = [] #Close the browser once scraping is done browser.close() #TESTING OUTPUTS #print("LISTS") #print(basic_info_list) #print(education_info_list) #print(projects_info_list) #print(certifications_info_list) #print(experience_info_list) #print(skills_info_list) #print(volunteer_info_list) #print(accomplishments_info_list) final_all_lists = [ basic_info_list, education_info_list, projects_info_list, certifications_info_list, experience_info_list, skills_info_list, volunteer_info_list, accomplishments_info_list, hobbies_info_list ] json_data = { 'basic_info_list': basic_info_list, 'education_info_list': education_info_list, 'projects_info_list': projects_info_list, 'certifications_info_list': certifications_info_list, 'experience_info_list': experience_info_list, 'skills_info_list': skills_info_list, 'volunteer_info_list': volunteer_info_list, 'accomplishments_info_list': accomplishments_info_list, 'hobbies_info_list': hobbies_info_list } final_json_string = json.dumps(json_data) #print(final_json_string) fileheader = open("static/test.json", 'w') fileheader.writelines(final_json_string) fileheader = open("output/test.json", 'w') fileheader.writelines(final_json_string) return json_data
def GetEducation(soup): education_info_list = [] try: edu_section = soup.find('section', { 'id': 'education-section' }).find('ul') edu_section = edu_section.find_all( 'li', { 'class': "pv-profile-section__sortable-item pv-profile-section__section-info-item relative pv-profile-section__sortable-item--v2 pv-profile-section__list-item sortable-item ember-view" }) college_names = [] degree_names = [] major_names = [] grades = [] dates = [] for x in range(len(edu_section)): curr_section = edu_section[x] try: college_name = curr_section.find( 'h3', {'class': 'pv-entity__school-name t-16 t-black t-bold'}) college_names.append(college_name.get_text()) except: college_names.append('') try: degree_name = curr_section.find( 'p', { 'class': 'pv-entity__secondary-title pv-entity__degree-name t-14 t-black t-normal' }).find('span', {'class': 'pv-entity__comma-item'}) degree_names.append(degree_name.get_text()) except: degree_names.append('') try: major_name = curr_section.find( 'p', { 'class': 'pv-entity__secondary-title pv-entity__fos t-14 t-black t-normal' }).find('span', {'class': 'pv-entity__comma-item'}) major_names.append(major_name.get_text()) except: major_names.append('') try: grade = curr_section.find( 'p', { 'class': 'pv-entity__secondary-title pv-entity__grade t-14 t-black t-normal' }).find('span', {'class': 'pv-entity__comma-item'}) grades.append(grade.get_text()) except: grades.append('') try: time = curr_section.find( 'p', {'class': 'pv-entity__dates t-14 t-black--light t-normal'}) dates.append((time.find_all('time')[1].get_text())) except: dates.append('') for i in range(len(edu_section)): education_info_list.append([ college_names[i], degree_names[i], major_names[i], dates[i], grades[i] ]) except: # If no education added pass #print(education_info_list) return education_info_list
url = 'https://twitter.com/realDonaldTrump' chromedriver_path = "C:\\Program Files\\Python37\\chromedriver.exe" #驱动链接 options = webdriver.ChromeOptions() # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium options.add_experimental_option('excludeSwitches', ['enable-automation']) driver = webdriver.Chrome(executable_path=chromedriver_path, options=options) driver.get(url) js = "var q=document.documentElement.scrollTop=" for i in range(100): driver.execute_script(js + str(10000 * (i + 1))) time.sleep(0.5) page_source = driver.page_source html = BeautifulSoup(page_source, 'html.parser') tweets = html.find_all('div', class_="content") text = '' for tweet in tweets: try: time = tweet.find('small', class_="time") time = time.find_all('span')[0].text tweet = tweet.find('p') print(time) print(tweet.text.strip()) text += tweet.text.strip() print('-----------------------------------------------') except: pass wordcloud = WordCloud(scale=16, background_color='white').generate(text) image_produce = wordcloud.to_image() image_produce.show() driver.close()
def scrap_movie_details(movie_url): id1 = movie_url[27:-1] file_name = id1 + ".json" if os.path.isfile(file_name): with open(file_name, "r") as file1: read = file1.read() data = json.loads(read) return (data) link = requests.get(movie_url) #call api soup = BeautifulSoup( link.text, "html.parser") #convert to html form using BeautifulSoup module name_tag = soup.find( "div", class_="title_block") #find movie name and poster using html name = name_tag.find("h1").text[:-8] poster_tag = soup.find("div", class_="poster") poster = poster_tag.find("img")["src"] a = [] director = [] language = [] country = [] time = soup.find("div", class_="subtext") run = time.find("time").text.strip() if run[1] == "h" and run[-3:] == "min": runtime = int(run[0]) * 60 + (int( run[3:-3])) #find runtime using slice and hour to convert into min else: runtime = run[0:-3] print(runtime) for j in time: if j in time.find_all("a"): #find which type or base of movie list genre_list = j.text a.append(genre_list) genre = a #pop last element of the genre list genre.pop() summary = soup.find("div", class_="plot_summary") bio = summary.find( "div", class_="summary_text").text.strip() #find bio of discription of movie direct_tag = summary.find("div", class_="credit_summary_item" ) #find all director in a movie using find_all direct = direct_tag.find_all("a") for i in direct: director.append(i.text) article = soup.find("div", {'class': "article", 'id': "titleDetails"}) txt_block = article.find_all("div", class_="txt-block") for m in txt_block: b = m.find("h4", class_="inline") #find countary and all langauage in a list if (b != None): if b.text == "Country:": Country = m.find("a").text if b.text == "Language:": Language = m.find_all("a") for i in Language: language.append(i.text) abc = scrap_movie_cast(movie_url) all_details = { #all data is convert into dictionary "name": name, "runtime": runtime, "genre": genre, "bio": bio, "Country": Country, "poster": poster, "director": director, "language": language, "cast": abc } print(all_details) with open(file_name, "w") as file1: json.dump(all_details, file1, indent=4) return (all_details)
def get_urls(text): all_titles = [] # 主题 all_abstracts = [] # 摘要 all_authors = [] # 作者 all_paper_urls = [] # 论文初步网址 all_publish = [] # 论文来源 all_time = [] # 论文时间 soup = BeautifulSoup(text, 'lxml') title_datas = soup.select( 'div.sc_content > h3 > a') # select返回值类型为<class 'list'> author_datas = soup.find_all( 'div', 'sc_info') # find_all返回值类型为<class 'bs4.element.ResultSet'> abstract_datas = soup.find_all('div', 'c_abstract') publish_datas = soup.find_all('div', 'sc_info') time_datas = soup.find_all('div', 'sc_info') for item in title_datas: result = { 'title': item.get_text(), 'href': item.get('href') # 关于论文的详细网址,经过观察发现需要提取部分内容 # http://xueshu.baidu.com/usercenter/paper/show?paperid=389ef371e5dae36e3a05b187f7eb2a95&site=xueshu_se # /s?wd=paperuri%3A%28389ef371e5dae36e3a05b187f7eb2a95%29&filter=sc_long_sign&sc_ks_para=q%3D%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0%E7%A0%94%E7%A9%B6%E7%BB%BC%E8%BF%B0&sc_us=11073893925633194305&tn=SE_baiduxueshu_c1gjeupa&ie=utf-8 } all_titles.append(item.get_text()) wd = str(parse.urlparse(item.get('href')).query).split('&')[0] paperid = wd.split('%')[2][2:] params = {'paperid': paperid, 'site': 'xueshu_se'} url = 'http://xueshu.baidu.com/usercenter/paper/show?' + urlencode( params) all_paper_urls.append(url) # print(url) # print(result) for abs in abstract_datas: # abs类型是<class 'bs4.element.Tag'> str_list = [] for l in abs.contents: # l的类型是<class 'bs4.element.NavigableString'> if str(l).replace('\n', '').strip(): str_list.append(str(l).replace('\n', '').strip()) else: str_list.append("unknown") # print("".join(str_list).replace('<em>','').replace('</em>','')) all_abstracts.append("".join(str_list).replace('<em>', '').replace( '</em>', '')) for authors in author_datas: # authors类型为<class 'bs4.element.Tag'> for span in authors.find_all( 'span', limit=1): # 此时span类型为<class 'bs4.element.Tag'> each_authors = [] for alist in span.find_all('a'): if alist.string is None: each_authors.append("unknown") else: each_authors.append(alist.string) all_authors.append(each_authors) for publish in publish_datas: # authors类型为<class 'bs4.element.Tag'> each_publish = [] spans = publish.find_all('span') # 此时span类型为<class 'bs4.element.Tag'> spans = str(spans) try: publish_name = re.search(r'《.*》', spans) publish_name = publish_name.group() except: publish_name = "unknown" each_publish.append(publish_name) all_publish.append(each_publish) for time in time_datas: # authors类型为<class 'bs4.element.Tag'> each_time = [] for span in time.find_all( 'span', {"class": "sc_time"}): # 此时span类型为<class 'bs4.element.Tag'> time_name = "unknown" for alist in span: try: alist.string = ((alist.string).strip()) time_name = alist.string except: time_name = "unknown" each_time.append(time_name) all_time.append(each_time) return all_titles, all_authors, all_abstracts, all_paper_urls, all_publish, all_time
def movie_details(): global time global serial_no time_limit = random.randint(1, 3) imdb_api = "https://www.imdb.com/india/top-rated-indian-movies/?ref_=nv_mv_250_in" # time.sleep(time_limit) imdb_url = requests.get(imdb_api) data = imdb_url.json soup = BeautifulSoup(imdb_url.text, "html.parser") dict_1 = {} div = soup.find("div", class_="lister") body = div.find("tbody", class_="lister-list") name = body.find_all("tr") for tr in name: list_1 = [] director = [] language = [] genre = [] movie_name = tr.find("td", class_="titleColumn").a.get_text() rating = tr.find("td", class_="ratingColumn imdbRating").strong.get_text() years = tr.find("td", class_="titleColumn").span.get_text() link = tr.find("td", class_="titleColumn").a['href'] url = 'https://www.imdb.com' + str(link) serial_no += 1 # time.sleep(time_limit) movie_url = requests.get(url) soup = BeautifulSoup(movie_url.text, "html.parser") director_name = soup.find("div", class_="credit_summary_item").a.get_text() director.append(director_name) movies_poster = soup.find("div", class_="poster").a['href'] movie_poster = "https://www.imdb.com/" + movies_poster bio = soup.find("div", class_="plot_summary") movie_bio = bio.find("div", class_="summary_text").get_text().strip() detail = soup.find("div", attrs={ "class": "article", "id": "titleDetails" }) div1 = detail.find_all("div") for i in div1: run = i.find_all("h4") for j in run: if "Language:" in j: lan = i.find_all("a") for lang_uage in lan: movie_language = lang_uage.get_text() language.append(movie_language) time = soup.find("div", class_="subtext") runtime = time.find("time").get_text().strip() hour_to_min = (int(runtime[0])) * 60 i = 0 mins = "" a = runtime[3:] while i < len(a): if a[i] == "m": break mins = mins + a[i] i = i + 1 runtime_of_movie = hour_to_min + int(mins) movie_genre = time.find_all("a") movie_genre.pop() for i in movie_genre: genre_1 = i.get_text() genre.append(genre_1) list1 = [] api_cast = url url_cast = requests.get(api_cast) soup = BeautifulSoup(url_cast.text, "html.parser") table = soup.find("table", "cast_list") td = table.find_all("td", class_="") for i in td: my_dict = {} id = i.a["href"][6:15] artist = i.a.get_text().strip() my_dict["artist"] = artist my_dict["imbd_id"] = id list1.append(my_dict) dict_1["movie_name"] = movie_name dict_1["year"] = int(years[1:5]) dict_1["rating"] = rating dict_1["position"] = int(serial_no) dict_1["url"] = url dict_1["director"] = director dict_1["country"] = "India" dict_1["poster_url"] = movie_poster dict_1["language"] = language dict_1["movie_bio"] = movie_bio dict_1["runtime"] = runtime_of_movie dict_1["movie_genre"] = genre dict_1["cast"] = list1 list_1.append(dict_1) link = link[7:16] print(link) with open("movie/" + link + ".json", "w") as Data: json.dump(list_1, Data, indent=4) return (list_1)