def getListURL(self): sQuery = """ SELECT DISTINCT "Linkedin_URL" from "Linkedin_URL" WHERE "Is_Crawl" is NULL AND "Row_ID" < 20000 """ conn = MyConnection.getConnection() cursor = conn.cursor() cursor.execute(sQuery) result = cursor.fetchone() while result: self.listURL.append(result[0]) result = cursor.fetchone() conn.close()
def main(self): try: sQuery = """ SELECT DISTINCT "URL" ,"Head_Line", "Schools ", "Sumary", "Skill", "Experiences" from "Linkedin_Detail" where "Is_Picked" is null """ conn = MyConnection.getConnection() cursor = conn.cursor() cursor.execute(sQuery) result = cursor.fetchone() HeadLine = "" Schools = "" Sumary = "" Skill = "" Experiences = "" while result: url = result[0] HeadLine = result[1] Schools = result[2] # Sumary = result[3] Skill = result[4] Experiences = result[5] ## GET ALL DETAIL ! li_Schools = Schools.split("---BREAK---") li_Sumary = Sumary.split("---BREAK---") li_Skill = Skill.split("---BREAK---") li_Experiences = Experiences.split("---BREAK---") ## Solve HeadLine ! HeadLine = str(HeadLine).upper() # Kiểm tra trong Headline có từ "AT" không ? li_Headline = HeadLine.split() if ("AT" in li_Headline): index = int(li_Headline.index("AT")) HeadLine = ' '.join(li_Headline[0:(index - 1)]) ## Solve Company ! for x in li_Schools: try: temp = x.split("\n") school_Name = temp[0] title = temp[2] duration = temp[6] field_Study = temp[4] active = temp[7].replace('Activities and Societies:', '') sQuery = """ INSERT INTO public."Linkedin_School"( "School_Name", "Secondary_Title", "Date_Attended", "Activitive", "Linkedin_URL", "Field_Study") VALUES (?, ?, ?, ?, ?, ?) """ value = [ school_Name, title, duration, active, url, field_Study ] MyConnection.insertUpdateDB(sQuery, value) print("INSERT SCHOOL DONE ! ") except Exception as e: print("Pick Schools ERROR ", e) # Solving Experiences ! for x in li_Experiences: try: temp = x.split("\n") position = temp[0] company_Name = temp[2] date_Empl = temp[4] empl_Duration = temp[6] location = "" desctiption = "" if (7 < len(temp)): location = temp[8] desctiption = ' '.join(temp[9:(len(temp))]) print(desctiption) sQuery = """INSERT INTO public."Linkedin_Experiences"( "Position", "Company_Name", "Date_Employee", "Employee_Duration", "Location", "Description", "URL") VALUES (?, ?, ?, ?, ?, ?, ?) """ value = [ position, company_Name, date_Empl, empl_Duration, location, desctiption, url ] MyConnection.insertUpdateDB(sQuery, value) print("INSERT EXPERIENCES DONE ! ") except Exception as e: print("Pick EXPERIENCES ERROR ", e) # Solving Skill ! for x in li_Skill: try: temp = x.split("\n") for i in temp: sQuery = """INSERT INTO public."Linkedin_Skill"( "Skill", "URL") VALUES (?,?)""" value = [i.strip(), url] MyConnection.insertUpdateDB(sQuery, value) print("INSERT SKILL DONE ! ") except Exception as e: print("INSERT SKILL ERROR ! ", e) ## UPDATE Is_Picked for thí URL ! try: sQuery = """ UPDATE "Linkedin_Detail" SET "Is_Picked" = '1' WHERE "URL" = ?""" value = [url] MyConnection.insertUpdateDB(sQuery, value) print("Update DONE ! ") except Exception as e: print("UPDATE ERROR ! ", e) result = cursor.fetchone() conn.close() except Exception as e: print("ERROR ! ", e)
def main(self): try: sQuery = """ SELECT DISTINCT [URL] , [Skill] , [Row_ID], [Schools ] FROM [DB_LINKEDIN].[dbo].[Linkedin_Detail] WHERE Row_ID = '1523' """ conn = MyConnection.getConnection() cursor = conn.cursor() cursor.execute(sQuery) result = cursor.fetchone() while result: url = result[0] skill = result[1] profile_ID = result[2] education = result[3] ## GET ALL Experiences DETAIL ! li_Skill = skill.split('---BREAK---') li_Edu = education.split('---BREAK---') ## Solve Skill ! for x in li_Skill: if (len(x) > 2 and None != x and '' != x): try: sQuery = """INSERT INTO [dbo].[Linkedin_Skill_Header] ([Profile_ID] ,[URL] ,[Skill_Name]) VALUES(?,?,?) """ value = [profile_ID, url, x] MyConnection.insertUpdateDB(sQuery, value) print("INSERT SKILL DONE ! ") except Exception as e: print("INSERT SKILL ERROR ! ", e) ## Solve Education ! for e in li_Edu: if (len(e) > 2 and None != e and '' != e): try: temp = e.split('\n') school_Name = temp[0] Degree_Name = temp[2] field_Of_Study = temp[4] dates_Attended = temp[6] description = ' '.join(temp[7:len(temp)]) sQuery = """INSERT INTO [dbo].[Linkedin_Education_Header] ([Profile_ID] ,[URL] ,[School_Name] ,[Degree_Name] ,[Field_Of_Study] ,[Dates_Attended_Or_Expected_graduation] ,[Education_Description]) VALUES (?,?,?,?,?,?,?) """ value = [ profile_ID, url, school_Name, Degree_Name, field_Of_Study, dates_Attended, description ] MyConnection.insertUpdateDB(sQuery, value) print("INSERT EDU DONE ! ") except Exception as e: print("INSERT EDU ERROR ! ", e) result = cursor.fetchone() conn.close() except Exception as e: print("ERROR ! ", e)
def main(self): try: # sQuery = """ SELECT DISTINCT [URL] , [Experiences] , [Location], [Avatar_URL], [Row_ID] # FROM [DB_LINKEDIN].[dbo].[Linkedin_Detail] # WHERE Row_ID = '1507' """ sQuery = """SELECT DISTINCT [URL] , [Experiences] , [Location], [Avatar_URL], [Row_ID] FROM [DB_LINKEDIN].[dbo].[Linkedin_Detail] WHERE len (Experiences) > 5 and Row_ID < 1630 AND [Is_Tag] IS NULL """ conn = MyConnection.getConnection() cursor = conn.cursor() cursor.execute(sQuery) result = cursor.fetchone() while result: temp_Experiences = result[1] location = result[2] url = result[0] avatar = result[3] profile_ID = result[4] ## GET ALL Experiences DETAIL ! li_Experiences = temp_Experiences.split('---BREAK---') ## Solve HeadLine ! for e in li_Experiences: temp_Element = e.split('\n') try: position = temp_Element[0] company_Name = temp_Element[2] date_Employee = temp_Element[4] employee_Duration = temp_Element[6] location = temp_Element[8] description_Exp = ' '.join( temp_Element[9:(len(temp_Element))]) sQuery = """ INSERT INTO [dbo].[LinkedIn_Experience_Header] ( [URL] ,[Position] ,[Company_Name] ,[Dates_Employed] ,[Employment_Duration] ,[Location] ,[Experience_Description] , [Profile_Id]) VALUES (?,?,?,?,?,?,?,?) """ value = [ url, position, company_Name, date_Employee, employee_Duration, location, description_Exp, profile_ID ] MyConnection.insertUpdateDB(sQuery, value) print("INSERT DONE ! ") except Exception as e: print("NOT SAME PATTERN ! ", e) # UPDATE ! try: sQuery = """ UPDATE [Linkedin_Detail] SET Is_Tag = 1 WHERE [Row_ID] = ? """ value = [profile_ID] MyConnection.insertUpdateDB(sQuery, value) print("UPDATE DONE ") except Exception as e: print("UPDATE ERROR ! ", e) result = cursor.fetchone() conn.close() except Exception as e: print("ERROR ! ", e)
def main(self): sQuery = """ SELECT [Exp_Header_Id] , [Experience_Description], [URL] FROM [LinkedIn_Experience_Header] WHERE [Is_Crawl] IS NULL AND len([Experience_Description]) >2 """ conn = MyConnection.getConnection() cursor = conn.cursor() cursor.execute(sQuery) result = cursor.fetchone() while result: # GET exe_ID & exe_Description ! exe_ID = result[0] sentence = result[1] url = result[2] # Solving data ! temp = sentence.split('-') for x in temp: if (len(x) > 5): # Loại bỏ trường hợp bị null ! x = x.replace(',', '') x = x.replace('(', '') x = x.replace(')', '') x = x.replace('.', '') x = x.replace('*', '') x = x.replace('#', '') tokens = nltk.word_tokenize(x) tagged = nltk.pos_tag(tokens) entities = nltk.chunk.ne_chunk(tagged) # print(entities) for y in entities: a1 = "" a2 = "" if (1 == len(y)): a = str(y) a = str(y).replace('(', '') a = a.replace(')', '') a = a.replace("'", "") a = a.replace(',', '') a = a.split() a2 = a[1] a_temp = a2.split('/') a1 = a_temp[0] a2 = a_temp[1] if (len(y) > 2): for i in y: a = ' '.join(i[0:len(i)]) a = a.split() a1 = a[0] a2 = a[1] # print(a1 , a2) if (2 == len(y)): a = str(y).replace('(', '') a = a.replace(')', '') a = a.replace("'", "") a = a.replace(',', '') a = a.split() a1 = a[0] a2 = a[1] if (len(a1) > 1): self.insertDB(a1, a2, exe_ID, url) self.updateDB(exe_ID) result = cursor.fetchone() conn.close()
def main(self, country, start, end, ID): try: ## GET ALL COMPANY NAME TO SEARCH ! listKey = {} connection = MyConnection.getConnection() cursor = connection.cursor() ## GET COMMPANY NAME USED TO SEARCH WITH BING!! SQLCommand = """SELECT DISTINCT [D-U-N-S] , [Company_Name_Clean] FROM [Company] WHERE [Company_Name_Clean] is not null AND [Is_Crawl] is NULL and [Row_ID] >= ? and [Row_ID] < ? """ value = [start, end] cursor.execute(SQLCommand, value) results = cursor.fetchone() while results: DUNS_ID = int(results[0]) # this is DUNS Number this comapany listKey[DUNS_ID] = results[1] # Name of Company results = cursor.fetchone() connection.close() print("--------------> Total Size : ", len(listKey)) ## Config for browser do not open web browser ! options = webdriver.ChromeOptions() options.add_argument('headless') browser = webdriver.Chrome(chrome_options=options) browser.get("https://www.bing.com/") time.sleep(4) for duns in listKey.keys(): time.sleep(randint(8, 15)) print("Company Search : ", listKey[duns]) keys = " \"VietNam \" " + " site: linkedin.com/in " + "\"" + listKey[ duns] + " \"" KeySearch = browser.find_element_by_xpath( "//*[@class='b_searchbox']") KeySearch.clear() KeySearch.send_keys(keys) # Truyền tên cty vào để search time.sleep(randint(2, 6)) # KeySearch.send_keys(u'\ue007') # Enter search ! btnSearch = browser.find_element_by_xpath( "//*[@id='sb_form_go']").click() sleep = int(randint(3, 15)) time.sleep(sleep) try: allRow = browser.find_elements_by_xpath( "//*[@class='b_algo']") count_2 = 0 if (0 < len(allRow)): while True: timeSpleep = randint( 5, 10) # Random delay time from 3 - 20s print("TIME DELAY : ", timeSpleep) time.sleep(timeSpleep) if (50 == count_2): time.sleep(40) count_2 = 0 allRow = browser.find_elements_by_xpath( "//*[@class='b_algo']") browser.execute_script( "window.scrollTo(0, document.body.scrollHeight);" ) #kéo thanh cuộn xuống . print("--------> Page Size : ", len(allRow)) for x in allRow: txt = x.text temp = txt.split() findString = txt.find("LinkedIn") urls = re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', x.text) companyName = txt[0:int(findString)] urls = ''.join(urls) print(urls) print( "--------------------------------------------------" ) # Save to DB ! try: command = """INSERT INTO [Linkedin_URL] ([Linkedin_Name] ,[Linkedin_URL] ,[DUNS_NUMBER] ,[Linkedin_Type] ,[Country]) VALUES (?,?,?,?,? )""" value = [ companyName, urls, duns, "Profile", country ] MyConnection.insertUpdateDB(command, value) print("INSERT DONE ! ") count_2 += 1 except Exception as e: print("INSERT CRAWL ERROR : ", e) browser.find_element_by_xpath( "//*[@class='sb_pagN']").click() # NExt page! time.sleep(3) except Exception as e: print("ERROR WHEN CRAWLING ! ", e) # xác nhận đã search với keyword đó rồi : try: command = """UPDATE [Company] SET [Is_Crawl] = 1 WHERE [D-U-N-S] = ? """ value = [duns] MyConnection.insertUpdateDB(command, value) print("UPDATE Crawl DONE ! ") time.sleep(2) except Exception as e: print("UPDATE Crawl Error : ", e) except Exception as e: print("ERROR IS : ", e)